<a href="https://colab.research.google.com/github/mille-s/GEM24_D2T_StratifiedSampling/blob/main/GEM24_D2T_utils.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# D2T data selection

## Prepare repo

In [None]:
#@title Install packages

from IPython.display import clear_output
! pip install datasets
! pip install json2html

clear_output()

In [None]:
#@title Import required libraries

import os
import xml.etree.ElementTree as ET
from sklearn.model_selection import train_test_split
import pandas as pd
import csv
import random
import json

## Set parameters

In [None]:
#@title Filepath definition (upload file(s) in TESTDATA folder after running!)

# project_dir_path = os.path.join('/', 'content', 'drive', 'MyDrive', 'WebNLG_data_selection')
project_dir_path = '/content'
rdf_path = os.path.join(project_dir_path, 'testdata')
csv_path = os.path.join(project_dir_path, 'csv_sampling')

subtask = 'D2T-1'#@param['D2T-1', 'D2T-2']
dataset = 'CFA'#@param['CFA', 'FA', 'FI']
# seed used for GEM'24: 49
seed = 49#@param
seed = int(seed)
datacode = subtask+'-'+dataset

output_path = os.path.join(csv_path, datacode+'_samplingData.csv')

if not os.path.exists(rdf_path):
  os.makedirs(rdf_path)

if not os.path.exists(csv_path):
  os.makedirs(csv_path)

## Create csv file with sampling info. Run once for each file.

In [None]:
#@title Function for sampling.

def extract_data(rdf_filepath, stratify_categories, exclude_size):

  '''
      This method:
      a. extracts the required entries (RDF triple(s), number of triples, property and category) from the json file.
      b. categorizes the triple and verbalisation pair as seen/unseen category based on its presence in the training set.
      c. groups the required extracted entry field (in this case, number of triples and property) for stratified selection.
  '''

  data = []
  count = 0
  original_id = 1
  for filename in os.listdir(rdf_filepath):
    if '.xml' in filename and datacode in filename:
      tree = ET.parse(f"{rdf_filepath}/{filename}")
      root = tree.getroot()

      # extract triples
      for entry in root.findall('./entries/entry'):
        triples = []
        pred = []
        for triple in entry.find('modifiedtripleset').findall('mtriple'):
          str_triple = triple.text
          triples.append(str_triple)
          only_pred = str_triple.split('|')[1]
          pred.append(only_pred)
        if exclude_size == 'none' or (exclude_size == '1 only' and int(entry.attrib['size']) > 1) or (exclude_size == '1 and 2' and int(entry.attrib['size']) > 2):
          curr_entry = {
              'id': count,
              'original_id': original_id,
              'triples': triples.copy(),
              'property': pred.copy(),
              'num_triples': int(entry.attrib['size']),
              'category': 'unseen' if entry.attrib['category'] in ['Athlete', 'Artist', 'CelestialBody', 'MeanOfTransportation', 'Politician'] else 'seen',
              'category_all': entry.attrib['category']
          }
          if stratify_categories == 'seenUnseen':
            curr_entry['strat_field'] = str(curr_entry['num_triples'])+curr_entry['category']
          elif stratify_categories == 'allCategories':
            curr_entry['strat_field'] = str(curr_entry['num_triples'])+curr_entry['category_all']
          elif stratify_categories == 'ignoreCategories':
            curr_entry['strat_field'] = str(curr_entry['num_triples'])
          data.append(curr_entry)
          count += 1
        original_id += 1

  # Remove data points for which there is only one member in a stratify category (triggers an error when stratifying, needs 2 members min)
  clean_data = []
  # Make a dico with the count of instances of each strat_field
  count_strat_field_instances = {}
  for datapoint in data:
    if datapoint['strat_field'] in count_strat_field_instances:
      count_strat_field_instances[datapoint['strat_field']] += 1
    else:
      count_strat_field_instances[datapoint['strat_field']] = 1
  # If a count of a strat_field is one, do no include it in the final dataset
  for datapoint_clean in data:
    if count_strat_field_instances[datapoint_clean['strat_field']] == 1:
      print(f"  Removed datapoint  {datapoint_clean['strat_field']} because there is only one member!")
    else:
      clean_data.append(datapoint_clean)

  return clean_data

In [None]:
#@title Sampling parameters

stratify_categories = 'ignoreCategories'#@param['allCategories', 'seenUnseen', 'ignoreCategories']
number_samples = "180"#@param[50, 100, 120, 150, 175, 180, 200, 300, 400, 500]
num_samples = int(number_samples)
exclude_size = '1 only'#@param['none', '1 only', '1 and 2']
# Get data
data=extract_data(rdf_path, stratify_categories, exclude_size)

In [None]:
#@title Stratified selection using train_test_split

# tset = pd.DataFrame.from_dict(data)

# X_train, X_test, = train_test_split(tset, test_size=num_samples, random_state=seed, stratify=tset['strat_field'], shuffle=True)
# print(len(X_train), len(X_test))


In [None]:
#@title Balanced selection using groupby

tset = pd.DataFrame.from_dict(data)

# I found three ways, not sure what they exactly do; the second and third ones allow for specifying the random_state
# X_test = tset.groupby(tset['strat_field']).apply(lambda s: s.sample(30))
# X_test = tset.sample(frac = 1.0, random_state=seed, axis=0).groupby(tset['strat_field']).head(30)
# The third one below seems more controlled; grouby uses axis=0
X_test = tset.groupby(by=tset['strat_field']).sample(n = 30, random_state=seed)

print(X_test)

In [None]:
#@title Print some numbers
# tset['num_triples']
# len(tset.loc[tset['category'] == 'unseen'])
# print(X_test['num_triples'])

# Show mean of column that contains triple number in each input (https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.loc.html)
# print(X_test.loc[:, 'num_triples'].mean())
print(f"{round(X_test['num_triples'].mean(), 2)} triples per input on average")

def count_num_instances(pd_column):
  count = {}
  for category in pd_column:
    if category in count:
      count[category] += 1
    else:
      count[category] = 1

  for count_category in sorted(count):
    print(f'{count_category}\t{count[count_category]}')
    # print(f'{count[count_category]}')
  print('-----------------')

count_num_instances(X_test['num_triples'])
count_num_instances(X_test['category_all'])

In [None]:
#@title Play around with groupby
# df_test_gb = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
#                               'Parrot', 'Parrot'],
#                    'Max Speed': [380., 370., 24., 26.]})
# print(df_test_gb)

# print(df_test_gb.groupby(['Animal']).mean())

In [None]:
#@title Create CSV file
X_test.to_csv(output_path, index=False)

# Create HTML tables for inputs. Run once for each file.

In [None]:
#@title Generate HTML tables from pandas dataframe (running the code above before)
from json2html import *
import json
import codecs
import os

path_out_pd = '/content/tables_pd'
if not os.path.exists(path_out_pd):
  os.makedirs(path_out_pd)

for ind in X_test.index:
  triple_set = X_test['triples'][ind]
  orig_id = X_test['original_id'][ind]
  # orig_id starts numbering at 1, while the lists of outputs texts will start numbering at 0, so we need to remove 1 from the original ID to maintain alignment with output files lines
  with codecs.open(os.path.join(path_out_pd, f'{datacode}_{str(orig_id-1).rjust(4, "0")}.html'), 'w', 'utf-8') as fo:
    list_dico_input = []
    for triple in triple_set:
      dico_triples = {}
      dico_triples['Subject'] = triple.split(' | ')[0]
      dico_triples['Property'] = triple.split(' | ')[1]
      dico_triples['Object'] = triple.split(' | ')[2]
      list_dico_input.append(dico_triples)
    fo.write(json2html.convert(json = list_dico_input))

In [None]:
#@title Generate HTML tables from HuggingFace data

# from datasets import load_dataset
# from json2html import *
# import codecs
# import os
# import re
# import json

# # struct2text: common_gen, cs_restaurants, dart, e2e_nlg, totto, web_nlg_en, web_nlg_ru
# # schema_guided_dialog
# dataset_name = 'web_nlg_en'
# dataset = load_dataset('gem', dataset_name)
# # subSets = ['test', 'validation', 'train']
# subSets = ['test']

# for subSet in subSets:
#   x = 0
#   subSet_inputs = dataset[subSet]
#   while x < len (subSet_inputs):
#     # fileName_out = 'out_tables/'+dataset_name+'-'+subSet+'-'+str(x)+'.html'
#     fileName_out = 'tables_/'+subSet_inputs[x]['gem_id']+'.html'
#     if not os.path.exists('tables_'):
#       os.makedirs('tables_')
#     print('Processing '+fileName_out)
#     fo = codecs.open(fileName_out, 'w', 'utf-8')
#     list_dico_input = []
#     if dataset_name == 'web_nlg_en' or dataset_name == 'web_nlg_ru':
#       for triple in subSet_inputs[x]['input']:
#         dico_triples = {}
#         dico_triples['Subject'] = triple.split(' | ')[0]
#         dico_triples['Property'] = triple.split(' | ')[1]
#         dico_triples['Object'] = triple.split(' | ')[2]
#         list_dico_input.append(dico_triples)
#     elif dataset_name == 'common_gen':
#       dico_concepts = {}
#       dico_concepts['Concepts'] = subSet_inputs[x]['concepts']
#       list_dico_input.append(dico_concepts)
#     elif dataset_name == 'cs_restaurants':
#       dico_DAs = {}
#       DA = subSet_inputs[x]['dialog_act'].split('(')[0]
#       triples = subSet_inputs[x]['dialog_act'].split('(')[1].split(')')[0]
#       dico_DAs['Dialogue Act'] = DA
#       if re.search(',', triples):
#         dico_DAs['Topics'] = triples.split(',')
#       else:
#         dico_DAs['Topic'] = triples
#       list_dico_input.append(dico_DAs)
#     elif dataset_name == 'e2e_nlg':
#       list_properties = subSet_inputs[x]['meaning_representation'].split(', ')
#       for input_property in list_properties:
#         dico_properties = {}
#         prop_name = input_property.split('[')[0]
#         prop_value = input_property.split('[')[1].split(']')[0]
#         dico_properties['Property'] = prop_name
#         dico_properties['Value'] = prop_value
#         list_dico_input.append(dico_properties)
#     elif dataset_name == 'schema_guided_dialog':
#       dico_DAs = {}
#       dico_DAs['Dialogue Acts'] = subSet_inputs[x]['dialog_acts']
#       list_dico_input.append(dico_DAs)
#     fo.write(json2html.convert(json = list_dico_input))
#     fo.close()
#     x += 1

## Download files

In [None]:
#@title Zip and download tables
from google.colab import files
zip_name_inter = '/content/html_tables.zip'
!zip -r {zip_name_inter} /content/tables_pd

clear_output()

files.download(zip_name_inter)

In [None]:
#@title Zip and download CSVs
from google.colab import files
zip_name_inter = '/content/CSVs.zip'
!zip -r {zip_name_inter} /content/csv_sampling

clear_output()

files.download(zip_name_inter)

# Create individual files for sampled system outputs

First upload all system outputs in a folder named "sys_outputs", and generate the corresponding csv file(s) with the code above (or upload manually in a folder called "csv_sampling").

In [None]:
#@title Create one text file per sampled input/output per team per language per test set
import os
import glob
from pandas import *
import codecs

out_sampled_folder = '/content/d2t_outputs-sampled'

# The 3 CSV files for each task have the same IDs sampled, so we can just use one file per task
list_csv_D2T1_OIDs = read_csv('/content/csv_sampling/D2T-1-FA_samplingData.csv')['original_id'].tolist()
list_csv_D2T2_OIDs = read_csv('/content/csv_sampling/D2T-2-FA_samplingData.csv')['original_id'].tolist()

# The original IDs are numbered starting from 1, we want a number starting from 0 to aling with list indices in the system output files; make it into a dic for easy access afterwards
list_csv_ids = {}
list_csv_ids['D2T-1'] = [OID1-1 for OID1 in sorted(list_csv_D2T1_OIDs)]
list_csv_ids['D2T-2'] = [OID2-1 for OID2 in sorted(list_csv_D2T2_OIDs)]

for sys_output_path in glob.glob(os.path.join('/content/sys_outputs', '*.txt')):
  head, tail = os.path.split(sys_output_path)
  # Get parameters of every output file
  team_ID = tail.split('-', 1)[0]
  lang_out = tail.rsplit('.', 1)[0].rsplit('_', 1)[1]
  data_code_out = tail.rsplit('.', 1)[0].rsplit('_', 1)[0].rsplit('_', 1)[1]
  subtask_code_out = data_code_out.rsplit('-', 1)[0]
  # print(tail)
  # print(f'  {team_ID}')
  # print(f'  {lang_out}')
  # print(f'  {data_code_out}')
  # print(f'  {subtask_code_out}')

  # Create subfolder to store sampled system outputs
  dest_folder_sample = os.path.join(out_sampled_folder, data_code_out, lang_out, team_ID)
  if not os.path.exists(dest_folder_sample):
    os.makedirs(dest_folder_sample)

  # Read sys output
  sys_output_all_lines = codecs.open(sys_output_path).readlines()

  for id_sampled in list_csv_ids[subtask_code_out]:
    # Create text files the last part of the name of which matches the name of the sampled input files
    dest_filename_sample = os.path.join(dest_folder_sample, '['+team_ID+'_'+lang_out+']_'+data_code_out+'_'+str(id_sampled).rjust(4, "0")+'.txt')
    with codecs.open(dest_filename_sample, 'w', 'utf-8') as fo:
        fo.write(sys_output_all_lines[id_sampled].strip())


In [None]:
#@title Zip and download sampled output text files
from IPython.display import clear_output
from google.colab import files
zip_name_inter = '/content/d2t_outputs-sampled.zip'
!zip -r {zip_name_inter} /content/d2t_outputs-sampled
clear_output()

files.download(zip_name_inter)