<a href="https://colab.research.google.com/github/mille-s/WebNLG_datasets/blob/main/WebNLG_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Install Google Translate (first because otherwise we need to restart session)
# https://pypi.org/project/googletrans/
from IPython.display import clear_output
! pip install googletrans==3.1.0a0
clear_output()

In [None]:
#@title Shared packages and functions
from IPython.display import clear_output

# Package for parsing xml files (WebNLG 23 and Enhanced WebNLG)
! pip install xmltodict

# Install SPARQLWrapper for making queries to DBpedia/Wikidata
! pip install SPARQLWrapper

# datasets is for loading datasets from HuggingFace (WebNLG 17, 18, 20)
! pip install datasets
from datasets import load_dataset

! pip install --upgrade gdown

! pip install dicttoxml

# Clone repos containing WebNLG processing modules and processed data
! git clone 'https://github.com/mille-s/Mod-D2T.git'
! git clone 'https://github.com/mille-s/M-FleNS_NLG-Pipeline.git'
! git clone 'https://github.com/mille-s/UD_Converter.git'
! git clone 'https://github.com/mille-s/DCU_TCD_FORGe_WebNLG23.git'
# Delete locally to avoid confusion
! rm '/content/UD_Converter/UD_Converter_release.ipynb'
! rm '/content/Mod-D2T/Mod-D2T.ipynb'
! rm '/content/M-FleNS_NLG-Pipeline/M-FleNS_NLG-Pipeline.ipynb'
! rm '/content/DCU_TCD_FORGe_WebNLG23/DCU_TCD_FORGe_WebNLG23.ipynb'

clear_output()

def extractTripleElements(dataset, element):
  """ Returns a list of unique subjects, objects or properties extracted from triple sets"""
  n = ''
  if element == 'subject':
    n = 0
  elif element == 'property':
    n = 1
  elif element == 'object':
    n = 2
  else:
    print('Error, the second argument of extractTripleElements must be "subject", "property" or "object".')
  element_list = []
  for entry in dataset:
    for input_triple in entry[0]:
      element_name = input_triple.split(' | ')[n]
      if element_name not in element_list:
        element_list.append(element_name)
  return(element_list)

In [None]:
# @title Functions for wikidata and dbpedia queries
# Function list

import requests
import csv
import re
import progressbar
from SPARQLWrapper import SPARQLWrapper, JSON

bar = ''
def createProgressBar(bar, max):
  bar = progressbar.ProgressBar(max_value=max)
  return(bar)

def format_entity_dbp(entity):
  """
  Used for the 2024 experiments
  """
  # Add this line so all lines below have the same variable name on the right
  clean_entity = entity
  # Remove what is between parentheses; in the end better to keep and escape them
  # clean_entity = clean_entity.split('_(',1)[0]
  # clean_entity = clean_entity.split(' (',1)[0]
  # Replace underscores by spaces (for wikidata)
  # clean_entity = re.sub('_', ' ', clean_entity)
  # Replace ampersands by 'and' (for dbpedia, seems to affect results from wikidata though)
  # clean_entity = re.sub('&', 'and', clean_entity)
  # Escape other reserved characters
  clean_entity = re.sub('/', '\/', clean_entity)
  clean_entity = re.sub('\.', '\.', clean_entity)
  clean_entity = re.sub('\+', '\+', clean_entity)
  clean_entity = re.sub('\,', '\,', clean_entity)
  clean_entity = re.sub('\&', '\&', clean_entity)
  clean_entity = re.sub('\-', '\-', clean_entity)
  clean_entity = re.sub('\(', '\(', clean_entity)
  clean_entity = re.sub('\)', '\)', clean_entity)
  # Remove quotes, semi-colons and other things which are usually errors or hacks
  clean_entity = re.sub('"', '', clean_entity)
  clean_entity = re.sub(';', '', clean_entity)
  clean_entity = re.sub('~', '', clean_entity)
  clean_entity = re.sub('<', '', clean_entity)
  clean_entity = re.sub('>', '', clean_entity)
  # Other
  # I checked, it works like this...
  clean_entity = re.sub("'", "\\'", clean_entity)
  return clean_entity

def format_entity_wkd(entity):
  """
  Used for the GEM 2023-2024 data
  """
  # Remove what is after commas and between parentheses
  clean_entity = entity.split(',',1)[0].split('_(',1)[0]
  # Replace underscores by spaces (for wikidata)
  clean_entity = re.sub('_', ' ', clean_entity)
  # Remove quotes
  clean_entity = re.sub('"', '', clean_entity)
  return clean_entity

def assign_classRegEx(entity):
  classRegEx = ''
  if re.search('gramPerCubicCentimetres', entity):
    classRegEx = 'concentration_gPerCubCm'
  if re.search('kilogramPerCubicMetres', entity):
    classRegEx = 'concentration_kgPerCubM'
  elif re.search('inhabitants per square kilometre', entity):
    classRegEx = 'populationDensity'
  elif re.search('[0-9\.,]+.*square.*metre', entity):
    classRegEx = 'area_measurement'
  elif re.search('bombing', entity):
    classRegEx = 'event'
  elif re.search('[Uu]niversity', entity):
    classRegEx = 'university'
  elif re.search('Dodge', entity):
    classRegEx = 'car'
  elif re.search('^"*[0-9]{4}-[0-9]{2}-[0-9]{2}"*$', entity):
    classRegEx = 'date'
  elif re.search('^"*[0-9]+\s*-*(January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|July|Jul|August|Aug|September|Sept|October|Oct|November|Nov|December|Dec)\s*-*[0-9]+"*$', entity):
    classRegEx = 'date'
  elif re.search('^"*(January|February|March|April|May|June|July|August|September|October|November|December)\s*-*[0-9]{4}"*$', entity):
    classRegEx = 'month'
  elif re.search('^"*(January|February|March|April|May|June|July|August|September|October|November|December)"*$', entity):
    classRegEx = 'month'
  elif re.search('^"*[0-9\.,]+.*(litres|cubic)', entity):
    classRegEx = 'volume_measurement'
  elif re.search('^"*[0-9\.,]+\s*m"*$', entity):
    classRegEx = 'distance_meters'
  elif re.search('^"*[0-9\.,]+\s*in"*$', entity):
    classRegEx = 'distance_inches'
  elif re.search('^"*[0-9\.,]+\s*yd"*$', entity):
    classRegEx = 'distance_yards'
  elif re.search('^"*[0-9\.,]+\s*ft"*$', entity):
    classRegEx = 'distance_feet'
  elif re.search('[0-9\.,]+.*millimetres', entity):
    classRegEx = 'distance_millimetres'
  elif re.search('[0-9\.,]+.*centimetres', entity):
    classRegEx = 'distance_centimetres'
  elif re.search('[0-9\.,]+.*metres', entity):
    classRegEx = 'distance_metres'
  elif re.search('[0-9\.,]+.*inches', entity):
    classRegEx = 'distance_inches'
  elif re.search('[0-9\.,]+.*yards', entity):
    classRegEx = 'distance_yards'
  elif re.search('[0-9\.,]+.*feet', entity):
    classRegEx = 'distance_feet'
  elif re.search('[0-9\.,]+.*seconds', entity):
    classRegEx = 'duration_seconds'
  elif re.search('[0-9\.,]+.*minutes', entity):
    classRegEx = 'duration_minutes'
  elif re.search('[0-9\.,]+.*hours', entity):
    classRegEx = 'duration_hours'
  elif re.search('[0-9\.,]+.*days', entity):
    classRegEx = 'duration_days'
  elif re.search('[0-9\.,]+.*weeks', entity):
    classRegEx = 'duration_weeks'
  elif re.search('[0-9\.,]+.*months', entity):
    classRegEx = 'duration_months'
  elif re.search('[0-9\.,]+.*years', entity):
    classRegEx = 'duration_years'
  elif re.search('[0-9\.,]+.* (engine|horsepower)', entity):
    classRegEx = 'engine'
  elif re.search('[0-9\.,]+.*euros', entity):
    classRegEx = 'moneyQuantity_euros'
  elif re.search('[0-9\.,]+.*dollars', entity):
    classRegEx = 'moneyQuantity_dollars'
  elif re.search('[0-9\.,]+.*kilometrePerSeconds', entity):
    classRegEx = 'speed_kmPerSec'
  elif re.search('[0-9\.,]+.*degreeCelsius', entity):
    classRegEx = 'temperature_celsius'
  elif re.search('[0-9\.,]+.*kelvins', entity):
    classRegEx = 'temperature_kelvin'
  elif re.search('[0-9\.,]+-speed', entity):
    classRegEx = 'transmission'
  elif re.search('^"*[0-9\.,]+.*(\sg|grams)', entity):
    classRegEx = 'weight_grams'
  elif re.search('^"*[0-9\.,]+.*\skg', entity):
    classRegEx = 'weight_kilograms'
  elif re.search('^"*[0-9\.,]+.*tonnes', entity):
    classRegEx = 'weight_tonnes'
  elif re.search('^"*[0-9\.,]+.*pounds', entity):
    classRegEx = 'weight_pounds'
  elif re.search('^"*[0-9]+/[0-9]+"*$', entity):
    classRegEx = 'fraction'
  elif re.search('^"*[0-9a-zA-Z]+/[0-9a-zA-Z\s\']+"*$', entity):
    classRegEx = 'runwayName'
  elif re.search('^"*[0-9]{4}[-–][0-9]{4}"*$', entity):
    classRegEx = 'issnNumber'
  elif re.search('^"*[0-9]+[-–][0-9]+[-–][0-9]+[-–][0-9]+[-–]*[0-9]*"*$', entity):
    classRegEx = 'isbnNumber'
  elif re.search('^"*[0-9]+[-–][0-9]+[-–]*[0-9]*[-–]*[0-9]*[-–]*[0-9]*"*$', entity):
    classRegEx = 'unknownIdentifier'
  elif re.search('^"*[0-9]{2} [a-zA-Z]+"*$', entity):
    classRegEx = 'celestialBody'
  elif re.search('^"*[0-9]{3} [a-zA-Z]+"*$', entity):
    classRegEx = 'celestialBody'
  elif re.search('^"*[0-9]+_[a-zA-Z]+"*$', entity):
    classRegEx = 'celestialBody'
  elif re.search('_FC_', entity):
    classRegEx = 'footballClub'
  elif re.search('(season|EPSTH|league|League|Liga|Season|Bundesliga|Eredivisie|Football_Conference|Lega_Pro|Regionalliga|Serie_A|Serie_B|Topklasse|Campeonato)', entity):
    classRegEx = 'sportsSeason'
  elif re.search('[Mm]onument', entity):
    classRegEx = 'monument'
  elif re.search('^"*[0-9]{4} [a-zA-Z]+', entity):
    classRegEx = 'celestialBody'
  elif re.search('^"*[0-9-]+[stndr]*[\s\-_][^:]*[\(\)a-zA-Z\']+"*$', entity):
    if not re.search('JD2457600', entity):
      classRegEx = 'address'
    else:
      classRegEx = 'date_epoch'
  elif re.search('^"*[\+-]*[0-9\.,]+"*$', entity):
    classRegEx = 'unknownQuantity'
  elif re.search('[0-9\.,]+, [0-9\.,]+', entity):
    classRegEx = 'unknownQuantity_multiple'

  return(classRegEx)

# Code below adapted from ChatGPT
def get_wikidata_id(entity_label):
  # Define the Wikidata API endpoint
  wikidata_api_url = "https://www.wikidata.org/w/api.php"

  # Set the parameters for the API request
  params = {
    "action": "wbsearchentities",
    "format": "json",
    "language": "en",  # You can change the language if needed
    "search": entity_label,
  }

  try:
    # Send a GET request to the Wikidata API
    response = requests.get(wikidata_api_url, params=params)
    response.raise_for_status()
    # Parse the JSON response
    data = response.json()
    # Check if any entities were found
    if "search" in data and data["search"]:
      # Get the first entity (assuming it's the most relevant)
      entity_id = data["search"][0]["id"]
      return entity_id
    return None  # Entity not found

  except requests.exceptions.RequestException as e:
    print("Error connecting to the Wikidata API:", e)
    return None

# Example usage:
# entity_label = '23 g'
# wikidata_id = get_wikidata_id(entity_label)
# if wikidata_id:
#   print(f"The Q-ID for {entity_label} is {wikidata_id}.")
# else:
#   print(f"No entity found for {entity_label}.")
# print(assign_classRegEx(entity_label))

def get_wikidata_id_bulk(rows, list_entities, bar):
  bar = createProgressBar(bar, len(list_entities)-1)
  for count, entity in enumerate(list_entities):
    bar.update(count)
    row = []
    clean_entity = ''
    if entity == 'School of Business and Social Sciences at the Aarhus University':
      clean_entity = 'Aarhus School of Business'
    else:
      clean_entity = format_entity_wkd(entity)
    wikidata_id = get_wikidata_id(clean_entity)
    # wikidata_id = None
    if wikidata_id:
      # print(f"The Q-ID for {entity} is {wikidata_id}.")
      row.append(wikidata_id)
      row.append(clean_entity)
      row.append(entity)
      row.append(assign_classRegEx(entity))
    else:
      # print(f"No entity found for {entity}.")
      row.append('???')
      row.append(clean_entity)
      row.append(entity)
      row.append(assign_classRegEx(entity))
    rows.append(row)

# ChatGPT prompt: Please write some Python code to get the value of an named entity's "gold:hypernym" property according to DBpedia
def get_dbpedia_hypernym(entity_name):
  entity_name = format_entity_dbp(entity_name)
  # For DBpedia specifically, we need to replace spaces by underscores in entity names to avoid query errors
  entity_name = ('_').join(entity_name.split(' '))

  # Set up the SPARQL endpoint
  sparql = SPARQLWrapper("http://dbpedia.org/sparql")

  # Define the SPARQL query
  query = f"""
  SELECT ?hypernym
  WHERE {{
    dbr:{entity_name} gold:hypernym ?hypernym
  }}
    """

  # Set the query and response format
  sparql.setQuery(query)
  sparql.setReturnFormat(JSON)

  # Execute the query
  results = sparql.query().convert()

  # Extract and return the hypernym value
  if 'results' in results and 'bindings' in results['results']:
    bindings = results['results']['bindings']
    # print(bindings)
    # Return the first value only
    if bindings:
      if re.search('/', bindings[0]['hypernym']['value']):
        return bindings[0]['hypernym']['value'].rsplit('/',1)[1]
      else:
        return bindings[0]['hypernym']['value']

    return None

# ChatGPT prompt: Please write some Python code to get the value of an named entity's "gold:hypernym" property according to Wikidata
def get_wikidata_hypernym(entity_ID):
  # Define the Wikidata Query Service endpoint URL
  wikidata_endpoint = "https://query.wikidata.org/sparql"

  # Define the SPARQL query
  query = f"""
  SELECT ?hypernymLabel
  WHERE {{
    wd:{entity_ID} wdt:P31 ?hypernym.
    SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
  }}
  """

  # Set up the request headers
  headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'application/json'
  }

  # Set up the request parameters
  params = {
    'query': query,
    'format': 'json'
  }

  # Make the API request
  response = requests.get(wikidata_endpoint, headers=headers, params=params)

  # Parse the JSON response
  data = response.json()

  # Extract and return the hypernym value
  if 'results' in data and 'bindings' in data['results']:
    bindings = data['results']['bindings']
    if bindings:
      return bindings[0]['hypernymLabel']['value']

  return None

def get_Wikidata_id_property(dbpedia_prop):
  # Set up the SPARQL endpoint

  wikidata_ids = []

  sparql = SPARQLWrapper("http://dbpedia.org/sparql")

  # Define the SPARQL query to get the Wikidata ID for a DBpedia property
  query = f"""
  SELECT ?wikidataProperty
  WHERE {{
    <{dbpedia_prop}> owl:equivalentProperty ?wikidataProperty .
    FILTER(STRSTARTS(STR(?wikidataProperty), "http://www.wikidata.org/entity/"))
  }}
  """

  # Set the query
  sparql.setQuery(query)
  sparql.setReturnFormat(JSON)

  try:
    # Execute the query
    results = sparql.query().convert()

    # Extract and print the Wikidata property ID
    wikidata_properties = [result["wikidataProperty"]["value"] for result in results["results"]["bindings"]]
    # print(wikidata_properties)
    for prop in wikidata_properties:
      # Extract the Wikidata ID (e.g., "P569") from the full URL
      wikidata_id = prop.split('/')[-1]
      if wikidata_id not in wikidata_ids:
        wikidata_ids.append(wikidata_id)

  except Exception as e:
    print(f"An error occurred: {e}")

  return(wikidata_ids)

# Original WebNLG

In [None]:
#@title Install and download
# Download training data of WebNLG 23
!gdown 110L41fyQpxDhgzUsraRNMqwo3AkvNgu7

clear_output()

In [None]:
#@title Load datasets

import xmltodict

# Path to downloaded xml
# path_file = '/content/ga_train.xml'
# xml_file = open(path_file, 'r').read()
# dataset_23_dict = xmltodict.parse(xml_file)

# Load existing datasets from HuggingFace
webnlg_2017 = load_dataset('web_nlg', 'webnlg_challenge_2017', trust_remote_code=True)
webnlg_enriched = load_dataset('enriched_web_nlg', 'en',trust_remote_code=True)
webnlg_2020 = load_dataset('web_nlg', 'release_v3.0_en', trust_remote_code=True)
# webnlg_2020 = load_dataset('GEM/web_nlg', 'en')

def load_webnlg(webnlg, split, data_origin):
  "Gets the mtriples and the verbalisations"
  dataset = []
  for sample in webnlg[split]:
    # if sample['size'] == 1:
    # Next line is to get additional annotations from enhanced webnlg
    # dataset.append([sample['modified_triple_sets']['mtriple_set'][0], sample['lex']['text'], sample['lex']['sorted_triple_sets'], sample['lex']['template']])
    if data_origin == 'modified':
      dataset.append([sample['modified_triple_sets']['mtriple_set'][0], sample['lex']['text']])
    elif data_origin == 'original':
      dataset.append([sample['original_triple_sets']['otriple_set'][0], sample['lex']['text']])
  return dataset

def load_webnlg_cat(webnlg, split, data_origin):
  "Gets the mtriples and the verbalisations, and the category. Was created separately to not break anything that was using the data produced by the load_webnlg function"
  dataset = []
  for sample in webnlg[split]:
    # if sample['size'] == 1:
    # Next line is to get additional annotations from enhanced webnlg
    # dataset.append([sample['modified_triple_sets']['mtriple_set'][0], sample['lex']['text'], sample['lex']['sorted_triple_sets'], sample['lex']['template']])
    if data_origin == 'modified':
      dataset.append([sample['modified_triple_sets']['mtriple_set'][0], sample['lex']['text'], sample['category']])
    elif data_origin == 'original':
      dataset.append([sample['original_triple_sets']['otriple_set'][0], sample['lex']['text'], sample['category']])
  return dataset

# One data point is 2 lists: triples, english texts
# 1 triple example: [['Arianespace | country | France'], ['Arianespace is located in France.']]
dataset_17_train = load_webnlg(webnlg_2017, 'train', 'modified')
dataset_18_train = load_webnlg(webnlg_enriched, 'train', 'modified')
dataset_20_train = load_webnlg(webnlg_2020, 'train', 'modified')
dataset_20_dev = load_webnlg(webnlg_2020, 'dev', 'modified')
dataset_20_test = load_webnlg(webnlg_2020, 'test', 'modified')

dataset_20_train_orig = load_webnlg(webnlg_2020, 'train', 'original')
dataset_20_dev_orig = load_webnlg(webnlg_2020, 'dev', 'original')
dataset_20_test_orig = load_webnlg(webnlg_2020, 'test', 'original')

# One data point is 3 lists: triples, english texts, irish texts
# 1 triple example: [['Arianespace | country | France'], ['Arianespace is located in France.'], ['Tá Arianspace lonnaithe sa Fhrainc.']]
dataset_23_train = []

# Build WebNLG 2023 data structure like the one from HuggingFace for the other datasets
# for entry in dataset_23_dict['benchmark']['entries']['entry']:
#   data_point = []
#   mtriples_list = []
#   text_list_en = []
#   text_list_ga = []
#   # Get modified triples
#   if isinstance(entry['modifiedtripleset']['mtriple'], list):
#     for mtriple in entry['modifiedtripleset']['mtriple']:
#       mtriples_list.append(mtriple)
#   else:
#     mtriples_list.append(entry['modifiedtripleset']['mtriple'])
#   # Get reference texts
#   for lex in entry['lex']:
#     if lex['@lang'] == 'ga':
#       text_list_ga.append(lex['#text'])
#     else:
#       text_list_en.append(lex['#text'])
#   data_point.append(mtriples_list)
#   data_point.append(text_list_en)
#   data_point.append(text_list_ga)
#   dataset_23_train.append(data_point)

dataset_20_train_cat = load_webnlg_cat(webnlg_2020, 'train', 'original')
dataset_20_dev_cat = load_webnlg_cat(webnlg_2020, 'dev', 'original')
dataset_20_test_cat = load_webnlg_cat(webnlg_2020, 'test', 'original')

# print(dataset_23_train

clear_output()

In [None]:
#@title Print examples
print('WebNLG-2017')
print('  train:', len(dataset_17_train))
print(' ', dataset_17_train[2222])
print('WebNLG-2018 (Enhanced)')
print('  train:', len(dataset_18_train))
print(' ', dataset_18_train[2222])
print('WebNLG-2020')
print('  train:', len(dataset_20_train))
print('  test:', len(dataset_20_test))
print('  dev:', len(dataset_20_dev))
print(' ', dataset_20_train[2222])
# print('WebNLG-2023')
# print('  train:', len(dataset_23_train))
# print(' ', dataset_23_train[2222])
print(' ', dataset_20_train_cat[2222])

print('WebNLG-2020_orig')
print('  train:', len(dataset_20_train_orig))
print('  test:', len(dataset_20_test_orig))
print('  dev:', len(dataset_20_dev_orig))
print(' ', dataset_20_train_orig[2222])

In [None]:
#@title Get examples for properties
import codecs
import re

filepath = '/content/230528-WebNLG23_EN-GA_properties.txt'

# a line contains semantically equivalent props separated by a vertical bar
# 1stRunwayLengthFeet|1stRunwayLengthMetre|1st_runway_LengthFeet
lines_props = codecs.open(filepath, 'r', 'utf-8').readlines()

def getExamples(props_list, dataset):
  # property_examples will contain 380 lists of 3 lists: 1- prop names, 2- verb en, 3 verb ga
  property_examples = []
  for props in props_list:
    mini_list = []
    equivalent_props = props.strip().split('|')
    for equivalent_prop in equivalent_props:
      mini_list.append(equivalent_prop)
    property_examples.append([mini_list,[],[],[]])
  # At this point, property_examples looks like this:
  # [[['1stRunwayLengthFeet', '1stRunwayLengthMetre', '1st_runway_LengthFeet'], [], []], [['1stRunwayNumber', '1st_runway_Number'], [], []],...]

  for entry in dataset:
    # try to get verbalisations of one triple only
    if len(entry[0]) == 1:
      for input_triple in entry[0]:
        prop_name = input_triple.split(' | ')[1]
        for property_example in property_examples:
          if prop_name in property_example[0]:
            if len(property_example[1]) == 0:
              property_example[1].append(input_triple)
              property_example[2].append(entry[1][0])
            if len(property_example[3]) == 0:
              property_example[3].append(entry[2][0])

  return(property_examples)

my_examples = getExamples(lines_props, dataset_23_train)

for example in my_examples:
  print(example)

In [None]:
#@title Get texts for translation
# list_texts will look like this: [[[EN-sents0], [GA-sents0]], [[EN-sents1], [GA-sents1]], [[EN-sents2], [GA-sents2]], etc. ]
# [[['The Aarhus is the airport of Aarhus, Denmark.', 'Aarhus Airport serves the city of Aarhus, Denmark.'], ['Is aerfoirt Aarhus, an Danmhairg.', 'Aerfort Aarhus seirbhísíonn an bhaile Aarhus, an Danmhairg.']],...]
list_texts_ENGA = []
for id, entry in enumerate(dataset_23_train):
  # print(id)
  list_texts_ENGA.append([])
  list_texts_ENGA[id].append(entry[1])
  list_texts_ENGA[id].append(entry[2])

# to store one English text per input
texts_en_single = []
# to store all English texts for each input
texts_en_all = []
for texts_ENGA in list_texts_ENGA:
  # texts_ENGA contains the EN and GA texts for one input; texts_ENGA[0] contains the English texts
  texts_en_single.append(texts_ENGA[0][0])
  for text_en in texts_ENGA[0]:
    texts_en_all.append(text_en)

# print(len(texts_en_single))
# print(len(texts_en_all))

def count_words(dataset):
  count_words_all = 0
  for text in dataset:
    count_words_text = 0
    text_split = text.split(' ')
    for word in text_split:
      count_words_text += 1
    count_words_all += count_words_text
  return(count_words_all)

words_in_text_single = count_words(texts_en_single)
words_in_text_all = count_words(texts_en_all)

print(f'There are {words_in_text_single:,} words in the single-ref dataset ({words_in_text_single/len(texts_en_single)} words per text.)')
print(f'There are {words_in_text_all:,} words in the multi-ref dataset ({words_in_text_all/len(texts_en_all)} words per text.)')

# print(len(texts_en_all))

In [None]:
#@title Get all property combinations for entities of each category (download manually when over)
# For Sem accuracy experiments, I need a dico in which for each WebNLG category, I have all possible input configurations, so as to use these as templates to create new inputs that mirror the WebNLG configurations.
import json

# We want to group all categories that belong to a person
# Note: artists can be bands or people. Included here because anyway some properties of e.g. Athletes don't apply to other people
dico_map_categories = {'Artist':'Person', 'Astronaut':'Person', 'Athlete':'Person', 'Politician':'Person'}

dico_category_tripleConfigs = {}
for datapoint in dataset_20_train_cat:
  # datapoint is a list with 3 elements: triples, texts, category: [['Arianespace | locationCountry | France'], ['Arianespace is located in France.'], 'MeanOfTransportation']
  # Get dico key
  category_triple = None
  if datapoint[2] in dico_map_categories:
    category_triple = dico_map_categories[datapoint[2]]
  else:
    category_triple = datapoint[2]
  # Create key in output dico
  if category_triple not in dico_category_tripleConfigs:
    dico_category_tripleConfigs[category_triple] = {}
  list_properties = []
  for triple in datapoint[0]:
    property_label = triple.split(' | ')[1]
    list_properties.append(property_label)
  str_list_props = '##'.join(sorted(list_properties))
  if str_list_props not in dico_category_tripleConfigs[category_triple].keys():
    dico_category_tripleConfigs[category_triple][str_list_props] = 1
  else:
    dico_category_tripleConfigs[category_triple][str_list_props] += 1

# Sort combinations by frequency
for key in dico_category_tripleConfigs:
  dico_category_tripleConfigs[key] = dict(sorted(dico_category_tripleConfigs[key].items(), key=lambda item: item[1], reverse=True))

# Save dico_category_tripleConfigs in a json
with open("dico_category_tripleConfigs.json", "w") as outfile:
  json.dump(dico_category_tripleConfigs, outfile)

## Compare properties of different datasets

In [None]:
def comparePropertyLists(list1, list2):
  property_not_found_1to2 = []
  property_not_found_2to1 = []
  for prop in list1:
    if prop not in list2:
      property_not_found_1to2.append(prop)
  for prop in list2:
    if prop not in list1:
      property_not_found_2to1.append(prop)
  return(property_not_found_1to2, property_not_found_2to1)

properties_17_train = extractTripleElements(dataset_17_train, 'property')
properties_18_train = extractTripleElements(dataset_18_train, 'property')
properties_20_train = extractTripleElements(dataset_20_train, 'property')
properties_23_train = extractTripleElements(dataset_23_train, 'property')

_17notIn18, _18notIn17 = comparePropertyLists(properties_17_train, properties_18_train)
# _17notIn20, _20notIn17 = comparePropertyLists(properties_17, properties_20)
# _17notIn23, _23notIn17 = comparePropertyLists(properties_17, properties_23)
_18notIn20, _20notIn18 = comparePropertyLists(properties_18_train, properties_20_train)
_18notIn23, _23notIn18 = comparePropertyLists(properties_18_train, properties_23_train)
_20notIn23, _23notIn20 = comparePropertyLists(properties_20_train, properties_23_train)

In [None]:
print("WebnLG 17 VS Enhanced WebNLG\n-----------------------------")
if len(_17notIn18) > 0:
  print("Properties not found in 18:")
  print(sorted(_17notIn18))
else:
  print("All properties from WebNLG 17 are in Enhanced WebNLG")

if len(_18notIn17) > 0:
  print("Properties not found in 17:")
  print(sorted(_18notIn17))
else:
  print("All properties from Enhanced WebNLG are in WebNLG 17")

# print()
# print("WebnLG 17 VS WebNLG 20\n-----------------------------")
# if len(_17notIn20) > 0:
#   print("Properties not found in 20:")
#   print(sorted(_17notIn20))
# else:
#   print("All properties from WebNLG 17 are in WebNLG 20")

# if len(_20notIn17) > 0:
#   print("Properties not found in 17:")
#   print(sorted(_20notIn17))
# else:
#   print("All properties from WebNLG 20 are in WebNLG 17")

print()
print("Enhanced WebNLG VS WebnLG 20\n-----------------------------")
if len(_18notIn20) > 0:
  print("Properties not found in 20:")
  print(sorted(_18notIn20))
else:
  print("All properties from Enhanced WebNLG are in WebNLG 20")

if len(_20notIn18) > 0:
  print("Properties not found in Enhanced:")
  print(sorted(_20notIn18))
else:
  print("All properties from WebNLG 20 are in Enhanced WebNLG")

print()
print("Enhanced WebNLG VS WebnLG 23\n-----------------------------")
if len(_18notIn23) > 0:
  print("Properties not found in 23:")
  print(sorted(_18notIn23))
else:
  print("All properties from Enhanced WebNLG are in WebNLG 23")

if len(_23notIn18) > 0:
  print("Properties not found in Enhanced:")
  print(sorted(_23notIn18))
else:
  print("All properties from WebNLG 23 are in Enhanced WebNLG")

print()
print("WebnLG 20 VS WebnLG 23\n-----------------------------")
if len(_20notIn23) > 0:
  print("Properties not found in 23:")
  print(sorted(_20notIn23))
else:
  print("All properties from WebNLG 20 are in WebNLG 23")

if len(_23notIn20) > 0:
  print("Properties not found in 20:")
  print(sorted(_23notIn20))
else:
  print("All properties from WebNLG 23 are in WebNLG 20")

## Get mappings between properties

In [None]:
#@title Get mappings original - modified - Wikidata
# Outcome of this cell: There is a major problem when looking for mappings between original and modified property labels, in that the triples are very often but not necessarily aligned in the "original" and "modified" fields of the WebNLG data.
# Possible solution: filter out mappings that happen only once, since they are probably coming from data errors.
import os
import json

# Get mapping from a dbpedia property to a wikidata one; I already compiled a similar list in queryDBpediaProps.py
# wkd_id_list = get_Wikidata_id_property('http://dbpedia.org/ontology/birthDate')
# print(wkd_id_list)

def compare_Orig_Modif_props(dataset_orig, dataset_modif, dico_mappings):
  # x is a counter for triple sets
  x = 0
  while x < len(dataset_orig):
    # print(f'#### Datapoint {x} ####')
    triples_orig = dataset_orig[x][0]
    triples_modif = dataset_modif[x][0]
    # y is a counter for triples
    y = 0
    while y < len(triples_orig):
      subj_orig_y = triples_orig[y].split(' | ')[0]
      prop_orig_y = triples_orig[y].split(' | ')[1]
      obj_orig_y = triples_orig[y].split(' | ')[2]
      subj_modif_y = triples_modif[y].split(' | ')[0]
      prop_modif_y = triples_modif[y].split(' | ')[1]
      obj_modif_y = triples_modif[y].split(' | ')[2]

      # In dico_mappings, let's have: {orig_prop1 {modif_prop1.1:count1.1, modif_prop1.2:count1.2, etc.}, orig_prop2 {modif_prop2.1:count2.1, modif_prop2.2:count2.2, etc.}}
      if prop_orig_y not in dico_mappings:
        dico_mappings[prop_orig_y] = {}
        dico_mappings[prop_orig_y][prop_modif_y] = 1
      else:
        if prop_modif_y not in dico_mappings[prop_orig_y]:
          dico_mappings[prop_orig_y][prop_modif_y] = 1
        else:
          dico_mappings[prop_orig_y][prop_modif_y] += 1

      y += 1
    x += 1

  return dico_mappings

def analyse_mappings(dico_mappings):
  list_mappings_same = []
  for prop_orig in dico_mappings:
    # print(prop_orig, dico_mappings[prop_orig])
    if len(dico_mappings[prop_orig]) == 1:
      for prop_modif in dico_mappings[prop_orig]:
        if prop_modif == prop_orig:
          list_mappings_same.append(prop_orig)
  list_mappings_same_sorted = sorted(list(set(list_mappings_same)))
  return list_mappings_same_sorted


dico_mappings_O2M = {}
# Fill up dico_mappings
dico_mappings_O2M = compare_Orig_Modif_props(dataset_20_dev_orig, dataset_20_dev, dico_mappings_O2M)
dico_mappings_O2M = compare_Orig_Modif_props(dataset_20_train_orig, dataset_20_train, dico_mappings_O2M)
dico_mappings_O2M = compare_Orig_Modif_props(dataset_20_test_orig, dataset_20_test, dico_mappings_O2M)
with open("dico_map_Orig2Modif.json", "w") as outfile:
  json.dump(dico_mappings_O2M, outfile)

dico_mappings_M2O = {}
# Fill up dico_mappings
dico_mappings_M2O = compare_Orig_Modif_props(dataset_20_dev, dataset_20_dev_orig, dico_mappings_M2O)
dico_mappings_M2O = compare_Orig_Modif_props(dataset_20_train, dataset_20_train_orig, dico_mappings_M2O)
dico_mappings_M2O = compare_Orig_Modif_props(dataset_20_test, dataset_20_test_orig, dico_mappings_M2O)
with open("dico_map_Modif2Orig.json", "w") as outfile:
  json.dump(dico_mappings_M2O, outfile)

# Check what the mappings look like
list_mappings_straight = analyse_mappings(dico_mappings_O2M)
print(f'\n\nThere are {len(list_mappings_straight)} properties that have always the same label in Orig and Modif vocabularies.\n  {str(sorted(list_mappings_straight))}')

In [None]:
#@title Check whether we have all WebNLG properties (original and Modified) in our predArg template file.
# The idea is that we want to be able to use the generator on real DBpedia properties, without going through the modified propertiy labels. I'm pretty sure I did not check that all original property labels are mapped to something.
# Actually checking the "240202_WebNLG23_EN-GA_properties.txt" file, which contains all properties according to the "#@title Check predArg template files : Do NOT edit!" cell of the Small_codes Notebook.
import codecs
import os

dico_map_Orig2Modif = json.load(open("dico_map_Orig2Modif.json"))
dico_map_Modif2Orig = json.load(open("dico_map_Modif2Orig.json"))
webnlg_properties_file = codecs.open('/content/240202_WebNLG23_EN-GA_properties.txt', 'r', 'utf-8').readlines()

# webnlg_properties_templates_full will contain full names used in the template mappings (e.g. "position[Subject_eq_Person]"). It is used toi check for duplicates.
webnlg_properties_templates_full = []
# webnlg_properties_templates will contain the "normal" property names (e.g. "position")
webnlg_properties_templates = []
for line in webnlg_properties_file:
  prop_names_list = line.strip().split('|')
  for prop_name in prop_names_list:
    clean_prop_name = prop_name.split('[')[0]
    if prop_name not in webnlg_properties_templates_full:
      webnlg_properties_templates_full.append(prop_name)
    else:
      print(f'Found duplicate property name: {clean_prop_name}')
    if clean_prop_name not in webnlg_properties_templates:
      webnlg_properties_templates.append(clean_prop_name)

webnlg_properties_original = list(dico_map_Orig2Modif.keys())
webnlg_properties_modified = list(dico_map_Modif2Orig.keys())

print(f'Properties from templates: {len(webnlg_properties_templates)}')
print(f'Properties from WebNLG-original: {len(webnlg_properties_original)}')
print(f'Properties from WebNLG-modified: {len(webnlg_properties_modified)}')
print('\n')

count_miss_modif = 0
count_miss_orig = 0
for wpm in webnlg_properties_modified:
  if wpm not in webnlg_properties_templates:
    count_miss_modif += 1
    print(f'Property {wpm} from WebNLG-Modified is not in templates')
print('\n')
for wpo in webnlg_properties_original:
  if wpo not in webnlg_properties_templates:
    count_miss_orig += 1
    # Get most likely mapping according to dico_mapping
    most_likely_mapping = max(dico_map_Orig2Modif[wpo], key=dico_map_Orig2Modif[wpo].get)
    print(f'Property {wpo} from WebNLG-Original is not in templates.\n  Modified: {most_likely_mapping} is the most likely mapping.')
print('\n')

print(f'{count_miss_modif} modified properties are not covered.')
print(f'{count_miss_orig} original properties are not covered.')

In [None]:
#@title Check how many of the actual dbpedia properties we are currently covering, assuming all original properties are mapped to a predArg template (see previous cell).
# For this let's use the list of original property labels extracted from the WebNLG dataset, and the list of all DBpedia properties obtained with a cell of the Small_codes notebook (see C:\Users\sfmil\Desktop\DCU-24\2025-2026_ADAPT\MyPapers\2025-01_FORGe)

dico_map_Orig2Modif = json.load(open("/content/dico_map_Orig2Modif.json"))
dico_map_Modif2Orig = json.load(open("/content/dico_map_Modif2Orig.json"))
dico_dbp_props = json.load(open("/content/dico_count_occurrences_dbp_props.json"))

webnlg_properties_original = list(dico_map_Orig2Modif.keys())
webnlg_properties_modified = list(dico_map_Modif2Orig.keys())
# dico_dbp_props is ordered by count, so the most frequent properties will appear first in the dbpedia_properties lists
dbpedia_properties_all = [key.rsplit('/', 1)[1] for key in dico_dbp_props.keys()]
dbpedia_properties_at_least_one_instance = [key.rsplit('/', 1)[1] for key in dico_dbp_props.keys() if dico_dbp_props[key] > 0]
dbpedia_properties_zero_instance = [key.rsplit('/', 1)[1] for key in dico_dbp_props.keys() if dico_dbp_props[key] == 0]

print(len(webnlg_properties_original), webnlg_properties_original)
print(len(webnlg_properties_modified), webnlg_properties_modified)
print(len(dbpedia_properties_at_least_one_instance), dbpedia_properties_at_least_one_instance)
print(len(dbpedia_properties_zero_instance), dbpedia_properties_zero_instance)

def compare_property_lists(candidate_list, dbpedia_properties_zero_instance, dbpedia_properties_at_least_one_instance):
  props_in_dbpedia_zero_instance = []
  props_in_dbpedia_at_least_one_instance = []
  props_not_in_dbpedia = []

  for webnlg_property in candidate_list:
    if webnlg_property in dbpedia_properties_zero_instance:
      props_in_dbpedia_zero_instance.append(webnlg_property)
    elif webnlg_property in dbpedia_properties_at_least_one_instance:
      props_in_dbpedia_at_least_one_instance.append(webnlg_property)
    else:
      props_not_in_dbpedia.append(webnlg_property)
  return props_in_dbpedia_zero_instance, props_in_dbpedia_at_least_one_instance, props_not_in_dbpedia

print('\n')
props_in_dbpedia_zero_instance_o, props_in_dbpedia_at_least_one_instance_o, props_not_in_dbpedia_o = compare_property_lists(webnlg_properties_original, dbpedia_properties_zero_instance, dbpedia_properties_at_least_one_instance)
print(f'{len(props_in_dbpedia_zero_instance_o)} Original WebNLG properties have 0 instances in DBpedia.', sorted(props_in_dbpedia_zero_instance_o))
print(f'{len(props_in_dbpedia_at_least_one_instance_o)} Original WebNLG properties have at least one instance in DBpedia.', sorted(props_in_dbpedia_at_least_one_instance_o))
print(f'{len(props_not_in_dbpedia_o)} Original WebNLG properties are not in DBpedia.', sorted(props_not_in_dbpedia_o))
print('\n')

props_in_dbpedia_zero_instance_m, props_in_dbpedia_at_least_one_instance_m, props_not_in_dbpedia_m = compare_property_lists(webnlg_properties_modified, dbpedia_properties_zero_instance, dbpedia_properties_at_least_one_instance)

m_notIn_o_atLeastOneInstance = []
m_notIn_o_zeroInstance = []
for prop_in_dbpedia_at_least_one_instance_m in props_in_dbpedia_at_least_one_instance_m:
  if prop_in_dbpedia_at_least_one_instance_m not in props_in_dbpedia_at_least_one_instance_o:
    m_notIn_o_atLeastOneInstance.append(prop_in_dbpedia_at_least_one_instance_m)

for prop_in_dbpedia_zero_instance_m in props_in_dbpedia_zero_instance_m:
  if prop_in_dbpedia_zero_instance_m not in props_in_dbpedia_zero_instance_o:
    m_notIn_o_zeroInstance.append(prop_in_dbpedia_zero_instance_m)

print(f'{len(m_notIn_o_zeroInstance)} Modified WebNLG properties that are different from Original WebNLG properties have 0 instances in DBpedia.', sorted(m_notIn_o_zeroInstance))
print(f'{len(m_notIn_o_atLeastOneInstance)} Modified WebNLG properties that are different from Original WebNLG properties have at least one instance in DBpedia.', sorted(m_notIn_o_atLeastOneInstance))




In [None]:
#@title Read file with all properties covered by FORGe
import os
import codecs

props_list_path = os.path.join('/content', 'DCU_TCD_FORGe_WebNLG23', 'code', 'sorted_properties.txt')
fd = codecs.open(props_list_path, 'r', 'utf-8')
lines_properties = fd.readlines()
list_properties = []
for line_properties in lines_properties:
  line_prop_list = line_properties.strip().split('-')
  for prop in line_prop_list:
    list_properties.append(prop)

print(len(list_properties))

## Get subject and object values and Wikidata Q-ID

In [None]:
# @title Create lists all entities
import codecs

subjects20_train = extractTripleElements(dataset_20_train, 'subject')
subjects20_dev = extractTripleElements(dataset_20_dev, 'subject')
subjects20_test = extractTripleElements(dataset_20_test, 'subject')
objects20_train = extractTripleElements(dataset_20_train, 'object')
objects20_dev = extractTripleElements(dataset_20_dev, 'object')
objects20_test = extractTripleElements(dataset_20_test, 'object')
properties20_train = extractTripleElements(dataset_20_train, 'property')
properties20_dev = extractTripleElements(dataset_20_dev, 'property')
properties20_test = extractTripleElements(dataset_20_test, 'property')

def extend_elementList(list_all, new_list):
  """This function is supposed to be called the first time with an empty list as list_all"""
  # print('Extension starts!')
  for element in new_list:
    if element not in list_all:
      # print(element)
      list_all.append(element)
  return(list_all)

all_properties = sorted(extend_elementList(extend_elementList(extend_elementList([], properties20_train), properties20_dev), properties20_test))
all_subjects = sorted(extend_elementList(extend_elementList(extend_elementList([], subjects20_train), subjects20_dev), subjects20_test))
all_objects = sorted(extend_elementList(extend_elementList(extend_elementList([], objects20_train), objects20_dev), objects20_test))
all_entities = sorted(extend_elementList(extend_elementList([], all_subjects), all_objects))

print(len(all_properties))
print(len(all_subjects))
print(len(all_objects))
print(len(all_entities))

In [None]:
#@title Save all_subjects and all_objects lists in text files, one word per line
with open('all_subjects.txt', 'w', encoding='utf-8') as f:
  for item in all_subjects:
    f.write(f"{item}\n")

with open('all_objects.txt', 'w', encoding='utf-8') as f:
  for item in all_objects:
    f.write(f"{item}\n")

with open('all_properties.txt', 'w', encoding='utf-8') as f:
  for item in all_properties:
    f.write(f"{item}\n")


In [None]:
#@title Create lists filtered entities (UPLOAD FR_subj/obj_props_to_translate.txt)
import codecs

list_props_subj = [line.strip() for line in codecs.open('/content/FR_subj_props_to_translate.txt', 'r', 'utf-8').readlines()]
list_props_obj = [line.strip() for line in codecs.open('/content/FR_obj_props_to_translate.txt', 'r', 'utf-8').readlines()]

# I compiled the next two lists manually to use them to find out which properties trigger their appearance in the lists of entities to be translated
# subj_list_I_dont_want_them_translated_why_were_they = ['1089_Tama', '108_St_Georges_Terrace', 'AIDA_Cruises', 'Audi_A1', 'Rock_and_roll', 'The_Fellowship_of_the_Ring', 'The_Honeymoon_Killers_(American_band)', 'Train_(band)', 'A-Rosa Luna', 'AIDS (journal)', 'A Severed Wasp', 'Bananaman', 'Chinabank', 'English Without Tears', 'Let It Breed', 'Nord (Year of No Light album)', 'Soho Press', 'Sony Music Entertainment', 'Take It Off!']
subj_list_I_dont_want_them_translated_why_were_they = ['Alcatraz_Versus_the_Evil_Librarians', 'A_Fortress_of_Grey_Ice', 'A_Long_Long_Way', 'A_Severed_Wasp', '1634:_The_Ram_Rebellion', '1634:_The_Bavarian_Crisis', '1634:_The_Baltic_War', 'Ring_of_Fire_II']
# I was looking in the wrong file with all values instead of the file with filtered values only; the actual files look better, no big issues
# obj_list_I_dont_want_them_translated_why_were_they = ['"2"', '"03R/21L"', '"110 million (dollars)"', '"DL1, DL2, DL3"', '0.0068 (kilometrePerSeconds)', '0.54 (square kilometres)', '1.1 (kilograms)', '1104.1 (inhabitants per square kilometre)', '1202.846 (days)', '125800.0 (millimetres)', '2.0 (gramPerCubicCentimetres)', 'Deșteaptă-te, române!', 'DeSoto Firedome', 'GMA New Media', "Hook 'em (mascot)", 'Kissing Spell Records', 'Lotus Eaters (band)', 'Mark Sixma', 'Marry Banilow', 'Max Benedict', 'Mike Akhigbe', 'Oberbürgermeister', 'Osmosys Records', 'Pools of Light (Brian Kelly album)', 'Riverside Art Museum', 'Roadside Attractions', 'Soho Press', 'South Capitol Street']

def extractFilteredEntities(dataset, entity_type, list_properties):
  """ Returns a list of subjects or objects, extracted from triple sets, filtered by property. e.g we want the subject values of the properties 'birthDate' and 'birthPlace' """
  n = ''
  if entity_type == 'subject':
    n = 0
  elif entity_type == 'object':
    n = 2
  else:
    print('Error, the second argument of extractTripleElements must be "subject", "property" or "object".')
  element_list = []
  for entry in dataset:
    for input_triple in entry[0]:
      property_name = input_triple.split(' | ')[1]
      if property_name in list_properties:
        element_name = input_triple.split(' | ')[n]
        # For debugging
        # Result: 55/66 errors come from the subject of "location", but in many cases it's better to translate the subj so don't act.
        # if entity_type == 'subject' and element_name in subj_list_I_dont_want_them_translated_why_were_they:
        #   print(f'  {element_name} is subject of {property_name}')
        if element_name not in element_list:
          element_list.append(element_name)
  return(element_list)

print('Extracting subjects from train')
filt_subj20_train = extractFilteredEntities(dataset_20_train, 'subject', list_props_subj)
print('Extracting subjects from dev')
filt_subj20_dev = extractFilteredEntities(dataset_20_dev, 'subject', list_props_subj)
print('Extracting subjects from test')
filt_subj20_test = extractFilteredEntities(dataset_20_test, 'subject', list_props_subj)
print('Extracting objects from train')
filt_obj20_train = extractFilteredEntities(dataset_20_train, 'object', list_props_obj)
print('Extracting objects from dev')
fitl_obj20_dev = extractFilteredEntities(dataset_20_dev, 'object', list_props_obj)
print('Extracting objects from test')
filt_obj20_test = extractFilteredEntities(dataset_20_test, 'object', list_props_obj)
# print(len(filtered_subjects_train), filtered_subjects_train)
# print(len(filtered_objects_train), filtered_objects_train)
filt_subjects = sorted(extend_elementList(extend_elementList(extend_elementList([], filt_subj20_train), filt_subj20_dev), filt_subj20_test))
filt_objects = sorted(extend_elementList(extend_elementList(extend_elementList([], filt_obj20_train), fitl_obj20_dev), filt_obj20_test))
filt_entities = sorted(extend_elementList(extend_elementList([], filt_subjects), filt_objects))

print(len(filt_subjects))
print(len(filt_objects))
print(len(filt_entities))

In [None]:
#@title Save filt_subjects and filt_objects lists in text files, one word per line
with open('filtered_subjects.txt', 'w', encoding='utf-8') as f:
  for item in filt_subjects:
    f.write(f"{item}\n")

with open('filtered_objects.txt', 'w', encoding='utf-8') as f:
  for item in filt_objects:
    f.write(f"{item}\n")

In [None]:
# @title Create CSV GEM QIDs
rows = []
header = ['Q-ID', 'Wikidata label', 'WebNLG label', 'Class_RegEx']
get_wikidata_id_bulk(rows, all_entities, bar)

with open('WebNLG_QIDs.csv', 'w', encoding='utf-8') as f:
  writer = csv.writer(f)
  writer.writerow(header)
  for row in rows:
    writer.writerow(row)

## GEM shared task: Fix inputs and check uploaded files



In [None]:
# !pip install xmltodict

In [None]:
#@title Fix inputs D2T
import glob
import os
import codecs
import re
import random

path_input_XML = '/content'
list_XML_files = glob.glob(os.path.join(path_input_XML, '*[0-9].xml'))

def getRandomListElementFromDict(dictListValue):
  """ Returns a random element from a list when provided with a dico entry for which the value is this list"""
  rand_num = random.randrange(len(dictListValue))
  return(str(dictListValue[rand_num]))

def write_line(out_file, new_line):
  line_to_write = new_line
  if re.search('^      \r\n', new_line) or re.search('^        \r\n', new_line):
    pass
  else:
    if re.search('^      <entry ', new_line):
      line_to_write = re.sub('^      <entry ', '    <entry ', new_line)
    elif re.search('^          </entry>\r\n', new_line) or re.search('^      </entry>\r\n', new_line):
      line_to_write = '    </entry>\r\n'
    elif re.search('^            <modifiedtripleset>\r\n', new_line):
      line_to_write = '      <modifiedtripleset>\r\n'
    elif re.search('^            </modifiedtripleset>\r\n', new_line):
      line_to_write = '      </modifiedtripleset>\r\n'
    elif re.search('^              <mtriple>', new_line):
      line_to_write = re.sub('^              <mtriple>', '        <mtriple>', new_line)
    out_file.write(line_to_write)

# For fictional wikidata dataset: DateOfBirth, DateOfDeath, EndOfWorkPeriod
missing_values = { 'DateOfBirth' : ['1975-05-30', '2242-09-22', '1326-05-19', '119-02-24'], 'EndOfWorkPeriod' : ['1684', '3875', '1998', '318'], 'DateOfDeath' : ['1594-12-06', '2532-11-26', '1654-01-09', '221-09-09']}
# For counterfactual webnlg dataset
# For address, buildDate, gridReference, timeshiftChannel, training
replace_obj_of = {'address' : ['103_Colmore_Row', '108_St_Georges_Terrace', '11_Diagonal_Street', '20_Fenchurch_Street', '200_Public_Square', '101 Ukrop Way'], 'builtDate' : ['1986-04-15', '2013-11-04', '1875-03-04', '1894-11-20', '1934-01-01', '2012-12-27'], 'gridReference' : ['NZ289147'], 'timeshiftChannel' : ['HBO_East,_HBO_West'], 'training' : ['School_of_Applied_Arts_in_Stuttgart'] }
# For specific wrong object values
# replace_value = {'"In_Soldevanahalli_Acharya_' : {'creator' : ['Steve_Bright', 'Marie_Curie', 'Olga_Bondareva'], 'director' : ['Sarah_Teale', 'Virginia_DeMarce', 'Stacy_Katzman'], 'distributor' : ['Lionsgate', 'Alliance_Films_Corporation', 'Roadside_Attractions'], 'editing' : ['Stacy_Katzman', 'Max_Benedict'], 'gridReference' : ['NZ289147'], 'precededBy' : ['The_Hobbit', 'Let_It_Breed', 'This''ll_Be_My_Year'], 'president' : ['Stacy_Katzman', 'John_F._Kennedy', 'Virginia_DeMarce'], 'producer' : ['The_Velvet_Underground', 'Year_of_No_Light', 'Anatole_de_Grunwald'], 'recordLabel' : ['Polydor_Records', 'Columbia_Records', 'Sony_Music_Entertainment', 'Universal_Music_Group'], 'spouse' : ['Casey_Ribicoff', 'Steve_Bright', 'Marie_Curie'], 'timeshiftChannel' : ['HBO' 'East', 'HBO' 'West'], 'training' : ['School_of_Applied_Arts_in_Stuttgart'], 'type' : ['City', 'Compilation_Album', 'Public_company']}, '"May_1950_-_August_1956' : {'campus' : ['Dijon', '"In' 'Soldevanahalli', 'Acharya' 'Dr.' 'Sarvapalli' 'Radhakrishnan' 'Road', 'Hessarghatta' 'Main' 'Road', 'Bangalore' '–' '560090."'], 'creator' : ['Steve_Bright', 'Marie_Curie', 'Amund_Bjørklund'], 'designer' : ['Sarah_Teale', 'Virginia_DeMarce', 'Olga_Bondareva'], 'director' : ['Sarah_Teale', 'Marie_Curie', 'Stacy_Katzman'], 'editing' : ['Casey_Ribicoff', 'Stacy_Katzman', 'Max_Benedict'], 'musicSubgenre' : ['Southern_sludge', 'Proto-punk', 'Indie_pop'], 'spouse' : ['Steve_Bright', 'Amund_Bjørklund', 'Olga_Bondareva'], 'training' : ['School_of_Applied_Arts_in_Stuttgart']}, 'andminus;7' :{'academicStaffSize' : ['500', '986', '4', '42'], 'editing' : ['Casey_Ribicoff', 'Stacy_Katzman', 'Max_Benedict'], 'meaning' : ['Opening_of_hope']}, 'School_of_Applied_Arts_in_Stuttgart' : {'background' : ['non_performing_personnel'], 'buildDate' : ['1986-04-15', '2013-11-04', '1875-03-04', '1894-11-20', '1934-01-01', '2012-12-27'], 'editing' : ['Robert_A._M._Stern', 'Amund_Bjørklund'], 'precededBy' : ['The_Hobbit', 'Let_It_Breed', "This'll_Be_My_Year"]}, 'Alliance_Films_Corporation' : {'award' : ['State_Award_for_Superior_Achievement', 'Distinguished_Service_Medal_(United_States_Navy)'], 'buildDate' : ['1986-04-15', '2013-11-04', '1875-03-04', '1894-11-20', '1934-01-01', '2012-12-27'], 'campus' : ['Dijon', '"In_Soldevanahalli,_Acharya_Dr._Sarvapalli_Radhakrishnan_Road,_Hessarghatta_Main_Road,_Bangalore_–_560090."'], 'director' : ['Sarah_Teale', 'Virginia_DeMarce', 'Stacy_Katzman'], 'nickname' : ['Asa_Gigante', 'Alvinegro'], 'spouse' : ['Steve_Bright', 'Amund_Bjørklund', 'Olga_Bondareva']}, '"DeMarce_short_stories_in_the_The_Grantville_Gazettes' : {'spouse' : ['Steve_Bright', 'Amund_Bjørklund', 'Olga_Bondareva']}, "This'll_Be_My_Year" : {'buildDate' : ['1986-04-15', '2013-11-04', '1875-03-04', '1894-11-20', '1934-01-01', '2012-12-27'], 'campus' : ['Dijon', '"In_Soldevanahalli,_Acharya_Dr._Sarvapalli_Radhakrishnan_Road,_Hessarghatta_Main_Road,_Bangalore_–_560090."'], 'director' : ['Sarah_Teale', 'Virginia_DeMarce', 'Stacy_Katzman'], 'nickname' : ['Asa_Gigante', 'Alvinegro'], 'producer' : ['The_Velvet_Underground', 'Year_of_No_Light', 'Anatole_de_Grunwald'], 'spouse' : ['Steve_Bright', 'Amund_Bjørklund', 'Olga_Bondareva'], 'training' : ['School_of_Applied_Arts_in_Stuttgart']}  }
# Same as list above but removing the properties covered in replace_obj_of list and editing problematic values
replace_value = {'\"In_Soldevanahalli_Acharya_' : {'creator' : ['Steve_Bright', 'Marie_Curie', 'Olga_Bondareva'], 'director' : ['Sarah_Teale', 'Virginia_DeMarce', 'Stacy_Katzman'], 'distributor' : ['Lionsgate', 'Alliance_Films_Corporation', 'Roadside_Attractions'], 'editing' : ['Stacy_Katzman', 'Max_Benedict'], 'precededBy' : ['The_Hobbit', 'Let_It_Breed'], 'president' : ['Stacy_Katzman', 'John_F._Kennedy', 'Virginia_DeMarce'], 'producer' : ['The_Velvet_Underground', 'Year_of_No_Light', 'Anatole_de_Grunwald'], 'recordLabel' : ['Polydor_Records', 'Columbia_Records', 'Sony_Music_Entertainment', 'Universal_Music_Group'], 'spouse' : ['Casey_Ribicoff', 'Steve_Bright', 'Marie_Curie'], 'type' : ['City', 'Compilation_Album', 'Public_company']}, '"May_1950_-_August_1956' : {'campus' : ['Dijon', 'Bangalore_–_560090'], 'creator' : ['Steve_Bright', 'Marie_Curie', 'Amund_Bjørklund'], 'designer' : ['Sarah_Teale', 'Virginia_DeMarce', 'Olga_Bondareva'], 'director' : ['Sarah_Teale', 'Marie_Curie', 'Stacy_Katzman'], 'editing' : ['Casey_Ribicoff', 'Stacy_Katzman', 'Max_Benedict'], 'musicSubgenre' : ['Southern_sludge', 'Proto-punk', 'Indie_pop'], 'spouse' : ['Steve_Bright', 'Amund_Bjørklund', 'Olga_Bondareva']}, 'andminus;7' :{'academicStaffSize' : ['500', '986', '4', '42'], 'editing' : ['Casey_Ribicoff', 'Stacy_Katzman', 'Max_Benedict'], 'meaning' : ['Opening_of_hope']}, 'School_of_Applied_Arts_in_Stuttgart' : {'background' : ['non_performing_personnel'], 'editing' : ['Robert_A._M._Stern', 'Amund_Bjørklund'], 'precededBy' : ['The_Hobbit', 'Let_It_Breed']}, 'Alliance_Films_Corporation' : {'award' : ['State_Award_for_Superior_Achievement', 'Distinguished_Service_Medal_(United_States_Navy)'], 'campus' : ['Dijon', 'Bangalore_–_560090'], 'director' : ['Sarah_Teale', 'Virginia_DeMarce', 'Stacy_Katzman'], 'nickname' : ['Asa_Gigante', 'Alvinegro'], 'spouse' : ['Steve_Bright', 'Amund_Bjørklund', 'Olga_Bondareva']}, '"DeMarce_short_stories_in_the_The_Grantville_Gazettes' : {'spouse' : ['Steve_Bright', 'Amund_Bjørklund', 'Olga_Bondareva']}, 'This\'ll_Be_My_Year' : {'campus' : ['Dijon', 'Bangalore_–_560090'], 'director' : ['Sarah_Teale', 'Virginia_DeMarce', 'Stacy_Katzman'], 'nickname' : ['Asa_Gigante', 'Alvinegro'], 'producer' : ['The_Velvet_Underground', 'Year_of_No_Light', 'Anatole_de_Grunwald'], 'spouse' : ['Steve_Bright', 'Amund_Bjørklund', 'Olga_Bondareva']} }

# Lists to store how many fixes are made
fixed_missing_obj = []
fixed_missing_obj_file = []
fixed_obj_of = []
fixed_obj_of_file = []
fixed_obj = []
fixed_obj_file = []
for XML_file_path in list_XML_files:
  print(XML_file_path)
  xml_file = codecs.open(XML_file_path, 'r', 'utf-8').readlines()
  new_file_path = str(XML_file_path.rsplit('.', 1)[0])+'-EDIT.xml'
  count_matches = 0
  with codecs.open(new_file_path, 'w', 'utf-8') as fo:
    # Fix missing objects, apply to all files
    # if XML_file_path == '/content/fictional_wikidata_1900-2023-10-14.xml':
    # This variable stores whether we are in a new input, so we generate a new value, or in an input for which a value has already been generated (we have otriples and mtriples that need to have the same values)
    new_input_obj_of = 'yes'
    new_input_obj = 'yes'
    # I organise these as dicos because there can be several substitutions in the same input (right now I don't cover multiple substitutions of the same property)
    stored_value_obj_of = {}
    stored_value_obj = {}
    for count_fix, line in enumerate(xml_file):
      new_line = line
      # print('LINE : "'+line+'"')
      # Add missing properties
      if re.search(' \| <', line):
        match_property = 'no'
        for property_empty_obj in list(missing_values.keys()):
          if re.search(property_empty_obj+' \| <', line):
            match_property = 'yes'
            # Get a random value from the list
            random_value = getRandomListElementFromDict(missing_values[property_empty_obj])
            new_line = re.sub(' \| <', ' | '+str(random_value)+'<', line)
            fixed_missing_obj.append(count_fix)
            if XML_file_path not in fixed_missing_obj_file:
              fixed_missing_obj_file.append(XML_file_path)
        # If there is no proposed value for the property, just write unknown
        if match_property == 'no':
          new_line = re.sub(' \| <', ' | Unknown <', line)
        print(new_line)
      # write_line(fo, new_line)
      # new_line = line
      # Change the object values of all instances of a specific list of properties; only apply to webnlg counterfactual
      else:
        if XML_file_path == '/content/rdf-to-text-generation-test-data-with-refs-en-counterfactual-v0.7.xml':
          for prop_replace_obj_of in list(replace_obj_of.keys()):
            if re.search(' \| '+str(prop_replace_obj_of)+' \| ', line):
              # If new input, we randomly select a value
              if new_input_obj_of == 'yes':
                random_value_obj_of = getRandomListElementFromDict(replace_obj_of[prop_replace_obj_of])
                new_line = re.sub(' \| [^<\|]+<', ' | '+str(random_value_obj_of)+'<', line)
                stored_value_obj_of[prop_replace_obj_of] = random_value_obj_of
                new_input_obj_of = 'no'
              # If seen input, reuse the value selected before
              elif new_input_obj_of == 'no':
                new_line = re.sub(' \| [^<\|]+<', ' | '+str(stored_value_obj_of[prop_replace_obj_of])+'<', line)
              fixed_obj_of.append(count_fix)
              if XML_file_path not in fixed_obj_of_file:
                fixed_obj_of_file.append(XML_file_path)
              print(new_line)
        # Change specific object values if found with specific properties; apply to all files (excludes properties handled in the if above so there is no overlap between the 2)
        for obj_value in list(replace_value.keys()):
          if re.search(' \| '+str(obj_value)+'[^<\|]+<', line):
            for target_property in list(replace_value[obj_value].keys()):
              if re.search(' \| '+str(target_property)+' \| ', line):
                # If new input, we randomly select a value
                if new_input_obj == 'yes':
                  random_value_obj = getRandomListElementFromDict(replace_value[obj_value][target_property])
                  new_line = re.sub(' \| [^<\|]+<', ' | '+str(random_value_obj)+'<', line)
                  stored_value_obj[target_property] = random_value_obj
                  new_input_obj = 'no'
                # If seen input, reuse the value selected before
                elif new_input_obj == 'no':
                  new_line = re.sub(' \| [^<\|]+<', ' | '+str(stored_value_obj[target_property])+'<', line)
                fixed_obj.append(count_fix)
                if XML_file_path not in fixed_obj_file:
                  fixed_obj_file.append(XML_file_path)
                print(new_line)
      write_line(fo, new_line)
      # When we see a new input, reset the variables
      if re.search('    <entry category=', line):
        new_input_obj_of = 'yes'
        new_input_obj = 'yes'

print('Fixed '+str(len(fixed_missing_obj))+' missing objects in '+str(fixed_missing_obj_file))
print('Fixed '+str(len(fixed_obj_of))+' objects of a targeted prop in '+str(fixed_obj_of_file))
print('Fixed '+str(len(fixed_obj))+' targeted objects in '+str(fixed_obj_file))

In [None]:
#@title Check uploaded files
import glob
import os
import re
import codecs
import json

out_folder = 'GEM_test'

paths_submissions = glob.glob(os.path.join(out_folder, '*'))

def check_GEM_submissions(path_submission, sys_output):
  # Check name
  task_suffixes = ('_D2T-1-FA', '_D2T-1-FI', '_D2T-1-CFA', '_D2T-2-FA', '_D2T-2-FI', '_D2T-2-CFA', '_Summ-1', '_Summ-2', '_Summ-3')
  d2t1_IDs = ['D2T-1-FA', 'D2T-1-FI', 'D2T-1-CFA']
  d2t2_IDs = ['D2T-2-FA', 'D2T-2-FI', 'D2T-2-CFA']
  summ_IDs = ['Summ-1', 'Summ-2', 'Summ-3']
  languages = ('_en', '_zh', '_de', '_ru', '_es', '_ko', '_hi', '_sw', '_ar')
  extensions = ('.txt', '.jsonl')

  filename = os.path.basename(path_submission)
  print(filename)

  # Check extension
  if filename.endswith(extensions):
    filename_noExt = filename.rsplit('.', 1)[0]
    extension = filename.rsplit('.', 1)[1]
    # Check language ID
    if filename_noExt.endswith(languages):
      filename_noExt_noLang = filename_noExt.rsplit('_', 1)[0]
      # Check task identifier
      if filename_noExt_noLang.endswith(task_suffixes):
        filename_noExt_noLang_noTask = filename_noExt_noLang.rsplit('_', 1)[0]
        task_ID = filename_noExt_noLang.rsplit('_', 1)[1]
        # If there is a system name, open the files and check inside
        if len(filename_noExt_noLang_noTask) > 0:
          # txt files are for the D2T task; D2T-1 should have 1,779 lines, D2T-2 should have 1,800 lines.
          if extension == 'txt':
            file_lines = sys_output.readlines()
            # Check line numbers in D2T-1 data
            if task_ID in d2t1_IDs and not len(file_lines) == 1779:
              print(f'  Error line numbers!\n\t{filename} should have 1,779 lines (found {len(file_lines)}).')
            # Check line numbers in D2T-2 data
            elif task_ID in d2t2_IDs and not len(file_lines) == 1800:
              print(f'  Error line numbers!\n\t{filename} should have 1,800 lines (found {len(file_lines)}).')
            else:
              print('  OK!')
          # json files are for the summ task; check well-formedness
          elif extension == 'json':
            try:
              json.load(sys_output)
            except:
              print(f'  Error json formatting! Check {filename_noExt}.')
            # There should additional be code to check the number of outputs in the submitted files
        else:
          print(f'  Error filename system name!\n\t{filename_noExt} should have a name before the task suffix.')
      else:
          print(f'  Error filename task suffix!\n\t{filename_noExt} should contain one of these task suffixes: {task_suffixes}.')
    else:
      print(f'  Error filename language suffix!\n\t{filename_noExt} should end with one of these language suffixes: {languages}.')
  else:
    print(f'  Error filename extension!\n\t{filename} should have one of these extensions (according to task): {extensions}.')

for path_submission in paths_submissions:
  # We should receive "path_submission" an "sys_output"
  sys_output = codecs.open(path_submission, 'r', 'utf-8')

  check_GEM_submissions(path_submission, sys_output)



In [None]:
#@title Extract one reference text per input in the test data
import random
import codecs

random.seed(42)

with codecs.open('0-references_D2T-1-FA_en.txt', 'w', 'utf-8') as fo:
  # The test data on HuggingFace contains first the D2T data and then the Text-to-Triple data
  for dtp_test in dataset_20_test[:1779]:
    # Get the number of reference texts for each data point
    num_refs = len(dtp_test[1])
    # Get a random number within the range of the number of texts
    id_select = random.randint(0, num_refs-1)
    # print(num_refs, id_select)
    fo.write(dtp_test[1][id_select])
    fo.write('\n')

## Get French labels for WebNLG entities

In [None]:
#@title Examine existing files
# We don't really remember what was done for Irish, so I want to understand what the different files used in FromTriples2PredArg contains and are for
# Conclusions: all_obj and all_subj contain all other train/dev/test files.
# Conclusions: objValuesTest_dbpediaTranslations is entirely contained in objValues_dbpediaTranslations except for 4 values that I think were cleaned manually.
import codecs

# train_obj = [line.strip() for line in codecs.open('/content/train_objValues.txt', 'r', 'utf-8').readlines()]
# train_subj = [line.strip() for line in codecs.open('/content/train_subValues.txt', 'r', 'utf-8').readlines()]
# test_obj = [line.strip() for line in codecs.open('/content/test_objValues.txt', 'r', 'utf-8').readlines()]
# test_subj = [line.strip() for line in codecs.open('/content/test_subValues.txt', 'r', 'utf-8').readlines()]
# dev_obj = [line.strip() for line in codecs.open('/content/dev_objValues.txt', 'r', 'utf-8').readlines()]
# dev_subj = [line.strip() for line in codecs.open('/content/dev_subValues.txt', 'r', 'utf-8').readlines()]
# all_obj = [line.strip() for line in codecs.open('/content/all_objValues.txt', 'r', 'utf-8').readlines()]
# all_subj = [line.strip() for line in codecs.open('/content/all_subValues.txt', 'r', 'utf-8').readlines()]
# dbpedia_obj = [line.strip() for line in codecs.open('/content/objValues_dbpediaTranslations.txt', 'r', 'utf-8').readlines()]
# dbpedia_objTest = [line.strip() for line in codecs.open('/content/objValuesTest_dbpediaTranslations.txt', 'r', 'utf-8').readlines()]

# dbpedia_obj = [line.strip() for line in codecs.open('/content/objValues_dbpediaTranslations.txt', 'r', 'utf-8').readlines()]
# translate_obj = [line.strip() for line in codecs.open('/content/objValues_googleTranslations.txt', 'r', 'utf-8').readlines()]
# dbpedia_subj = [line.strip() for line in codecs.open('/content/subValues_dbpediaTranslations.txt', 'r', 'utf-8').readlines()]
# translate_subj = [line.strip() for line in codecs.open('/content/subValues_googleTranslations.txt', 'r', 'utf-8').readlines()]

def check_overlap_lists(list1, list2):
  """Checks if all elements of list1 are found in list2"""
  not_found = []
  for element in list1:
    if element not in list2:
      not_found.append(element)
  if len(not_found) == 0:
    print('All elements of the first list are in the second list!')
  elif len(not_found) == len(list1):
    print('No overlap found between the two lists!')
  else:
    print(f'{len(not_found)} elements of the first list are not in the second list: {not_found}')

# check_overlap_lists(dbpedia_subj, translate_subj)

In [None]:
#@title Functions to get DBpedia and Google translations
# Adapted from ChatGPT: Please write some Python code which, given an English entity label (with underscores instead of spaces), retrieves the corresponding French label according to DBpedia.
from googletrans import Translator
import requests

def get_translated_label_dbpedia(language, entity_label):
  # Format entity to escape/remove all reserved charachers
  formatted_entity_label = format_entity_dbp(entity_label.replace(' ', '_'))
  # DBpedia SPARQL endpoint
  sparql_endpoint = "http://dbpedia.org/sparql"
  # Construct the SPARQL query
  query = f"""
  SELECT ?label WHERE {{
    dbr:{formatted_entity_label} rdfs:label ?label .
    FILTER (lang(?label) = '{language}')
  }}
  """

  # Send the request to the SPARQL endpoint
  response = requests.get(sparql_endpoint, params={'query': query, 'format': 'json'})
  data = response.json()

  # Extract the French label from the results
  try:
    french_label = data['results']['bindings'][0]['label']['value']
    return french_label
  except (IndexError, KeyError):
    return None  # Return None if no French label is found

# Example usage
# entity = "People\\'s_Party_(Spain)"  # Example entity
# entity = "Fried_chicken"  # Example entity
# french_label = get_translated_label_dbpedia('fr', entity)
# print(f"French label for {entity}: {french_label.replace(' ', '_')}")

translator = Translator()
def get_translated_labels_google(dest_language, entity_list):
  entity_list_with_spaces = [entity_label.replace('_', ' ') for entity_label in entity_list]
  translations = translator.translate(entity_list_with_spaces, dest=dest_language)
  list_transl = []
  # We're not reintroducing underscores since that's what I initially sent to the triple2predarg module and I don't want to break anything
  for translation in translations:
    list_transl.append(translation.text.strip())
  return list_transl

In [None]:
#@title Translate entities from GA files using DBpedia or Google Translate if not found on DBpedia
import os
import codecs

dest_language = 'fr'#@param['fr', 'ga']

# The files' lines are an english word and an irish word separated by an asterisk (asterisk possibly surrounded by tabs)
# For Google translate, we need spaces between the words, not for DBpedia (the DBpedia function takes care of formatting)
# Update: if we remove underscores here we're losing the alignment with the entities in the WebNLG inputs; now done within the Google function
ga_dbpedia_obj = []
ga_translate_obj = []
ga_dbpedia_subj = []
ga_translate_subj = []
# If we use uploaded GA files as input
if os.path.exists('/content/objValues_dbpediaTranslations.txt'):
  print('Starting to process files coming from GA experiments...')
  # WAS: ga_dbpedia_obj = [line.split('*')[0].replace('_', ' ').strip() for line in codecs.open('/content/objValues_dbpediaTranslations.txt', 'r', 'utf-8').readlines()]
  ga_dbpedia_obj = [line.split('*')[0].strip() for line in codecs.open('/content/objValues_dbpediaTranslations.txt', 'r', 'utf-8').readlines()]
  ga_translate_obj = [line.split('*')[0].strip() for line in codecs.open('/content/objValues_googleTranslations.txt', 'r', 'utf-8').readlines()]
  ga_dbpedia_subj = [line.split('*')[0].strip() for line in codecs.open('/content/subValues_dbpediaTranslations.txt', 'r', 'utf-8').readlines()]
  ga_translate_subj = [line.split('*')[0].strip() for line in codecs.open('/content/subValues_googleTranslations.txt', 'r', 'utf-8').readlines()]
# if we use the subjects and objects collected from the WebNLG data using the cells above
elif os.path.exists('/content/filtered_objects.txt'):
  print('Starting to process files with only filtered WebNLG entities...')
  ga_dbpedia_obj = [line.strip() for line in codecs.open('/content/filtered_objects.txt', 'r', 'utf-8').readlines()]
  ga_dbpedia_subj = [line.strip() for line in codecs.open('/content/filtered_subjects.txt', 'r', 'utf-8').readlines()]
elif os.path.exists('/content/all_objects.txt'):
  print('Starting to process files with all WebNLG entities...')
  ga_dbpedia_obj = [line.strip() for line in codecs.open('/content/all_objects.txt', 'r', 'utf-8').readlines()]
  ga_dbpedia_subj = [line.strip() for line in codecs.open('/content/all_subjects.txt', 'r', 'utf-8').readlines()]

# These are the lists that will contain english and translated entitied aligned; we split files by subject/object and dbpedia/google because that's what the triple2predarg code expects
list_gtrans_en_subj = []
list_gtrans_en_obj = []
list_gtrans_dest_subj = []
list_gtrans_dest_obj = []
list_dbp_en_subj = []
list_dbp_en_obj = []
list_dbp_dest_subj = []
list_dbp_dest_obj = []

def get_DBpedia_labels_and_list_for_gtrans(dest_language, list_en, list_dbp_en, list_dbp_dest, list_for_gtrans):
  """
  This function retrieves  DBpedia labels in the target language, and creates a list with the remaining entities to be translated using an MT system.
  Expects a list of english entities with no underscores in.
  Returns three lists: two with the DBpedia labels (english and target language), and one and one with the entities not found on DBpedia
  """
  for i, entity in enumerate(list_en):
    print(f'  Processing entity {i+1}/{len(list_en)} - {entity}...')
    result_query = get_translated_label_dbpedia(dest_language, entity)
    if result_query == None:
      list_for_gtrans.append(entity)
    else:
      # We need to reintroduce underscores for the entities to match a in the WebNLG inputs
      list_dbp_en.append(entity.replace(' ', '_'))
      list_dbp_dest.append(result_query.replace(' ', '_'))
  #return list_dbp_en, list_dbp_dest, list_for_gtrans

# Get lists for subject entities
print('Getting lists for subject entities...')
get_DBpedia_labels_and_list_for_gtrans(dest_language, ga_dbpedia_subj, list_dbp_en_subj, list_dbp_dest_subj, list_gtrans_en_subj)
if os.path.exists('/content/objValues_dbpediaTranslations.txt'):
  get_DBpedia_labels_and_list_for_gtrans(dest_language, ga_translate_subj, list_dbp_en_subj, list_dbp_dest_subj, list_gtrans_en_subj)
# Translate subject entities not found on DBpedia
if len(list_gtrans_en_subj) > 0:
  list_gtrans_dest_subj = get_translated_labels_google(dest_language, list_gtrans_en_subj)
print(f'Subjects: expected {len(ga_dbpedia_subj)+len(ga_translate_subj)} items (DBp: {len(ga_dbpedia_subj)}; GTr: {len(ga_translate_subj)}).')
print(f'DBp en:\t{len(list_dbp_en_subj)} {list_dbp_en_subj}')
print(f'DBp {dest_language}:\t{len(list_dbp_dest_subj)} {list_dbp_dest_subj}')
print(f'GTr en:\t{len(list_gtrans_en_subj)} {list_gtrans_en_subj}')
print(f'GTr {dest_language}:\t{len(list_gtrans_dest_subj)} {list_gtrans_dest_subj}')
print('\n')

# Get lists for object entities
print('Getting lists for object entities...')
get_DBpedia_labels_and_list_for_gtrans(dest_language, ga_dbpedia_obj, list_dbp_en_obj, list_dbp_dest_obj, list_gtrans_en_obj)
if os.path.exists('/content/objValues_dbpediaTranslations.txt'):
  get_DBpedia_labels_and_list_for_gtrans(dest_language, ga_translate_obj, list_dbp_en_obj, list_dbp_dest_obj, list_gtrans_en_obj)
# Translate object entities not found on DBpedia
if len(list_gtrans_en_obj) > 0:
  list_gtrans_dest_obj = get_translated_labels_google(dest_language, list_gtrans_en_obj)
print(f'Objects: expected {len(ga_dbpedia_obj)+len(ga_translate_obj)} items (DBp: {len(ga_dbpedia_obj)}; GTr: {len(ga_translate_obj)}).')
print(f'DBp en:\t{len(list_dbp_en_obj)} {list_dbp_en_obj}')
print(f'DBp {dest_language}:\t{len(list_dbp_dest_obj)} {list_dbp_dest_obj}')
print(f'GTr en:\t{len(list_gtrans_en_obj)} {list_gtrans_en_obj}')
print(f'GTr {dest_language}:\t{len(list_gtrans_dest_obj)} {list_gtrans_dest_obj}')

In [None]:
#@title Build file with WebNLG EN and destination values
import re
import codecs

def build_tr_file(list_en, list_dest, dest_language, subj_obj, dbp_mt):
  x = 0
  fo = codecs.open(dest_language+'_'+subj_obj+'Values_'+dbp_mt+'Translations.txt', 'w', 'utf-8')
  while x < len(list_en):
    source_entity = list_en[x]
    dest_entity_full = list_dest[x]
    dest_entity_noParenth = re.sub('_\(.*$', '', dest_entity_full)
    # removing commas may not be a great idea, that's how we write addresses naturally in french (39, dargle road)
    # dest_entity_noParenth_noComma = re.sub(',.*$', '', dest_entity_noParenth)
    # Only write in file if values are actually different (this was added at a later stage, so the first vesion of the files have all pair regardless of the values)
    if source_entity != dest_entity_full:
      source_entity_final = []
      dest_entity_final = []
      separator = None
      # If the entity ends with several commas, replace the last one by a coordination, and put that updated version with "and" in the list of entities to write
      if re.search('[^,]+,[\s_][^,]+,[\s_][^,]+,[\s_][^,]+$', source_entity) and not re.search('[^,]+,[\s_][^,]+,[\s_][^,]+[\s_](and|or)[^,]+$', source_entity):
        coord_source = (re.sub('([^,]+,[\s_][^,]+,[\s_][^,]+),([\s_])([^,]+)$', '\g<1>\g<2>and\g<2>\g<3>', source_entity))
        coord_dest = (re.sub('([^,]+,[\s_][^,]+,[\s_][^,]+),([\s_])([^,]+)$', '\g<1>\g<2>et\g<2>\g<3>', dest_entity_full))
        source_entity_final.append(coord_source)
        dest_entity_final.append(coord_dest)
        # If the entity has quotes, also add a non-quoted version of the coordinated entity
        if re.search('^"', source_entity):
          source_entity_final.append(coord_source.replace('"', ''))
          dest_entity_final.append(coord_dest.replace('"', ''))

      # If the entity has quotes, put a version of it without quotes in the list of entities to write
      if re.search('"', source_entity):
        source_entity_final.append(source_entity.replace('"', ''))
        dest_entity_final.append(dest_entity_full.replace('"', ''))

      # Also add all entities as such; just add the source as is and the destination with or without parentheses
      source_entity_final.append(source_entity)
      # If the original entity has a parenthesis or a comma, leave the parenthesis in the destination entity (simple check but wil work most of the time)
      if re.search('\(', source_entity) or re.search(',', source_entity):
        dest_entity_final.append(dest_entity_full)
      else:
        dest_entity_final.append(dest_entity_noParenth)

      # Stam's code had no tabs in the dbpedia files, so I keep doing the same to not break anything
      if dbp_mt == 'dbpedia':
        separator = '*'
      else:
        separator = '\t*\t'

      y = 0
      while y < len(source_entity_final):
        source_entity_final_x = source_entity_final[y]
        dest_entity_final_x = dest_entity_final[y]
        # Write output file
        fo.write(source_entity_final_x+separator+dest_entity_final_x+'\n')
        # For each input entity, write a lowercase and underscore counterpart, because lowercasing/underscoring is sometimes happening in the triple2predArg conversion
        # I also found that some commas are replaced by "and"
        if re.search('[A-Z]', source_entity_final_x) and re.search(' ', source_entity_final_x):
          fo.write(source_entity_final_x.lower()+separator+dest_entity_final_x.lower()+'\n')
          fo.write(source_entity_final_x.replace(' ', '_')+separator+dest_entity_final_x+'\n')
          fo.write(source_entity_final_x.replace(' ', '_').lower()+separator+dest_entity_final_x.lower()+'\n')
        elif re.search('[A-Z]', source_entity_final_x):
          fo.write(source_entity_final_x.lower()+separator+dest_entity_final_x.lower()+'\n')
        elif re.search(' ', source_entity_final_x):
          fo.write(source_entity_final_x.replace(' ', '_')+separator+dest_entity_final_x+'\n')
        y += 1
    x += 1
  fo.close()

# Create output files by pairing english and target language entities
build_tr_file(list_dbp_en_subj, list_dbp_dest_subj, dest_language, 'sub', 'dbpedia')
build_tr_file(list_dbp_en_obj, list_dbp_dest_obj, dest_language, 'obj', 'dbpedia')
build_tr_file(list_gtrans_en_subj, list_gtrans_dest_subj, dest_language, 'sub', 'google')
build_tr_file(list_gtrans_en_obj, list_gtrans_dest_obj, dest_language, 'obj', 'google')

# WebNLG 2020 shared task data (system outputs, human ratings, etc.)

In [None]:
! git clone https://github.com/WebNLG/challenge-2020.git
clear_output()

In [None]:
#@title Print outputs of the different systems
import glob
import os
import codecs
from IPython.display import HTML, display

# To wrap texts in cells
def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

keepers = [5,6,20,23]

# id = 24
for id in keepers:
  print(f'Text #{id}\n----------\n')
  for input_triple in dataset_20_test[id][0]:
    print(input_triple)
  print('\nReferences')
  for reference in dataset_20_test[id][1]:
    print(reference)
  print('\n')

  for folder_path in sorted(glob.glob('/content/challenge-2020/submissions/rdf2text/en/*')):
    # Get a list with all outputs of a system
    outputs_file_lines = codecs.open(os.path.join(folder_path, 'primary.en'), 'r', 'utf-8').readlines()
    # Get system name
    system_name = folder_path.rsplit('/', 1)[1]
    print(system_name)
    print(outputs_file_lines[id])

In [None]:
#@title Get system outputs according to human scores

import json
import codecs
import statistics

# The threshold t will be used to filter the texts that have a rating >= t when averaging all annotators for the selected criteria
threshold = '95' #@param[100, 99, 98, 97, 96, 95, 94, 93, 92, 91, 90, 85, 80]
threshold = int(threshold)

FORGe_ratings = json.load(codecs.open('/content/challenge-2020/evaluation/human-evaluation/results/en/Baseline-FORGE2020/primary.json', 'r', 'utf-8'))
Amazon_ratings = json.load(codecs.open('/content/challenge-2020/evaluation/human-evaluation/results/en/Amazon_AI_(Shanghai)/primary.json', 'r', 'utf-8'))
OSU_ratings = json.load(codecs.open('/content/challenge-2020/evaluation/human-evaluation/results/en/OSU_Neural_NLG/primary.json', 'r', 'utf-8'))
FB_ratings = json.load(codecs.open('/content/challenge-2020/evaluation/human-evaluation/results/en/FBConvAI/primary.json', 'r', 'utf-8'))

Amazon_1textPerLine = codecs.open('/content/challenge-2020/submissions/rdf2text/en/Amazon_AI_(Shanghai)/primary.en', 'r', 'utf-8').readlines()
OSU_1textPerLine = codecs.open('/content/challenge-2020/submissions/rdf2text/en/OSU_Neural_NLG/primary.en', 'r', 'utf-8').readlines()
FB_1textPerLine = codecs.open('/content/challenge-2020/submissions/rdf2text/en/FBConvAI/primary.en', 'r', 'utf-8').readlines()


def select_accurate_outputs(system_ratings, threshold):
  kept_outputs = []
  for system_output_id in system_ratings:
    scores = []
    ID_key = system_ratings[system_output_id]
    for evaluator in ID_key:
      # Select criteria to take into account
      scores.append(ID_key[evaluator]['DataCoverage'])
      scores.append(ID_key[evaluator]['Relevance'])
      scores.append(ID_key[evaluator]['Correctness'])
      # scores.append(ID_key[evaluator]['TextStructure'])
      # scores.append(ID_key[evaluator]['Fluency'])
    if len(scores) > 0:
      if statistics.mean(scores) >= threshold:
        kept_outputs.append(system_output_id)
    else:
      print(f'No scores for datapoint #{system_output_id}')
    # print(scores)
  print(f'{len(kept_outputs)}/{len(system_ratings)} outputs can be used!')
  print(kept_outputs)
  return kept_outputs

def combine_lists(main_list, list_to_add):
  for list_element in list_to_add:
    if list_element not in main_list:
      main_list.append(list_element)

list_all_IDs_string = []
# select_accurate_outputs returns a list of ID as strings; convert these to integers
print('FORGe')
list_FORGe_ids = [int(x) for x in select_accurate_outputs(FORGe_ratings, threshold)]
print('AmazonAI')
list_Amazon_ids = [int(x) for x in select_accurate_outputs(Amazon_ratings, threshold)]
print('OSU')
list_OSU_ids = [int(x) for x in select_accurate_outputs(OSU_ratings, threshold)]
print('FBConvAI')
list_FB_ids = [int(x) for x in select_accurate_outputs(FB_ratings, threshold)]

# Get a list with all IDs for which we have at least one reference
combine_lists(list_all_IDs_string, list_Amazon_ids)
combine_lists(list_all_IDs_string, list_OSU_ids)
combine_lists(list_all_IDs_string, list_FB_ids)

print('All LLMs combined')
# Sort the list to maintain alignment with the FORGe outputs later (I think)
list_all_LLMout_IDs = sorted([int(x) for x in list_all_IDs_string])
print(f'{len(list_all_LLMout_IDs)} IDs selected: {list_all_LLMout_IDs}.')

# Now build a resource that looks like dataset_20_test (see "Make data post-processing experiments")
# So we can run use the same functions as for the train/dev/test splits
dataset_system_outputs = []
for c, ID in enumerate(list_all_LLMout_IDs):
  # For each datapoint, create two lists, the first one for the input (empty here), the second one for the texts
  dataset_system_outputs.append([[], []])
  if ID in list_Amazon_ids:
    dataset_system_outputs[c][1].append(Amazon_1textPerLine[ID].strip())
  if ID in list_OSU_ids:
    if OSU_1textPerLine[ID].strip() not in dataset_system_outputs[c][1]:
      dataset_system_outputs[c][1].append(OSU_1textPerLine[ID].strip())
    # else:
    #   print(f'Duplicate OSU {ID}')
  if ID in list_FB_ids:
    if FB_1textPerLine[ID].strip() not in dataset_system_outputs[c][1]:
      dataset_system_outputs[c][1].append(FB_1textPerLine[ID].strip())
    # else:
    #   print(f'Duplicate FB {ID}')
print(dataset_system_outputs)

# list_Amazon_texts = [Amazon_1textPerLine[int(i)].strip() for i in list_Amazon_ids]
# list_OSU_texts = [OSU_1textPerLine[int(i)].strip() for i in list_OSU_ids]
# list_FB_texts = [FB_1textPerLine[int(i)].strip() for i in list_FB_ids]

In [None]:
#@title Make fine-tuning data for post-processing experiments
# The idea is to compile:
# - all pairs <WebNLG-Triple>/<unique-ref-text> for train, dev, test
# - all pairs <FORGe-output>/<unique-ref-text> for train, dev test
# - all pairs <FORGe-output>/<accurate-and-fluent-LLM-output> for test
# And the same 3 datasets as above but with multiple references for each input
# We already stored the data quite simply: dataset_20_test[0][0] contains the triples of the first datapoint, dataset_20_test[0][1] contains the ref texts of the first datapoint
# E.g. [['Ballistic_(comicsCharacter) | creator | Doug_Moench', 'Ballistic_(comicsCharacter) | creator | "Michael Manley"'], ['The creators of the comic character Ballistic were Michael Manley and Doug Moench.', 'Doug Moench and Michael Manley, created the comic character Ballistic.', 'The comic book character Ballistic was created by Michael Manley and Doug Moench.']]

create_csv_triple2ref = 'no'#@param['yes', 'no']
create_csv_forge2ref = 'no'#@param['yes', 'no']
create_csv_forge2llm = 'yes'#@param['yes', 'no']

txt_folder = '/content/textFiles'

import csv
import codecs
import glob
import os

num_inputs_data20_train = len(dataset_20_train)
num_inputs_data20_dev = len(dataset_20_dev)
num_inputs_data20_test = 1779

def check_num_datapoints_unique(count_original_inputs_test, pairs_input_test, count_original_inputs_dev, pairs_input_dev, count_original_inputs_train, pairs_input_train):
  errors = 0
  if (not count_original_inputs_test == 1779) or (not pairs_input_test == 5150):
    print('  Error number test items!')
    errors += 1
  if (not count_original_inputs_dev == 1667) or (not pairs_input_dev == 4464):
    print('  Error number dev items!')
    errors += 1
  if (not count_original_inputs_train == 13211) or (not pairs_input_train == 35426):
    print('  Error number train items!')
    errors += 1
  if errors == 0:
    print('  All good!')

def check_num_datapoints_multi(count_original_inputs_test, pairs_input_test, count_original_inputs_dev, pairs_input_dev, count_original_inputs_train, pairs_input_train):
  errors = 0
  if (not count_original_inputs_test == 1779) or (not pairs_input_test == 1779):
    print('  Error number test items!')
    errors += 1
  if (not count_original_inputs_dev == 1667) or (not pairs_input_dev == 1667):
    print('  Error number dev items!')
    errors += 1
  if (not count_original_inputs_train == 13211) or (not pairs_input_train == 13211):
    print('  Error number train items!')
    errors += 1
  if errors == 0:
    print('  All good!')

def create_csv(rows, dataset, split, reference_num, threshold = ''):
  with open(f'WebNLG_{dataset}_{split}_{reference_num}{threshold}.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(rows)

def write_text_file(list_texts, fileprefix, split):
  with open(f'{fileprefix}_{split}.txt', 'w', encoding='UTF8') as fx:
    for text in list_texts:
      fx.write(text)
      fx.write('\n')

def gather_ModD2T_texts_in_file(filepath, txt_folder):
  """ To read a ModD2T file and store all texts in a list. """
  lines = codecs.open(filepath, 'r', 'utf-8').readlines()
  texts_ModD2T = []
  for line in lines:
    if line.startswith('# text = '):
      texts_ModD2T.append(re.subn('\\\&', '&', line.split(' = ')[1])[0].strip())
  # From the list, write the ModD2T FORGe outputs in a separate file to compare to the official WebNLG'20 submission
  if filepath == '/content/Mod-D2T/conllu-en_INLG23/test/01-PredArgNorm.conllu':
    write_text_file(texts_ModD2T, 'textFiles/WebNLG_ModD2T-out', 'test')
  elif filepath == '/content/Mod-D2T/conllu-en_INLG23/dev/01-PredArgNorm.conllu':
    write_text_file(texts_ModD2T, 'textFiles/WebNLG_ModD2T-out', 'dev')
  elif filepath == '/content/Mod-D2T/conllu-en_INLG23/train/01-PredArgNorm.conllu':
    write_text_file(texts_ModD2T, 'textFiles/WebNLG_ModD2T-out', 'train')

def align_input_uniqueRefText(dataset, num_inputs, forge_texts = None):
  """
  dataset = a list of the following: [['Ballistic_(comicsCharacter) | creator | Doug_Moench', 'Ballistic_(comicsCharacter) | creator | "Michael Manley"'], ['The creators of the comic character Ballistic were Michael Manley and Doug Moench.', 'Doug Moench and Michael Manley, created the comic character Ballistic.', 'The comic book character Ballistic was created by Michael Manley and Doug Moench.']]
  num_inputs = an integer
  To create the triple2ref data and the forge2ref data.
  With this function, we aim at building alignments that can be exported into a CSV.
  Each CSV row must be a list of 2 columns, and all rows must be kept in a list. Here's what one row should look like:
  ['Nie_Haisheng | birthDate | 1964-10-13, Nie_Haisheng | occupation | Fighter_pilot', 'Nie Haisheng born on 10/13/1964 is a fighter pilot.']
  """
  pairs = []
  c = 0
  count_texts = 0
  # On HuggingFace, they concatenated all RDF2Text and Text2RDF test sets; the RDF2Text data with references consists of the first 1779 items (the next 1779 items are without refs, then it's Text2RDF)
  # So we need to select only the first 1779 text lists of the test data (we use num_inputs to capture this number).
  while c < num_inputs:
    # For each data point, iterate over the individual reference texts
    for text_id, reference_text in enumerate(dataset[c][1]):
      # Now create the position in the pairs list, which is the current count_text counter + the current text_id
      pairs_list_id = count_texts + text_id
      # For each position, create a list and append the input and one reference text (the same input for all reference_texts of the same datapoint)
      pairs.append([])
      # If we are dealing with input triples, join the triples of each input into one string
      if forge_texts == None:
        pairs[pairs_list_id].append(', '.join(dataset[c][0]))
      # Otherwise, just copy the text
      else:
         pairs[pairs_list_id].append(forge_texts[c])
      pairs[pairs_list_id].append(reference_text)
    # Update the count_texts counter with the last (highest) value of text_id, adding 1 because the text_id counter always starts at 0, so 0 is 1 text, 1 is 2, etc.
    count_texts += text_id + 1
    c += 1
  return pairs, c

def align_input_multipleRefTexts(dataset, num_inputs, forge_texts = None):
  """ See comments in the function above for unique ref """
  pairs = []
  c = 0
  while c < num_inputs:
    pairs.append([])
    # If we are dealing with input triples, join the triples of each input into one string
    if forge_texts == None:
      pairs[c].append(', '.join(dataset[c][0]))
    # Otherwise, just copy the FORGe text as input
    else:
        pairs[c].append(forge_texts[c])
    # Add the reference texts
    pairs[c].append(dataset[c][1])
    # Update the count_texts counter with the last (highest) value of text_id, adding 1 because the text_id counter always starts at 0, so 0 is 1 text, 1 is 2, etc.
    c += 1
  return pairs, c

if create_csv_triple2ref == 'yes':
  print('Creating triple2ref data')
  # Align inputs and outputs for unique ref
  pairs_input_uniqueRef_test20, count_original_inputs_test20_u = align_input_uniqueRefText(dataset_20_test, num_inputs_data20_test)
  pairs_input_uniqueRef_dev20, count_original_inputs_dev20_u = align_input_uniqueRefText(dataset_20_dev, num_inputs_data20_dev)
  pairs_input_uniqueRef_train20, count_original_inputs_train20_u = align_input_uniqueRefText(dataset_20_train, num_inputs_data20_train)
  # Align inputs and outputs for multiple refs
  pairs_input_multiRefs_test20, count_original_inputs_test20_m = align_input_multipleRefTexts(dataset_20_test, num_inputs_data20_test)
  pairs_input_multiRefs_dev20, count_original_inputs_dev20_m = align_input_multipleRefTexts(dataset_20_dev, num_inputs_data20_dev)
  pairs_input_multiRefs_train20, count_original_inputs_train20_m = align_input_multipleRefTexts(dataset_20_train, num_inputs_data20_train)
  # Create CSVs for unique ref
  create_csv(pairs_input_uniqueRef_test20, 'triple2ref', 'test', 'uniqueRef')
  create_csv(pairs_input_uniqueRef_dev20, 'triple2ref', 'dev', 'uniqueRef')
  create_csv(pairs_input_uniqueRef_train20, 'triple2ref', 'train', 'uniqueRef')
  # Create CSVs for multiple refs
  create_csv(pairs_input_multiRefs_test20, 'triple2ref', 'test', 'multiRef')
  create_csv(pairs_input_multiRefs_dev20, 'triple2ref', 'dev', 'multiRef')
  create_csv(pairs_input_multiRefs_train20, 'triple2ref', 'train', 'multiRef')
  # Check number of inputs
  print('Checking triple2ref data unique ref')
  check_num_datapoints_unique(count_original_inputs_test20_u, len(pairs_input_uniqueRef_test20), count_original_inputs_dev20_u, len(pairs_input_uniqueRef_dev20), count_original_inputs_train20_u, len(pairs_input_uniqueRef_train20))
  print('Checking triple2ref data multiple refs')
  check_num_datapoints_multi(count_original_inputs_test20_m, len(pairs_input_multiRefs_test20), count_original_inputs_dev20_m, len(pairs_input_multiRefs_dev20), count_original_inputs_train20_m, len(pairs_input_multiRefs_train20))

if create_csv_forge2ref == 'yes':
  print('Creating forge2ref data')
  path_dev = '/content/Mod-D2T/conllu-en_INLG23/dev/01-PredArgNorm.conllu'
  path_test = '/content/Mod-D2T/conllu-en_INLG23/test/01-PredArgNorm.conllu'
  path_train = '/content/Mod-D2T/conllu-en_INLG23/train/01-PredArgNorm.conllu'
  # Collect all FORGe texts into files
  if not os.path.exists(txt_folder):
    os.makedirs(txt_folder)
  else:
    files = glob.glob(txt_folder+'/*')
    for f in files:
      os.remove(f)
  # Get latest texts for all splits from Mod-D2T dataset
  gather_ModD2T_texts_in_file(path_test, txt_folder)
  gather_ModD2T_texts_in_file(path_dev, txt_folder)
  gather_ModD2T_texts_in_file(path_train, txt_folder)
  # Once we have collected the texts, apply same postprocessing as regular FORGe outputs
  print('Post processing of FORGe texts:')
  ! python '/content/M-FleNS_NLG-Pipeline/code/postProcess.py' 'EN' {txt_folder}
  # Finally, put the post_processed texts into a list to build our dataset
  forge_test_texts = [text.strip() for text in codecs.open('textFiles/WebNLG_ModD2T-out_test_postproc.txt', 'r', 'utf-8').readlines()]
  forge_dev_texts = [text.strip() for text in codecs.open('textFiles/WebNLG_ModD2T-out_dev_postproc.txt', 'r', 'utf-8').readlines()]
  forge_train_texts = [text.strip() for text in codecs.open('textFiles/WebNLG_ModD2T-out_train_postproc.txt', 'r', 'utf-8').readlines()]
  # Align inputs and outputs for unique ref
  pairs_forge_uniqueRef_test20, count_original_inputs_test20_u = align_input_uniqueRefText(dataset_20_test, num_inputs_data20_test, forge_test_texts)
  pairs_forge_uniqueRef_dev20, count_original_inputs_dev20_u = align_input_uniqueRefText(dataset_20_dev, num_inputs_data20_dev, forge_dev_texts)
  pairs_forge_uniqueRef_train20, count_original_inputs_train20_u = align_input_uniqueRefText(dataset_20_train, num_inputs_data20_train, forge_train_texts)
  # Align inputs and outputs for multiple refs
  pairs_forge_multiRefs_test20, count_original_inputs_test20_m = align_input_multipleRefTexts(dataset_20_test, num_inputs_data20_test, forge_test_texts)
  pairs_forge_multiRefs_dev20, count_original_inputs_dev20_m = align_input_multipleRefTexts(dataset_20_dev, num_inputs_data20_dev, forge_dev_texts)
  pairs_forge_multiRefs_train20, count_original_inputs_train20_m = align_input_multipleRefTexts(dataset_20_train, num_inputs_data20_train, forge_train_texts)
  # Create CSVs for unique ref
  create_csv(pairs_forge_uniqueRef_test20, 'forge2ref', 'test', 'uniqueRef')
  create_csv(pairs_forge_uniqueRef_dev20, 'forge2ref', 'dev', 'uniqueRef')
  create_csv(pairs_forge_uniqueRef_train20, 'forge2ref', 'train', 'uniqueRef')
  # Create CSVs for multiple refs
  create_csv(pairs_forge_multiRefs_test20, 'forge2ref', 'test', 'multiRef')
  create_csv(pairs_forge_multiRefs_dev20, 'forge2ref', 'dev', 'multiRef')
  create_csv(pairs_forge_multiRefs_train20, 'forge2ref', 'train', 'multiRef')
  # Check number of inputs
  print('Checking forge2ref data unique ref')
  check_num_datapoints_unique(count_original_inputs_test20_u, len(pairs_forge_uniqueRef_test20), count_original_inputs_dev20_u, len(pairs_forge_uniqueRef_dev20), count_original_inputs_train20_u, len(pairs_forge_uniqueRef_train20))
  print('Checking forge2ref data muliple refs')
  check_num_datapoints_multi(count_original_inputs_test20_m, len(pairs_forge_multiRefs_test20), count_original_inputs_dev20_m, len(pairs_forge_multiRefs_dev20), count_original_inputs_train20_m, len(pairs_forge_multiRefs_train20))

if create_csv_forge2llm == 'yes':
  path_test = '/content/Mod-D2T/conllu-en_INLG23/test/01-PredArgNorm.conllu'  # Collect all FORGe texts into files
  if not os.path.exists(txt_folder):
    os.makedirs(txt_folder)
  else:
    files = glob.glob(txt_folder+'/*')
    for f in files:
      os.remove(f)
  # Get latest texts for test split from Mod-D2T dataset
  gather_ModD2T_texts_in_file(path_test, txt_folder)
  # Once we have collected the texts, apply same postprocessing as regular FORGe outputs
  print('Post processing of FORGe texts:')
  ! python '/content/M-FleNS_NLG-Pipeline/code/postProcess.py' 'EN' {txt_folder}
  # Now select the text for which we have accurate LLM outputs (IDs saved in list_all_LLMout_IDs from the cell above)
  forge_test_texts = [text.strip() for i, text in enumerate(codecs.open('textFiles/WebNLG_ModD2T-out_test_postproc.txt', 'r', 'utf-8').readlines()) if i in list_all_LLMout_IDs]
  # forge_test_texts = []
  # forge_1text_per_line = codecs.open('textFiles/WebNLG_ModD2T-out_test_postproc.txt', 'r', 'utf-8').readlines()
  # for iforge, text in enumerate(forge_1text_per_line):
  #   if iforge in list_all_LLMout_IDs:
  #     print(iforge)
  #     forge_test_texts.append(text)
  # Align inputs and outputs for unique ref
  pairs_forge_uniqueLLM_test20, count_original_inputs_test20_u = align_input_uniqueRefText(dataset_system_outputs, len(list_all_LLMout_IDs), forge_test_texts)
  # Align inputs and outputs for multiple refs
  pairs_forge_multiLLM_test20, count_original_inputs_test20_m = align_input_multipleRefTexts(dataset_system_outputs, len(list_all_LLMout_IDs), forge_test_texts)
  # Create CSVs for unique ref
  create_csv(pairs_forge_uniqueLLM_test20, 'forge2llm', 'test', 'uniqueRef', threshold)
  # Create CSVs for multiple refs
  create_csv(pairs_forge_multiLLM_test20, 'forge2llm', 'test', 'multiRef', threshold)



In [None]:
#@title Make test data for post-processing experiments
import xmltodict
import glob
import os
import csv
import codecs

# Path to downloaded xml
path_GEM_test = '/content/GEM24_test'
path_GEM_forge = '/content/GEM24_forge'

paths_GEM_input_files = glob.glob(os.path.join(path_GEM_test, '*.xml'))
paths_GEM_FORGe_output_files = glob.glob(os.path.join(path_GEM_forge, '*.txt'))

def create_csv(rows, dataset, inputType):
  with open(f'{dataset}_{inputType}.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(rows)

def create_test_data_GEM (input_file_path):
  head, tail = os.path.split(input_file_path)
  # Get input ytpe
  input_type = ''
  if tail.rsplit('.', 1)[1] == 'xml':
    input_type = 'triples'
  elif tail.rsplit('.', 1)[1] == 'txt':
    input_type = 'forge'
  dataset_id = ''

  list_inputs = []
  if input_type == 'triples':
    # Get dataset name
    dataset_id = tail.split('_', 1)[0]
    input_xml_file = codecs.open(input_file_path, 'r', 'utf-8').read()
    input_data_dict = xmltodict.parse(input_xml_file)
    # Build a data structure like the one from HuggingFace for the other datasets
    for entry in input_data_dict['benchmark']['entries']['entry']:
      data_point = []
      mtriples_list = []
      # Get modified triples
      if isinstance(entry['modifiedtripleset']['mtriple'], list):
        for mtriple in entry['modifiedtripleset']['mtriple']:
          mtriples_list.append(mtriple)
      else:
        mtriples_list.append(entry['modifiedtripleset']['mtriple'])
      data_point.append(', '.join(mtriples_list))
      list_inputs.append(data_point)
  elif input_type == 'forge':
    # Get dataset name
    dataset_id = tail.split('_')[1]
    forge_texts = codecs.open(input_file_path, 'r', 'utf-8').readlines()
    for text in forge_texts:
      list_inputs.append([text.strip()])

  print(input_type, dataset_id)
  create_csv(list_inputs, dataset_id, input_type)

for file_path_i in paths_GEM_input_files:
  create_test_data_GEM(file_path_i)

for file_path_f in paths_GEM_FORGe_output_files:
  create_test_data_GEM(file_path_f)

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
#@title Extract predictions csv
import csv
import codecs
import glob
import os

path_pred = '/content/drive/MyDrive/M-FleNS/Papers&Slides/M-FleNS_papers/2024-03_GEM-SharedTask/Test data/FORGe_T5Base_prediction/fulltune'
path_sub = '/content/drive/MyDrive/M-FleNS/Papers&Slides/M-FleNS_papers/2024-03_GEM-SharedTask/Submissions/[FORGe]+[T5-Base(forge2text)]'
files_pred_paths = glob.glob(os.path.join(path_pred, '*.csv'))

for file_pred_path in files_pred_paths:
  head, tail = os.path.split(file_pred_path)
  filenameSub = 'FORGe-T5base_'+tail.rsplit('.', 1)[0].split('_')[0]+'_en.txt'
  print(filenameSub)
  with codecs.open(os.path.join(path_sub, filenameSub), 'w', 'utf-8') as fo:
    with open(file_pred_path, newline='') as csvfile:
      reader = csv.DictReader(csvfile)
      row_current_count = 0
      for row in reader:
        fo.write(row['predictions'])
        fo.write('\n')