<a href="https://colab.research.google.com/github/mille-s/DCU_TCD_FORGe_WebNLG23/blob/main/DCU_TCD_FORGe_WebNLG23.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title Prepare repo

# Run this cell to download and unzip the working folder and install Java 8

from IPython.display import clear_output, HTML, display
import os
import shutil

# clone main repo
! git clone https://github.com/mille-s/DCU_TCD-FORGe_WebNLG23.git
# Delete locally to avoid confusion
! rm 'DCU_TCD-FORGe_WebNLG23/DCU_TCD_FORGe_WebNLG23.ipynb'

# clone M-FleNS repo (generation pipeline)
! git clone https://github.com/mille-s/M-FleNS_NLG-Pipeline.git
# Delete locally to avoid confusion
! rm 'M-FleNS_NLG-Pipeline/M_FleNS_pipe_v2.ipynb'

# Download FORGe
# Version used for WebNLG (fails to generate a few structures of the training data)
# ! gdown 1lsh8pwUp9mc0Z_aFbSy1WTIpSx9YwFFD
# ! unzip /content/FORGe_colab_v3_WebNLG.zip
# Version used for Mod-D2T (minor improvements on WebNLG)
# ! gdown 196w_EtORTkR3idaXDMq0xl3pOtBrGbiE
# ! unzip /content/FORGe_colab_v4.zip
# Version used for GEM (now supporting some Wikidata properties)
# ! gdown 1gaTZVGFjtR_zBNskJXCeIJVug95aGFkf
# ! unzip /content/FORGe_colab_v5.zip
# Version now supporting FR WebNLG data (including some minor improvements on aggregation)
! gdown 1M0yk7aLUpHiT4UfT72g-rNIPd4W8WT44
! unzip /content/FORGe_colab_v6.zip

# Download triple to predArg conversion
triple2predArg = 'triples2predArg'
os.makedirs(triple2predArg)
# ! gdown 1Fr_ThZHGPLkoi3XQthSsaM5uVGLxgVat
# ! unzip 'triples2predArg2.zip' -d {triple2predArg}
# ! rm 'triples2predArg2.zip'
! gdown 1NKuoIqWj-VBUSCWos7k7Ps5gywVS0IGC
! unzip 'triples2predArg3.zip' -d {triple2predArg}
! rm 'triples2predArg3.zip'

# Download Morphology generator
! gdown 1vk1utEjeZ_2YO1H20DPDTjVSevgRJNM_
morph_folder_name = 'test_irish_morph_gen_v5.0'
zip_name = morph_folder_name+'.zip'
! unzip {zip_name}

morph_input_folder = '/content/'+morph_folder_name+'/Inputs'
morph_output_folder = '/content/'+morph_folder_name+'/Outputs'
os.makedirs(morph_input_folder)
os.makedirs(morph_output_folder)

# Make morphology flookup executable
! 7z a -sfx {morph_folder_name}'/flookup.exe' {morph_folder_name}'/flookup'
! chmod 755 {morph_folder_name}'/flookup'

# Package for parsing XML files
!pip install xmltodict
# Install SPARQLWrapper
! pip install SPARQLWrapper

# Clean
! rm '/content/FORGe_colab_v3_WebNLG.zip'
! rm '/content/FORGe_colab_v4.zip'
! rm '/content/FORGe_colab_v5.zip'
! rm '/content/FORGe_colab_v6.zip'
! rm '/content/test_irish_morph_gen_v5.0.zip'
clear_output()
print('Working folder ready!\n--------------\nInstalling Java 8...\n')

# Switch to Java 1.8 (needed for FORGe to run correctly)
def install_java():
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
  !update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
  !java -version       #check java version
install_java()

# To wrap texts in cells
def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))

get_ipython().events.register('pre_run_cell', set_css)

In [None]:
# @title Set parameters, create empty folders
import os
# Run this cell to set parameters for generation

# The input structure(s) of the correct type should be placed in the folder that corresponds to the first module called in the next cell
# E.g. if one a module PredArg_... or DSynt_... is selected, the input predicate-argument structures should be placed in the structures/00-PredArg folder
# I'll make the instructions and names clearer in a later (actually usable) version.

############# Select language #############
language = 'GA' #@param['EN', 'FR', 'GA']

############# Choose whether or not to look for class and gender information of entities via DBpedia #############
# Not implemented yet, right now it concatenates the files whatever happens
get_class_gender = 'no' #@param['yes', 'no']

############# Choose whether or not to concatenate output files #############
# Not implemented yet, right now it concatenates the files whatever happens
concatenate_output_files = 'yes' #@param['yes', 'no']

############# Select module grouping #############
# Group consecutive modules for the same system or call each module separately.
# Select 'no' to get all intermediate representations, 'yes' if you're only interested in the output.
generate_intermediate_representations = 'no' #@param['yes', 'no']
group_modules_prm = ''
if generate_intermediate_representations == 'yes':
  group_modules_prm = 'no'
else:
  group_modules_prm = 'yes'

############# Select dataset split #############
split = "dev" #@param['dev', 'test','train','ukn']

#######################################################################

# Modules to run, with type of processing (FORGe, Model1, SimpleNLG, etc.).
# Only FORGe is supported for this prototype version.
PredArg_Normalisation = 'FORGe'
# To have an external module assigning triples to aggregate
PredArg_AggregationMark = 'None'
PredArg_Aggregation = 'FORGe'
PredArg_PoSTagging = 'FORGe'
PredArg_CommStructuring = 'FORGe'
DSynt_Structuring = 'FORGe'
SSynt_Structuring = 'FORGe'
SSynt_Aggregation = 'FORGe'
RE_Generation = 'FORGe'
DMorph_AgreementsLinearisation = 'FORGe'
SMorph_Processing = 'FORGe'

# # Tests Lin only (also modify M-FleNS.py)
# PredArg_Normalisation = 'None'
# PredArg_AggregationMark = 'None'
# PredArg_Aggregation = 'None'
# PredArg_PoSTagging = 'None'
# PredArg_CommStructuring = 'None'
# DSynt_Structuring = 'None'
# SSynt_Structuring = 'None'
# SSynt_Aggregation = 'None'
# RE_Generation = 'None'
# DMorph_AgreementsLinearisation = 'FORGe'
# SMorph_Processing = 'FORGe'

#######################################################################
# Paths to python files
path_MFleNS = '/content/M-FleNS_NLG-Pipeline/code/M-FleNS.py'
path_checkOutputs = '/content/M-FleNS_NLG-Pipeline/code/M-FleNS-checkOutputs.py'
path_postProc = '/content/M-FleNS_NLG-Pipeline/code/postProcess.py'
path_FORGe2Morph = '/content/DCU_TCD-FORGe_WebNLG23/code/FORGe2Morph.py'
path_concatenate = '/content/M-FleNS_NLG-Pipeline/code/concatenate_files.py'
path_getClassGenderDBp = '/content/M-FleNS_NLG-Pipeline/code/getClassGenderDBpedia.py'
path_splitFiles = '/content/M-FleNS_NLG-Pipeline/code/splitFiles.py'
# path_MorphGen = '/content/DCU_TCD-FORGe_WebNLG23/code/IrishNLP_MorphGen.py'

#######################################################################
# Paths to FORGe/MATE folders and property files
FORGe_input_folder = '/content/FORGe/buddy_project/struct'
path_MATE = '/content/FORGe/buddy-patched.jar'
path_props_resources_template = '/content/FORGe/mateColabDrive.properties'
path_props_levels = '/content/FORGe/mateLevels.properties'
path_props = '/content/FORGe/mate.properties'

# Paths to general folders
# The input structure(s) of the correct type should be placed in the folder that corresponds to the first module called in the next cell
path_strs = '/content/FORGe/structures'
path_input_XML = '/content/input_XMLs'
str_PredArg_folder = os.path.join(path_strs, '00-PredArg')
str_PredArgNorm_folder = os.path.join(path_strs, '01-PredArgNorm')
str_PredArgAggMark_folder = os.path.join(path_strs, '02-PredArgAggMark')
str_PredArgAgg_folder = os.path.join(path_strs, '03-PredArgAgg')
str_PredArgPoS_folder = os.path.join(path_strs, '04-PredArgPoS')
str_PredArgComm_folder = os.path.join(path_strs, '05-PredArgComm')
str_DSynt_folder = os.path.join(path_strs, '06-DSynt')
str_SSynt_folder = os.path.join(path_strs, '07-SSynt')
str_SSyntAgg_folder = os.path.join(path_strs, '08-SSyntAgg')
str_REG_folder = os.path.join(path_strs, '09-REG')
str_DMorphLin_folder = os.path.join(path_strs, '10-DMorphLin')
str_SMorphText_folder = os.path.join(path_strs, '11-SMorphText')
log_folder = '/content/FORGe/log'

if not os.path.exists(log_folder):
  os.makedirs(log_folder)
if not os.path.exists(path_input_XML):
  os.makedirs(path_input_XML)
temp_input_folder_morph = '/content/FORGe-out'
if not os.path.exists(temp_input_folder_morph):
  os.makedirs(temp_input_folder_morph)

def clear_files(folder):
  "Function to clear files from a folder."
  if os.path.exists(folder) and os.path.isdir(folder):
    for filename in os.listdir(folder):
      file_path = os.path.join(folder, filename)
      try:
        if os.path.isfile(file_path) or os.path.islink(file_path):
          os.unlink(file_path)
        elif os.path.isdir(file_path):
          shutil.rmtree(file_path)
      except Exception as e:
        print('Failed to delete %s. Reason: %s' % (file_path, e))

def clear_folder(folder):
  "Function to clear whole folders."
  if os.path.exists(folder) and os.path.isdir(folder):
    try:
      shutil.rmtree(folder)
    except Exception as e:
      print('Failed to delete %s. Reason: %s' % (folder, e))

def removeReservedCharsFileName(entityName):
  # reservedChars = ['#', '%', '&', '\{', '\}', '\\', '<', '>', '\*', '\?', '/', ' ', '\$', '!', "'", '"', ':', '@', '\+', '`', '\|', '=']
  newEntityName = str(entityName)
  # for reservedChar in reservedChars:
  while re.search(r'[#%&\{\}\\<>\*\?/ \$!\'":@\+`\|=]', newEntityName):
    newEntityName = re.sub(r'[#%&\{\}\\<>\*\?/ \$!\'":@\+`\|=]', "", newEntityName)
  return(newEntityName)

# Run generation pipeline

Run all cells to get the concatenated texts, intermediate representations and log files

## 1 (Alternative 1) - RDF to PredArg (Using pre-generated WebNLG files)

In [None]:
# For the moment, you can download the outputs of the conversion and copy the desired inputs of the same split in the str_PredArg_folder (see below).
! gdown 1B_8UXCiC71hqqiBhcgTQ1Jg_6Y-Nc5su
! unzip /content/00-PredArg-train.zip
! rm '/content/00-PredArg-train.zip'

! gdown 1YXcDXNS8lnU9EWBbVpYtHIJFQLHqAg0w
! unzip /content/00-PredArg-test.zip
! rm '/content/00-PredArg-test.zip'

! gdown 1L0D3pyUW43qep5C2jvmbdQyzbZesMFOb
! unzip /content/00-PredArg-dev.zip
! rm '/content/00-PredArg-dev.zip'

clear_output()

In [None]:
# Copy some PredArg structures in the input folder used for generation

import glob
import os

# empty FORGe input folder
clear_files(str_PredArg_folder)

predArg_conv_folder = '/content/00-PredArg-'+split
# predArg_conv_folder = '/content/00-PredArg-dev'
# predArg_conv_folder = '/content/00-PredArg-test'
# predArg_conv_folder = '/content/00-PredArg-train'
list_predArgPaths = glob.glob(os.path.join(predArg_conv_folder, '*.conll'))
c = 0
for predArgPath in list_predArgPaths:
  PAfilename = os.path.split(predArgPath)[-1]
  ! cp {predArgPath} '/content/FORGe/structures/00-PredArg/'{PAfilename}
  c += 1
print('Copied '+str(c)+' files.')

In [None]:
# Empty input folder to copy other inputs instead
# list_predArgPathsCC = glob.glob(os.path.join('/content/FORGe/structures/00-PredArg/', '*.conll'))
# c = 0
# for predArgPathCC in list_predArgPathsCC:
#   ! rm {predArgPathCC}
#   c += 1
# print('Removed '+str(c)+' files.')

## 1 (Alternative 2) - RDF to PredArg (Using XML files in the WebNLG format that you upload in the input_XMLs folder on the left.)

In [None]:
#@title Pre-processing of input XMls (Upload XML)
import codecs
import re
import glob
import os

list_XML_files = glob.glob(os.path.join(path_input_XML, '*.xml'))

months_map = {'January':'01', 'February': '02', 'March': '03', 'April': '04', 'May': '05', 'June': '06', 'July': '07', 'August': '08', 'September': '09', 'October': '10', 'November': '11', 'December': '12' }
months_list = list(months_map.keys())
for XML_file_path in list_XML_files:
  print(XML_file_path)
  xml_file = codecs.open(XML_file_path, 'r', 'utf-8').readlines()
  count_matches = 0
  with codecs.open(XML_file_path, 'w', 'utf-8') as fo:
    for line in xml_file:
      new_line = line
      # Reformat dates so they are tagged correctly by XML2PredArg conversion
      for month in months_list:
        if re.search(month+'_[0-9]_[0-9]{4}', line):
          month_num = months_map[month]
          new_line = re.subn(month+'_([0-9])_([0-9]{4})', '\g<2>-'+month_num+'-0\g<1>', line)[0]
          count_matches += 1
        elif re.search(month+'_[0-9]{2}_[0-9]{4}', line):
          month_num = months_map[month]
          new_line = re.subn(month+'_([0-9]{2})_([0-9]{4})', '\g<2>-'+month_num+'-\g<1>', line)[0]
          count_matches += 1
      # Remove the ID prefix to the IDs, breaks the XML2PredArg conversion
      if re.search('eid="Id', line):
        new_line = re.sub('eid="Id', 'eid="', line)
      fo.write(new_line)
  print('Replaced '+str(count_matches)+' lines.')

In [None]:
#@title Get and write class and gender information
import json
import glob
import xmltodict
import shutil
import codecs
import re

def extractTripleElements(dataset, element, existing_list):
  """ Returns a list of subjects, objects or properties extracted from triple sets"""
  n = ''
  if element == 'subject':
    n = 0
  elif element == 'property':
    n = 1
  elif element == 'object':
    n = 2
  else:
    print('Error, the second argument of extractTripleElements must be "subject", "property" or "object".')
  element_list = []
  for entry in dataset:
    for input_triple in entry:
      # print(input_triple)
      element_name = input_triple.split(' | ')[n]
      # To filter out values that give SPARQL query errors
      if not re.search('`', element_name) and not re.search('"', element_name):
        new_element_name = '_'.join(element_name.split(' '))
        if new_element_name not in element_list:
          if new_element_name not in existing_list:
            element_list.append(new_element_name)
          # else:
          #   print(new_element_name+' was already seen in WebNLG dataset!')
  return(element_list)

if get_class_gender == 'yes':
  # List that contains all unique new triples from the input files
  triple_sets_list = []

  # Fill triple_sets_list with triples extracted from the input XML
  for XML_file_path in list_XML_files:
    print('Reading '+XML_file_path+'...')
    xml_file = open(XML_file_path, 'r').read()
    XML_data = xmltodict.parse(xml_file)

    for entry in XML_data['benchmark']['entries']['entry']:
      mtriples_list = []
      # Get modified triples
      if isinstance(entry['modifiedtripleset']['mtriple'], list):
        for mtriple in entry['modifiedtripleset']['mtriple']:
          mtriples_list.append(mtriple)
      else:
        mtriples_list.append(entry['modifiedtripleset']['mtriple'])
      triple_sets_list.append(mtriples_list)

  path_covered_subj = '/content/triples2predArg/classMembership/all_subValues.txt'
  path_covered_obj = '/content/triples2predArg/classMembership/all_objValues.txt'

  covered_subj_list_raw = codecs.open(path_covered_subj, 'r', 'utf-8').readlines()
  covered_obj_list_raw = codecs.open(path_covered_obj, 'r', 'utf-8').readlines()

  covered_subj_list = []
  covered_obj_list = []
  for covered_subj in covered_subj_list_raw:
    clean_subj = covered_subj.strip()
    covered_subj_list.append(clean_subj)
  for covered_obj in covered_obj_list_raw:
    clean_obj = covered_obj.strip()
    covered_obj_list.append(clean_obj)

  # Convert lists of subjets and objects to JSON to pass them as argument
  list_subj = sorted(extractTripleElements(triple_sets_list, 'subject', covered_subj_list))
  list_obj = sorted(extractTripleElements(triple_sets_list, 'object', covered_obj_list))

  json_subj = json.dumps(list_subj)
  json_obj = json.dumps(list_obj)
  filepath_subj = os.path.join('/content/triples2predArg/classMembership', 'new_subj_values.json')
  filepath_obj = os.path.join('/content/triples2predArg/classMembership', 'new_obj_values.json')

  with codecs.open(filepath_subj, 'w', 'utf-8') as fo1:
    fo1.write(json_subj)
  with codecs.open(filepath_obj, 'w', 'utf-8') as fo2:
    fo2.write(json_obj)

  ! python {path_getClassGenderDBp} {filepath_subj} {filepath_obj}

In [None]:
#@title Function for adding grammatical gender/definiteness/number common nouns FR
import codecs
import glob
import os
import re

list_Fem_N = [line.strip() for line in codecs.open('/content/M-FleNS_NLG-Pipeline/resources/FR_feminine_nouns.txt', 'r', 'utf-8').readlines()]
list_Fem_N_lower = [fnoun.lower() for fnoun in list_Fem_N]
list_Masc_N = [line.strip() for line in codecs.open('/content/M-FleNS_NLG-Pipeline/resources/FR_masculine_nouns.txt', 'r', 'utf-8').readlines()]
list_Masc_N_lower = [mnoun.lower() for mnoun in list_Masc_N]

# print(list_Fem_N[:20])
# print(list_Fem_N_lower[:20])
# print(list_Masc_N[:20])
# print(list_Masc_N_lower[:20])

def get_all_translation(language):
  # language not used yet
  lines_dbp_subj = codecs.open('/content/triples2predArg/translations/subValues_dbpediaTranslations.txt', 'r', 'utf-8').readlines()
  lines_dbp_obj = codecs.open('/content/triples2predArg/translations/objValues_dbpediaTranslations.txt', 'r', 'utf-8').readlines()
  lines_gtr_subj = codecs.open('/content/triples2predArg/translations/subValues_googleTranslations.txt', 'r', 'utf-8').readlines()
  lines_gtr_obj = codecs.open('/content/triples2predArg/translations/objValues_googleTranslations.txt', 'r', 'utf-8').readlines()

  separator_dbp = '*'
  separator_gtr = '\t*\t'

  all_translations = {}
  for line_ds in lines_dbp_subj:
    if line_ds.split(separator_dbp)[0] not in all_translations.keys():
      all_translations[line_ds.split(separator_dbp)[0]] = line_ds.split(separator_dbp)[1].strip()
  for line_do in lines_dbp_obj:
    if line_do.split(separator_dbp)[0] not in all_translations.keys():
      all_translations[line_do.split(separator_dbp)[0]] = line_do.split(separator_dbp)[1].strip()
  for line_gs in lines_gtr_subj:
    if line_gs.split(separator_gtr)[0] not in all_translations.keys():
      all_translations[line_gs.split(separator_gtr)[0]] = line_gs.split(separator_gtr)[1].strip()
  for line_go in lines_gtr_obj:
    if line_go.split(separator_gtr)[0] not in all_translations.keys():
      all_translations[line_go.split(separator_gtr)[0]] = line_go.split(separator_gtr)[1].strip()

  return all_translations

def is_plural(noun, list_Fem_N, list_Masc_N):
  """
  Determine if a French noun is plural or singular.
  Returns True if the noun is plural, False if it is singular.
  """
  noun_to_check = None
  # If there is a space in the string, we can't trust that the last word carries the number, so consider it singular
  if ' ' in noun:
    noun_to_check = noun.split(' ')[0]
  elif '_' in noun:
    noun_to_check = noun.split('_')[0]
  else:
    noun_to_check = noun

  # Nouns that end in "s", "x", or "z" might be plural but could also be singular, so we check if they are in the fem/masc lists, which only contain singular nouns
  if noun_to_check.lower() in list_Fem_N or noun in list_Masc_N:
    return False
  # If noun starts with an uppercase letter, it's probably a proper noun (although not very true in the WebNLG data...)
  elif noun_to_check[0].isupper():
    return False
  else:
    if noun_to_check.endswith("s") or noun_to_check.endswith("x") or noun_to_check.endswith("z"):
      # Special endings for French plural nouns
      if noun_to_check.endswith(("eaux", "aux", "eux")):
        return True  # These are common plural endings
      # Nouns ending in just "s", "x", or "z" could be either plural or singular
      return True  # Assume it's plural if it ends in "s", "x", or "z"
    else:
      # If the noun does not end in "s", "x", or "z", it is likely singular
      return False

def FR_add_gender_definiteness(path_t2p_out_split, path_t2p_out_split_final, list_Fem_N, list_Fem_N_lower, list_Masc_N, list_Masc_N_lower):
  dico_months_irish = {"Eanáir":"Janvier", "Feabhra":"Février", "Márta":"Mars", "Aibreán":"Avril", "Bealtaine":"Mai", "Meitheamh":"Juin", "Iúil":"Juillet", "Lúnasa":"Août", "Meán_Fómhair":"Septembre", "Deireadh_Fómhair":"Octobre", "Samhain":"Novembre", "Nollaig":"Décembre"}
  dico_months_english = {"January":"Janvier", "February":"Février", "Mars":"Mars", "April":"Avril", "May":"Mai", "June":"Juin", "July":"Juillet", "August":"Août", "September":"Septembre", "October":"Octobre", "November":"Novembre", "December":"Décembre"}
  list_conll_files = glob.glob(os.path.join(path_t2p_out_split, '*.conll'))
  dico_backup_translations = get_all_translation('FR')
  for conll_file in list_conll_files:
    # split filename into head and tail
    head, tail = os.path.split(conll_file)
    # with os.path.join(path_t2p_out_split_final, tail) as fo:
    lines_conll = codecs.open(conll_file, 'r', 'utf-8').readlines()
    with codecs.open(os.path.join(path_t2p_out_split_final, tail), 'w', 'utf-8') as fo:
      for line in lines_conll:
        if re.search('(su|o)bj=yes', line):
          # Add backup translation if something went wrong during triple2predArg mapping
          full_entity = line.split('\t')[1]
          if full_entity in dico_backup_translations.keys():
            line = re.sub('^([^\t]+\t)[^\t]+', '\g<1>'+str(dico_backup_translations.get(full_entity)), line)
          # get the word in the second column of the line
          word = None
          if re.search('_', line.split('\t')[1]):
            word = line.split('\t')[1].split('_')[0]
          elif re.search(' ', line.split('\t')[1]):
            word = line.split('\t')[1].split(' ')[0]
          else:
            word = line.split('\t')[1]
          # Get nouns that should be added gender=FEM in the CoNLL
          if not re.search('gender=', line):
            # list_Fem_N_lower contains all feminine nouns in lower case (same for Masc)
            if word.lower() in list_Fem_N_lower and not word.lower() in list_Masc_N_lower:
              line = re.sub('^([^\t]+\t[^\t]+\t[^\t]+\t[^\t]+\t[^\t]+\t[^\t]+\t[^\t]+)', '\g<1>|gender=FEM', line)
          # Add definiteness when needed
          if not re.search('definiteness=', line) and not re.search('class=Person', line) and not re.search('class=Band', line) and not re.match("(L|l)('|a|e|es)$", word) and not re.match("(U|u)ne*$", word) and not re.match("(D|d)(u|es)$", word) and not re.match("(A|a)$", word):
            # Get nouns that should be added definiteness=DEF in the CoNLL (i.e. if a lowercased entity E is in list_Fem_N, in which supposedly only common nouns are lowercased, then E is likely a common noun, hence likely needs a det)
            if word.lower() in list_Fem_N or word.lower() in list_Masc_N:
              if re.search('dpos=NP', line):
                line = re.sub('dpos=NP', 'definiteness=DEF', line)
              else:
                line = re.sub('^([^\t]+\t[^\t]+\t[^\t]+\t[^\t]+\t[^\t]+\t[^\t]+\t[^\t]+)', '\g<1>|definiteness=DEF', line)
          # Add number=PL when needed
          if not re.search('number=', line):
            if is_plural(word, list_Fem_N, list_Masc_N) == True:
              line = re.sub('^([^\t]+\t[^\t]+\t[^\t]+\t[^\t]+\t[^\t]+\t[^\t]+\t[^\t]+)', '\g<1>|number=PL', line)
          # Replace Irish month names by french names
          # print('Processing line '+str(line))
          for month_i in dico_months_irish.keys():
            if month_i in line.split('\t')[1]:
              # for Irish, let's just replace any month label by the french one (note there are months with an underscore in the middle).
              line = re.sub(month_i, dico_months_irish.get(month_i), line)
          for month_e in dico_months_english.keys():
            if month_e in line.split('\t')[1]:
              # for English, get rid of commas if any; also check is we actually have a date to avoid possible errors (e.g. a month in a book name)
              line = re.sub(month_e+',*(_[0-9]{4})', dico_months_english.get(month_e)+'\g<1>', line)
        fo.write(line)

In [None]:
#@title Create FORGe input file in conll format (TEMPORARY FOR FR: upload manually '241008_WebNLG23_FR.conll' and the translation files, and use 'GA' block in this cell)
import shutil
import os

# empty FORGe input folder
clear_files(str_PredArg_folder)

language_t2p = language.lower()
path_t2p_out = os.path.join(triple2predArg, 'out/')
clear_files(path_t2p_out)

path_t2p_out_split = os.path.join(triple2predArg, 'out_split/')
if not os.path.exists(path_t2p_out_split):
  os.makedirs(path_t2p_out_split)
else:
  clear_files(path_t2p_out_split)

path_t2p_out_split_final = os.path.join(triple2predArg, 'out_split_final/')
if not os.path.exists(path_t2p_out_split_final):
  os.makedirs(path_t2p_out_split_final)
else:
  clear_files(path_t2p_out_split_final)

name_conll_templates = ''
name_properties_list = ''

if language == 'GA':
  # name_conll_templates_outdated = '221130_WebNLG23_GA.conll'
  # name_properties_list_outdated = '231027-WebNLG23_EN-GA_properties.txt'
  # name_conll_templates = '241008_WebNLG23_FR.conll'
  name_conll_templates = '240202_WebNLG23_GA.conll'
  name_properties_list = '240202_WebNLG23_EN-GA_properties.txt'
# elif language == 'FR':
  # For FR for now: upload manually '241008_WebNLG23_FR.conll' and the translation files with the same name as the GA files (FR is not yet accepted by triples2predarg)
  # IF UPDATED: ALSO UPDATE LANGUAGE FOR LANGUAGE-SPECIFIC PROCESSING JUST AFTER THE CONVERSION
  # name_conll_templates = '241008_WebNLG23_FR.conll'
  # name_properties_list = '240202_WebNLG23_EN-GA_properties.txt'
else:
  # name_conll_templates = '230528-WebNLG23_EN.conll'
  name_conll_templates = '240202_WebNLG23_EN.conll'
  name_properties_list = '240202_WebNLG23_EN-GA_properties.txt'

# newEntityName = removeReservedCharsFileName(entity_name)

for XML_file_path in list_XML_files:
  inputFilename = XML_file_path.rsplit('/', 1)[1].rsplit('.', 1)[0]
  # Copy input files to triples2predArg repo
  shutil.copy(XML_file_path, os.path.join(triple2predArg, inputFilename)+'.xml')

  # Convert xml into predArg
  print('Converting '+inputFilename+' to PredArg...')
  !java -jar '/content/triples2predArg/webNLG_triples2conll.jar' '/content/triples2predArg/' {name_conll_templates} {name_properties_list} {path_t2p_out} {language_t2p} {inputFilename}  # -> "log.txt"

  clear_output()

  # File splitting: the generator cannot process files that are too big, so they need to be split. For regular-sized inputs, 450 inputs per file should do.
  # Whether split or not, the output files will be copied in the folder in [5]
  # Parameters:
  # [1] path to input folder
  # [2] encoding of input files
  # [3] number of structures per file
  # [4] split once ('first'), or every time the threshold in [3] is reached ('all')
  # [5] path to temp folder used to store split files
  ! python {path_splitFiles} {path_t2p_out} 'utf-8' 300 'all' {path_t2p_out_split}

  # Add language-specific info, such as gender, definiteness info; creates new path '/content/triples2predArg/out_split_final'
  # Next line should be FR, cheating for now
  if language == 'GA':
    FR_add_gender_definiteness(path_t2p_out_split, path_t2p_out_split_final, list_Fem_N, list_Fem_N_lower, list_Masc_N, list_Masc_N_lower)
  else:
    if os.path.exists(path_t2p_out_split_final):
      shutil.rmtree(path_t2p_out_split_final)
    shutil.copytree(path_t2p_out_split, path_t2p_out_split_final)

  # Copy conll file from final conversion folder to FORGe input folder
  # The following "if" is not needed but I keep it in case we don't use splitFiles in the future.
  if len(os.listdir(path_t2p_out_split_final)) == 0:
    shutil.copy(os.path.join(path_t2p_out, inputFilename+'_'+language_t2p+'.conll'), str_PredArg_folder)
    print('Copied files from '+str(path_t2p_out))
  else:
    list_conll_files = glob.glob(os.path.join(path_t2p_out_split_final, '*.conll'))
    for conll_file in list_conll_files:
      # conllFilename = conll_file.rsplit('/', 1)[1].rsplit('.', 1)[0]
      shutil.copy(conll_file, str_PredArg_folder)
    print('Copied files from '+str(path_t2p_out_split_final))

In [None]:
#@title Check for missing mappings
path_debug_triples2predArg = '/content/triples2predArg/out/test_missingProperties.txt'

pdt_lines = codecs.open(path_debug_triples2predArg, 'r', 'utf-8').readlines()

if len(pdt_lines) == 0:
  print(f'All input properties were mapped successfully!')
else:
  print(f'CRITICAL WARNING: the mapping of some input properties was not found!')
  for pdt_line in pdt_lines:
    print(f'  {pdt_line.strip()}')

In [None]:
#@title Zip and download conll inputs
download_inputs = 'yes'#@param['yes', 'no']
split = 'dev' #@param ['train', 'dev', 'test']

# language = 'FR'
if download_inputs == 'yes':
  from google.colab import files
  zip_name_log = '/content/00-PredArg-'+split+'.zip'
  !zip -r {zip_name_log} /content/triples2predArg/out_split_final

  clear_output()

  files.download(zip_name_log)


## 2 - PredArg to uninflected text (FORGe via M-FleNS pipeline code)

In [None]:
#@title Launch generation process
# ! python '/content/M-FleNS_NLG-Pipeline/code/M-FleNS.py' {language} {split} {group_modules_prm} {PredArg_Normalisation} {PredArg_AggregationMark} {PredArg_Aggregation} {PredArg_PoSTagging} {PredArg_CommStructuring} {DSynt_Structuring} {SSynt_Structuring} {SSynt_Aggregation} {RE_Generation} {DMorph_AgreementsLinearisation} {SMorph_Processing} {FORGe_input_folder} {path_MATE} {path_props_resources_template} {path_props_levels} {path_props} {str_PredArg_folder} {str_PredArgNorm_folder} {str_PredArgAggMark_folder} {str_PredArgAgg_folder} {str_PredArgPoS_folder} {str_PredArgComm_folder} {str_DSynt_folder} {str_SSynt_folder} {str_SSyntAgg_folder} {str_REG_folder} {str_DMorphLin_folder} {str_SMorphText_folder} {log_folder}
! python {path_MFleNS} {language} {split} {group_modules_prm} {PredArg_Normalisation} {PredArg_AggregationMark} {PredArg_Aggregation} {PredArg_PoSTagging} {PredArg_CommStructuring} {DSynt_Structuring} {SSynt_Structuring} {SSynt_Aggregation} {RE_Generation} {DMorph_AgreementsLinearisation} {SMorph_Processing} {FORGe_input_folder} {path_MATE} {path_props_resources_template} {path_props_levels} {path_props} {str_PredArg_folder} {str_PredArgNorm_folder} {str_PredArgAggMark_folder} {str_PredArgAgg_folder} {str_PredArgPoS_folder} {str_PredArgComm_folder} {str_DSynt_folder} {str_SSynt_folder} {str_SSyntAgg_folder} {str_REG_folder} {str_DMorphLin_folder} {str_SMorphText_folder} {log_folder}


In [None]:
#@title Check outputs and copy files to morph folder if GA
import codecs

# # Read original check script
# check_script = open(path_checkOutputs, 'r')
# lines_check_script = check_script.readlines()
# # Update check script
# with codecs.open(path_checkOutputs, 'w', 'utf-8') as f:
#   for line in lines_check_script:
#     if line.startswith('log_folder = sys.argv[3]'):
#       f.write('log_folder = sys.argv[3]\ntemp_input_folder_morph = sys.argv[4]\nlanguage = sys.argv[5]\n')
#     elif line.startswith('            count_perLevel.append(count)\n'):
#       f.write('            count_perLevel.append(count)\n            if language == "GA":\n              shutil.copy(new_file_path, temp_input_folder_morph)\n')
#     else:
#       f.write(line)

! python {path_checkOutputs} {str_PredArg_folder} {str_SMorphText_folder} {log_folder} {temp_input_folder_morph} {language}

if not language == 'GA':
  clear_folder(os.path.join(temp_input_folder_morph, split))
  # For GA, files are copied from the python code called above
  if concatenate_output_files == 'yes':
    ! python {path_concatenate} {str_SMorphText_folder} {temp_input_folder_morph} {split}
  else:
    ! python {path_concatenate} {str_SMorphText_folder} {temp_input_folder_morph} {split}

## 3 - Morphology processing (Irish NLP Tools)

In [None]:
#@title Use if you upload structures generated from another pipeline instead of using the previous cells
# import shutil
# clear_folder('/content/FORGe/structures')
# ! unzip /content/FORGe-train.zip

In [None]:
#@title Process raw FORGe output and format it for Morphology

# Read original check script
if os.path.isfile('/content/FORGe/log/summary.txt'):
  FORGe_log = open('/content/FORGe/log/summary.txt', 'r')
  lines_log = FORGe_log.readlines()
  # Get number of expected texts
  count_strs_all_FORGe = 0
  for line in lines_log:
    if line.startswith('Outputs: '):
      count_strs_all_FORGe = int(line.strip().split('Outputs: ')[-1])

  print('Expected texts: '+str(count_strs_all_FORGe)+'.\n')

if language == 'GA':
  ! python {path_FORGe2Morph} {language} {temp_input_folder_morph} {morph_input_folder}
  clear_files(temp_input_folder_morph)

In [None]:
#@title Call morph generator
# v3 (fast, ~2sec/450 texts)

# Run the morphology generation
from IPython.display import HTML, display
import progressbar
import glob
import codecs
from termcolor import colored
import re

show_input = False #@param {type:"boolean"}

if language == 'GA':
  clear_files(morph_output_folder)
  # To store how many texts we have in each file (used to )
  count_strs_all_Morph = []
  for filepath in sorted(glob.glob(os.path.join(morph_input_folder, '*.*'))):
    count_strs_all = 0
    head, tail = os.path.split(filepath)
    filename = tail.rsplit('.')[0]
    print('Processing '+filename)
    fo = codecs.open(morph_output_folder+'/'+filename+'_out.txt', 'w', 'utf-8')
    list_inflected_words = ! cat {filepath} | {morph_folder_name}'/flookup' -a {morph_folder_name}'/allgen.fst'
    # print(list_inflected_words)

    # Create a variable to store the outputs
    text = ''
    # morph returns this as list_inflected_words: ['imir+Verb+Vow+PresInd\timríonn', '', 'Agremiação_Sportiva_Arapiraquense+Noun+Masc+Com+Pl\t+?', '', ',\t+?',...]
    for word in list_inflected_words:
      empty = 'yes'
      input_string = ''
      morph_returned = ''
      morph_backup = ''
      if re.search('\t', word):
        # for every space an empty string is returned; we'll ignore them later. Between two consecutive texts there is a simple "\t" with nothing around. I use this to introduce linebreaks later.
        empty = 'no'
        input_string = word.split('\t')[0]
        morph_returned = word.split('\t')[1]
        if re.search('\+', word):
          morph_backup = input_string.split('+', 1)[0]
        else:
          morph_backup = input_string
      out_line = ''
      # Create each output line with the required contents
      if show_input == True:
        if empty == 'no':
          if morph_returned == '':
            if input_string == '':
              out_line = out_line + '\n'
              count_strs_all += 1
          else:
            out_line = out_line + input_string + ': ' +'\x1b[5;30;47m'+morph_returned+'\x1b[0m'+'\n'
      else:
        if empty == 'no':
          if morph_returned == '+?':
            out_line = out_line + morph_backup + ' '
          # If the line is empty, add a line break (empty lines separate different texts in the input)
          elif morph_returned == '':
            if input_string == '':
              out_line = out_line + '\n'
              count_strs_all += 1
          else:
            out_line = out_line + morph_returned + ' '
      # add line to the other lines of the same file
      text = text + out_line

    # print('\n----------------------\n'+text+'\n')
    count_strs_all_Morph.append(count_strs_all)
    fo.write(text+'\n')
    fo.close()

  # Check
  if os.path.isfile('/content/FORGe/log/summary.txt'):
    with codecs.open('/content/FORGe/log/summary.txt', 'a', 'utf-8') as fo:
      fo.write('\nMorphology debug\n==================\n\n')
      if not sum(count_strs_all_Morph) == count_strs_all_FORGe:
        print('\nERROR! Mismatch with FORGe outputs!')
        fo.write('ERROR! Mismatch with FORGe outputs!\n')
      print('\nThere are '+str(sum(count_strs_all_Morph))+' texts.')
      fo.write('There are '+str(sum(count_strs_all_Morph))+' texts.\n')
      print('Texts per file: '+str(count_strs_all_Morph))
      fo.write('Texts per file: '+str(count_strs_all_Morph)+'\n')
      fo.write('---------------------------------\n')
  clear_files(morph_input_folder)

## 4 - Output post-processing and packaging

In [None]:
#@title Process texts
prefinal_output_folder = ''

if language == 'GA':
  prefinal_output_folder = morph_output_folder
else:
  prefinal_output_folder = os.path.join(temp_input_folder_morph, split)

! python {path_postProc} {language} {prefinal_output_folder}

# Check
list_filepaths = glob.glob(os.path.join(prefinal_output_folder, '*_postproc.txt'))
count_strs_all_postproc = []
for filepath in sorted(list_filepaths):
  count_strs_all = 0
  head, tail = os.path.split(filepath)
  fd = codecs.open(filepath, 'r', 'utf-8')
  lines = fd.readlines()
  x = 0
  for line in lines:
    if not line == '\n':
      count_strs_all += 1
    x += 1
  count_strs_all_postproc.append(count_strs_all)

if os.path.isfile('/content/FORGe/log/summary.txt'):
  with codecs.open('/content/FORGe/log/summary.txt', 'a', 'utf-8') as fo:
    fo.write('\nPost-processing debug\n==================\n\n')
    if not sum(count_strs_all_postproc) == count_strs_all_FORGe:
      print('\nERROR! Mismatch with FORGe outputs!')
      fo.write('ERROR! Mismatch with FORGe outputs!\n')
    print('\nThere are '+str(sum(count_strs_all_postproc))+' texts.')
    fo.write('There are '+str(sum(count_strs_all_postproc))+' texts.\n')
    print('Texts per file: '+str(count_strs_all_postproc))
    fo.write('Texts per file: '+str(count_strs_all_postproc)+'\n')
    fo.write('---------------------------------\n')

In [None]:
#@title Concatenate files

# list_clean_outputs = glob.glob(os.path.join(morph_output_folder, '*_out_postproc.txt'))
list_clean_outputs = ''
if language == 'GA':
  list_clean_outputs = glob.glob(os.path.join(morph_output_folder, '*_out_postproc.txt'))
else:
  list_clean_outputs = glob.glob(os.path.join(temp_input_folder_morph, split, '*_postproc.txt'))
print(list_clean_outputs)

filename = 'all_'+language+'_'+split+'_out.txt'

with codecs.open(filename, 'w', 'utf-8') as outfile:
  # Files need to be sorted to be concatenated in the right order
  for fname in sorted(list_clean_outputs):
    print('Processing '+fname)
    with open(fname) as infile:
      outfile.write(infile.read())

# Check
if os.path.isfile('/content/FORGe/log/summary.txt'):
  with codecs.open('/content/FORGe/log/summary.txt', 'a', 'utf-8') as fo:
    fo.write('\nConcatenate debug\n==================\n\n')
    count_texts_all = len(codecs.open(filename).readlines())
    if not count_texts_all == count_strs_all_FORGe:
      print('\nERROR! Mismatch with FORGe outputs!')
      fo.write(('ERROR! Mismatch with FORGe outputs!\n'))
    print('\nThere are '+str(count_texts_all)+' texts.')
    fo.write('There are '+str(count_texts_all)+' texts.\n')


In [None]:
#@title Zip and download FORGe output folder with intermediate representations
download_outputs = 'yes'#@param['yes', 'no']

if download_outputs == 'yes':
  from google.colab import files
  zip_name_inter = '/content/WebNLG_['+language+']_['+split+']_allLevels.zip'
  !zip -r {zip_name_inter} /content/FORGe/structures

  clear_output()

  files.download(zip_name_inter)

In [None]:
#@title Zip and download FORGe log files folder

download_logfiles = 'no'#@param['yes', 'no']

if download_logfiles == 'yes':
  from google.colab import files
  zip_name_log = '/content/WebNLG_['+language+']_['+split+']_logs.zip'
  !zip -r {zip_name_log} /content/FORGe/log

  clear_output()

  files.download(zip_name_log)

In [None]:
#@title Zip and download conll inputs
download_inputs = 'no'#@param['yes', 'no']

if download_inputs == 'yes':
  from google.colab import files
  zip_name_log = '/content/WebNLG_['+language+']_inputs.zip'
  !zip -r {zip_name_log} /content/FORGe/structures/00-PredArg

  clear_output()

  files.download(zip_name_log)

In [None]:
#! unzip /content/WebNLG_[EN]_[test]_allLevels.zip