<a href="https://colab.research.google.com/github/mille-s/DCU_TCD-FORGe_WebNLG23/blob/main/DCU_TCD_FORGe_WebNLG23.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Run this cell to download and unzip the working folder and install Java 8

from IPython.display import clear_output
import os

# clone main repo
! git clone https://github.com/mille-s/DCU_TCD-FORGe_WebNLG23.git
# Delete locally to avoid confusion
! rm 'DCU_TCD-FORGe_WebNLG23/DCU_TCD_FORGe_WebNLG23.ipynb'

# clone M-FleNS repo (generation pipeline)
! git clone https://github.com/mille-s/M-FleNS_NLG-Pipeline.git
# Delete locally to avoid confusion
! rm 'M-FleNS_NLG-Pipeline/M_FleNS_pipe_v2.ipynb'

# Download FORGe
# Version used for WebNLG (fails to generate a few structures of the training data)
! gdown 1lsh8pwUp9mc0Z_aFbSy1WTIpSx9YwFFD
! unzip /content/FORGe_colab_v3_WebNLG.zip
# Version used for Mod-D2T (minor improvements on WebNLG)
# ! gdown 196w_EtORTkR3idaXDMq0xl3pOtBrGbiE
# ! unzip /content/FORGe_colab_v4.zip

# Download Morphology generator
! gdown 1vk1utEjeZ_2YO1H20DPDTjVSevgRJNM_
morph_folder_name = 'test_irish_morph_gen_v5.0'
zip_name = morph_folder_name+'.zip'
! unzip {zip_name}

morph_input_folder = '/content/'+morph_folder_name+'/Inputs'
morph_output_folder = '/content/'+morph_folder_name+'/Outputs'
os.makedirs(morph_input_folder)
os.makedirs(morph_output_folder)

# Make morphology flookup executable
! 7z a -sfx {morph_folder_name}'/flookup.exe' {morph_folder_name}'/flookup'
! chmod 755 {morph_folder_name}'/flookup'

# Clean
! rm '/content/FORGe_colab_v3_WebNLG.zip'
! rm '/content/FORGe_colab_v4.zip'
! rm '/content/test_irish_morph_gen_v5.0.zip'
clear_output()
print('Working folder ready!\n--------------\nInstalling Java 8...\n')

# Switch to Java 1.8 (needed for FORGe to run correctly)
def install_java():
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
  !update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
  !java -version       #check java version
install_java()

Working folder ready!
--------------
Installing Java 8...

update-alternatives: using /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java to provide /usr/bin/java (java) in manual mode
openjdk version "1.8.0_382"
OpenJDK Runtime Environment (build 1.8.0_382-8u382-ga-1~22.04.1-b05)
OpenJDK 64-Bit Server VM (build 25.382-b05, mixed mode)


In [None]:
# Run this cell to set parameters for generation

# The input structure(s) of the correct type should be placed in the folder that corresponds to the first module called in the next cell
# E.g. if one a module PredArg_... or DSynt_... is selected, the input predicate-argument structures should be placed in the structures/00-PredArg folder
# I'll make the instructions and names clearer in a later (actually usable) version.

############# Select language #############
language = 'GA' #@param['EN', 'ES', 'GA']

############# Select module grouping #############
# Group consecutive modules for the same system or call each module separately.
# Select 'no' to get all intermediate representations, 'yes' if you're only interested in the output.
group_modules_prm = 'yes' #@param['yes', 'no']

############# Select dataset split #############
split = "test" #@param['dev', 'test','train','ukn']

#######################################################################

# Modules to run, with type of processing (FORGe, Model1, SimpleNLG, etc.).
# Only FORGe is supported for this prototype version.
PredArg_Normalisation = 'FORGe'
# To have an external module assigning triples to aggregate
PredArg_AggregationMark = 'None'
PredArg_Aggregation = 'FORGe'
PredArg_PoSTagging = 'FORGe'
PredArg_CommStructuring = 'FORGe'
DSynt_Structuring = 'FORGe'
SSynt_Structuring = 'FORGe'
SSynt_Aggregation = 'FORGe'
RE_Generation = 'FORGe'
DMorph_AgreementsLinearisation = 'FORGe'
SMorph_Processing = 'FORGe'

#######################################################################
# Paths to python files
path_MFleNS = '/content/M-FleNS_NLG-Pipeline/code/M-FleNS.py'
path_checkOutputs = '/content/M-FleNS_NLG-Pipeline/code/M-FleNS-checkOutputs.py'
path_postProc = '/content/M-FleNS_NLG-Pipeline/code/postProcess.py'
path_FORGe2Morph = '/content/DCU_TCD-FORGe_WebNLG23/code/FORGe2Morph.py'
# path_MorphGen = '/content/DCU_TCD-FORGe_WebNLG23/code/IrishNLP_MorphGen.py'

#######################################################################
# Paths to FORGe/MATE folders and property files
FORGe_input_folder = '/content/FORGe/buddy_project/struct'
path_MATE = '/content/FORGe/buddy-patched.jar'
path_props_resources_template = '/content/FORGe/mateColabDrive.properties'
path_props_levels = '/content/FORGe/mateLevels.properties'
path_props = '/content/FORGe/mate.properties'

# Paths to general folders
# The input structure(s) of the correct type should be placed in the folder that corresponds to the first module called in the next cell
path_strs = '/content/FORGe/structures'
str_PredArg_folder = os.path.join(path_strs, '00-PredArg')
str_PredArgNorm_folder = os.path.join(path_strs, '01-PredArgNorm')
str_PredArgAggMark_folder = os.path.join(path_strs, '02-PredArgAggMark')
str_PredArgAgg_folder = os.path.join(path_strs, '03-PredArgAgg')
str_PredArgPoS_folder = os.path.join(path_strs, '04-PredArgPoS')
str_PredArgComm_folder = os.path.join(path_strs, '05-PredArgComm')
str_DSynt_folder = os.path.join(path_strs, '06-DSynt')
str_SSynt_folder = os.path.join(path_strs, '07-SSynt')
str_SSyntAgg_folder = os.path.join(path_strs, '08-SSyntAgg')
str_REG_folder = os.path.join(path_strs, '09-REG')
str_DMorphLin_folder = os.path.join(path_strs, '10-DMorphLin')
str_SMorphText_folder = os.path.join(path_strs, '11-SMorphText')
log_folder = '/content/FORGe/log'
if not os.path.exists(log_folder):
  os.makedirs(log_folder)
temp_input_folder_morph = '/content/FORGe-out'
if not os.path.exists(temp_input_folder_morph):
  os.makedirs(temp_input_folder_morph)

# Run generation pipeline

Run all cells to get the concatenated texts, intermediate representations and log files

## 1 - RDF to PredArg (TBD)

In [None]:
# For the moment, you can download the outputs of the conversion and copy the desired inputs of the same split in the str_PredArg_folder (see below).
! gdown 1B_8UXCiC71hqqiBhcgTQ1Jg_6Y-Nc5su
! unzip /content/00-PredArg-train.zip
! rm '/content/00-PredArg-train.zip'

! gdown 1YXcDXNS8lnU9EWBbVpYtHIJFQLHqAg0w
! unzip /content/00-PredArg-test.zip
! rm '/content/00-PredArg-test.zip'

! gdown 1L0D3pyUW43qep5C2jvmbdQyzbZesMFOb
! unzip /content/00-PredArg-dev.zip
! rm '/content/00-PredArg-dev.zip'

clear_output()

In [None]:
# Copy some PredArg structures in the input folder used for generation

import glob
import os

predArg_conv_folder = '/content/00-PredArg-'+split
# predArg_conv_folder = '/content/00-PredArg-dev'
# predArg_conv_folder = '/content/00-PredArg-test'
# predArg_conv_folder = '/content/00-PredArg-train'
list_predArgPaths = glob.glob(os.path.join(predArg_conv_folder, '*.conll'))
c = 0
for predArgPath in list_predArgPaths:
  PAfilename = os.path.split(predArgPath)[-1]
  ! cp {predArgPath} '/content/FORGe/structures/00-PredArg/'{PAfilename}
  c += 1
print('Copied '+str(c)+' files.')

Copied 4 files.


In [None]:
# Empty input folder to copy other inputs instead
# list_predArgPathsCC = glob.glob(os.path.join('/content/FORGe/structures/00-PredArg/', '*.conll'))
# c = 0
# for predArgPathCC in list_predArgPathsCC:
#   ! rm {predArgPathCC}
#   c += 1
# print('Removed '+str(c)+' files.')

## 2 - PredArg to uninflected text (FORGe via M-FleNS pipeline code)

In [None]:
# Launch generation process
! python {path_MFleNS} {language} {split} {group_modules_prm} {PredArg_Normalisation} {PredArg_AggregationMark} {PredArg_Aggregation} {PredArg_PoSTagging} {PredArg_CommStructuring} {DSynt_Structuring} {SSynt_Structuring} {SSynt_Aggregation} {RE_Generation} {DMorph_AgreementsLinearisation} {SMorph_Processing} {FORGe_input_folder} {path_MATE} {path_props_resources_template} {path_props_levels} {path_props} {str_PredArg_folder} {str_PredArgNorm_folder} {str_PredArgAggMark_folder} {str_PredArgAgg_folder} {str_PredArgPoS_folder} {str_PredArgComm_folder} {str_DSynt_folder} {str_SSynt_folder} {str_SSyntAgg_folder} {str_REG_folder} {str_DMorphLin_folder} {str_SMorphText_folder} {log_folder}


Preparing generation pipeline...
  -> Initial structure is ['PredArg'].
  -> 10 modules were selected.
  -> Sequence: ['PredArgNorm', 'PredArgAgg', 'PredArgPoS', 'PredArgComm', 'DSynt', 'SSynt', 'SSyntAgg', 'REG', 'DMorphLin', 'SMorphText']
  -> The pipeline looks good, proceeding...
--------------------------
Running FORGe
--------------------------
  Running ['10_Con_Sem.rl', '11.1_Con_Agg1.rl', '11.2_Con_Agg2.rl', '11.3_Con_Agg3.rl', '11.4_Con_Agg4.rl', '13_Sem_SemPoS.rl', '15_SemPoS_SemCommMark.rl', '17_SemCommMark_SemComm.rl', '20_SemComm_DSynt.rl', '30_DSynt_SSynt.rl', '35_SSynt_PostProc.rl', '37.1_SSynt_Agg1.rl', '37.2_SSynt_Agg2.rl', '38.1_SSynt_REG1.rl', '38.2_SSynt_REG2.rl', '40_SSynt_DMorph_linearize.rl', '50_DMorph_SMorph.rl', '60_Smorph_Sentence.rl']
All done!


In [None]:
# Check outputs and copy files to morph folder if GA
import codecs

## Read original check script
#check_script = open(path_checkOutputs, 'r')
#lines_check_script = check_script.readlines()
## Update check script
#with codecs.open(path_checkOutputs, 'w', 'utf-8') as f:
#  for line in lines_check_script:
#    if line.startswith('log_folder = sys.argv[3]'):
#      f.write('log_folder = sys.argv[3]\ntemp_input_folder_morph = sys.argv[4]\nlanguage = sys.argv[5]\n')
#    elif line.startswith('            count_perLevel.append(count)\n'):
#      f.write('            count_perLevel.append(count)\n            if language == "GA":\n              shutil.copy(new_file_path, temp_input_folder_morph)\n')
#    else:
#      f.write(line)

! python {path_checkOutputs} {str_PredArg_folder} {str_SMorphText_folder} {log_folder} {temp_input_folder_morph} {language}

Log files OK!
----


Number of texts OK!
----
Inputs:  1779
Outputs: 1779
Inputs per file:  [450, 450, 450, 429]
Outputs per file: [450, 450, 450, 429]


## 3 - Morphology processing (Irish NLP Tools)

In [None]:
# Process raw FORGe output and format it for Morphology

# Read original check script
FORGe_log = open('/content/FORGe/log/summary.txt', 'r')
lines_log = FORGe_log.readlines()
# Get number of expected texts
count_strs_all_FORGe = 0
for line in lines_log:
  if line.startswith('Outputs: '):
    count_strs_all_FORGe = int(line.strip().split('Outputs: ')[-1])

print('Expected texts: '+str(count_strs_all_FORGe)+'.\n')

! python {path_FORGe2Morph} {language} {temp_input_folder_morph} {morph_input_folder}

Expected texts: 1779.

Processing test_triples_en_ga_utf8_0000-0449__SMorphText.conll_out.txt
Processing test_triples_en_ga_utf8_0450-0899__SMorphText.conll_out.txt
Processing test_triples_en_ga_utf8_0900-1349__SMorphText.conll_out.txt
Processing test_triples_en_ga_utf8_1350-1778__SMorphText.conll_out.txt

There are 1779 texts.
Texts per file: [450, 450, 450, 429]


In [None]:
# Call morph generator
# v3 (fast, ~2sec/450 texts)

# Run the morphology generation
from IPython.display import HTML, display
import progressbar
import glob
import codecs
from termcolor import colored
import re

show_input = False #@param {type:"boolean"}

if language == 'GA':
# To store how many texts we have in each file (used to )
  count_strs_all_Morph = []
  for filepath in sorted(glob.glob(os.path.join(morph_input_folder, '*.*'))):
    count_strs_all = 0
    head, tail = os.path.split(filepath)
    filename = tail.rsplit('.')[0]
    print('Processing '+filename)
    fo = codecs.open(morph_output_folder+'/'+filename+'_out.txt', 'w', 'utf-8')
    list_inflected_words = ! cat {filepath} | {morph_folder_name}'/flookup' -a {morph_folder_name}'/allgen.fst'
    # print(list_inflected_words)

    # Create a variable to store the outputs
    text = ''
    # morph returns this as list_inflected_words: ['imir+Verb+Vow+PresInd\timríonn', '', 'Agremiação_Sportiva_Arapiraquense+Noun+Masc+Com+Pl\t+?', '', ',\t+?',...]
    for word in list_inflected_words:
      empty = 'yes'
      input_string = ''
      morph_returned = ''
      morph_backup = ''
      if re.search('\t', word):
        # for every space an empty string is returned; we'll ignore them later. Between two consecutive texts there is a simple "\t" with nothing around. I use this to introduce linebreaks later.
        empty = 'no'
        input_string = word.split('\t')[0]
        morph_returned = word.split('\t')[1]
        if re.search('\+', word):
          morph_backup = input_string.split('+', 1)[0]
        else:
          morph_backup = input_string
      out_line = ''
      # Create each output line with the required contents
      if show_input == True:
        if empty == 'no':
          if morph_returned == '':
            if input_string == '':
              out_line = out_line + '\n'
              count_strs_all += 1
          else:
            out_line = out_line + input_string + ': ' +'\x1b[5;30;47m'+morph_returned+'\x1b[0m'+'\n'
      else:
        if empty == 'no':
          if morph_returned == '+?':
            out_line = out_line + morph_backup + ' '
          # If the line is empty, add a line break (empty lines separate different texts in the input)
          elif morph_returned == '':
            if input_string == '':
              out_line = out_line + '\n'
              count_strs_all += 1
          else:
            out_line = out_line + morph_returned + ' '
      # add line to the other lines of the same file
      text = text + out_line

    # print('\n----------------------\n'+text+'\n')
    count_strs_all_Morph.append(count_strs_all)
    fo.write(text+'\n')
    fo.close()

  # Check
  with codecs.open('/content/FORGe/log/summary.txt', 'a', 'utf-8') as fo:
    fo.write('\nMorphology debug\n==================\n\n')
    if not sum(count_strs_all_Morph) == count_strs_all_FORGe:
      print('\nERROR! Mismatch with FORGe outputs!')
      fo.write('ERROR! Mismatch with FORGe outputs!\n')
    print('\nThere are '+str(sum(count_strs_all_Morph))+' texts.')
    fo.write('There are '+str(sum(count_strs_all_Morph))+' texts.\n')
    print('Texts per file: '+str(count_strs_all_Morph))
    fo.write('Texts per file: '+str(count_strs_all_Morph)+'\n')
    fo.write('---------------------------------\n')

Processing test_triples_en_ga_utf8_0000-0449__SMorphText
Processing test_triples_en_ga_utf8_0450-0899__SMorphText
Processing test_triples_en_ga_utf8_0900-1349__SMorphText
Processing test_triples_en_ga_utf8_1350-1778__SMorphText

There are 1779 texts.
Texts per file: [450, 450, 450, 429]


## 4 - Output post-processing and packaging

In [None]:
# Process texts
prefinal_output_folder = ''

if language == 'GA':
  prefinal_output_folder = morph_output_folder
else:
  prefinal_output_folder = temp_input_folder_morph

! python {path_postProc} {language} {prefinal_output_folder}

# Check
list_filepaths = glob.glob(os.path.join(prefinal_output_folder, '*_postproc.txt'))
count_strs_all_postproc = []
for filepath in sorted(list_filepaths):
  count_strs_all = 0
  head, tail = os.path.split(filepath)
  fd = codecs.open(filepath, 'r', 'utf-8')
  lines = fd.readlines()
  x = 0
  for line in lines:
    if not line == '\n':
      count_strs_all += 1
    x += 1
  count_strs_all_postproc.append(count_strs_all)

with codecs.open('/content/FORGe/log/summary.txt', 'a', 'utf-8') as fo:
  fo.write('\nPost-processing debug\n==================\n\n')
  if not sum(count_strs_all_postproc) == count_strs_all_FORGe:
    print('\nERROR! Mismatch with FORGe outputs!')
    fo.write('ERROR! Mismatch with FORGe outputs!\n')
  print('\nThere are '+str(sum(count_strs_all_postproc))+' texts.')
  fo.write('There are '+str(sum(count_strs_all_postproc))+' texts.\n')
  print('Texts per file: '+str(count_strs_all_postproc))
  fo.write('Texts per file: '+str(count_strs_all_postproc)+'\n')
  fo.write('---------------------------------\n')

Processing test_triples_en_ga_utf8_0000-0449__SMorphText_out
Processing test_triples_en_ga_utf8_0450-0899__SMorphText_out
Processing test_triples_en_ga_utf8_0900-1349__SMorphText_out
Processing test_triples_en_ga_utf8_1350-1778__SMorphText_out

There are 1779 texts.
Texts per file: [450, 450, 450, 429]


In [None]:
# Concatenate files

list_clean_outputs = glob.glob(os.path.join(morph_output_folder, '*_out_postproc.txt'))
filename = 'all_'+language+'_'+split+'_out.txt'

with codecs.open(filename, 'w', 'utf-8') as outfile:
  # Files need to be sorted to be concatenated in the right order
  for fname in sorted(list_clean_outputs):
    print('Processing '+fname)
    with open(fname) as infile:
      outfile.write(infile.read())

# Check
with codecs.open('/content/FORGe/log/summary.txt', 'a', 'utf-8') as fo:
  fo.write('\nConcatenate debug\n==================\n\n')
  count_texts_all = len(codecs.open(filename).readlines())
  if not count_texts_all == count_strs_all_FORGe:
    print('\nERROR! Mismatch with FORGe outputs!')
    fo.Write(('ERROR! Mismatch with FORGe outputs!\n'))
  print('\nThere are '+str(count_texts_all)+' texts.')
  fo.write('There are '+str(count_texts_all)+' texts.\n')


Processing /content/test_irish_morph_gen_v5.0/Outputs/test_triples_en_ga_utf8_0000-0449__SMorphText_out_postproc.txt
Processing /content/test_irish_morph_gen_v5.0/Outputs/test_triples_en_ga_utf8_0450-0899__SMorphText_out_postproc.txt
Processing /content/test_irish_morph_gen_v5.0/Outputs/test_triples_en_ga_utf8_0900-1349__SMorphText_out_postproc.txt
Processing /content/test_irish_morph_gen_v5.0/Outputs/test_triples_en_ga_utf8_1350-1778__SMorphText_out_postproc.txt

There are 1779 texts.


In [None]:
# Zip and download FORGE output folder to download intermediate representations
from google.colab import files
zip_name_inter = '/content/WebNLG_['+language+']_['+split+']_allLevels.zip'
!zip -r {zip_name_inter} /content/FORGe/structures

clear_output()

files.download(zip_name_inter)
# print('Donwloaded intermediate representations!')
# ! rm {zip_name_inter}

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Zip FORGe log files folder
from google.colab import files
zip_name_log = '/content/WebNLG_['+language+']_['+split+']_logs.zip'
!zip -r {zip_name_log} /content/FORGe/log

clear_output()

files.download(zip_name_log)
# print('Donwloaded log files!')
# ! rm {zip_name_log}