<a href="https://colab.research.google.com/github/mille-s/Mod-D2T/blob/main/ModD2T.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Run this cell to prepare the working folder (now setup to use FORGe-v6) and install Java 8
from IPython.display import clear_output
import os
import sys

# clone Mod-D2T repo
! git clone https://github.com/mille-s/Mod-D2T.git
# Delete locally to avoid confusion
! rm 'Mod-D2T/ModD2T.ipynb'

# clone M-FleNS repo (generation pipeline)
! git clone https://github.com/mille-s/M-FleNS_NLG-Pipeline.git
# Delete locally to avoid confusion
! rm 'M-FleNS_NLG-Pipeline/M_FleNS_pipe_v2.ipynb'

# Download FORGe (text generator)
# V2 was used for the INLG paper
# ! gdown 1K99nCrBX2RTVMhDcPEgF0usfJYnwtE-w
# ! unzip /content/FORGe_colab_v2.zip
# ! rm '/content/FORGe_colab_v2.zip'
# V4 is being tested for the Mod-D2T-GA data
# ! gdown 196w_EtORTkR3idaXDMq0xl3pOtBrGbiE
# ! unzip /content/FORGe_colab_v4.zip
# ! rm '/content/FORGe_colab_v4.zip'
# Version used for GEM (now supporting some Wikidata properties)
# ! gdown 1gaTZVGFjtR_zBNskJXCeIJVug95aGFkf
# ! unzip /content/FORGe_colab_v5.zip
# ! rm '/content/FORGe_colab_v5.zip'
# Version used for French
! gdown 1M0yk7aLUpHiT4UfT72g-rNIPd4W8WT44
! unzip /content/FORGe_colab_v6.zip
! rm '/content/FORGe_colab_v6.zip'

# Install parsimonious (used for parsing .str files)
! pip install parsimonious

# Clean
clear_output()
print('Working folder ready!\n--------------')

# Run to switch to Java 1.8 (needed for FORGe to run correctly)
def install_java():
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
  !update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
  !java -version       #check java version
install_java()

# Part 1: Convert input triples to linguistic structures
The last cell of this block can be used to erase the input files automatically (it's a bit tedious by hand if there are many).

In [None]:
#@title 1. Run this cell to set general parameters

language = 'FR' #@param ['EN', 'FR', 'GA']
split = "dev" #@param ['dev', 'test','train']

In [None]:
#@title 2. Download inputs
# For the moment, you can download the outputs of the conversion and copy the desired inputs of the same split in the str_PredArg_folder (see below).
import os

if split == 'train':
  if os.path.exists('/content/00-PredArg-train'):
    ! rm -r '/content/00-PredArg-train'
  if language == 'EN':
    ! gdown 1MQJnUWEUlELpd-mnWmsdTkJNpIPeIA48
    ! unzip /content/00-PredArg-train.zip
    ! rm '/content/00-PredArg-train.zip'
  elif language == 'GA':
    ! gdown 1bv4CjYcqkg14-Mu4TLxX94atok2LsloU
    ! unzip /content/00-PredArg-train_GA.zip
    ! rm '/content/00-PredArg-train_GA.zip'
  elif language == 'FR':
    ! gdown 1EG19aItypQcW-eyNSkioh1EPmxu29beE
    ! unzip /content/00-PredArg-train_FR.zip
    ! rm '/content/00-PredArg-train_FR.zip

elif split == 'dev':
  if os.path.exists('/content/00-PredArg-dev'):
    ! rm -r '/content/00-PredArg-dev'
  if language == 'EN':
    ! gdown 1vm5A5WRGmnjOPrq8GsNjF3JCRGqhxigp
    ! unzip /content/00-PredArg-dev.zip
    ! rm '/content/00-PredArg-dev.zip
  elif language == 'GA':
    ! gdown 1Sg7AaDq14-BGjZYdBuJtndm8A1PmUA1A
    ! unzip /content/00-PredArg-dev_GA.zip
    ! rm '/content/00-PredArg-dev_GA.zip
  elif language == 'FR':
    ! gdown 1e0nl3X37zYkGaWHGrWThZFZzgTTEo1h4
    ! unzip /content/00-PredArg-dev_FR.zip
    ! rm '/content/00-PredArg-dev_FR.zip

elif split == 'test':
  if os.path.exists('/content/00-PredArg-test'):
    ! rm -r '/content/00-PredArg-test'
  if language == 'EN':
    ! gdown 1qOA17TYg__89euDjQliOPrywYwgecwBc
    ! unzip /content/00-PredArg-test.zip
    ! rm '/content/00-PredArg-test.zip'
  elif language == 'GA':
    ! gdown 1w9bKDMnOn-73s8xGRTgxMfOK2YbBTWMG
    ! unzip /content/00-PredArg-test_GA.zip
    ! rm '/content/00-PredArg-test_GA.zip'
  elif language == 'FR':
    ! gdown 1J8tIUkpYhYZguqz-oVOTGAGzdq-2_euI
    ! unzip /content/00-PredArg-test_FR.zip
    ! rm '/content/00-PredArg-test_FR.zip

clear_output()

In [None]:
#@title 3. Copy some PredArg structures in the input folder used for generation

import glob
import os

predArg_conv_folder = ''
if split == 'train':
  predArg_conv_folder = '/content/00-PredArg-train'
elif split == 'dev':
  predArg_conv_folder = '/content/00-PredArg-dev'
elif split == 'test':
  predArg_conv_folder = '/content/00-PredArg-test'

list_predArgPaths = glob.glob(os.path.join(predArg_conv_folder, '*.conll'))

c = 0
for predArgPath in list_predArgPaths:
  PAfilename = os.path.split(predArgPath)[-1]
  ! cp {predArgPath} '/content/FORGe/structures/00-PredArg/'{PAfilename}
  c += 1
print('Copied '+str(c)+' files.')

In [None]:
#@title Empty input folder to copy other inputs instead
flush_input_folder = False #@param  {type:"boolean"}
if flush_input_folder == True:
  list_predArgPathsCC = glob.glob(os.path.join('/content/FORGe/structures/00-PredArg/', '*.conll'))
  for predArgPathCC in list_predArgPathsCC:
    ! rm {predArgPathCC}

# Part 2: Generate texts and intermediate representations

In [None]:
#@title 1. Run this cell to set parameters for generation
import shutil

# The input structure(s) of the correct type should be placed in the folder that corresponds to the first module called in the next cell
# E.g. if one a module PredArg_... or DSynt_... is selected, the input predicate-argument structures should be placed in the structures/00-PredArg folder
# I'll make the instructions and names clearer in a later (actually usable) version.

############# Select language #############
# GA and ES not supported for this version of the pipeline (ES will break on some structures and Morphology is missing for GA)
# language = 'FR' #@param['EN', 'ES', 'FR', 'GA']

############# Select module grouping #############
# Group consecutive modules for the same system or call each module separately.
# Select 'no' to get all intermediate representations ('no' for Mod-D2T), 'yes' if you're only interested in the output.
group_modules_prm = 'no'

############# Select dataset split #############
# split = "train" #@param['dev', 'test','train']

#######################################################################

# Modules to run, with type of processing (FORGe, Model1, SimpleNLG, etc.).
# Only FORGe is supported for this prototype version.
# All modules are mandatory except: PredArg_Aggregation, SSynt_Aggregation, RE_Generation
PredArg_Normalisation = 'FORGe'
# To have an external module assigning triples to aggregate
PredArg_AggregationMark = 'None'
PredArg_Aggregation = 'FORGe' #@param['FORGe', 'None']
PredArg_PoSTagging = 'FORGe'
PredArg_CommStructuring = 'FORGe'
DSynt_Structuring = 'FORGe'
SSynt_Structuring = 'FORGe'
SSynt_Aggregation = 'FORGe' #@param['FORGe', 'None']
RE_Generation = 'FORGe' #@param['FORGe', 'None']
DMorph_AgreementsLinearisation = 'FORGe'
SMorph_Processing = 'FORGe'

#######################################################################
# Paths to python files
path_MFleNS = '/content/M-FleNS_NLG-Pipeline/code/M-FleNS.py'
path_checkOutputs = '/content/M-FleNS_NLG-Pipeline/code/M-FleNS-checkOutputs.py'
path_concatenate = '/content/M-FleNS_NLG-Pipeline/code/concatenate_files.py'
path_postProc = '/content/M-FleNS_NLG-Pipeline/code/postProcess.py'

#######################################################################
# Paths to FORGe/MATE folders and property files
FORGe_input_folder = '/content/FORGe/buddy_project/struct'
path_MATE = '/content/FORGe/buddy-patched.jar'
path_props_resources_template = '/content/FORGe/mateColabDrive.properties'
path_props_levels = '/content/FORGe/mateLevels.properties'
path_props = '/content/FORGe/mate.properties'

# Paths to general folders
# The input structure(s) of the correct type should be placed in the folder that corresponds to the first module called in the next cell
path_strs = '/content/FORGe/structures'
str_PredArg_folder = os.path.join(path_strs, '00-PredArg')
str_PredArgNorm_folder = os.path.join(path_strs, '01-PredArgNorm')
str_PredArgAggMark_folder = os.path.join(path_strs, '02-PredArgAggMark')
str_PredArgAgg_folder = os.path.join(path_strs, '03-PredArgAgg')
str_PredArgPoS_folder = os.path.join(path_strs, '04-PredArgPoS')
str_PredArgComm_folder = os.path.join(path_strs, '05-PredArgComm')
str_DSynt_folder = os.path.join(path_strs, '06-DSynt')
str_SSynt_folder = os.path.join(path_strs, '07-SSynt')
str_SSyntAgg_folder = os.path.join(path_strs, '08-SSyntAgg')
str_REG_folder = os.path.join(path_strs, '09-REG')
str_DMorphLin_folder = os.path.join(path_strs, '10-DMorphLin')
str_SMorphText_folder = os.path.join(path_strs, '11-SMorphText')
log_folder = '/content/FORGe/log'
if not os.path.exists(log_folder):
  os.makedirs(log_folder)
clean_out_str_folder = '/content/Mod-D2T/str'
if not os.path.exists(clean_out_str_folder):
  os.makedirs(clean_out_str_folder)

def clear_folder(folder):
  "Function to clear whole folders."
  if os.path.exists(folder) and os.path.isdir(folder):
    try:
      shutil.rmtree(folder)
    except Exception as e:
      print('Failed to delete %s. Reason: %s' % (folder, e))

In [None]:
#@title 2. Run this cell to generate the outputs (do not select "use_existing_outputs" unless you know what you're doing).
use_existing_outputs = False #@param  {type:"boolean"}
zip_name = 'FORGe_FRtrain_for-Mod-D2T.zip' #@param  {type:"string"}

if use_existing_outputs == True:
  # Alternative1. Use if you upload structures generated from another pipeline instead of using the previous
  clear_folder('/content/FORGe/structures')
  ! unzip '/content/'{zip_name}
else:
  # Alternative2. Launch generation process
  ! python {path_MFleNS} {language} {split} {group_modules_prm} {PredArg_Normalisation} {PredArg_AggregationMark} {PredArg_Aggregation} {PredArg_PoSTagging} {PredArg_CommStructuring} {DSynt_Structuring} {SSynt_Structuring} {SSynt_Aggregation} {RE_Generation} {DMorph_AgreementsLinearisation} {SMorph_Processing} {FORGe_input_folder} {path_MATE} {path_props_resources_template} {path_props_levels} {path_props} {str_PredArg_folder} {str_PredArgNorm_folder} {str_PredArgAggMark_folder} {str_PredArgAgg_folder} {str_PredArgPoS_folder} {str_PredArgComm_folder} {str_DSynt_folder} {str_SSynt_folder} {str_SSyntAgg_folder} {str_REG_folder} {str_DMorphLin_folder} {str_SMorphText_folder} {log_folder}

In [None]:
#@title 3. (optional) Check outputs
! python {path_checkOutputs} {str_PredArg_folder} {str_SMorphText_folder} {log_folder} {clean_out_str_folder} {language}

In [None]:
#@title 4. Concatenate outputs of each level and copy to Mod-D2T folder
! python {path_concatenate} {str_PredArgNorm_folder} {clean_out_str_folder} {split}
# Not used for now ! python {path_concatenate} {str_PredArgAggMark_folder} {clean_out_str_folder} {split}
! python {path_concatenate} {str_PredArgAgg_folder} {clean_out_str_folder} {split}
! python {path_concatenate} {str_PredArgPoS_folder} {clean_out_str_folder} {split}
! python {path_concatenate} {str_PredArgComm_folder} {clean_out_str_folder} {split}
! python {path_concatenate} {str_DSynt_folder} {clean_out_str_folder} {split}
! python {path_concatenate} {str_SSynt_folder} {clean_out_str_folder} {split}
! python {path_concatenate} {str_SSyntAgg_folder} {clean_out_str_folder} {split}
! python {path_concatenate} {str_REG_folder} {clean_out_str_folder} {split}
! python {path_concatenate} {str_DMorphLin_folder} {clean_out_str_folder} {split}
! python {path_concatenate} {str_SMorphText_folder} {clean_out_str_folder} {split}
clear_output()

#Part 3: Clean and save the dataset, create stats and figures

In [None]:
# IMPORTANT: str files of more than 3M lines may cause a memory exception; split very long files in files < 3M lines.
# We'll add code to do so automatically.

#@title 1. Parameters
ROOT        = '/content/Mod-D2T/'
SCRIPTS_DIR = os.path.join(ROOT, 'scripts')
STR_DIR     = os.path.join(ROOT, 'str')
STR_DIR_SPLIT = os.path.join(ROOT, 'str_split')
CONLLU_DIR  = os.path.join(ROOT, 'conllu')
CONLLU_DIR_SPLIT  = os.path.join(ROOT, 'conllu_split')
EXTRACT_DIR = os.path.join(ROOT, 'extracted')
TEX_DIR     = os.path.join(ROOT, 'tex')
EXTRACT_ID  = 1
EXTRACT_SEC = split
TEXT_FILE   = '00-Text_postproc.txt'
ENCODING    = 'utf-8'
# Up to how many lines in a single file (to avoid reaching the 3M lines)
LINE_LIMIT  = 2000000

# sys.path.append(SCRIPTS_DIR)
if os.path.exists('/content/Mod-D2T/str/.ipynb_checkpoints'):
  ! rmdir '/content/Mod-D2T/str/.ipynb_checkpoints'
if os.path.exists('/content/Mod-D2T/str/train/.ipynb_checkpoints'):
  ! rmdir '/content/Mod-D2T/str/train/.ipynb_checkpoints'
if os.path.exists('/content/Mod-D2T/str/dev/.ipynb_checkpoints'):
  ! rmdir '/content/Mod-D2T/str/dev/.ipynb_checkpoints'
if os.path.exists('/content/Mod-D2T/str/test/.ipynb_checkpoints'):
  ! rmdir '/content/Mod-D2T/str/test/.ipynb_checkpoints'

In [None]:
#@title 2. apply post-processing to outputs
str_out_subfolder = os.path.join(STR_DIR, split)
! python {path_postProc} {language} {str_out_subfolder}

In [None]:
#@title 3. Split large files and check again number of texts
import codecs
import re

def split_large_str_files(LINE_LIMIT, ROOT_path, STR_DIR_path, STR_DIR_SPLIT, TEXT_FILE_name):
  # Get all directories in /content/Mod-D2T/str (train, dev, test)
  str_folders = os.listdir(STR_DIR_path)
  for str_folder in str_folders:
    # Get list of files in folder
    txt_file_path = os.path.join(STR_DIR_path, str_folder, TEXT_FILE_name)
    str_files = [str_file for str_file in os.listdir(os.path.join(STR_DIR_path, str_folder)) if not str_file.startswith('00-')]

    # Look for file that has the largest number of lines
    dico_lengths_lines = {}
    dico_lengths_strs = {}
    max_lines = 0
    print('Reading files\n---------------')
    for str_file in sorted(str_files):
      print('  '+str_file)
      with codecs.open(os.path.join(STR_DIR_path, str_folder, str_file), 'r', encoding=ENCODING) as f:
        read_file = f.read()
        lines = re.split('\n', read_file)
        dico_lengths_lines[str_file] = len(lines)
        if len(lines) > max_lines:
          max_lines = len(lines)
        structures = re.split('\n\n\n\}\n', read_file)
        dico_lengths_strs[str_file] = len(structures)

    # Calculate the number of files we'll need to split each file into
    num_files = 0
    if max_lines > LINE_LIMIT:
      # For WebNLG, we should get max 2 files
      # The // operator is for "floor division", and only gets us the integer part of the result
      num_files = max_lines // LINE_LIMIT
      # Round up to the integer above. The % operator gets the remainder of a division
      if max_lines % LINE_LIMIT != 0:
        num_files += 1
    else:
      num_files = 1

    # print(dico_lengths_lines)
    # print(dico_lengths_strs)
    num_str = dico_lengths_strs['01-PredArgNorm.str']
    # Check that all files have the same number of structures
    for level_str in dico_lengths_strs:
      if not dico_lengths_strs[level_str] == num_str:
        print(f'!!!ERROR in number of structures in {level_str}. Expected {num_str}, found {dico_lengths_strs[level_str]}')

    # At this point, we know if we have files to split (num_files tells us the number of files we need to not exceed the line limit).
    if num_files > 1:
      print(f'{max_lines} lines found in a file. Each file will be split in {num_files}.')
      # If folder exists, remove it
      if os.path.exists(STR_DIR_SPLIT):
        ! rm -r {STR_DIR_SPLIT}
      os.makedirs(STR_DIR_SPLIT)
      # Get number of structures in each folder; we need the same number of structures for each file because they all use the same output text file
      # This is a simplistic way of defining the size; if all very long texts are at the beginning of a file, the first file may still be too long
      num_strs_per_file = num_str // num_files
      # Round up to the integer above
      if num_str % num_files != 0:
        num_strs_per_file += 1

      print('\nSplitting files (Number of structures per file: '+str(num_strs_per_file)+')\n---------------')
      # Create output folders
      x = 0
      while x < num_files:
        print(f'Group {x}...')
        # Get the interval of the positions of the targeted structures
        boundary_down =  num_strs_per_file * (x)
        boundary_up = num_strs_per_file * (x+1)
        print(f'Boundaries: {boundary_down} (included) to {boundary_up} (excluded).')
        # Create new folders
        os.makedirs(os.path.join(STR_DIR_SPLIT, 'str'+str(x), str_folder))
        # Select desired slice in text file (same for all str files)
        texts = codecs.open(txt_file_path, 'r', 'utf-8').readlines()[boundary_down:boundary_up]
        # Write output text file
        with codecs.open(os.path.join(STR_DIR_SPLIT, 'str'+str(x), str_folder, TEXT_FILE_name), 'w', 'utf-8') as fo_t:
          fo_t.writelines(texts)

        # Now slice all the files according to the established boundaries
        for str_file in sorted(str_files):
          print('  '+str_file)
          with codecs.open(os.path.join(STR_DIR_path, str_folder, str_file), 'r', encoding=ENCODING) as f:
            read_file = f.read()
            # Select desired slice in str file
            structures = re.split('\n\n\n\}\n', read_file)[boundary_down:boundary_up]
            # Write output str files
            with codecs.open(os.path.join(STR_DIR_SPLIT, 'str'+str(x), str_folder, str_file.rsplit('.')[0]+'_'+str(x)+'.str'), 'w', 'utf-8') as fo_s:
              fo_s.write('\n\n\n}\n'.join(structures))
              # Write end of file for intermediate files only (the last file has the closing brackets already)
              if x < num_files-1:
                fo_s.write('\n\n\n}\n')
        x += 1

  return num_files

number_of_files = split_large_str_files(LINE_LIMIT, ROOT, STR_DIR, STR_DIR_SPLIT, TEXT_FILE)

if number_of_files == 1:
  print('No files need to be split.')

In [None]:
#@title 4. Convert .str files in STR_DIR to .conllu format and save to CONLLU_DIR
if number_of_files == 1:
  ! python3 {SCRIPTS_DIR}/convert.py -i {STR_DIR} -o {CONLLU_DIR} -t {TEXT_FILE} -e {ENCODING}
elif number_of_files > 1:
  for x in range(number_of_files):
    folder_split = os.path.join(STR_DIR_SPLIT, 'str'+str(x))
    ! python3 {SCRIPTS_DIR}/convert.py -i {folder_split} -o {CONLLU_DIR_SPLIT} -t {TEXT_FILE} -e {ENCODING}
  # Bring the split files back together
  if not os.path.exists(CONLLU_DIR):
    os.makedirs(CONLLU_DIR)
  # List the folders in CONLLU_DIR
  traindevtest_folders = [f for f in os.listdir(CONLLU_DIR_SPLIT) if os.path.isdir(os.path.join(CONLLU_DIR_SPLIT, f))]
  # List the files in each folder
  for traindevtest_folder in traindevtest_folders:
    if os.path.exists(os.path.join(CONLLU_DIR, traindevtest_folder)):
      ! rm -r {os.path.join(CONLLU_DIR, traindevtest_folder)}
    os.makedirs(os.path.join(CONLLU_DIR, traindevtest_folder))
    files_in_folder = [f for f in os.listdir(os.path.join(CONLLU_DIR_SPLIT, traindevtest_folder)) if os.path.isfile(os.path.join(CONLLU_DIR_SPLIT, traindevtest_folder, f))]
    # Get the first file for each layer
    for file_in_folder in files_in_folder:
      head, tail = os.path.split(file_in_folder)
      final_filename = tail.rsplit('_', 1)[0]
      # Create final file if extension is _0
      if tail.rsplit('_', 1)[1] == '0.conllu':
        with open(os.path.join(CONLLU_DIR, traindevtest_folder, final_filename+ '.conllu'), 'w') as outfile:
          # Write the lines of the split file in the output file
          with open(os.path.join(CONLLU_DIR_SPLIT, traindevtest_folder, file_in_folder), 'r') as infile:
            outfile.write(infile.read())
          # Get the number of files we need to look for, which is the total number of split files -1 because we already have the file with extension _0
          for y in range(number_of_files-1):
            with open(os.path.join(CONLLU_DIR_SPLIT, traindevtest_folder, final_filename+'_'+str(y+1)+'.conllu'), 'r') as infile:
              outfile.write(infile.read())

In [None]:
#@title 5. (optional) Prepare files for visualising chosen example in first cell of Part 3, get stats
# Extract structures for a text specified by its ID. They will be taken from the section specified by EXTRACT_SEC.
! python3 {SCRIPTS_DIR}/extract.py -x {EXTRACT_ID} -i {CONLLU_DIR}/{EXTRACT_SEC} -o {EXTRACT_DIR} -e {ENCODING}
# Export structures as LaTeX tables.
! python3 {SCRIPTS_DIR}/export.py -i {EXTRACT_DIR} -o {TEX_DIR} -e {ENCODING}
# Compile statistics.
! python {SCRIPTS_DIR}/stats.py -i {CONLLU_DIR} -o {TEX_DIR} -e {ENCODING}

#Part 4: Zip and download

In [None]:
#@title 6. Zip and download Mod-D2T data
from google.colab import files
zip_name_conllu = '/content/ModD2T_['+language+'].zip'
!zip -r {zip_name_conllu} /content/Mod-D2T/conllu

clear_output()

files.download(zip_name_conllu)

In [None]:
#@title 7. (optional) Zip and download tex files
from google.colab import files
zip_name_tex = '/content/ModD2T_['+language+']_stats-examplesTex.zip'
!zip -r {zip_name_tex} /content/Mod-D2T/tex

clear_output()

files.download(zip_name_tex)

In [None]:
#@title 8. (optional) Zip and download FORGE output folder
from google.colab import files
zip_name_inter = '/content/FORGe_['+language+']_['+split+']_allLevels.zip'
!zip -r {zip_name_inter} /content/FORGe/structures

clear_output()

files.download(zip_name_inter)

In [None]:
#@title 9. (optional) Zip and download FORGe debugging
from google.colab import files
zip_name_log = '/content/FORGe_['+language+']_log.zip'
!zip -r {zip_name_log} /content/FORGe/log

clear_output()

files.download(zip_name_log)