<a href="https://colab.research.google.com/github/mille-s/Mod-D2T/blob/main/ModD2T.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Run this cell to prepare the working folder and install Java 8
from IPython.display import clear_output
import os
import sys

# clone Mod-D2T repo
! git clone https://github.com/mille-s/Mod-D2T.git
# Delete locally to avoid confusion
! rm '/content/Mod-D2T/Mod-D2T.ipynb'

# clone M-FleNS repo (generation pipeline)
! git clone https://github.com/mille-s/M-FleNS_NLG-Pipeline.git
# Delete locally to avoid confusion
! rm '/content/M-FleNS_NLG-Pipeline/M_FleNS_pipe_v2.ipynb'

# Download FORGe (text generator)
# V2 was used for the INLG paper
! gdown 1K99nCrBX2RTVMhDcPEgF0usfJYnwtE-w
! unzip /content/FORGe_colab_v2.zip
! rm '/content/FORGe_colab_v2.zip'
# ! gdown 1lsh8pwUp9mc0Z_aFbSy1WTIpSx9YwFFD
# ! unzip /content/FORGe_colab_v3_WebNLG.zip
# ! rm '/content/FORGe_colab_v3_WebNLG.zip'

# Install parsimonious (used for parsing .str files)
! pip install parsimonious

# Clean
clear_output()
print('Working folder ready!\n--------------')

# Run to switch to Java 1.8 (needed for FORGe to run correctly)
def install_java():
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
  !update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
  !java -version       #check java version
install_java()

# Part 1: Convert input triples to linguistic structures
Don't run all cells at once in this block! The last one erases the input files.

In [None]:
# For the moment, you can download the outputs of the conversion and copy the desired inputs of the same split in the str_PredArg_folder (see below).
! gdown 1MQJnUWEUlELpd-mnWmsdTkJNpIPeIA48
! unzip /content/00-PredArg-train.zip
! rm '/content/00-PredArg-train.zip'

! gdown 1qOA17TYg__89euDjQliOPrywYwgecwBc
! unzip /content/00-PredArg-test.zip
! rm '/content/00-PredArg-test.zip'

! gdown 1vm5A5WRGmnjOPrq8GsNjF3JCRGqhxigp
! unzip /content/00-PredArg-dev.zip
! rm '/content/00-PredArg-dev.zip'

clear_output()

In [3]:
# Copy some PredArg structures in the input folder used for generation

import glob
import os

predArg_conv_folder = '/content/00-PredArg-dev'
# predArg_conv_folder = '/content/00-PredArg-test'
# predArg_conv_folder = '/content/00-PredArg-train'
list_predArgPaths = glob.glob(os.path.join(predArg_conv_folder, '*.conll'))
for predArgPath in list_predArgPaths:
  PAfilename = os.path.split(predArgPath)[-1]
  ! cp {predArgPath} '/content/FORGe/structures/00-PredArg/'{PAfilename}

In [12]:
# Empty input folder to copy other inputs instead
list_predArgPathsCC = glob.glob(os.path.join('/content/FORGe/structures/00-PredArg/', '*.conll'))
for predArgPathCC in list_predArgPathsCC:
  ! rm {predArgPathCC}

# Part 2: Generate texts and intermediate representations

In [4]:
# 1. Run this cell to set parameters for generation

# The input structure(s) of the correct type should be placed in the folder that corresponds to the first module called in the next cell
# E.g. if one a module PredArg_... or DSynt_... is selected, the input predicate-argument structures should be placed in the structures/00-PredArg folder
# I'll make the instructions and names clearer in a later (actually usable) version.

############# Select language #############
# GA and ES not supported for this version of the pipeline (ES will break on many structures and for GA Morphology processing is missing)
language = 'EN' #@param['EN', 'ES', 'GA']

############# Select module grouping #############
# Group consecutive modules for the same system or call each module separately.
# Select 'no' to get all intermediate representations ('no' for Mod-D2T), 'yes' if you're only interested in the output.
group_modules_prm = 'no' #@param['yes', 'no']

############# Select dataset split #############
split = "dev" #@param['dev', 'test','train','ukn']

#######################################################################

# Modules to run, with type of processing (FORGe, Model1, SimpleNLG, etc.).
# Only FORGe is supported for this prototype version.
PredArg_Normalisation = 'FORGe'
# To have an external module assigning triples to aggregate
PredArg_AggregationMark = 'None'
PredArg_Aggregation = 'FORGe'
PredArg_PoSTagging = 'FORGe'
PredArg_CommStructuring = 'FORGe'
DSynt_Structuring = 'FORGe'
SSynt_Structuring = 'FORGe'
SSynt_Aggregation = 'FORGe'
RE_Generation = 'FORGe'
DMorph_AgreementsLinearisation = 'FORGe'
SMorph_Processing = 'FORGe'

#######################################################################
# Paths to python files
path_MFleNS = '/content/M-FleNS_NLG-Pipeline/code/M-FleNS.py'
path_checkOutputs = '/content/M-FleNS_NLG-Pipeline/code/M-FleNS-checkOutputs.py'
path_concatenate = '/content/M-FleNS_NLG-Pipeline/code/concatenate_files.py'
path_postProc = '/content/M-FleNS_NLG-Pipeline/code/postProcess.py'

#######################################################################
# Paths to FORGe/MATE folders and property files
FORGe_input_folder = '/content/FORGe/buddy_project/struct'
path_MATE = '/content/FORGe/buddy-patched.jar'
path_props_resources_template = '/content/FORGe/mateColabDrive.properties'
path_props_levels = '/content/FORGe/mateLevels.properties'
path_props = '/content/FORGe/mate.properties'

# Paths to general folders
# The input structure(s) of the correct type should be placed in the folder that corresponds to the first module called in the next cell
path_strs = '/content/FORGe/structures'
str_PredArg_folder = os.path.join(path_strs, '00-PredArg')
str_PredArgNorm_folder = os.path.join(path_strs, '01-PredArgNorm')
str_PredArgAggMark_folder = os.path.join(path_strs, '02-PredArgAggMark')
str_PredArgAgg_folder = os.path.join(path_strs, '03-PredArgAgg')
str_PredArgPoS_folder = os.path.join(path_strs, '04-PredArgPoS')
str_PredArgComm_folder = os.path.join(path_strs, '05-PredArgComm')
str_DSynt_folder = os.path.join(path_strs, '06-DSynt')
str_SSynt_folder = os.path.join(path_strs, '07-SSynt')
str_SSyntAgg_folder = os.path.join(path_strs, '08-SSyntAgg')
str_REG_folder = os.path.join(path_strs, '09-REG')
str_DMorphLin_folder = os.path.join(path_strs, '10-DMorphLin')
str_SMorphText_folder = os.path.join(path_strs, '11-SMorphText')
log_folder = '/content/FORGe/log'
if not os.path.exists(log_folder):
  os.makedirs(log_folder)
clean_out_str_folder = '/content/Mod-D2T/str'
if not os.path.exists(clean_out_str_folder):
  os.makedirs(clean_out_str_folder)

In [None]:
# 2. Launch generation process
! python {path_MFleNS} {language} {split} {group_modules_prm} {PredArg_Normalisation} {PredArg_AggregationMark} {PredArg_Aggregation} {PredArg_PoSTagging} {PredArg_CommStructuring} {DSynt_Structuring} {SSynt_Structuring} {SSynt_Aggregation} {RE_Generation} {DMorph_AgreementsLinearisation} {SMorph_Processing} {FORGe_input_folder} {path_MATE} {path_props_resources_template} {path_props_levels} {path_props} {str_PredArg_folder} {str_PredArgNorm_folder} {str_PredArgAggMark_folder} {str_PredArgAgg_folder} {str_PredArgPoS_folder} {str_PredArgComm_folder} {str_DSynt_folder} {str_SSynt_folder} {str_SSyntAgg_folder} {str_REG_folder} {str_DMorphLin_folder} {str_SMorphText_folder} {log_folder}

In [None]:
# 3. (optional) Check outputs
! python {path_checkOutputs} {str_PredArg_folder} {str_SMorphText_folder} {log_folder}

In [7]:
# 4. Concatenate outputs of each level and copy to Mod-D2T folder
! python {path_concatenate} {str_PredArgNorm_folder} {clean_out_str_folder} {split}
# Not used for now ! python {path_concatenate} {str_PredArgAggMark_folder} {clean_out_str_folder} {split}
! python {path_concatenate} {str_PredArgAgg_folder} {clean_out_str_folder} {split}
! python {path_concatenate} {str_PredArgPoS_folder} {clean_out_str_folder} {split}
! python {path_concatenate} {str_PredArgComm_folder} {clean_out_str_folder} {split}
! python {path_concatenate} {str_DSynt_folder} {clean_out_str_folder} {split}
! python {path_concatenate} {str_SSynt_folder} {clean_out_str_folder} {split}
! python {path_concatenate} {str_SSyntAgg_folder} {clean_out_str_folder} {split}
! python {path_concatenate} {str_REG_folder} {clean_out_str_folder} {split}
! python {path_concatenate} {str_DMorphLin_folder} {clean_out_str_folder} {split}
! python {path_concatenate} {str_SMorphText_folder} {clean_out_str_folder} {split}
clear_output()

#Part 3: Clean and save the dataset, create stats and figures

In [18]:
# 1. Parameters
ROOT        = '/content/Mod-D2T/'
SCRIPTS_DIR = os.path.join(ROOT, 'scripts')
STR_DIR     = os.path.join(ROOT, 'str')
CONLLU_DIR  = os.path.join(ROOT, 'conllu')
EXTRACT_DIR = os.path.join(ROOT, 'extracted')
TEX_DIR     = os.path.join(ROOT, 'tex')
EXTRACT_ID  = 1
EXTRACT_SEC = split
TEXT_FILE   = '00-Text_postproc.txt'
ENCODING    = 'utf-8'

# sys.path.append(SCRIPTS_DIR)
if os.path.exists('/content/Mod-D2T/str/.ipynb_checkpoints'):
  ! rmdir '/content/Mod-D2T/str/.ipynb_checkpoints'

In [None]:
# 2. apply post-processing to outputs
str_out_subfolder = os.path.join(STR_DIR, split)
! python {path_postProc} {language} {str_out_subfolder}

In [None]:
# 3. Convert .str files in STR_DIR to .conllu format and save to CONLLU_DIR
! python3 {SCRIPTS_DIR}/convert.py -i {STR_DIR} -o {CONLLU_DIR} -t {TEXT_FILE} -e {ENCODING}

In [None]:
# 4. (optional) Prepare files for visualising chosen example in first cell of Part 3, get stats
# Extract structures for a text specified by its ID. They will be taken from the section specified by EXTRACT_SEC.
! python3 {SCRIPTS_DIR}/extract.py -x {EXTRACT_ID} -i {CONLLU_DIR}/{EXTRACT_SEC} -o {EXTRACT_DIR} -e {ENCODING}
# Export structures as LaTeX tables.
! python3 {SCRIPTS_DIR}/export.py -i {EXTRACT_DIR} -o {TEX_DIR} -e {ENCODING}
# Compile statistics.
! python {SCRIPTS_DIR}/stats.py -i {CONLLU_DIR} -o {TEX_DIR} -e {ENCODING}