<a href="https://colab.research.google.com/github/mille-s/Text_Structuring/blob/main/Parsers_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Stanza: retrain parser on UD data

In [None]:
#@title Prepare and setup repo
from IPython.display import clear_output
import os

#https://stanfordnlp.github.io/stanza/retrain_ud.html
! git clone 'https://github.com/stanfordnlp/stanza.git'
udbase = '/content/Universal_Dependencies/git'
ubdase_EN_EWT = os.path.join(udbase, 'UD_English-EWT')
! git clone 'https://github.com/UniversalDependencies/UD_English-EWT.git' {ubdase_EN_EWT}

# Set environment variables
os.environ['PYTHONPATH'] = '/content'
os.environ['STANZA_RESOURCES_DIR'] = '/content/stanza_resources'
os.environ['TOKENIZE_DATA_DIR'] = '/content/data/tokenize'
os.environ['DEPPARSE_DATA_DIR'] = '/content/data/depparse'
os.environ['UDBASE'] = udbase

# Needed to run some of the code
! pip install emoji

# Copy files in stanza/stanza to folder above (stanza) otherwise the code doesn't find them
! cp '/content/stanza/stanza/_version.py' '/content/stanza/_version.py'
! cp '/content/stanza/stanza/__init__.py' '/content/stanza/__init__.py'

stanza_folder =  '/content/stanza'
! cp -r '/content/stanza/stanza/models' {stanza_folder}
! cp -r '/content/stanza/stanza/pipeline' {stanza_folder}
! cp -r '/content/stanza/stanza/protobuf' {stanza_folder}
! cp -r '/content/stanza/stanza/resources' {stanza_folder}
! cp -r '/content/stanza/stanza/server' {stanza_folder}
! cp -r '/content/stanza/stanza/tests' {stanza_folder}
! cp -r '/content/stanza/stanza/utils' {stanza_folder}

clear_output()
print('Done!')

## Train tokenizer (not needed for parser)

In [None]:
#@title Read the dataset from UDBASE and write it to TOKENIZE_DATA_DIR
# The last element (UD_English-EWT) is searched in the udbase subfolder of the same name
! python -m stanza.utils.datasets.prepare_tokenizer_treebank UD_English-EWT

In [None]:
#@title Train the tokenizer model
# This puts the result in saved_models/tokenize/en_ewt_tokenizer.pt
! python -m stanza.utils.training.run_tokenizer UD_English-EWT

# Note: Every 200 epochs, the model is saved if better than the previous one; the training seems to continue even though the Dev doesn't improve
# 400 epochs:   [~5 min CPU] 89.486 Dev;  [~7 sec GPU] 88.603 Dev
# 1,000 epochs: [~10 min CPU] 92.681 Dev; [~18 sec GPU] 92.701 Dev
# 1,800 epochs: [~20 min CPU] 93.596 Dev; [~30 sec GPU] 93.539 Dev
# 2,800 epochs: [~30 min CPU] 93.934 Dev; [~50 sec GPU] 93.709 Dev
# 5,000 epochs:                           [~90 sec GPU] 94.420 Dev
# 9,800 epochs:                           [~200 sec GPU] 94.474 Dev
# 14200 epochs:                           [~5 min GPU] 94.568 Dev
# 2024-03-28 14:52:32 INFO: Dev score: 94.390	Stopping training after 5200 steps with no improvement
# 2024-03-28 14:52:32 INFO: Best dev score=0.9479763240178896 at step 15400

# If you already have a model saved, the training script will not overwrite that model. You can make that happen with --force:
# ! python -m stanza.utils.training.run_tokenizer UD_English-EWT --force

# If you want a different save name:
# ! python -m stanza.utils.training.run_tokenizer UD_English-EWT --save_name en_ewt_variant_tokenizer.pt

In [None]:
#@title Evaluate tokenizer model
! python -m stanza.utils.training.run_tokenizer UD_English-EWT --score_dev
! python -m stanza.utils.training.run_tokenizer UD_English-EWT --score_test

## Train parser (GPU needed)

In [None]:
#@title Copy my own files
import shutil
import os

path_textStruct = '/content/drive/MyDrive/M-FleNS/Papers-Slides/M-FleNS_papers/2024-05_Fluency_Improvements/Parsing4/'
path_fake_UD = os.path.join(udbase, 'UD_English-EWT2')
if not os.path.exists(path_fake_UD):
  os.makedirs(path_fake_UD)

data_code = '2AOe1p1d2r2c3'

def copyFiles(path_in, data_code, path_out, split):
  shutil.copy(os.path.join(path_in, data_code, split+'-TextStruct_'+data_code+'.conllu'), os.path.join(path_out, 'en_ewt-ud-'+split+'.conllu'))

copyFiles(path_textStruct, data_code, path_fake_UD, 'train')
copyFiles(path_textStruct, data_code, path_fake_UD, 'dev')
copyFiles(path_textStruct, data_code, path_fake_UD, 'test')

In [None]:
#@title Read the dataset from UDBASE and write it to DEPPARSE_DATA_DIR (RUN CELL TWICE!)

pos_tags = 'gold'#@param['predicted', 'gold']

if pos_tags == 'predicted':
  # The first time the cell is run, it will download files but then won't find them; the second time running the cell will copy the files where expected and it will run
  # The code looks for the following file for an existing pos tagger
  ! cp '/content/stanza_resources/en/pos/ewt_charlm.pt' '/content/stanza_resources/en/pos/ewt.pt'
  # To apply PoS tagger to get predicted PoS for training
  ! python -m stanza.utils.datasets.prepare_depparse_treebank UD_English-EWT
elif pos_tags == 'gold':
  # To use the gold PoS tags, use the --gold flag
  # ! python -m stanza.utils.datasets.prepare_depparse_treebank UD_English-EWT --gold
  ! python -m stanza.utils.datasets.prepare_depparse_treebank path_fake_UD --gold

# Use this flag to use a different pretrained embeddings file: -⁠-⁠wordvec_pretrain_file

In [None]:
#@title Train the parser
# This puts the result in saved_models/depparse/en_ewt_charlm_parser.pt
! python -m stanza.utils.training.run_depparse UD_English-EWT

# 100 epochs: [~70 sec GPU] 24.90 LAS Dev
# 200 epochs: [~150 sec GPU] 64.89 LAS Dev
# 400 epochs: [~5 min GPU] 78.03 LAS Dev
# 600 epochs: [~8 min GPU] 82.92 LAS Dev
# 800 epochs: [~10 min GPU] 84.79 LAS Dev
# 1000 epochs: [~13 min GPU] 86.06 LAS Dev
# 1200 epochs: [~16 min GPU] 86.64 LAS Dev
# 1400 epochs: [~19 min GPU] 86.87 LAS Dev

In [None]:
 #@title Evaluate parser
! python -m stanza.utils.training.run_depparse UD_English-EWT --score_dev
! python -m stanza.utils.training.run_depparse UD_English-EWT --score_test

# Maltparser

In [None]:
#@title Maven (just for testing, not needed))
from IPython.display import clear_output

# !apt-get install maven
# !apt-get install openjdk-8-jdk
# !apt-get install openjfx
# # ! mvn --version

# # Download POM for MaltParser
# ! gdown 1GquY2cUDUPLMGUu6GQAO2czkBbrmlztk

# clear_output()

In [None]:
#@title Prepare repo
from IPython.display import clear_output
import os

# def install_java():
#   !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
#   os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
#   !update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
#   !java -version       #check java version
# install_java()

# Download and unzip MaltParser code from my gmail drive My Drive/M-FleNS/Parsing
# Local files in C:\Users\sfmil\OneDrive\Desktop\DCU\Papers\2024-05_Fluency-improvements\TextPlanning\parsing
# The version of Maltparser I use has paths changed in the options.xml file, so as to find the local Colab feats files for liblinear/conllu
! gdown 14iRdWjzmzl5QJkEaNE8mukpJxaa3i614
! unzip /content/maltparser-1.9.2.zip

# Download and unzip MaltEval code
! gdown 11Wfoel-N9A3uqD1gsR0wI5UuRIYxRDNw
! unzip /content/MaltEval-dist.zip

# Download and unzip MaltOptimizer
! gdown 1F3WHHAvO35FMSROaWFf38cSUWHYcL-Z6
! unzip /content/MaltOptimizer-1.0.3.zip

# Clone UD EN-EWT repo or testing
udbase = '/content/Universal_Dependencies/git'
ubdase_EN_EWT = os.path.join(udbase, 'UD_English-EWT')
! git clone 'https://github.com/UniversalDependencies/UD_English-EWT.git' {ubdase_EN_EWT}

# Clone repo for conversion of parser output to format needed for evaluation
! git clone 'https://github.com/mille-s/Text_Structuring.git'

clear_output()

path_maltParser = '/content/maltparser-1.9.2/maltparser-1.9.2.jar'
path_maltEval = '/content/dist-20141005/lib/MaltEval.jar'
path_TS = '/content/Text_Structuring'

! java -jar {path_maltParser}
! java -jar {path_maltEval}

In [None]:
#@title Mount drive ans set path data repos
import os
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

mflens_local_data_folder = '/content/Text_Structuring/data'
if not os.path.exists(mflens_local_data_folder):
  os.makedirs(mflens_local_data_folder)

mflens_data_folder = '/content/drive/MyDrive/M-FleNS/Papers-Slides/M-FleNS_papers/2024-05_Fluency_Improvements/Parsing4'
gold_structuring_folder = '/content/drive/MyDrive/M-FleNS/Papers-Slides/M-FleNS_papers/2024-05_Fluency_Improvements/Parsing4/0_structuring_gold_input'

## Train and run parser on test/dev data

In [None]:
#@title Train parser and/or set parameters for eval (for eval only, uncheck both train_... boxes)
# https://www.maltparser.org/userguide.html
# https://www.maltparser.org/options.html
import shutil
import codecs

# model_name = 'ud-en-ewt_parser'
# training_data = '/content/Universal_Dependencies/git/UD_English-EWT/en_ewt-ud-train.conllu'

learner = 'liblinear'#@param['liblinear', 'libsvm']
parsing_algorithm = 'stackeager'#@param['nivreeager', 'stackproj', 'covnonproj', 'stackeager', 'stacklazy', 'planar']
features = 'allFeats3_Mrg1'#@param['test', 'default', 'libsvmGuide', 'libsvmGuide_noPos', 'libsvmGuide_noFeats', 'MaltOptDefault_2AOe1p1d1r1c3', 'MaltOptDefault_2AOe1p1d2r2c3', 'MaltOptDefault_2AOe1d2r2c3', 'allFeats3', 'allFeats3_noPos', 'allFeats3_noPos_noLem', 'allFeats3_noFeats', 'allFeats3_Mrg1', 'allFeats3_Mrg1_noFeats', 'allFeats3_Mrg2', 'UposDeprel', 'FormDeprel', 'FormUposDeprel', 'FormUposDeprel3', 'FormUposDeprel5', 'FormUposDeprel7']
mflens_data_fileID_train = '2AOe1p1d3r3c3'#@param['2AO', '2AOc1', '2AOc2', '2AOc3', '2AOe1p1', '2AOe1c3', '2AOe1p1c3', '2AOe1p1d1r1c3', '2AOe1p1d2r2', '2AOe1p1d2r2c3', '2AOe1p1d3r3c3', '2TO', '2TOc1', '2TOc2', '2TOc3', '2TOe1p1', '2TOe1c3', '2TOe1p1d1r1c3', '2TOe1p1d2r2c3', '2UO', '3AO', '3AOe1p1d1r1c1', '3TO', '3UO']
# For evaluating text structuring only
train_parser_scrambled_data = True #@param{type:"boolean"}
train_parser_lin_data = True #@param{type:"boolean"}

parameters_concat = learner+'-'+parsing_algorithm+'_'+features

def train_parser(path_maltParser, parsing_algorithm, learner, mflens_data_fileID_train, use_lin_data):
  model_name = 'textStruct_parser_'+mflens_data_fileID_train
  training_data = ''
  if use_lin_data == True:
    training_data = os.path.join(mflens_data_folder, mflens_data_fileID_train+'-lin', 'train-TextStruct_'+mflens_data_fileID_train+'-lin.conllu')
    model_name = model_name+'-lin'
  else:
    training_data = os.path.join(mflens_data_folder, mflens_data_fileID_train, 'train-TextStruct_'+mflens_data_fileID_train+'.conllu')
  # For covnonproj, we need to allow for Shift transition
  if parsing_algorithm == 'covnonproj':
    ! java -jar {path_maltParser} -c {model_name} -i {training_data} -if conllu -l {learner} -a {parsing_algorithm} -m learn -F {parsing_algorithm} -cs true
  else:
    ! java -jar {path_maltParser} -c {model_name} -i {training_data} -if conllu -l {learner} -a {parsing_algorithm} -m learn -F {parsing_algorithm}
  return model_name

# Copy feats file to liblinear/conllu (this path is hardcoded, use it even if training libsvm)
# The first name corresponds to the static files that contain the features
feature_xml_filename_toCopy = ''
# The second name is the name needed by MaltParser
feature_xml_filename_malt = ''
if parsing_algorithm == 'nivreeager':
  feature_xml_filename_toCopy = 'NivreEager'
  feature_xml_filename_malt = 'NivreEager'
elif parsing_algorithm == 'stackproj':
  feature_xml_filename_toCopy = 'StackProjective'
  feature_xml_filename_malt = 'StackProjective'
elif parsing_algorithm == 'stackeager' and features.startswith('MaltOptDefault'):
  feature_xml_filename_toCopy = 'StackEager'
  feature_xml_filename_malt = 'StackSwap'
elif parsing_algorithm == 'stacklazy' and features.startswith('MaltOptDefault'):
  feature_xml_filename_toCopy = 'StackLazy'
  feature_xml_filename_malt = 'StackSwap'
elif parsing_algorithm == 'stackeager' or parsing_algorithm == 'stacklazy':
  feature_xml_filename_toCopy = 'StackSwap'
  feature_xml_filename_malt = 'StackSwap'
elif parsing_algorithm == 'covnonproj':
  feature_xml_filename_toCopy = 'CovingtonNonProjective'
  feature_xml_filename_malt = 'CovingtonNonProjective'
elif parsing_algorithm == 'planar':
  feature_xml_filename_toCopy = 'PlanarEager'
  feature_xml_filename_malt = 'PlanarEager'
# Get the feature file needed to train the parser
path_feature_xml_toCopy_filename = os.path.join(path_TS, 'data', 'malt_feats', feature_xml_filename_toCopy+'-'+features+'.xml')
# Copy the file to where MaltParser looks for it; reminder: I hardcoded the path to the features file and we always use the one in liblinear
feature_xml_path = os.path.join('/content/maltparser-1.9.2/appdata/features', 'liblinear', 'conllu', feature_xml_filename_malt+'.xml')
# Remove default file and copy new file
if os.path.exists(feature_xml_path):
  os.remove(feature_xml_path)
shutil.copy(path_feature_xml_toCopy_filename, feature_xml_path)
print(f'File:\n  {feature_xml_path}\ncopied from:\n  {path_feature_xml_toCopy_filename}')

model_name = ''
if train_parser_scrambled_data == True:
  model_name = train_parser(path_maltParser, parsing_algorithm, learner, mflens_data_fileID_train, False)

if train_parser_lin_data == True:
  model_name = train_parser(path_maltParser, parsing_algorithm, learner, mflens_data_fileID_train, True)

#Print configuration file
! java -jar '/content/maltparser-1.9.2/maltparser-1.9.2.jar' -c {model_name} -m info > log.txt

# clear_output()


In [None]:
#@title Save models to drive (before changing parser parameters)
import glob
import shutil

# Custom path
path_save_models_drive = os.path.join(mflens_data_folder, 'MaltModels', parameters_concat)
if not os.path.exists(path_save_models_drive):
  os.makedirs(path_save_models_drive)

# Save feature configuration file in the same folder
print(feature_xml_path)
shutil.copy(feature_xml_path, path_save_models_drive)

paths_models = glob.glob(os.path.join('/content', '*.mco'))
for path_model in paths_models:
  # Save model
  print(path_model)
  shutil.copy(path_model, path_save_models_drive)
  # Delete model so it isn't copied by mistake somewhere else
  os.remove(path_model)

In [None]:
#@title Parameters for evaluation (before evaluating, SAVE PARAMS!)

# Need to run the evaluation 3 times to get all the numbers:
# - parser + corresponding test file NO LIN: for LAS same
# - parser + 2TO test file NO LIN: for LAS 2T1.4K and ordering (+structuring) evals
# - parser + 2TO test file  LIN: for LAS 2T2.7K and structuring only eval

parser_to_use = '2AOe1p1d3r3c3'#@param['2AO', '2AOc1', '2AOc2', '2AOc3', '2AOe1p1', '2AOe1c3', '2AOe1p1c3', '2AOe1p1d1r1c3', '2AOe1p1d2r2', '2AOe1p1d2r2c3', '2AOe1p1d3r3c3', '2TO', '2TOc1', '2TOc2', '2TOc3', '2TOe1p1', '2TOe1c3', '2TOe1p1d1r1c3', '2UO', '3AO', '3TO', '3AOe1p1d1r1c1', '3UO']
mflens_data_fileID_eval = '2TOe1p1d3r3c3'#@param['2AO', '2AOc1', '2AOc2', '2AOc3', '2AOe1p1', '2AOe1c3', '2AOe1p1c3', '2AOe1p1d1r1c3', '2AOe1p1d2r2', '2AOe1p1d2r2c3', '2AOe1p1d3r3c3', '2TO', '2TOc1', '2TOc2', '2TOc3', '2TOe1p1', '2TOe1c3', '2TOe1p1c3', '2TOe1p1d1r1c3', '2TOe1p1d2r2', '2TOe1p1d2r2c3', '2TOe1p1d3r3c3', '2UO', '3AO', '3AOe1p1d1r1c1', '3TO', '3UO']
data_split_eval = 'test'#@param['dev', 'test']
use_parser_lin_data = True #@param{type:"boolean"}
eval_on_lin_file = True #@param{type:"boolean"}

## Evaluate and convert output format

In [None]:
#@title Run parser on dev/test data
import sys

if use_parser_lin_data == True:
  if not parser_to_use.endswith('-lin'):
    parser_to_use = parser_to_use+'-lin'

model_name_run = 'textStruct_parser_'+parser_to_use
# From what I see the parser need to be on the root for MaltEval to be able to use it, so I coy it from the repo it is saved in
path_saved_model = os.path.join(mflens_data_folder, 'MaltModels', learner+'-'+parsing_algorithm+'_'+features, model_name_run+'.mco')
dest_path_model = os.path.join('/content', model_name_run+'.mco')
shutil.copy(path_saved_model, dest_path_model)
print(f'Copied model from {path_saved_model}')

eval_data = ''
# eval_data = '/content/Universal_Dependencies/git/UD_English-EWT/en_ewt-ud-test.conllu'

folder_Malt_drive = os.path.join(mflens_data_folder, parser_to_use, 'Malt_'+parameters_concat)
if not os.path.exists(folder_Malt_drive):
  os.makedirs(folder_Malt_drive)

pred_file = ''
if eval_on_lin_file == True:
  if not mflens_data_fileID_eval.startswith('2TO'):
    sys.exit('You should choose a file that has the prefix 2T0 for this evaluation.')
  eval_data = os.path.join(gold_structuring_folder, 'conll_gold_struct_'+mflens_data_fileID_eval+'-'+data_split_eval+'.conllu')
  # Save file in folder of parser to use, but the with name of the 2T file
  pred_file = os.path.join(folder_Malt_drive, data_split_eval+'-out_'+mflens_data_fileID_eval+'-lin_fromParser-'+parser_to_use+'.conll')
else:
  eval_data = os.path.join(mflens_data_folder, mflens_data_fileID_eval, data_split_eval+'-TextStruct_'+mflens_data_fileID_eval+'.conllu')
  if parser_to_use == mflens_data_fileID_eval:
    pred_file = os.path.join(mflens_data_folder, mflens_data_fileID_eval, 'Malt_'+parameters_concat, data_split_eval+'-out_'+mflens_data_fileID_eval+'_fromParser-'+parser_to_use+'.conll')
  else:
    pred_file = os.path.join(folder_Malt_drive, data_split_eval+'-out_'+mflens_data_fileID_eval+'_fromParser-'+parser_to_use+'.conll')

! java -jar {path_maltParser} -c {model_name_run} -i {eval_data} -o {pred_file} -m parse -nt true

# clear_output()

In [None]:
#@title Evaluate parser (MaltEval)
import os
import codecs
import re

def convertToCoNLL(filepath):
  filepath_noExt = filepath.rsplit('.', 1)[0]
  file_lines = codecs.open(filepath, 'r', 'utf-8').readlines()
  out_file_path = filepath_noExt+'_maltEval.conll'
  with codecs.open(out_file_path, 'w', 'utf-8') as fo:
    for line in file_lines:
      # Only write lines that are not metedata or multiple words (starting with 22-23)
      if re.search('^[0-9]+\t', line):
        # Copy columns 7-8 in 9-10 for MaltEval
        new_line = re.subn('^([^\t\n]+\t[^\t\n]+\t[^\t\n]+\t[^\t\n]+\t[^\t\n]+\t[^\t\n]+\t)([^\t\n]+\t[^\t\n]+\t)[^\t\n]+\t[^\t\n]+', '\g<1>\g<2>\g<2>', line)[0]
        fo.write(new_line)
      elif line.strip() == '':
        fo.write('\n')
  return out_file_path

# Convert files to pure conll
pred_file_maltEval = convertToCoNLL(pred_file)
gold_file_maltEval = convertToCoNLL(eval_data)

! java -jar {path_maltEval} -s {pred_file_maltEval} -g {gold_file_maltEval}

# Running default everything: ! java -jar {path_maltParser} -c {model_name} -i {training_data} -m learn, ! java -jar {path_maltParser} -c {model_name} -i {test_data} -o {pred_file} -m parse, ! java -jar {path_maltEval} -s {pred_file_maltEval} -g {gold_file_maltEval}
# ====================================================
# GroupBy-> Token
# Metric-> LAS
# ====================================================
# accuracy /    Token
# -----------------------
# 0.827         Row mean
# 25094         Row count
# -----------------------

In [None]:
print(pred_file)

In [None]:
#@title Convert CoNLLs to Thiago's format for evaluation and copy to drive
path_save_out = os.path.join('/content/drive/MyDrive/M-FleNS/Papers-Slides/M-FleNS_papers/2024-05_Fluency_Improvements/Parsing4/Files4CastroEval', parameters_concat)
folder_save_out = parser_to_use.split('-', 1)[0]
path_converter = '/content/Text_Structuring/code/conll2castro.py'
pathConllFile = pred_file
print_debug = False #@param {type:"boolean"}

if not os.path.exists(path_save_out):
  os.makedirs(path_save_out)

path_out_converted_files = os.path.join(path_save_out, folder_save_out)
if not os.path.exists(path_out_converted_files):
  os.makedirs(path_out_converted_files)

if mflens_data_fileID_eval.startswith('2TO'):
  pathOutFileOrdering = os.path.join(path_out_converted_files, 'ordering_'+parser_to_use+'-'+data_split_eval+'.out.postprocessed')
  pathOutFileStructuring = os.path.join(path_out_converted_files, 'structuring_'+parser_to_use+'-'+data_split_eval+'.out.postprocessed')
  ! python {path_converter} {pathConllFile} {print_debug} {pathOutFileOrdering} {pathOutFileStructuring}
  # If we use linearised files, the ordering output is not relevant
  if use_parser_lin_data == True and eval_on_lin_file == True:
    ! rm {pathOutFileOrdering}

In [None]:
#@title Erase models
import glob
paths_models_post = glob.glob(os.path.join('/content', '*.mco'))
for path_model_post in paths_models_post:
  # Delete model so it isn't copied by mistake somewhere else
  os.remove(path_model_post)

## Compare order properties converted file

In [None]:
#@title Compare order properties converted file
# I want to see if the structuring parsers always output the same order of properties as in the input
import json
import os
import codecs
import re

learner2 = 'libsvm'#@param['liblinear', 'libsvm']
parsing_algorithm2 = 'stackeager'#@param['nivreeager', 'stackproj', 'covnonproj', 'stackeager', 'stacklazy', 'planar']
features2 = 'MaltOptDefault_2AOe1p1d1r1c3'#@param['test', 'default', 'libsvmGuide', 'libsvmGuide_noPos', 'libsvmGuide_noFeats', 'MaltOptDefault_2AOe1p1d1r1c3', 'MaltOptDefault_2AOe1p1d2r2c3', 'MaltOptDefault_2AOe1d2r2c3', 'allFeats3', 'allFeats3_noPos', 'allFeats3_noFeats', 'allFeats3_Mrg1', 'allFeats3_Mrg2', 'UposDeprel', 'FormDeprel', 'FormUposDeprel', 'FormUposDeprel3', 'FormUposDeprel5', 'FormUposDeprel7']
mflens_data_fileID_eval2 = '2AOe1p1d1r1c3'#@param['2AO', '2AOe1p1c3', '2AOe1p1d1r1c3', '2AOe1p1d2r2', '2AOe1p1d2r2c3', '2AOe1p1d3r3c3', '2TO', '2TOe1p1d1r1c3', '2TOe1p1d2r2c3', '2UO', '3AO', '3AOe1p1d1r1c1', '3TO', '3UO']
data_split_eval2 = 'test'#@param['dev', 'test']

path_gold_struct_json = os.path.join('/content/drive/MyDrive/M-FleNS/Papers-Slides/M-FleNS_papers/2024-05_Fluency_Improvements/Thiago-files', 'structuring_gold-'+data_split_eval2+'.json')
file_gold_struct_json = json.load(open(path_gold_struct_json))
path_pred_struct = os.path.join(mflens_data_folder, 'Files4CastroEval', learner2+'-'+parsing_algorithm2+'_'+features2, mflens_data_fileID_eval2, 'structuring_'+mflens_data_fileID_eval2+'-lin-'+data_split_eval2+'.out.postprocessed')
file_pred_struct = codecs.open(path_pred_struct, 'r', 'utf-8').readlines()

x = 0
count_identical_order = 0
while x < len(file_gold_struct_json):
  # Get the order from the first target (all targets have the same order)
  # First remove empty <SNT> tags (<SNT> </SNT>) in the reference, then remove the internal <SNT> tags on both reference and prediction
  # Eventually, we are left with the initial and final tags and, in between them, the list of properties
  gold_struct = re.sub(' </SNT> <SNT>', '', re.sub(' <SNT> </SNT>', '', ' '.join(file_gold_struct_json[x]['targets'][0]['output']))).strip()
  pred_struct = re.sub(' </SNT> <SNT>', '', file_pred_struct[x]).strip()
  # If nothing is printed, it means that the properties in the pred file are always in the same order as the properties in the reference file
  if gold_struct == pred_struct:
    count_identical_order += 1
  else:
    print(f'Mismatch in input #{x}:\n  {gold_struct}\n  {pred_struct}')
  x += 1

if count_identical_order == len(file_gold_struct_json):
  print('Properties are always in the same order in the predicted file and the gold file.')

## Check data !!! RESTART RUNTIME AFTER USING, I'M CHANGING THE ROOT DIR (not sure why)

In [None]:
#@title Check data
import glob
import codecs
import re

# Why is this next line needed by the way?
os.chdir(mflens_data_folder)

# Get names of folders that have training data (don't select -lin folders at this point, we'll check if they exist with this code)
list_dataTrain_folders = [name for name in os.listdir(".") if os.path.isdir(name) and (name.startswith('2A') or name.startswith('3A')) and not name.endswith('-lin')]

def createRegexCheck(stringWithFeats):
  """ Build regex to check each line of the file based on a string (folder or filename) than contains codes for the CoNLL contents"""
  # Build regex to check each line of the file
  regex_search = '^[ 0-9]+\t[^\t]+\t'
  feats = []
  if re.search('e[1-9]', stringWithFeats):
    regex_search = regex_search+'[^_][^\t]*\t'
  else:
    regex_search = regex_search+'_\t'
  if re.search('p[1-9]', stringWithFeats):
    regex_search = regex_search+'[^_][^\t]*\t_\t'
  else:
    regex_search = regex_search+'_\t_\t'
  # Feats
  if re.search('d[1-9]', stringWithFeats):
    feats.append('dom_class=[^\t\|]+')
  if re.search('r[1-9]', stringWithFeats):
    feats.append('ran_class=[^\t\|]+')
  if re.search('c1', stringWithFeats):
    feats.append('dom_ID=[^\t\|]+')
  elif re.search('c2', stringWithFeats):
    feats.append('ran_ID=[^\t\|]+')
  elif re.search('c3', stringWithFeats):
    feats.append('dom_ID=[^\t\|]+')
    feats.append('ran_ID=[^\t\|]+')
  if len(feats) == 0:
    regex_search = regex_search+'_\t'
  else:
    regex_search = regex_search+'\|'.join(feats)+'\t'
  # Head and dependency
  regex_search = regex_search+'[0-9]+\t[^\t]+\t_\t_$'
  return regex_search

def checkConllu(pathFolderToCheck, missing_contents):
  for filepath_conllu in pathFolderToCheck:
    head, tail = os.path.split(filepath_conllu)
    print(f'  {tail}')
    missing_contents[filepath_conllu] = []
    conll_structures = codecs.open(filepath_conllu, 'r', 'utf-8').read().split('\n\n')
    # Build regex to check each line of the file
    regex_search = createRegexCheck(tail)
    for i, conll_structure in enumerate(conll_structures):
      lines_conll = conll_structure.split('\n')
      for line_conll in lines_conll:
        if not line_conll == '':
          if not re.search(regex_search, line_conll):
            missing_contents[filepath_conllu].append(i)
  return missing_contents

missing_folders = []
missing_contents_base = {}
missing_contents_2T = {}
missing_contents_lin = {}
missing_contents_2Tlin = {}
missing_contents_GoldStruct = {}

for dataTrain_folder in sorted(list_dataTrain_folders):
  dataTrain_folder_fullPath = os.path.join(mflens_data_folder, dataTrain_folder)
  dataTrain_folder_fullPath_2T = re.sub('A', 'T', dataTrain_folder_fullPath)
  dataTrain_folder_fullPath_lin = dataTrain_folder_fullPath+'-lin'
  dataTrain_folder_fullPath_2Tlin = re.sub('A', 'T', dataTrain_folder_fullPath)+'-lin'
  print(dataTrain_folder_fullPath)
  if not os.path.exists(dataTrain_folder_fullPath_2T):
    missing_folders.append(dataTrain_folder_fullPath_2T)
  if not os.path.exists(dataTrain_folder_fullPath_lin):
    missing_folders.append(dataTrain_folder_fullPath_lin)
  if not os.path.exists(dataTrain_folder_fullPath_2Tlin):
    missing_folders.append(dataTrain_folder_fullPath_2Tlin)

  # Get dico with errors for each file
  checkConllu(glob.glob(os.path.join(dataTrain_folder_fullPath, '*.conllu')), missing_contents_base)
  checkConllu(glob.glob(os.path.join(dataTrain_folder_fullPath_2T, '*.conllu')), missing_contents_2T)
  checkConllu(glob.glob(os.path.join(dataTrain_folder_fullPath_lin, '*.conllu')), missing_contents_lin)
  checkConllu(glob.glob(os.path.join(dataTrain_folder_fullPath_2Tlin, '*.conllu')), missing_contents_2Tlin)

print(gold_structuring_folder)
checkConllu(glob.glob(os.path.join(gold_structuring_folder, '*.conllu')), missing_contents_GoldStruct)

if len(missing_folders) > 0:
  print(f'One or more folders missing: {str(missing_folders)}')
else:
  print('All required folders were found')


In [None]:
#@title Print missing
print_base = False#@param{type:"boolean"}
print_Lin = False#@param{type:"boolean"}
print_2T = False#@param{type:"boolean"}
print_2TLin = False#@param{type:"boolean"}
print_gold = True#@param{type:"boolean"}

if print_base == True:
  print('\nBase\n==========')
  for dataset in sorted(missing_contents_base):
    print(dataset)
    if len(missing_contents_base[dataset]) > 0:
      print(f'  {len(missing_contents_base[dataset])} missing: {missing_contents_base[dataset]}')
    else:
      print('  OK')

if print_Lin == True:
  print('\nLin\n==========')
  for dataset in sorted(missing_contents_lin):
    print(dataset)
    if len(missing_contents_lin[dataset]) > 0:
      print(f'  {len(missing_contents_lin[dataset])} missing: {missing_contents_lin[dataset]}')
    else:
      print('  OK')

if print_2T == True:
  print('\n2T\n==========')
  for dataset in sorted(missing_contents_2T):
    print(dataset)
    if len(missing_contents_2T[dataset]) > 0:
      print(f'  {len(missing_contents_2T[dataset])} missing: {missing_contents_2T[dataset]}')
    else:
      print('  OK')

if print_2TLin == True:
  print('\n2T-Lin\n==========')
  for dataset in sorted(missing_contents_2Tlin):
    print(dataset)
    if len(missing_contents_2Tlin[dataset]) > 0:
      print(f'  {len(missing_contents_2Tlin[dataset])} missing: {missing_contents_2Tlin[dataset]}')
    else:
      print('  OK')

if print_gold == True:
  print('\n2T-Lin\n==========')
  for dataset in sorted(missing_contents_GoldStruct):
    print(dataset)
    if len(missing_contents_GoldStruct[dataset]) > 0:
      print(f'  {len(missing_contents_GoldStruct[dataset])} missing: {missing_contents_GoldStruct[dataset]}')
    else:
      print('  OK')

# NLLB MT

In [None]:
import os
import glob
import codecs

txt_folder = '/content/texts'
out_folder = '/content/outs'

if not os.path.exists(txt_folder):
  os.makedirs(txt_folder)

if not os.path.exists(out_folder):
  os.makedirs(out_folder)

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import codecs
import os
import torch


# tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
# model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
# model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-1.3B")

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-1.3B")


# Check if CUDA (GPU acceleration) is available on the system
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-1.3B").to(device)

# article = "UN Chief says there is no military solution in Syria. This is also a problem for my little cat."
# inputs = tokenizer(article, return_tensors="pt")

# for language in languages:
#   translated_tokens = model.generate(
#       **inputs, forced_bos_token_id=tokenizer.lang_code_to_id[language], max_length=30
  # )
  # print(tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0])

In [None]:
import re
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

path_drive_NLLB = '/content/drive/MyDrive/Colab-dump/NLLB'
if not os.path.exists(path_drive_NLLB):
  os.makedirs(path_drive_NLLB)

txt_files_paths = glob.glob(os.path.join(txt_folder, '*.txt'))

# languages = ['fra_Latn', 'arb_Arab', 'deu_Latn', 'hin_Deva', 'kor_Hang', 'rus_Cyrl', 'spa_Latn', 'swh_Latn', 'zho_Hans']
languages = ['kor_Hang']
# languages = ['hin_Deva', 'deu_Latn', 'kor_Hang']
language_map = {'arb_Arab':'ar', 'deu_Latn':'de', 'fra_Latn':'fr', 'hin_Deva':'hi', 'kor_Hang':'ko', 'rus_Cyrl':'ru', 'spa_Latn':'es', 'swh_Latn':'sw', 'zho_Hans':'zh'}
# languages = ['fra_Latn', 'spa_Latn']

def extract_sentences(txt_file_path):
  list_texts_split = []
  list_texts = codecs.open(txt_file_path, 'r', 'utf-8').readlines()
  for i, text in enumerate(list_texts):
    sentences = text.strip().split('. ')
    list_texts_split.append(sentences)
    # if i == 0:
    #   print(text.strip())
    #   print(sentences)
  return list_texts_split

for language in languages:
  for txt_file_path in txt_files_paths:
    # Get filename without extension and language suffix
    head, tail = os.path.split(txt_file_path)
    filename_noExt = tail.rsplit('.', 1)[0].rsplit('_', 1)[0]
    lang_out = language_map.get(language)
    print(f'Translating {filename_noExt} to {lang_out}...')

    list_texts_nllb = extract_sentences(txt_file_path)
    # print(list_texts_nllb[0])

    translated_texts = []

    # NLLB takes only one sentence at a time as input.
    # Split text file into texts, and texts in to sentences.
    for j, text_nllb in enumerate(list_texts_nllb):
      # if j < 3:
      print(f'Text {j}...')
      translated_sentences = []
      for sentence_nllb in text_nllb:
        inputs = tokenizer(sentence_nllb, return_tensors="pt").to(device)
        translated_tokens = model.generate(
          **inputs, forced_bos_token_id=tokenizer.lang_code_to_id[language], max_length=100
        )
        translated_sentences.append(tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0])
      # Combine the sentences of the same text with a dot and space, since that's what we removed above to build
      out_text = '. '.join(translated_sentences)
      out_text = re.subn('\.\.', '.', out_text)[0]
      translated_texts.append(out_text)

    # Write output file, with one text per line and a linebreak for all but last text
    filename_out = os.path.join(path_drive_NLLB, filename_noExt+'_'+lang_out+'.txt')
    with codecs.open(filename_out, 'w', 'utf-8') as fo:
      for k, translated_text in enumerate(translated_texts):
        fo.write(translated_text)
        if k < len(translated_texts)-1:
          fo.write('\n')

In [None]:
#@title Zip and download conll inputs
download_inputs = 'yes'#@param['yes', 'no']

from IPython.display import clear_output
import locale
locale.getpreferredencoding = lambda: "UTF-8"

if download_inputs == 'yes':
  from google.colab import files
  zip_name_log = '/content/outs_ar-de-hi-ko.zip'
  !zip -r {zip_name_log} {out_folder}

  clear_output()

  files.download(zip_name_log)