# A Notebook for Basic Auxiliary Task Experiments on MLM Data:

In [None]:
# mount google drive for access to unpublished data and saving results!
from google.colab import drive
drive.mount('/content/drive')
# check if drive is present in root directory '/content'
%ls -l
%pwd

In [None]:
# prepare/clear drive directory
%rm -r /content/BaySIDshot # should fail in initial call
%cd /content
# get a fresh clone of BaySIDshot repo with it's submodules
! git clone https://github.com/XaverKrueckl/BaySIDshot.git --recurse-submodules
# cd into BaySIDshot repo:
%cd /content/BaySIDshot/
%ls -l
%pwd

In [None]:
# prepare MLM data to use with machamp:
# again move into BaySIDshot repo to be safe:

! bash /content/BaySIDshot/scripts/prepare_mlm_data.sh

# finally move into machamp to start training and predictions
%cd /content/BaySIDshot/machamp
%ls -l
%pwd

## General Checks:

In [None]:
# make sure to have a GPU backend selected
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
# install the required packages for machamp
#! cat /content/BaySIDshot/machamp/README.md | grep "requirements"

%cd /content/BaySIDshot/machamp
! pip3 install --user -r /content/BaySIDshot/machamp/requirements.txt

In [None]:
# appends the directory /root/.local/bin to the existing PATH variable,
# allowing executables located in that directory to be run from anywhere in the shell
! export PATH=$PATH:/root/.local/bin

# check if imports for machamp are there
import tensorflow as tf
# check the version
print(tf.__version__)

# check if basic system works
! python3 /content/BaySIDshot/machamp/train.py

## Prepare evaluation data:

In [None]:
#!bash /content/BaySIDshot/scripts/prepare_evaldata_dialects.sh
#!bash /content/BaySIDshot/scripts/prepare_evaldata_baseline.sh
!bash /content/BaySIDshot/scripts/prepare_evaldata_alllangs.sh

# if issues occur, use manually created gold set:
# /content/BaySIDshot/manual_data/alllangs_eval_data # or just dialects_eval_data, etc.

%cd /content/BaySIDshot/machamp

## Basic Continuous Pre-Training MLMxNLU Multitask setting:

In [None]:
# inspect the config to be used
! cat /content/BaySIDshot/configs/mlm_nlu_x.json

# inspect params to be used
! cat /content/BaySIDshot/configs/params_mdeberta.json

In [None]:
# train experiment continuous mlm pretraining of mDeBERTa
# make sure be in ../machamp or use direct path as below
# set name of experiment / logs directory! --name mDeBERTa_exp3_mlmnlu_SEED
# with respective random seed that should be used --seed 1234 e.g.


! python3 /content/BaySIDshot/machamp/train.py --dataset_configs /content/BaySIDshot/configs/mlm_nlu_x.json --parameters /content/BaySIDshot/configs/params_mdeberta.json --device 0 --name mDeBERTa_exp3_mlmnlu_1234 --seed 1234

In [None]:
# save logs dir with model and metrics to drive - change name of logs dir accordingly!

! cp -R /content/BaySIDshot/machamp/logs/mDeBERTa_exp3_mlmnlu_1234* /content/drive/MyDrive/Masterarbeit

In [None]:
# SET and get path(s) to the final model
# last line with text in model_path has model path to nlu model
# (if nlu config was set as last one when sequential/intermediate training)
! ls -d /content/drive/MyDrive/Masterarbeit/mDeBERTa_exp3_mlmnlu_1234*/*/model_* > model_path.txt

import os

with open('model_path.txt', 'r') as file:
  lines = file.readlines()
  # get the path to the final model (maily if sequential experiment)
  model_line = [line.strip() for line in lines if line.strip()][-1]
  model_line = model_line.strip()
  print("Evaluating Model: ", model_line)
  if '/' in model_line:
    # just to show the parts of the path:
    parts = model_line.split('/')
    # get all the necessary path parts:
    model = model_line.split('/')[7]
    time = model_line.split('/')[6]
    experiment_name = model_line.split('/')[5]
    if experiment_name.split("."):
      experiment_name_cleaned = experiment_name.split(".")[0]
    save_dir = model_line.split('/')[:5]
    base_save_dir = '/'.join(save_dir)
  else:
    raise ValueError("No valid model path")


#eval_dir = "/content/BaySIDshot/manual_data/alllangs_eval_data"
eval_dir = "/content/BaySIDshot/alllangs_eval_data"

with open('script.sh', 'w') as file:
    # create predictions folder in directory where model is saved:
    file.write(f"! mkdir -p {base_save_dir}/{experiment_name}/{time}/predictions_{experiment_name_cleaned}\n")
    # prepare prediction files order
    file_list = []
    for filename in os.listdir(eval_dir):
        if filename.endswith('test.conll'): # do not use when also using valid files is desired
          # path to goldfile:
          goldfile = eval_dir + "/" + filename
          # path to outfile
          outfile = base_save_dir + "/" + experiment_name + "/" + time + "/predictions_" + experiment_name_cleaned + "/" + filename + ".out"
          # append goldfile outfile "pairs" to filelist for prediction command
          file_list.append(str(goldfile))
          file_list.append(str(outfile))
    file_list_string = ' '.join(file_list)
    # prediction call for all files in test dir
    # also adds the specific dataset for which prediction should be done - necessary in multitask setting, else possible to also append
    file.write(f"! python3 predict.py {model_line} {file_list_string} --device 0 --dataset NLU\n")
    # dir eval call to get all metrics and save output as json file:
    file.write(f"! python3 /content/BaySIDshot/scripts/dir_eval_out.py {eval_dir} {base_save_dir}/{experiment_name}/{time}/predictions_{experiment_name_cleaned}\n")
    # copy json output also to predictions dir:
    file.write(f"! mkdir -p {base_save_dir}/results\n")
    file.write(f"! cp /content/BaySIDshot/results/* {base_save_dir}/results\n")

print("Prediction Script successfully generated!")

# runs predictions:
! bash script.sh
# and removes scripts:
! rm script.sh
! rm model_path.txt

## Basic Sequential Intermediate Continuous Pre-Training MLM_NLU setting:

In [None]:
# inspect the config to be used
! cat /content/BaySIDshot/configs/mlm_x.json
! cat /content/BaySIDshot/configs/nlu_x.json

# inspect params to be used
! cat /content/BaySIDshot/configs/params_mdeberta.json

In [None]:
# train experiment continuous mlm pretraining of mDeBERTa
# make sure be in ../machamp or use direct path as below
# set name of experiment / logs directory! --name mDeBERTa_exp3_mlm_nlu_SEED
# with respective random seed that should be used --seed 1234 e.g.


! python3 /content/BaySIDshot/machamp/train.py --dataset_configs /content/BaySIDshot/configs/mlm_x.json /content/BaySIDshot/configs/nlu_x.json --parameters /content/BaySIDshot/configs/params_mdeberta.json --sequential --device 0 --name mDeBERTa_exp3_mlm_nlu_1234 --seed 1234

In [None]:
# save logs dir with model and metrics to drive - change name of logs dir accordingly!

! cp -R /content/BaySIDshot/machamp/logs/mDeBERTa_exp3_mlm_nlu_1234* /content/drive/MyDrive/Masterarbeit

In [None]:
# SET and get path(s) to the final model
# last line with text in model_path has model path to nlu model
# (if nlu config was set as last one when sequential/intermediate training)
! ls -d /content/drive/MyDrive/Masterarbeit/mDeBERTa_exp3_mlm_nlu_1234*/*/model_* > model_path.txt

import os

with open('model_path.txt', 'r') as file:
  lines = file.readlines()
  # get the path to the final model (maily if sequential experiment)
  model_line = [line.strip() for line in lines if line.strip()][-1]
  model_line = model_line.strip()
  print("Evaluating Model: ", model_line)
  if '/' in model_line:
    # just to show the parts of the path:
    parts = model_line.split('/')
    # get all the necessary path parts:
    model = model_line.split('/')[7]
    time = model_line.split('/')[6]
    experiment_name = model_line.split('/')[5]
    if experiment_name.split("."):
      experiment_name_cleaned = experiment_name.split(".")[0]
    save_dir = model_line.split('/')[:5]
    base_save_dir = '/'.join(save_dir)
  else:
    raise ValueError("No valid model path")


#eval_dir = "/content/BaySIDshot/manual_data/alllangs_eval_data"
eval_dir = "/content/BaySIDshot/alllangs_eval_data"

with open('script.sh', 'w') as file:
    # create predictions folder in directory where model is saved:
    file.write(f"! mkdir -p {base_save_dir}/{experiment_name}/{time}/predictions_{experiment_name_cleaned}\n")
    # prepare prediction files order
    file_list = []
    for filename in os.listdir(eval_dir):
        if filename.endswith('test.conll'): # do not use when also using valid files is desired
          # path to goldfile:
          goldfile = eval_dir + "/" + filename
          # path to outfile
          outfile = base_save_dir + "/" + experiment_name + "/" + time + "/predictions_" + experiment_name_cleaned + "/" + filename + ".out"
          # append goldfile outfile "pairs" to filelist for prediction command
          file_list.append(str(goldfile))
          file_list.append(str(outfile))
    file_list_string = ' '.join(file_list)
    # prediction call for all files in test dir
    # also adds the specific dataset for which prediction should be done - necessary in multitask setting, else possible to also append
    file.write(f"! python3 predict.py {model_line} {file_list_string} --device 0 --dataset NLU\n")
    # dir eval call to get all metrics and save output as json file:
    file.write(f"! python3 /content/BaySIDshot/scripts/dir_eval_out.py {eval_dir} {base_save_dir}/{experiment_name}/{time}/predictions_{experiment_name_cleaned}\n")
    # copy json output also to predictions dir:
    file.write(f"! mkdir -p {base_save_dir}/results\n")
    file.write(f"! cp /content/BaySIDshot/results/* {base_save_dir}/results\n")

print("Prediction Script successfully generated!")

# runs predictions:
! bash script.sh
# and removes scripts:
! rm script.sh
! rm model_path.txt

### Reference for _MaChAmp_ and MLM Data:

```
@inproceedings{van-der-goot-etal-2021-massive,
    title = "Massive Choice, Ample Tasks ({M}a{C}h{A}mp): A Toolkit for Multi-task Learning in {NLP}",
    author = {van der Goot, Rob  and
      {\"U}st{\"u}n, Ahmet  and
      Ramponi, Alan  and
      Sharaf, Ibrahim  and
      Plank, Barbara},
    booktitle = "Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations",
    month = apr,
    year = "2021",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2021.eacl-demos.22",
    doi = "10.18653/v1/2021.eacl-demos.22",
    pages = "176--197",
    abstract = "Transfer learning, particularly approaches that combine multi-task learning with pre-trained contextualized embeddings and fine-tuning, have advanced the field of Natural Language Processing tremendously in recent years. In this paper we present MaChAmp, a toolkit for easy fine-tuning of contextualized embeddings in multi-task settings. The benefits of MaChAmp are its flexible configuration options, and the support of a variety of natural language processing tasks in a uniform toolkit, from text classification and sequence labeling to dependency parsing, masked language modeling, and text generation.",
}
@inproceedings{artemova-plank-2023-low,
    title = "Low-resource Bilingual Dialect Lexicon Induction with Large Language Models",
    author = "Artemova, Ekaterina  and Plank, Barbara",
    booktitle = "Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa 2023) (NoDaLiDa)",
    year = "2023",
}

```