# A Notebook for predicting already fine-tuned MaChAmp Models on Natural Bavarian Data:

In [None]:
# mount google drive for access to unpublished data and saving results!
from google.colab import drive
drive.mount('/content/drive')
# check if drive is present in root directory '/content'
%ls -l
%pwd

In [None]:
## prepare/clear drive directory
%rm -r /content/BaySIDshot # should fail in initial call
%cd /content
# get a fresh clone of BaySIDshot repo with it's submodules
! git clone https://github.com/XaverKrueckl/BaySIDshot.git --recurse-submodules
# cd into BaySIDshot repo:
%cd /content/BaySIDshot/
%ls -l
%pwd

## General Checks:

In [None]:
# make sure to have a GPU backend selected
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
# install the required packages for machamp
#! cat /content/BaySIDshot/machamp/README.md | grep "requirements"

%cd /content/BaySIDshot/machamp

! pip3 install --user -r /content/BaySIDshot/machamp/requirements.txt

In [None]:
# appends the directory /root/.local/bin to the existing PATH variable,
# allowing executables located in that directory to be run from anywhere in the shell
! export PATH=$PATH:/root/.local/bin

# check if imports for machamp are there
import tensorflow as tf
# check the version
print(tf.__version__)

# check if basic system works
! python3 /content/BaySIDshot/machamp/train.py

In [None]:
# preparing prediction data:

!bash /content/BaySIDshot/scripts/prepare_evaldata_natural.sh
!ls -l /content/BaySIDshot/natural_eval_data
%cd /content/BaySIDshot/machamp

In [None]:
# get path to the final model
# last line with text in model_path has model path to nlu model
# (if nlu config was set as last one when sequential/intermediate training)
# example with a path used in this master thesis:
! ls -d /content/drive/MyDrive/Masterarbeit/mDeBERTa_exp4_mlmner_nlu_8446*/*/model_* > model_path.txt

import os

with open('model_path.txt', 'r') as file:
  lines = file.readlines()
  # get the path to the final model (maily if sequential experiment)
  model_line = [line.strip() for line in lines if line.strip()][-1]
  model_line = model_line.strip()
  print("Evaluating Model: ", model_line)
  if '/' in model_line:
    # just to show the parts of the path:
    parts = model_line.split('/')
    # get all the necessary path parts:
    model = model_line.split('/')[7]
    time = model_line.split('/')[6]
    experiment_name = model_line.split('/')[5]
    if experiment_name.split("."):
      experiment_name_cleaned = experiment_name.split(".")[0]
    save_dir = model_line.split('/')[:5]
    base_save_dir = '/'.join(save_dir)
  else:
    raise ValueError("No valid model path")


eval_dir = "/content/BaySIDshot/natural_eval_data"


with open('script.sh', 'w') as file:
    # create predictions folder in directory where model is saved:
    file.write(f"! mkdir -p {base_save_dir}/{experiment_name}/{time}/natural_predictions_{experiment_name_cleaned}\n")
    # prepare prediction files order
    file_list = []
    for filename in os.listdir(eval_dir):
        if filename.endswith('test.conll'): # do not use when also using valid files is desired
          # path to goldfile:
          goldfile = eval_dir + "/" + filename
          # path to outfile
          outfile = base_save_dir + "/" + experiment_name + "/" + time + "/natural_predictions_" + experiment_name_cleaned + "/" + filename + ".out"
          # append goldfile outfile "pairs" to filelist for prediction command
          file_list.append(str(goldfile))
          file_list.append(str(outfile))
    file_list_string = ' '.join(file_list)
    # prediction call for all files in test dir
    # also adds the specific dataset for which prediction should be done - necessary in multitask setting, else possible to also append
    file.write(f"! python3 predict.py {model_line} {file_list_string} --device 0 --dataset NLU\n")
    # dir eval call to get all metrics and save output as json file:
    file.write(f"! python3 /content/BaySIDshot/scripts/dir_eval_out.py {eval_dir} {base_save_dir}/{experiment_name}/{time}/natural_predictions_{experiment_name_cleaned}\n")
    # copy json output also to predictions dir:
    file.write(f"! mkdir -p {base_save_dir}/results\n")
    file.write(f"! cp /content/BaySIDshot/results/* {base_save_dir}/results\n")

print("Prediction Script successfully generated!")

# runs predictions:
! bash script.sh
# and removes scripts:
! rm script.sh
! rm model_path.txt

### Reference for _MaChAmp_ and Bavarian Natural Evaluation Data:

```
@inproceedings{van-der-goot-etal-2021-massive,
    title = "Massive Choice, Ample Tasks ({M}a{C}h{A}mp): A Toolkit for Multi-task Learning in {NLP}",
    author = {van der Goot, Rob  and
      {\"U}st{\"u}n, Ahmet  and
      Ramponi, Alan  and
      Sharaf, Ibrahim  and
      Plank, Barbara},
    booktitle = "Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: System Demonstrations",
    month = apr,
    year = "2021",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2021.eacl-demos.22",
    doi = "10.18653/v1/2021.eacl-demos.22",
    pages = "176--197",
    abstract = "Transfer learning, particularly approaches that combine multi-task learning with pre-trained contextualized embeddings and fine-tuning, have advanced the field of Natural Language Processing tremendously in recent years. In this paper we present MaChAmp, a toolkit for easy fine-tuning of contextualized embeddings in multi-task settings. The benefits of MaChAmp are its flexible configuration options, and the support of a variety of natural language processing tasks in a uniform toolkit, from text classification and sequence labeling to dependency parsing, masked language modeling, and text generation.",
}
@inproceedings{Winkler2024,
  title = "Slot and Intent Detection Resources for {B}avarian and {L}ithuanian: Assessing Translations vs Natural Queries to Digital Assistants",
  author = "Winkler, Miriam and Juozapaityte, Virginija and van der Goot, Rob and Plank, Barbara",
  booktitle = "Proceedings of The 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation",
  year = "2024",
  publisher = "Association for Computational Linguistics",
}
```