# Apply the models on the Test set

In [7]:
import pandas as pd
from tqdm.auto import tqdm, trange
from transformers import AutoModelForSeq2SeqLM, NllbTokenizer

In [2]:
# The project folder can be saved in google drive and accessed through Google colab
from google.colab import drive
# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
# path to the project folder that contains a folder code, data and models
path_project = "/content/drive/MyDrive/anlp_project/NLLB-200/ANLP_SUBMISSION"

In [9]:
def translate(text, src_lang, tgt_lang, a=16, b=1.5, max_input_length=1024, **kwargs):
    tokenizer.src_lang = src_lang
    tokenizer.tgt_lang = tgt_lang
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_input_length)
    result = model.generate(
        **inputs.to(model.device),
        forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
        max_new_tokens=int(a + b * inputs.input_ids.shape[1]),
        **kwargs
    )
    return tokenizer.batch_decode(result, skip_special_tokens=True)

def batched_translate(texts, batch_size=16, **kwargs):
    """Translate texts in batches of similar length"""
    idxs, texts2 = zip(*sorted(enumerate(texts), key=lambda p: len(p[1]), reverse=True))
    results = []
    for i in trange(0, len(texts2), batch_size):
        results.extend(translate(texts2[i: i+batch_size], **kwargs))
    return [p for i, p in sorted(zip(idxs, results))]

In [10]:
data_path = f"{path_project}/data"

df_test = pd.read_csv(f"{data_path}/df_test.csv")

In [11]:
models = {
    "init_none" : [f"{path_project}/models/nllb-de-be_initNONE_best_checkpoint", ["ch_be"]],
    "init_vs" : [f"{path_project}/models/nllb-de-be_initvs_best_checkpoint", ["ch_be", "ch_vs"]],
    "init_gr_large" : [f"{path_project}/models/nllb-de-be_initgrLarge_best_checkpoint", ["ch_be", "ch_gr"]],
    "init_de" :[f"{path_project}/models/nllb-de-be_initde_best_checkpoint", ["ch_be"]],
    "init_average": [f"{path_project}/models/nllb-de-be_init_average_v1_best_checkpoint", ["ch_be"]],
    "init_gr_small": [f"{path_project}/models/nllb-de-be_initgrSmall_best_checkpoint", ["ch_be", "ch_gr"]]
}

for key, value in models.items():
  model_path = f'/content/drive/MyDrive/nlp_project_mira/NLLB-200/{value[0]}'
  model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
  tokenizer = NllbTokenizer.from_pretrained(model_path)

  if len(value[1]) == 2:
    fix_tokenizer2n(tokenizer, new_lang_tokens=value[1])
  elif len(value[1]) == 1:
    fix_tokenizer1n(tokenizer, new_lang_token=value[1][0])

  df_test[key] = [translate(t, "ch_be", 'deu_Latn')[0] for t in tqdm(df_test["ch_be"])]

HFValidationError: ignored

In [None]:
# Filename
df_test.to_excel("df_test_output_new.xlsx")

# Apply the model on the Synthetic Test Set