# 1. Installations & Imports

In [None]:
!pip install sentencepiece transformers==4.33 datasets wandb sacremoses sacrebleu -q

In [None]:
import logging
import os
import random
import torch
import numpy as np
import pandas as pd
import re
import sys
import typing as tp
import unicodedata
import gc
import random
import sacrebleu
import wandb
import matplotlib.pyplot as plt
from sacremoses import MosesPunctNormalizer
from tqdm.auto import tqdm, trange
from transformers import NllbTokenizer, AutoModelForSeq2SeqLM, AutoTokenizer, AutoConfig, get_constant_schedule_with_warmup
from transformers.optimization import Adafactor
from sklearn.model_selection import train_test_split

# 2. CHANGE HERE

In [None]:
# The project folder can be saved in google drive and accessed through Google colab
from google.colab import drive
# Mount Google Drive
drive.mount('/content/drive')

In [None]:
#########################
#### CHANGE HERE ########
#########################

# new language tag to add to the model
# IMPORTANT: the column in df_train corresponding to this token
# should have the same name as new_lang_token
new_lang_token = 'ch_be'

# Language from which the embeddings should be obtained, can be german or
# a dialect the model was trained before
# e.g. None, 'deu_Latn', 'ch_vs', 'ch_gr'
similar_language_token = 'deu_Latn'

# specifiy if the training data for gr should be limited to the training data of vs
# this is needed for the model with init_gr_small
# only relevant when new_lang_token = 'ch_gr'
limit_data = True

# path to the project folder that contains the folder data
path_project = "/content/drive/MyDrive/German_to_Swiss_Translation_ANLP_2023"
os.chdir(path_project)

# path to the folder where the model is stored
# Can be the hugging face model name
MODEL_PATH = "facebook/nllb-200-distilled-600M"

# when the model specified in model_path contains an added token of an added
# Swiss german dialect, you need to specify which additional token was added
# e.g 'ch_vs' or 'ch_gr'
token_added = None

# Name/ path to save the finetuned model
MODEL_SAVE_PATH = 'model/nllb-de-be_initde_v1'

# Training steps
# To train the gr-de model large, set training steps to 5000
# as it needs more iterations to converge
training_steps = 2200

# WandB API to log the results, if set to none, no results are logged to wandb
api_key = None

# 3. Data Loading

In [None]:
def prepare_data(data_path, new_lang_token, limit_data):
    """
    Prepare train and validation data.
    """
    df_train = pd.read_csv(f"{data_path}/df_train.csv")
    df_test = pd.read_csv(f"{data_path}/df_test.csv")

    # subset of the data that contains values for this dialect
    df_train = df_train[df_train[['de', new_lang_token]].notna().all(axis=1)].reset_index()
    df_test = df_test[df_test[['de', new_lang_token]].notna().all(axis=1)].reset_index()

    # all dialect exept GR (10475) have between 2700 and 2760 sentences (VS 2619 in train)
    # limit the dataframe for GR to 2619 sentences (train) to make it comparable
    if limit_data and new_lang_token == 'ch_gr':
        df_train = df_train.sample(2619, random_state=42).reset_index(drop=True)

    # split into train and validation
    df_train, df_val = train_test_split(df_train, test_size=0.05, random_state=42)

    print(f"Length train set: {len(df_train)}")
    print(f"Length validation: {len(df_val)}")
    print(f"Length test set: {len(df_test)}")

    return df_train, df_val, df_test

In [None]:
df_train, df_val, df_test = prepare_data("Data/", new_lang_token, limit_data)

# 4. Prepare the Model and Tokenizer for the new Language Tag

In [None]:
def fix_tokenizer2n(tokenizer, new_lang_tokens):
    """
    Add a new language token to the tokenizer vocabulary,
    this should be done each time after its initialization.
    This function is used when the model already contains another newly added token
    e.g. "ch_vs" or "ch_gr"
    """

    print(f"BEFORE IDs: {tokenizer.convert_tokens_to_ids(new_lang_tokens[::-1] + ['<mask>'])}")
    print(f"BEFORE Tokens: {tokenizer.convert_ids_to_tokens(tokenizer.convert_tokens_to_ids(new_lang_tokens[::-1] + ['<mask>']))}")

    # how many of the tokens are actually in tokenizer.added_tokens_encoder
    n_added_tokens = len([token for token in new_lang_tokens if token in tokenizer.added_tokens_encoder])

    # get the old/ original length of the tokenizer
    old_len = len(tokenizer) - n_added_tokens

    # move the new tokens in the previous position
    for i, token in enumerate(new_lang_tokens):
      tokenizer.lang_code_to_id[token] = old_len-i
      tokenizer.id_to_lang_code[old_len-old_len-n_added_tokens-i] = token

    # always move "mask" to the last position
    tokenizer.fairseq_tokens_to_ids["<mask>"] = len(tokenizer.sp_model) + len(tokenizer.lang_code_to_id) + tokenizer.fairseq_offset

    tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id)
    tokenizer.fairseq_ids_to_tokens = {v: k for k, v in tokenizer.fairseq_tokens_to_ids.items()}

    # if the token is not yet trained in the model, add it to the special tokens
    for token in new_lang_tokens:
      if token not in tokenizer._additional_special_tokens:
          tokenizer._additional_special_tokens.append(token)

    # clear the added token encoder; otherwise a new token may end up there by mistake
    tokenizer.added_tokens_encoder = {}
    tokenizer.added_tokens_decoder = {}

    print(f"AFTER IDs: {tokenizer.convert_tokens_to_ids(new_lang_tokens[::-1] + ['<mask>'])}")
    print(f"AFTER Tokens: {tokenizer.convert_ids_to_tokens(tokenizer.convert_tokens_to_ids(new_lang_tokens[::-1] + ['<mask>']))}")

def fix_tokenizer1n(tokenizer, new_lang_token):
    """
    Add a new language token to the tokenizer vocabulary,
    this should be done each time after its initialization.
    This function is used when the model does not contain another newly added token.
    """
    old_len = len(tokenizer) - int(new_lang_token in tokenizer.added_tokens_encoder)
    tokenizer.lang_code_to_id[new_lang_token] = old_len-1
    tokenizer.id_to_lang_code[old_len-1] = new_lang_token
    # always move "mask" to the last position
    tokenizer.fairseq_tokens_to_ids["<mask>"] = len(tokenizer.sp_model) + len(tokenizer.lang_code_to_id) + tokenizer.fairseq_offset

    tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id)
    tokenizer.fairseq_ids_to_tokens = {v: k for k, v in tokenizer.fairseq_tokens_to_ids.items()}
    if new_lang_token not in tokenizer._additional_special_tokens:
        tokenizer._additional_special_tokens.append(new_lang_token)
    # clear the added token encoder; otherwise a new token may end up there by mistake
    tokenizer.added_tokens_encoder = {}
    tokenizer.added_tokens_decoder = {}

    print(f"Length of the tokenizer: {len(tokenizer)}")
    print(f"IDs: {tokenizer.convert_tokens_to_ids([new_lang_token, '<mask>'])}")
    print(f"IDs: {tokenizer.convert_ids_to_tokens(tokenizer.convert_tokens_to_ids([new_lang_token, '<mask>']))}")

In [None]:
def adapt_embeddings(tokenizer, model, new_lang_token, similar_language_token):
  """
  Rezide the token embeddings of the model, move the embeddings for "mask" to it's new position.
  If similar_language_token is not None, initialize the new language token with
  a token of a similar language.
  """
  added_token_id = tokenizer.convert_tokens_to_ids(new_lang_token)
  model.resize_token_embeddings(len(tokenizer))

  # moving the embedding for "mask" to its new position
  model.model.shared.weight.data[added_token_id+1] = model.model.shared.weight.data[added_token_id]

  if similar_language_token is not None:
    similar_lang_id = tokenizer.convert_tokens_to_ids(similar_language_token)
    # initializing new language token with a token of a similar language
    model.model.shared.weight.data[added_token_id] = model.model.shared.weight.data[similar_lang_id]

In [None]:
tokenizer = NllbTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH)

if MODEL_PATH=="facebook/nllb-200-distilled-600M":
  # fix tokenizer with new language token
  fix_tokenizer1n(tokenizer, new_lang_token)
else:
  # fix the tokenizer with the added tokens
  fix_tokenizer2n(tokenizer, new_lang_tokens=[new_lang_token, token_added])

# adapt embeddings
adapt_embeddings(tokenizer, model, new_lang_token, similar_language_token)

# 5. Training

In [None]:
def cleanup():
    """Try to free GPU memory"""
    gc.collect()
    torch.cuda.empty_cache()

cleanup()

In [None]:
# this code is adapted from https://github.com/facebookresearch/stopes/blob/main/stopes/pipelines/monolingual/monolingual_line_processor.py#L214

mpn = MosesPunctNormalizer(lang="en")
mpn.substitutions = [
    (re.compile(r), sub) for r, sub in mpn.substitutions
]

def get_non_printing_char_replacer(replace_by: str = " ") -> tp.Callable[[str], str]:
    non_printable_map = {
        ord(c): replace_by
        for c in (chr(i) for i in range(sys.maxunicode + 1))
        # see https://www.unicode.org/reports/tr44/#General_Category_Values
        if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"}
    }

    def replace_non_printing_char(line) -> str:
        return line.translate(non_printable_map)

    return replace_non_printing_char

replace_nonprint = get_non_printing_char_replacer(" ")

def preproc(text):
    clean = mpn.normalize(text)
    clean = replace_nonprint(clean)
    clean = unicodedata.normalize("NFKC", clean)
    return clean

In [None]:
LANGS = [('de', 'deu_Latn'), (new_lang_token, new_lang_token)]

def get_batch_pairs(batch_size, data=df_train):
    (l1, long1), (l2, long2) = random.sample(LANGS, 2)
    xx, yy = [], []
    for _ in range(batch_size):
        item = data.iloc[random.randint(0, len(data)-1)]
        xx.append(preproc(item[l1]))
        yy.append(preproc(item[l2]))
    return xx, yy, long1, long2

In [None]:
# model specific parameters
batch_size = 16
max_length = 128
warmup_steps = 1000
scale_parameter=False
relative_step=False
lr=1e-4
clip_threshold=1.0
weight_decay=1e-3

if api_key is not None:

  project = "nllb-200" # don't need to change
  name = f'{MODEL_SAVE_PATH.split("/")[-1]}'

  !wandb login {api_key}

  # start a new wandb run to track this script
  wandb.init(
      # set the wandb project where this run will be logged
      project=project,
      name=name,
      save_code=True, # save the main script or notebook to W&B
      # track hyperparameters and run metadata
      config={
        "architecture": "nllb-200",
        "dataset": "Swiss-dial",
        "batch_size": batch_size,
        "max_length": max_length,
        "warmup_steps": warmup_steps,
        "training_steps": training_steps,
        "scale_parameter": scale_parameter,
        "relative_step": relative_step,
        "lr": lr,
        "clip_threshold": clip_threshold,
        "weight_decay": weight_decay
      }
  )

In [None]:
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
model.to(DEVICE)

optimizer = Adafactor(
    [p for p in model.parameters() if p.requires_grad],
    scale_parameter=scale_parameter,
    relative_step=relative_step,
    lr=lr,
    clip_threshold=clip_threshold,
    weight_decay=weight_decay,
)

losses = []
val_losses = []
average_train_losses = []
average_val_losses = []
scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps)

In [None]:
# when to save model and printing training and validation loss
save_step = 100

x, y, loss = None, None, None
cleanup()

# to store the best model checkpoint
best_val_loss = float('inf')  # Initialize with a large value
BEST_CHECKPOINT_PATH = f"{MODEL_SAVE_PATH}_best_checkpoint"

tq = trange(len(losses), training_steps)
for i in tq:
    ##################
    ### TRAIN LOOP ###
    ##################
    model.train()
    xx, yy, lang1, lang2 = get_batch_pairs(batch_size)
    try:
        tokenizer.src_lang = lang1
        x = tokenizer(xx, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(model.device)
        tokenizer.src_lang = lang2
        y = tokenizer(yy, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(model.device)
        y.input_ids[y.input_ids == tokenizer.pad_token_id] = -100
        # Forward Pass and get the loss
        loss = model(**x, labels=y.input_ids).loss
        # backward: perform gradient descent of the loss w.r. to the model params
        loss.backward()
        # Calculate Loss
        losses.append(loss.item())
        # Update Weights, update the model parameters by performing a single optimization step
        optimizer.step()
        # clear the old gradients from optimized variables
        optimizer.zero_grad(set_to_none=True)
        # Update Weights
        scheduler.step()

    except RuntimeError as e:
        # clear the old gradients from optimized variables
        optimizer.zero_grad(set_to_none=True)
        x, y, loss = None, None, None
        cleanup()
        print('error', max(len(s) for s in xx + yy), e)
        continue

    #######################
    ### VALIDATION LOOP ###
    #######################

    # Set the model to evaluation mode
    model.eval()

    # turn off gradients for validation
    with torch.no_grad():
        xx, yy, lang1, lang2 = get_batch_pairs(batch_size, data=df_val)
        tokenizer.src_lang = lang1
        x_val = tokenizer(xx, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(model.device)
        tokenizer.src_lang = lang2
        y_val = tokenizer(yy, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(model.device)
        y_val.input_ids[y_val.input_ids == tokenizer.pad_token_id] = -100

        val_loss = model(**x_val, labels=y_val.input_ids).loss

        val_losses.append(val_loss.item())

        # log metrics to wandb
        if api_key is not None:
          wandb.log({"loss-valid": val_loss.item(), "loss-train": loss.item()})

        model.train()

    if i % save_step == 0:
        # get the training loss
        average_train_loss = np.mean(losses[-save_step:])
        average_train_losses.append(average_train_loss)

        # get the validation loss
        average_val_loss = np.mean(val_losses[-save_step:])
        average_val_losses.append(average_val_loss)

        print(i, "| Average train loss: ", average_train_loss, ' | Average validation loss: ', average_val_loss)

        # log metrics to wandb
        if api_key is not None:
          wandb.log({"loss-valid-average": average_val_loss, "loss-train-average": average_train_loss})

        # Check if the current validation loss is the best so far, if yes save the model
        if average_val_loss < best_val_loss and i > 0:
            best_val_loss = average_val_loss
            # Save the best checkpoint
            model.save_pretrained(BEST_CHECKPOINT_PATH)
            tokenizer.save_pretrained(BEST_CHECKPOINT_PATH)
            print(i,f"| Best model saved at {BEST_CHECKPOINT_PATH}")

    # save current model state
    #if i % save_step == 0 and i > 0:
    #    model.save_pretrained(MODEL_SAVE_PATH)
    #    tokenizer.save_pretrained(MODEL_SAVE_PATH)

In [None]:
if api_key is not None:
  # this is needed in a notebook to finish the logs
  wandb.finish()