# Training an MT5 model for multilingual paraphrasing using multilingual paraphrase data in English, German, Czech and Slovene on the whole data


Main configuration

In [None]:
initial_finetuning = True  # this is true only at the beginning of fine-tuning. Set to False if you want to continue training from some checkpoint saved on google drive.
hf_checkpoint = 'google/mt5-small'
drive_checkpoint = ''  # e.g. '/content/drive/MyDrive/models/old-checkpoint-234/'

## Environment Setup

We need a GPU, so we check the availability:

In [None]:
!nvidia-smi

And we install all neede libraries.

In [None]:
!pip install datasets==2.11.0 transformers==4.28.0 nltk==3.8.1 parascore==1.0.5 sentencepiece==0.1.98

We store checkpoints on Google Drive. After we have mounted our Google Drive, the root folder of our Drive is at `/content/drive/MyDrive/`.

In [None]:
from google.colab import drive
drive.mount("/content/drive/")

## Data Download and Preparation

In [None]:
from datasets import load_dataset, interleave_datasets

We use our own created datasets with paraphrases.

In [None]:
raw_dataset = load_dataset('yawnick/para_crawl_multi_all')

Let's store the splits separately.

In [None]:
raw_dataset_train = raw_dataset['train']
raw_dataset_val = raw_dataset['validation']
raw_dataset_test = raw_dataset['test']
raw_dataset_train[5]

Now, let's prepare the data for training.

In [None]:
from transformers import T5Tokenizer, AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(hf_checkpoint)

Let's see how the tokenizer works:

In [None]:
s1 = raw_dataset_train[4]['Original']
s2 = raw_dataset_train[4]['Paraphrase']
print(s1)
print(s2)
inputs = tokenizer(s1, text_target=s2)
print([tokenizer.decode(id) for id in inputs['input_ids']])
inputs

Now we create a preprocess function that turns a dataset item into a form that the model can use for training.

In [None]:
max_length = 128

# the prefix has to (dynamically) be adjusted depending on the language or when training multilingually (I think).
prefix = 'paraphrase: '

def preprocess_function(examples):
    inputs = [prefix+s1 for s1 in examples['Original']]
    targets = examples['Paraphrase']
    # most likely there will be nothing to truncate, but we still add it
    model_inputs = tokenizer(inputs, text_target=targets, max_length=max_length, truncation=True)
    return model_inputs

Now we apply the preprocessing function to the datasets.

In [None]:
tokenized_ds_train = raw_dataset_train.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_dataset_train.column_names
)
tokenized_ds_val = raw_dataset_val.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_dataset_val.column_names
)
tokenized_ds_test = raw_dataset_test.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_dataset_test.column_names
)

Now the data is ready.

## Model and Training Preparation

Next, the model and a Datacollator.

In [None]:
from transformers import MT5ForConditionalGeneration

Either load the pretrained model from huggingface at the beginning of fine-tuning for the first epoch, or load the model from a previous fine-tune checkooint from google drive.

In [None]:
if initial_finetuning:
  model = MT5ForConditionalGeneration.from_pretrained(hf_checkpoint)
else:
  model = MT5ForConditionalGeneration.from_pretrained(drive_checkpoint)

Next, we instantiate a DataCollator.

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

Here, I'll skip the example usage of the datacollator, check it out [here](https://huggingface.co/learn/nlp-course/chapter7/4?fw=pt#data-collation).

Now, let's continue with metrics. We will use Parascore.

In [None]:
from parascore import ParaScorer

scorer = ParaScorer(lang='multi')

Let's quickly go over how Parascore is used (this example is in english, so it's not ideal):

In [None]:
cands = ["A young person is skating.", "I like sports.", "He catches the ball.", "That's very interesting!"]
sources = ["There's a child on a skateboard.", "I like to relax.", "good morning, everyone!", "I find this interesting."]
score = scorer.free_score(cands, sources)
float(score[-1].mean())

Now, here's the `compute_metrics` function (mostly copied from [here](https://huggingface.co/learn/nlp-course/chapter7/4?fw=pt#metrics)):

In [None]:
import numpy as np

In [None]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]
    print(decoded_preds[:5])
    print(decoded_labels[:5])
    
    parascore = scorer.free_score(decoded_preds, decoded_labels)
    return {'parascore': float(parascore[-1].mean())}
    

In [None]:
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer

In [None]:


args = Seq2SeqTrainingArguments(
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy='steps',
    logging_steps=500,
    output_dir='/content/drive/MyDrive/models/multi-para-all',  # this is where the checkpoint will be saved
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=5,
    num_train_epochs=5,
    predict_with_generate=True,
)

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_ds_train,
    eval_dataset=tokenized_ds_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()