# Machine Translation & Multilingual NLP

## Goal of the session

**Part I - Machine Translation**: you are going to train a small encoder-decoder from scratch on English-French data. Use backtranslation and iterative backtranslation to improve your results.

**Part II - Multilingual NLP**: you are going to compare CamemBERT & RoberTA on XLNI French. Finetune your previous MT models on in-domain data (similarly to what was presented in the slides) to improve RoberTA results.

❗❗❗ SELECT A GPU HARDWARE ❗❗❗

# Part I - Machine Translation

### Requirements

Install packages & download data

In [None]:
!pip install transformers evaluate datasets

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m751.1 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: dill, responses, multiprocess, datasets, 

In [None]:
!npm install -g github-files-fetcher
!fetcher --url=https://github.com/multi30k/dataset/tree/master/data/task1/raw

In [None]:
!gunzip raw/*
!rm raw/*de raw/*cs

In [None]:
!echo raw/*

raw/test_2016_flickr.en raw/test_2016_flickr.fr raw/test_2017_flickr.en raw/test_2017_flickr.fr raw/test_2017_mscoco.en raw/test_2017_mscoco.fr raw/test_2018_flickr.en raw/test_2018_flickr.fr raw/train.en raw/train.fr raw/val.en raw/val.fr


### Train a Transformer encoder-decoder on Multi30k English => French.

**Config**: 4 layers - 4 heads - hidden_dim 128 - feedforward_dim 256

In [None]:
# Imports
import os
import sys
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoTokenizer

from typing import Optional

In [None]:
## TODO: Write a dataset class. Use test_2016_flickr as test set

class MyDataset(Dataset):
  def __init__(self, split: str):
    super(MyDataset, self).__init__()

    self.datapath = "./raw/"
    self.tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
    ## To complete

  def __len__(self):
    ## To complete: size must be < 4000
    ...

  def __getitem__(self, item):
    ## To complete: Must return a dict of torch.Tensor with two keys 'src' and 'tgt'
    ...


In [None]:
## TODO: Write a collate_fn to pad your batch - Have a look at section 2 pad_sequence here: https://suzyahyah.github.io/pytorch/2019/07/01/DataLoader-Pad-Pack-Sequence.html

def collate_fn(batch):
  ## To complete: Must return a batch dict of padded torch.Tensor with two keys 'src' and 'tgt'
  ...

In [None]:
train_data = MyDataset(split="train")
dev_data = MyDataset(split="val")
test_data = MyDataset(split="test")

train_loader = DataLoader(train_data, batch_size=16, shuffle=True, num_workers=2, collate_fn=collate_fn)
dev_loader = DataLoader(dev_data, batch_size=16, shuffle=False, num_workers=2, collate_fn=collate_fn)
test_loader = DataLoader(test_data, batch_size=16, shuffle=False, num_workers=2, collate_fn=collate_fn)

In [None]:
## TODO: Model definition with the following config: 4 layers - 4 heads - hidden_dim 128 - feedforward_dim 256
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class MyModel(nn.Module):
  def __init__(self):

    self.positional_encodings = nn.PositionalEncoding(...)
    self.embeddings = nn.Embedding(...) # To complete (have a look at the number of tokens in the tokenizer)
    self.transformer = nn.Transformer(...) # To complete

  def forward(self, x: torch.Tensor, y: Optional[torch.Tensor] = None):
    # To complete: Don't forget to build masks! See here for more details: https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html
    ...

model = MyModel()
print(f"Transformer with {sum([p.numel() for p in model.parameters()])} parameters")  # Print num params

In [None]:
## TODO: Build optimizer and training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = optim.Adam(...) # To complete
loss_fn = nn.CrossEntropyLoss(...)  # To complete: don't forget to ignore the padding index!

NUM_EPOCH = 5

for epoch in range(NUM_EPOCH):
  print(f"Epoch {epoch}")

  # Train
  model.train()
  train_loss, dev_loss = [], []
  for batch_idx, batch in enumerate(train_loader, 1):

    batch = {k: v.to(device) for k, v in batch.items()}

    # Forward pass: To complete
    ...

    # Backward pass: To complete
    ...

    # Don't forget to add the loss to train_loss
    if not batch_idx % 10:
      print(f"Epoch {epoch} - Batch idx {batch_idx} - Train loss {np.mean(train_loss)}")

  # Val
  model.eval()
  for batch in val_loader:

    batch = {k: v.to(device) for k, v in batch.items()}

    # Eval pass: To complete
    with torch.no_grad():
      ...

    # Don't forget to add the loss to dev_loss

  print(f"Epoch {epoch} - Dev set - Loss {np.mean(dev_loss)}")


In [None]:
## TODO: Find best hyperparameters on dev set for decoding (top-k, temperature)
## Vary top-k and temperature to decode sentences
## Have a look at https://huggingface.co/spaces/evaluate-metric/bleu for using BLEU score
## Don't forget to detokenize output sequence before feeding it to BLEU

import evaluate
bleu = evaluate.load("bleu")

output_logits = {t: [] for t in [0.1, 1.0, 10.0]}  # Different temperature values
for batch in val_loader:

  batch = {k: v.to(device) for k, v in batch.items()}

  model.eval()
  with torch.no_grad():
    logits = model(...).detach().cpu() # To complete

    for t in output_logits.keys():
      probs = ... # To complete: turn logits into probabilities with temperature t
      output_logits[t].append(probs)


# Load references from MyDataset
references = [...]  # To complete

# Test different top-k values
topk_bleu_scores = {k: {t: "" for t in output_logits.keys()} for k in [1, 5, 10]}
for k in [1, 5, 10]:
  for t in output_logits.keys():

    translations = []
    # Sample with top-k value
    for probs in output_logits[t]:

      decoded_translation = ... # To complete: decode translation
      translations.append(decoded_translation)

    # Compute BLEU score
    bleu_score = ... # To complete
    topk_bleu_scores[k][t] = bleu_score

In [None]:
## TODO: Compute BLEU on test set using the selected hyperparameters

### BackTranslation: Train a Transformer encoder-decoder on Multi30k French => English. Add test_2017_flickr and test_2018_flickr as the backtranslated data (using previous model)

**Config**: 4 layers - 4 heads - hidden_dim 128 - feedforward_dim 256

In [None]:
## TODO: use previous model to create synthetic Fr => En parallel data
additional_training_data = MyDataset(...)  # To complete to load test_2017 and test_2018 French data - Modify MyDataset class if necessary
additional_loader = DataLoader(additional_training_data, batch_size=16, num_workers=2, collate_fn=collate_fn)
tokenizer = additional_training_data.tokenizer

fr_data, en_data = [], []
for batch in additional_loader:

  batch = {k: v.to(device) for k, v in batch.items()}

  en_data += [...]  # To complete: decode src sentences with tokenizer

  model.eval()
  with torch.no_grad():
    logits = model(...).detach().cpu() # To complete

    probs = ... # To complete: turn logits into probabilities with temperature t chosen in previous cells

    # Sample with top-k chosen in previous cells - To complete
    for prob in probs:
      translation = ...
      fr_data.append(...)

In [None]:
## TODO: train a model for Fr => En and evaluate (similar to previous block)

### Iterative BackTranslation:

- Iterative Backtranslation consists of training two models src <=> tgt simultaneously. While the first one trains, the second one is frozen and provides backtranslated data. At the end of the epoch, the first one becomes frozen and provides the backtranslated data while the second one trains.


Train simultaneously two Transformer encoder-decoder on Multi30k French => English & English => French. Add test_2017_flickr and test_2018_flickr as the backtranslated data

**Config**: 4 layers - 4 heads - hidden_dim 128 - feedforward_dim 256

In [None]:
model = MyModel(...)  # To complete
reverse_model = MyModel(...)  # To complete

# First iteration - no backtranslated data
## TODO: Build optimizer and training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
reverse_model.to(device)

optimizer = optim.Adam(...) # To complete
rev_optimizer = optim.Adam(...) # To complete
loss_fn = nn.CrossEntropyLoss(...)  # To complete: don't forget to ignore the padding index!

# Train
model.train()
reverse_model.to(device)
train_loss, dev_loss, rev_train_loss, rev_dev_loss = [], []
for batch_idx, batch in enumerate(train_loader, 1):

  batch = {k: v.to(device) for k, v in batch.items()}

  # Forward pass: To complete
  ...

  # Backward pass: To complete
  ...

  # Don't forget to add the loss to train_loss
  if not batch_idx % 10:
    print(f"Epoch {epoch} - Batch idx {batch_idx} - En => Fr Train loss {np.mean(train_loss)}")
    print(f"Epoch {epoch} - Batch idx {batch_idx} - Fr => En Train loss {np.mean(rev_train_loss)}")

# Val
model.eval()
reverse_model.eval()
for batch in val_loader:

  batch = {k: v.to(device) for k, v in batch.items()}

  # Eval pass: To complete
  with torch.no_grad():
    ...

  # Don't forget to add the loss to dev_loss

print(f"Epoch {epoch} - Dev set - En => Fr Loss {np.mean(dev_loss)}")
print(f"Epoch {epoch} - Dev set - Fr => En Loss {np.mean(rev_dev_loss)}")


In [None]:
## TODO: Iterate
NUM_ITERATIONS = 5

for _iter in range(NUM_ITERATIONS):
  ...  # To complete, don't forget to freeze one model when training the other

In [None]:
## TODO: Eval on test_2016 & compare the three methods, which one gives the best BLEU score?

# Part II: Multilingual NLP

### Requirements

Install packages

In [None]:
!pip install transformers[sentencepiece] evaluate datasets sentencepiece accelerate peft sacremoses
!pip install -U sentence-transformers

  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=a3b11dacafac00f082f8363066b3d346542d9f73458f30134bf34fc5b94f44e7
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.2


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
import accelerate
import peft
from tqdm import tqdm

### Download translation models & classification models & their tokenizers

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

translation_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-fr-en")
back_translation_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-fr")
fren_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-fr-en")
enfr_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")

translation_model.to(device)
back_translation_model.to(device)

In [None]:
camembert_nli = AutoModelForSequenceClassification.from_pretrained("mtheo/camembert-base-xnli")
camembert_tokenizer = AutoTokenizer.from_pretrained("mtheo/camembert-base-xnli")

camembert_nli.to(device)

tokenizer_config.json:   0%|          | 0.00/516 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/354 [00:00<?, ?B/s]

CamembertForSequenceClassification(
  (roberta): CamembertModel(
    (embeddings): CamembertEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): CamembertEncoder(
      (layer): ModuleList(
        (0-11): 12 x CamembertLayer(
          (attention): CamembertAttention(
            (self): CamembertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): CamembertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=Tru

In [None]:
from sentence_transformers import CrossEncoder
roberta_nli = CrossEncoder('cross-encoder/nli-roberta-base')
roberta_nli.model.to(device)

In [None]:
print(f"Number of parameters of the translation model: {sum([p.numel() for p in translation_model.parameters()])}")
print(f"Number of parameters of CamemBERT: {sum([p.numel() for p in camembert_nli.parameters()])}")
print(f"Number of parameters of RoBERTa: {sum([p.numel() for p in roberta_nli.model.parameters()])}")

Number of parameters of the translation model: 75133952
Number of parameters of CamemBERT: 110624259
Number of parameters of RoBERTa: 124647939


### Load data

In [None]:
test_data_fr = load_dataset("xnli", data_files="fr/test-00000-of-00001.parquet")["train"]
test_data_en = load_dataset("xnli", data_files="en/test-00000-of-00001.parquet")["train"]
data_en = load_dataset("xnli", data_files="en/train-00000-of-00001.parquet")["train"]
data_fr = load_dataset("xnli", data_files="fr/train-00000-of-00001.parquet")["train"]

In [None]:
from torch.utils.data import DataLoader
_BATCH_SIZE = 128

test_loader_fr = DataLoader(test_data_fr, batch_size=_BATCH_SIZE, num_workers=2)

### Translate test_data_fr into English using translation_model

Use top-p sampling with p=0.8

In [None]:
test_data_en_translated_from_fr = {"premise": [], "hypothesis": [], "label": []}

pad_token_id = fren_tokenizer.pad_token_id
translation_model.eval()
for sample in tqdm(test_loader_fr):

  premise_inps = fren_tokenizer(sample["premise"], return_tensors="pt", padding=True)
  hypothesis_inps = fren_tokenizer(sample["hypothesis"], return_tensors="pt", padding=True)

  # Transfer to current device
  premise_inps = {k: v.to(device) for k, v in premise_inps.items()}
  hypothesis_inps = {k: v.to(device) for k, v in hypothesis_inps.items()}

  with torch.no_grad():
    # Compute translations
    tr_premise_ids = translation_model.generate(**premise_inps, num_beams=1,
                                            do_sample=True, top_p=0.8,
                                            top_k=0, temperature=0.6,
                                            pad_token_id=pad_token_id)
    tr_hyp_ids = translation_model.generate(**hypothesis_inps, num_beams=1,
                                        do_sample=True, top_p=0.8,
                                        top_k=0, temperature=0.6,
                                        pad_token_id=pad_token_id)

    ## Detokenize
    test_data_en_translated_from_fr["premise"] += fren_tokenizer.batch_decode(tr_premise_ids, skip_special_tokens=True)
    test_data_en_translated_from_fr["hypothesis"] += fren_tokenizer.batch_decode(tr_hyp_ids, skip_special_tokens=True)
    test_data_en_translated_from_fr["label"] += sample["label"]


100%|██████████| 40/40 [01:31<00:00,  2.28s/it]


### Compute score of RoBERTa (translate-test baseline)

In [None]:
import time
inputs = [(premise, hypothesis) for premise, hypothesis in zip(test_data_en_translated_from_fr["premise"],
                                                               test_data_en_translated_from_fr["hypothesis"])]

t0 = time.time()
logits = roberta_nli.predict(inputs)
print(f"Time to compute logits: {time.time() - t0}")

#label_mapping = ['contradiction', 'entailment', 'neutral']
labels_matching = {1: 0, 2: 1, 0: 2}
preds = logits.argmax(axis=1)
preds = np.array([labels_matching[lab] for lab in preds])
gt = np.array(test_data_en_translated_from_fr["label"])

Time to compute logits: 18.42674446105957


In [None]:
## Compute scores
accuracy = accuracy_score(gt, preds)
_f1_score = f1_score(gt, preds, average="weighted")
recall = recall_score(gt, preds, average="weighted")
precision = precision_score(gt, preds, average="weighted")

print(f"Translate test - Accuracy: {accuracy:.3f}")
print(f"Translate test - F1 Score: {_f1_score:.3f}")
print(f"Translate test - Recall: {recall:.3f}")
print(f"Translate test - Precision: {precision:.3f}")

print(confusion_matrix(gt, preds))

Translate test - Accuracy: 0.808
Translate test - F1 Score: 0.807
Translate test - Recall: 0.808
Translate test - Precision: 0.822
[[1143  389  138]
 [  80 1442  148]
 [  29  178 1463]]


### Compute score of CamemBERT

In [None]:
torch.cuda.empty_cache()

In [None]:
## TODO: Compute XNLI score on test_data_fr.
## Have a look here: https://huggingface.co/docs/transformers/model_doc/camembert#transformers.CamembertForSequenceClassification
## Tokenizer usage:
##    camembert_tokenizer(premise, hypothesis, return_tensors='pt', padding=True)

## Fine-tune translation model using the method from Artexte et al. (2023) described in class. We will focus on the domain adaptation method in the MT adaptation section.

To recap:
Back translate English training data into French and use the parallel data to fine-tune the translation model using LORA.

Recompute translations once done and recompute scores of translate-test baseline (RoBERTa).

In [None]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

In [None]:
print_trainable_parameters(translation_model)

trainable params: 74609664 || all params: 75133952 || trainable%: 99.30


In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
)
lora_translation_model = get_peft_model(translation_model, config)
print_trainable_parameters(lora_translation_model)

trainable params: 589824 || all params: 75723776 || trainable%: 0.78


### Back Translate English training data into French using backtranslate language model

In [None]:
### TODO: Back translation - reuse code from previous blocks

_BATCH_SIZE = 128

train_loader_en = DataLoader(data_en, batch_size=_BATCH_SIZE // 2, num_workers=2, shuffle=False)
training_data_fr_translated_from_en = {"premise": [], "hypothesis": [], "label": []}

pad_token_id = enfr_tokenizer.pad_token_id
back_translation_model.eval()
for batch_idx, sample in tqdm(enumerate(train_loader_en, 1)):

  if batch_idx == 250:
    break

  premise_inps = enfr_tokenizer(sample["premise"], return_tensors="pt", padding=True)
  hypothesis_inps = enfr_tokenizer(sample["hypothesis"], return_tensors="pt", padding=True)

  # Transfer to current device
  premise_inps = {k: v.to(device) for k, v in premise_inps.items()}
  hypothesis_inps = {k: v.to(device) for k, v in hypothesis_inps.items()}

  with torch.no_grad():
    # Compute translations
    tr_premise_ids = back_translation_model.generate(**premise_inps, num_beams=1,
                                            do_sample=True, top_p=0.8,
                                            top_k=0, temperature=0.6,
                                            pad_token_id=pad_token_id)
    tr_hyp_ids = back_translation_model.generate(**hypothesis_inps, num_beams=1,
                                        do_sample=True, top_p=0.8,
                                        top_k=0, temperature=0.6,
                                        pad_token_id=pad_token_id)

    ## Detokenize
    training_data_fr_translated_from_en["premise"] += enfr_tokenizer.batch_decode(tr_premise_ids, skip_special_tokens=True)
    training_data_fr_translated_from_en["hypothesis"] += enfr_tokenizer.batch_decode(tr_hyp_ids, skip_special_tokens=True)
    training_data_fr_translated_from_en["label"] += sample["label"]


249it [08:57,  2.16s/it]


### Use the back-translated data to fine-tune translation_model (fr => en)

In [None]:
### TODO: finetune translation_model on the back translated data. Reuse code from Partie I (training loop)

### Recompute results translate test for the domain adapted translation_model

In [None]:
### TODO: Recompute results - reuse code from previous blocks