# **Low-resource Machine Translation using mBART50**

In [1]:
!pip install -q transformers sentencepiece datasets accelerate evaluate sacrebleu

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/510.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━[0m [32m399.4/510.5 kB[0m [31m12.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/290.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.6/106.6 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90

In [2]:
import os
import numpy as np
import torch
from torch.utils.data import Dataset
from datasets import load_dataset
import evaluate
from transformers import (
    MBart50TokenizerFast,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
# %cd /content/drive/MyDrive

## **Dataset**

{ "en": "Rachel Pike : The science behind a climate headline", "vi": "Khoa học đằng sau một tiêu đề về khí hậu" }

In [26]:
class NMTDataset(Dataset):
    def __init__(self, cfg, data_type="train"):
        super().__init__()
        self.cfg = cfg
        self.src_texts, self.tgt_texts = self.read_data(data_type)

        self.src_input_ids = self.texts_to_sequences(self.src_texts)
        self.labels = self.texts_to_sequences(self.tgt_texts, is_labels=True)


    def __len__(self):
        return np.shape(self.src_texts)[0]

    def read_data(self, data_type):
        data = load_dataset(
            "mt_eng_vietnamese",
            "iwslt2015-en-vi",
            split=data_type
        )
        src_texts = [sample["translation"][self.cfg.src_lang] for sample in data]
        tgt_texts = [sample["translation"][self.cfg.tgt_lang] for sample in data]

        return src_texts, tgt_texts

    def texts_to_sequences(self, text, is_labels=False):
        data_inputs = self.cfg.tokenizer(
                text,
                padding='max_length',
                truncation=True,
                max_length=self.cfg.max_len,
                return_tensors='pt'
        )
        if is_labels:
            labels = []
            for label in data_inputs.input_ids:
                label = [l if l != cfg.tokenizer.pad_token_id else - 100 for l in label]
                labels.append(label)
            return torch.tensor(labels)

        return data_inputs.input_ids
    def __getitem__(self, idx):
        return {
            "input_ids": self.src_input_ids[idx],
            "labels": self.labels[idx]
        }


In [4]:
data = load_dataset(
    "mt_eng_vietnamese",
    "iwslt2015-en-vi"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/181k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/181k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/133318 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1269 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1269 [00:00<?, ? examples/s]

In [8]:
model_name = "facebook/mbart-large-50-many-to-many-mmt"
sentence = "tôi đi học"

In [9]:
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

In [10]:
tokenizer(sentence) # 250004 ma cho english

{'input_ids': [250004, 2259, 2467, 2546, 2], 'attention_mask': [1, 1, 1, 1, 1]}

In [11]:
tokenizer = MBart50TokenizerFast.from_pretrained(model_name, src_lang="vi_VN")

In [12]:
tokenizer(sentence) # 250024 ma cho VietNam , neu khong set mac dinh cho

{'input_ids': [250024, 2259, 2467, 2546, 2], 'attention_mask': [1, 1, 1, 1, 1]}

In [None]:
class NMTDataset(Dataset):
    def __init__(self, cfg, data_type="train"):
        super().__init__()
        self.cfg = cfg

        self.src_texts, self.tgt_texts = self.read_data(data_type)

        self.src_input_ids = self.texts_to_sequences(self.src_texts)
        self.labels = self.texts_to_sequences(self.tgt_texts)

    def read_data(self, data_type):
        data = load_dataset(
            "mt_eng_vietnamese",
            "iwslt2015-en-vi",
            split=data_type
        )
        src_texts = [sample["translation"][self.cfg.src_lang] for sample in data]
        tgt_texts = [sample["translation"][self.cfg.tgt_lang] for sample in data]
        return src_texts, tgt_texts

    def texts_to_sequences(self, texts):
        data_inputs = self.cfg.tokenizer(
            texts,
            padding='max_length',
            truncation=True,
            max_length=self.cfg.max_len,
            return_tensors='pt'
        )
        return data_inputs.input_ids

    def __getitem__(self, idx):
        return {
            "input_ids": self.src_input_ids[idx], # attention mask chi dung khi co su ket hop giua cac text
            "labels": self.labels[idx]
        }

    def __len__(self):
        return np.shape(self.src_input_ids)[0]

## **Config**

In [13]:
class BaseConfig:
    """ base Encoder Decoder config """

    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self, k, v)

class NMTConfig(BaseConfig):
    # Data
    src_lang = 'en'
    tgt_lang = 'vi'
    max_len = 75
    add_special_tokens = True

    # Model
    model_name = "facebook/mbart-large-50-many-to-many-mmt"

    # Training
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    learning_rate = 5e-5
    train_batch_size = 16
    eval_batch_size = 16
    num_train_epochs = 2
    save_total_limit = 1
    ckpt_dir = f'./mbart50-{src_lang}-{tgt_lang}'
    eval_steps = 1000

    # Inference
    beam_size = 5

cfg = NMTConfig()

## **Tokenizer, Model, Metric**

In [14]:
# tokenizer = MBart50TokenizerFast.from_pretrained(cfg.model_name, src_lang="en_XX",tgt_lang = "vi_VN")
cfg.tokenizer = MBart50TokenizerFast.from_pretrained(cfg.model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(cfg.model_name)

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

In [None]:
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):

    preds, labels = eval_preds # preds : bs x Seq_len , labels : bs x Seq_len

    if isinstance(preds, tuple):
        preds = preds[0]

    preds = np.where(preds != -100, preds, cfg.tokenizer.pad_token_id)
    labels = np.where(labels != -100, preds, cfg.tokenizer.pad_token_id)

    decoded_preds = cfg.tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    decoded_labels = cfg.tokenizer.batch_decode(labels, skip_special_tokens=True, clean_up_tokenization_spaces=True)

    decoded_preds , decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # Dua vao BLEU

    bleu_score = metric.compute(decoded_preds, decoded_labels)

    result = {'blue' : blue_score['score']}

    return result



In [15]:
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    preds= np.where(preds != -100, preds, cfg.tokenizer.pad_token_id)
    decoded_preds = cfg.tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True)

    labels= np.where(labels != -100, labels, cfg.tokenizer.pad_token_id)
    decoded_labels = cfg.tokenizer.batch_decode(labels, skip_special_tokens=True, clean_up_tokenization_spaces=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != cfg.tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}

    return result

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

## **Training**

In [27]:
test_dataset = NMTDataset(cfg, data_type="test")

In [28]:
next(iter(test_dataset))

{'input_ids': tensor([250004,  14847,     87,    509,  10176,      6,      4,     87,  17569,
            759,  23295,    509,     70,   2965,     98,     70,  23208,      6,
              4,    136,     87,   3514,    434,   1257,   5367,    214,     10,
          11531,  35839,    619,  41502,     74, 182747,    717,    357,   3033,
              6,      5,    619,  41502,     74,      2,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1]),
 'labels': tensor([250004,  16584,   2259,   3531,  14162,      6,      4,  14343,  14290,
          10371,  38262,  55487,  86630, 111226,    580,  11472,   3042,   6718,
           2671,   2479,   3061,   7385,    544,   2259,   6840,  21780,   9031,
            619,  41502,     74,  33756,    308,  2

In [29]:
train_dataset = NMTDataset(cfg, data_type="train")
valid_dataset = NMTDataset(cfg, data_type="validation")
test_dataset = NMTDataset(cfg, data_type="test")

In [30]:
next(iter(train_dataset))

{'input_ids': tensor([250004, 127055,  66937,     13,    152,    581,  41664,  50155,     10,
         153552,  10336,   2256,      2,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1]),
 'labels': tensor([250004,  67766,   2546, 218877,    858,    889,  10037,   6248,   1893,
          17964,  42254,      2,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,   -100,
           -100,   -100,   -100,   -100,   -100,   

In [None]:
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    save_strategy='steps',
    save_steps=cfg.eval_steps,
    eval_steps=cfg.eval_steps,
    output_dir=cfg.ckpt_dir,
    per_device_train_batch_size=cfg.train_batch_size,
    per_device_eval_batch_size=cfg.eval_batch_size,
    learning_rate=cfg.learning_rate,
    save_total_limit=cfg.save_total_limit,
    num_train_epochs=cfg.num_train_epochs,
    load_best_model_at_end=True,
)

data_collator = DataCollatorForSeq2Seq(
    cfg.tokenizer,
    model=model
)

trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=cfg.tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Bleu,Gen Len
1000,0.5019,0.599626,31.8265,32.3656
2000,0.4932,0.581667,32.6807,32.4169
3000,0.4845,0.572604,32.8764,32.5887
4000,0.4846,0.561723,33.0896,32.8487
5000,0.4733,0.555238,33.7726,32.6627
6000,0.4711,0.551521,33.6935,32.7289
7000,0.4648,0.545575,33.7381,32.9535
8000,0.4536,0.53477,34.2136,32.9567
9000,0.3668,0.546183,34.0964,32.6887
10000,0.366,0.546823,33.7523,32.8022


Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameter

TrainOutput(global_step=16666, training_loss=0.4283612245673909, metrics={'train_runtime': 10167.3268, 'train_samples_per_second': 26.225, 'train_steps_per_second': 1.639, 'total_flos': 4.23218857033728e+16, 'train_loss': 0.4283612245673909, 'epoch': 2.0})

In [None]:
prediction = trainer.predict(test_dataset)

In [None]:
prediction

PredictionOutput(predictions=array([[     2, 250004,  16584, ...,      1,      1,      1],
       [     2, 250004,  23598, ...,      1,      1,      1],
       [     2, 250004,  71717, ...,      1,      1,      1],
       ...,
       [     2, 250004,  14343, ...,      1,      1,      1],
       [     2, 250004, 131785, ...,      1,      1,      1],
       [     2, 250004,      2, ...,      1,      1,      1]]), label_ids=array([[250004,  16584,   2259, ...,      1,      1,      1],
       [250004,  14343,   1408, ...,      1,      1,      1],
       [250004,  71717,   4373, ...,      1,      1,      1],
       ...,
       [250004,  14343,   1274, ...,      1,      1,      1],
       [250004, 131785,  43209, ...,      1,      1,      1],
       [250004,      2,      1, ...,      1,      1,      1]]), metrics={'test_loss': 0.5306801795959473, 'test_bleu': 34.8714, 'test_gen_len': 32.792, 'test_runtime': 134.5483, 'test_samples_per_second': 9.432, 'test_steps_per_second': 0.595})

In [None]:
def inference(
    text,
    tokenizer,
    model,
    device="cpu",
    max_length=75,
    beam_size=5
    ):
    inputs = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
        )
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)
    model.to(device)

    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        early_stopping=True,
        num_beams=beam_size,
        length_penalty=2.0
    )

    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return output_str

In [None]:
sentence = 'i go to school'
inference(sentence, cfg.tokenizer, model)

['tôi đi học.']

## **Checkpoint**
https://drive.google.com/drive/folders/1ii_lPm2-1CfIhQM8RVzLgTHMxXDKgnk4?usp=sharing