# **Fine-tuning mBART50 for En-Vi Machine Translation**

## I. Datasets

In [None]:
# install libs
!pip install -q transformers sentencepiece datasets accelerate evaluate sacrebleu wandb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from datasets import Dataset
from datasets import load_dataset

ds = load_dataset("Tohrumi/iwslt15_en-vi_10k")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/605 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.50M [00:00<?, ?B/s]

eval-00000-of-00001.parquet:   0%|          | 0.00/154k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/188k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating eval split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1268 [00:00<?, ? examples/s]

In [None]:
train_ds = ds["train"]["translation"]
val_ds = ds["eval"]["translation"]
test_ds = ds["test"]["translation"]

In [None]:
train_ds = Dataset.from_dict({
    "en": [item["en"] for item in train_ds],
    "vi": [item["vi"] for item in train_ds]
})

val_ds = Dataset.from_dict({
    "en": [item["en"] for item in val_ds],
    "vi": [item["vi"] for item in val_ds]
})

test_ds = Dataset.from_dict({
    "en": [item["en"] for item in test_ds],
    "vi": [item["vi"] for item in test_ds]
})

print(train_ds)
print(val_ds)
print(test_ds)

Dataset({
    features: ['en', 'vi'],
    num_rows: 10000
})
Dataset({
    features: ['en', 'vi'],
    num_rows: 1000
})
Dataset({
    features: ['en', 'vi'],
    num_rows: 1268
})


In [None]:
print(type(train_ds))
print(type(val_ds))
print(type(test_ds))

<class 'datasets.arrow_dataset.Dataset'>
<class 'datasets.arrow_dataset.Dataset'>
<class 'datasets.arrow_dataset.Dataset'>


## II. Tokenizer

In [None]:
from transformers import AutoTokenizer

model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = AutoTokenizer.from_pretrained(model_name)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

## III. Encoder

In [None]:
import torch

MAX_LEN = 75
def preprocess_function(examples):
    input_ids = tokenizer(
        examples["en"], padding ="max_length", truncation=True , max_length=MAX_LEN
    )["input_ids"]

    labels = tokenizer (
        examples["vi"], padding ="max_length", truncation=True , max_length = MAX_LEN
    )["input_ids"]

    labels = [
        [-100 if item == tokenizer.pad_token_id else item for item in label]
    for label in labels]

    return {
        "input_ids": torch.tensor(input_ids),
        "labels": torch.tensor(labels)
    }

preprocessed_train_ds = train_ds.map(preprocess_function, batched=True)
preprocessed_val_ds = val_ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

## IV. Model

In [None]:
from transformers import AutoModelForSeq2SeqLM

model_name = "facebook/mbart-large-50-many-to-many-mmt"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

## V. Evalution

In [None]:
import numpy as np
import evaluate
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [ pred.strip() for pred in preds ]
    labels = [ [label.strip() ] for label in labels ]
    return preds , labels

def compute_metrics(eval_preds):
    preds , labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    preds = np.where( preds != -100 ,preds , tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(
        preds, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )

    labels = np.where( labels != -100 , labels , tokenizer . pad_token_id )
    decoded_labels = tokenizer.batch_decode(
        labels, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )

    decoded_preds, decoded_labels = postprocess_text(
        decoded_preds, decoded_labels
    )

    result = metric.compute( predictions = decoded_preds , references = decoded_labels)
    result = {"bleu": result["score"]}

    return result

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

## VI. Trainer

In [None]:
import os
os.environ["WANDB_DISABLED"] = "false"

from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq,Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments (
    output_dir ="./en-vi-mbart50",
    logging_dir ="logs",
    logging_steps = 1000,
    predict_with_generate=True,
    eval_strategy = "steps",
    eval_steps =1000,
    save_strategy = "steps",
    save_steps =1000,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,
    save_total_limit = 1,
    num_train_epochs = 1,
    load_best_model_at_end=True,
    report_to="wandb"
)

print("Output directory:", training_args.output_dir)


data_collator = DataCollatorForSeq2Seq( tokenizer, model = model)
trainer = Seq2SeqTrainer (
    model,
    training_args,
    train_dataset = preprocessed_train_ds,
    eval_dataset = preprocessed_val_ds,
    data_collator = data_collator,
    processing_class = tokenizer,
    compute_metrics = compute_metrics
)

trainer.train()
print("Model saved to:", training_args.output_dir)


Output directory: ./en-vi-mbart50


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mngdangkhanh04[0m ([33mngdangkhanh04-hanoi-university-of-science-and-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Bleu
1000,1.4812,1.424718,29.476099
2000,1.4322,1.390511,30.047066
3000,1.3903,1.336715,30.336105
4000,1.3101,1.274082,31.750771
5000,1.2986,1.244979,32.302603


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Model saved to: ./en-vi-mbart50


In [None]:
from dotenv import load_dotenv
import os

load_dotenv()

hf_token = os.getenv("HF_TOKEN")

In [None]:
# Nhập key huggingface
trainer.push_to_hub(token=hf_token)")

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.43k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ngdangkhanh/en-vi-mbart50/commit/4e68420f1088b47da42f834771b528dca2d6e346', commit_message='End of training', commit_description='', oid='4e68420f1088b47da42f834771b528dca2d6e346', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ngdangkhanh/en-vi-mbart50', endpoint='https://huggingface.co', repo_type='model', repo_id='ngdangkhanh/en-vi-mbart50'), pr_revision=None, pr_num=None)

## VII. Inference

In [None]:
model_name = "ngdangkhanh/en-vi-mbart50"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/11.0k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/992 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/226 [00:00<?, ?B/s]

###. Greedy search

In [None]:
src_text = "Your jupyter notebook connection has expired. Please refresh your browser window if you wish to use the notebook again."
encoded_text = tokenizer(src_text, return_tensors="pt")
generated_tokens = model.generate(
    **encoded_text
)
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

['Liên kết của bạn với máy tính xách tay đã hết thời gian . Hãy làm mới cửa sổ trình duyệt của bạn nếu bạn muốn sử dụng máy tính xách tay một lần nữa .']

### Beam search

In [None]:
src_text = "In the next step, we consider the next possible tokens for each of the three branches we created in the previous step."
encoded_text = tokenizer(src_text, return_tensors="pt")
generated_tokens = model.generate(
    **encoded_text,
    num_beams=5,
)
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

['Trong bước tiếp theo , chúng tôi xem xét các biểu tượng tiếp theo cho mỗi 3 bộ phận mà chúng tôi tạo ra trong bước trước .']

### Pipeline

In [None]:
from transformers import pipeline

translator = pipeline(model="ngdangkhanh/en-vi-mbart50")

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/226 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/11.0k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/992 [00:00<?, ?B/s]

Device set to use cpu


In [None]:
translated_text = translator("You are my sunshine", num_beams=1, do_sample=False)
translated_text

[{'generated_text': 'Bạn là ánh nắng của tôi'}]

In [None]:
translated_text = translator("I am from Nghe An Province", num_beams=2)
translated_text

[{'generated_text': 'Tôi đến từ tỉnh Nghe An'}]