In [None]:
!pip install -q transformers sentencepiece datasets accelerate evaluate sacrebleu

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/542.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m532.5/542.0 kB[0m [31m8.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K 

In [None]:
import os
import numpy as np
import torch
from torch.utils.data import Dataset
from datasets import load_dataset
import evaluate
from transformers import MBart50TokenizerFast, AutoModelForSeq2SeqLM , DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

class NMTDataset(Dataset):
    def __init__(self, cfg, data_type='train'):
        self.cfg = cfg
        self.src_texts, self.target_texts = self.read_data(data_type)
        self.src_input_ids = self.texts_to_sequences(self.src_texts)
        self.labels = self.texts_to_sequences(self.target_texts)

    def read_data(self, data_type):
        if data_type == "train":
            data = load_dataset("ted_talks_iwslt", language_pair=(self.cfg.src_lang, self.cfg.target_lang), year="2014", trust_remote_code=True, split='train', download_mode='force_redownload')
        elif data_type == "validation":
            data = load_dataset("ted_talks_iwslt", language_pair=(self.cfg.src_lang, self.cfg.target_lang), year="2015", trust_remote_code=True, split='train', download_mode='force_redownload')
        elif data_type == "test":
            data = load_dataset("ted_talks_iwslt", language_pair=(self.cfg.src_lang, self.cfg.target_lang), year="2016", trust_remote_code=True, split='train', download_mode='force_redownload')

        src_texts = [sample['translation'][self.cfg.src_lang] for sample in data]
        target_texts = [sample['translation'][self.cfg.target_lang] for sample in data]

        return src_texts, target_texts

    def texts_to_sequences(self, texts):
        data_inputs = self.cfg.tokenizer(texts, padding='max_length', truncation=True, max_length=self.cfg.max_len)

        return data_inputs.input_ids

    def __getitem__(self, idx):
        return {"input_ids": self.src_input_ids[idx], "labels": self.labels[idx]}

    def __len__(self):
        return np.shape(self.src_input_ids)[0]

In [None]:
class BaseConfig:
    """ base Encoder Decoder config """

    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self, k, v)

class NMTConfig(BaseConfig):
    # data
    src_lang = "en"
    target_lang = "fr"
    max_len = 100
    add_special_tokens = True

    # model
    model_name = "facebook/mbart-large-50-many-to-many-mmt"

    # training
    model = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

    learning_rate = 0.1
    train_batch_size = 128
    eval_batch_size = 128
    num_train_epochs = 7
    save_total_limit = 1
    ckpt_dir = f'./mbart50-{src_lang}-{target_lang}'
    eval_steps = 1000

    # inference
    beam_width = 5

cfg = NMTConfig ()
cfg.tokenizer = MBart50TokenizerFast.from_pretrained(cfg.model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(cfg.model_name)

# metric
metric = evaluate.load('sacrebleu')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    preds = np.where(preds != -100, preds, cfg.tokenizer.pad_token_id)
    decoded_preds = cfg.tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces = True)

    labels = np.where(labels != -100, labels, cfg.tokenizer.pad_token_id)
    decoded_labels = cfg.tokenizer.batch_decode(labels, skip_special_tokens=True, clean_up_tokenization_spaces= True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result['score']}
    prediction_lens = [np.count_nonzero(pred != cfg.tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}

    return result

In [None]:
train_dataset = NMTDataset(cfg, data_type='train')
valid_dataset = NMTDataset(cfg, data_type='validation')
test_dataset = NMTDataset(cfg, data_type='test')

training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    save_strategy='steps',
    save_steps=cfg.eval_steps,
    eval_steps=cfg.eval_steps,
    output_dir =cfg. ckpt_dir,
    per_device_train_batch_size=cfg.train_batch_size,
    per_device_eval_batch_size=cfg.eval_batch_size,
    learning_rate=cfg.learning_rate,
    save_total_limit=cfg.save_total_limit,
    num_train_epochs=cfg.num_train_epochs,
    load_best_model_at_end=True,
)

data_collator = DataCollatorForSeq2Seq(cfg.tokenizer, model=model)
trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=cfg.tokenizer,
    compute_metrics=compute_metrics
)

Downloading builder script:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/16.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.67G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading builder script:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/16.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.67G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading builder script:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/16.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.67G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
trainer.train()