In [1]:
!pip install -q transformers sentencepiece datasets accelerate evaluate sacrebleu

In [2]:
import os
import numpy as np
import torch
from torch.utils.data import Dataset
from datasets import load_dataset
import evaluate
from transformers import (
    MBart50TokenizerFast,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

2024-04-22 03:18:54.637234: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-22 03:18:54.637389: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-22 03:18:54.791691: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
import wandb
wandb.login(key="41546a42a22875e1707e92f43d24a95281b54784")


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
class NMTDataset(Dataset):
    def __init__(self, cfg, data_type="train"):
        super().__init__()
        self.cfg = cfg

        self.src_texts, self.tgt_texts = self.read_data(data_type)

        self.src_input_ids = self.texts_to_sequences(self.src_texts)
        self.labels = self.texts_to_sequences(self.tgt_texts)

    def read_data(self, data_type):
        data = load_dataset(
            "mt_eng_vietnamese",
            "iwslt2015-en-vi",
            split=data_type
        )
        src_texts = [sample["translation"][self.cfg.src_lang] for sample in data]
        tgt_texts = [sample["translation"][self.cfg.tgt_lang] for sample in data]
        return src_texts, tgt_texts

    def texts_to_sequences(self, texts):
        data_inputs = self.cfg.tokenizer(
            texts,
            padding='max_length',
            truncation=True,
            max_length=self.cfg.max_len,
            return_tensors='pt'
        )
        return data_inputs.input_ids

    def __getitem__(self, idx):
        return {
            "input_ids": self.src_input_ids[idx],
            "labels": self.labels[idx]
        }

    def __len__(self):
        return np.shape(self.src_input_ids)[0]

In [5]:
class BaseConfig:
    """ base Encoder Decoder config """

    def __init__(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self, k, v)

class NMTConfig(BaseConfig):
    # Data
    src_lang = 'en'
    tgt_lang = 'vi'
    max_len = 75
    add_special_tokens = True

    # Model
    model_name = "facebook/mbart-large-50-many-to-many-mmt"

    # Training
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    learning_rate = 5e-5
    train_batch_size = 16
    eval_batch_size = 16
    num_train_epochs = 2
    save_total_limit = 1
    ckpt_dir = f'./mbart50-{src_lang}-{tgt_lang}'
    eval_steps = 1000

    # Inference
    beam_size = 5

cfg = NMTConfig()

In [6]:
# tokenizer = MBart50TokenizerFast.from_pretrained(cfg.model_name, src_lang="en_XX",tgt_lang = "vi_VN")
cfg.tokenizer = MBart50TokenizerFast.from_pretrained(cfg.model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(cfg.model_name)

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

In [7]:
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    preds= np.where(preds != -100, preds, cfg.tokenizer.pad_token_id)
    decoded_preds = cfg.tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True)

    labels= np.where(labels != -100, labels, cfg.tokenizer.pad_token_id)
    decoded_labels = cfg.tokenizer.batch_decode(labels, skip_special_tokens=True, clean_up_tokenization_spaces=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != cfg.tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}

    return result

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [8]:
train_dataset = NMTDataset(cfg, data_type="train")
valid_dataset = NMTDataset(cfg, data_type="validation")
test_dataset = NMTDataset(cfg, data_type="test")

Downloading data: 100%|██████████| 17.8M/17.8M [00:00<00:00, 36.3MB/s]
Downloading data: 100%|██████████| 181k/181k [00:00<00:00, 870kB/s]
Downloading data: 100%|██████████| 181k/181k [00:00<00:00, 1.12MB/s]


Generating train split:   0%|          | 0/133318 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1269 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1269 [00:00<?, ? examples/s]

In [9]:
next(iter(train_dataset))

{'input_ids': tensor([250004, 127055,  66937,     13,    152,    581,  41664,  50155,     10,
         153552,  10336,   2256,      2,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1]),
 'labels': tensor([250004,  67766,   2546, 218877,    858,    889,  10037,   6248,   1893,
          17964,  42254,      2,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,   

In [10]:
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    eval_steps=cfg.eval_steps,
    output_dir=cfg.ckpt_dir,
    per_device_train_batch_size=cfg.train_batch_size,
    per_device_eval_batch_size=cfg.eval_batch_size,
    learning_rate=cfg.learning_rate,
    save_total_limit=cfg.save_total_limit,
    num_train_epochs=cfg.num_train_epochs,
    load_best_model_at_end=False,
)


data_collator = DataCollatorForSeq2Seq(
    cfg.tokenizer,
    model=model
)

trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=cfg.tokenizer,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [11]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mnamanh2k2av[0m ([33mnlnamanh2002[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240422_032021-xu4afhz3[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mhardy-wildflower-8[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/nlnamanh2002/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/nlnamanh2002/huggingface/runs/xu4afhz3[0m


Step,Training Loss,Validation Loss,Bleu,Gen Len
1000,0.51,0.593244,32.6002,32.5327
2000,0.4991,0.577715,33.0774,32.7912
3000,0.4919,0.570066,32.7266,32.3176
4000,0.4835,0.559116,32.8136,33.0969
5000,0.4707,0.55386,33.6786,32.6887
6000,0.4756,0.545011,33.5897,32.9283
7000,0.465,0.544703,33.9741,32.8511
8000,0.4612,0.537071,34.5608,32.5666
9000,0.3672,0.549739,34.108,32.8597
10000,0.3684,0.541565,34.2057,32.8952


Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameter

TrainOutput(global_step=16666, training_loss=0.4288014778495193, metrics={'train_runtime': 17279.4537, 'train_samples_per_second': 15.431, 'train_steps_per_second': 0.964, 'total_flos': 4.23218857033728e+16, 'train_loss': 0.4288014778495193, 'epoch': 2.0})

In [12]:
prediction = trainer.predict(test_dataset)

In [13]:
prediction

PredictionOutput(predictions=array([[     2, 250004,  16584, ...,      1,      1,      1],
       [     2, 250004,  23598, ...,      1,      1,      1],
       [     2, 250004,  71717, ...,      1,      1,      1],
       ...,
       [     2, 250004,  14343, ...,      1,      1,      1],
       [     2, 250004, 131785, ...,      1,      1,      1],
       [     2, 250004,      2, ...,      1,      1,      1]]), label_ids=array([[250004,  16584,   2259, ...,      1,      1,      1],
       [250004,  14343,   1408, ...,      1,      1,      1],
       [250004,  71717,   4373, ...,      1,      1,      1],
       ...,
       [250004,  14343,   1274, ...,      1,      1,      1],
       [250004, 131785,  43209, ...,      1,      1,      1],
       [250004,      2,      1, ...,      1,      1,      1]]), metrics={'test_loss': 0.5291271805763245, 'test_bleu': 34.7657, 'test_gen_len': 32.8668, 'test_runtime': 203.0686, 'test_samples_per_second': 6.249, 'test_steps_per_second': 0.394})

In [14]:
def inference(
    text,
    tokenizer,
    model,
    device="cpu",
    max_length=75,
    beam_size=5
    ):
    inputs = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
        )
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)
    model.to(device)

    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        early_stopping=True,
        num_beams=beam_size,
        length_penalty=2.0
    )

    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return output_str

In [15]:
sentence = 'i go to school'
inference(sentence, cfg.tokenizer, model)

['tôi đi học.']

In [16]:
trainer.save_model()

Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
