In [7]:
!pip install sentencepiece
!pip install transformers
!pip install --upgrade sentencepiece transformers
!pip install datasets
!pip install sacrebleu
!pip install accelerate -U
!pip install transformers[torch]
!pip install wandb



In [8]:
import torch
import torch.nn as nn
import torch.optim as optim

from transformers import AutoTokenizer, DataCollatorForSeq2Seq
from datasets import load_dataset, load_metric
from torch.utils.data import DataLoader
import re

import spacy
import numpy as np

import random
import math
import time

import wandb

In [9]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [10]:
dataset = load_dataset("mt_eng_vietnamese", "iwslt2015-en-vi")

metric = load_metric("sacrebleu")

Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-6f0ee4765353a2f9/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-6f0ee4765353a2f9/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [11]:
def replace_elements(input_tensor, A, B):
    
    mask = input_tensor == A
    input_tensor[mask] = B
    return input_tensor

In [12]:
special_tokens_dict = {
    'bos_token': '<BOS>',
    'eos_token': '<EOS>',
    'unk_token': '<UNK>',
    'sep_token': '<SEP>',
    'pad_token': '<PAD>',
    'cls_token': '<CLS>',
    'mask_token': '<MASK>'
}

tokenizer = AutoTokenizer.from_pretrained("vinai/vinai-translate-en2vi-v2", **special_tokens_dict)
tokenizer.src_lang = "en_XX"
tokenizer.tgt_lang = "vi_VN"

train_data = dataset["train"]
test_data = dataset["test"]
valid_data = dataset["validation"]

In [13]:
# Example usage of the tokenizer
text = "<BOS> Hugging Face transformers are awesome!"
encoded = tokenizer.encode(text, add_special_tokens=True, return_tensors="pt", padding=True, truncation=True)

with tokenizer.as_target_tokenizer():
    # Decoding and ensuring <BOS> token is present
    decoded = tokenizer.decode(encoded[0], skip_special_tokens=False)
print("Encoded:", encoded)
print("Decoded:", decoded)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Encoded: tensor([[66773,  4526,  6343,  6933,  6975,   504,    66, 12995,   160, 66774,
         66750]])
Decoded: <BOS> Hugging Face transformers are awesome!<EOS>en_XX




In [14]:
# Getting special token IDs
bos_token_id = tokenizer.bos_token_id
eos_token_id = tokenizer.eos_token_id
unk_token_id = tokenizer.unk_token_id
sep_token_id = tokenizer.sep_token_id
pad_token_id = tokenizer.pad_token_id
cls_token_id = tokenizer.cls_token_id
mask_token_id = tokenizer.mask_token_id

BOS Token ID: 66773
EOS Token ID: 66774
UNK Token ID: 66775
SEP Token ID: 66776
PAD Token ID: 66777
CLS Token ID: 66778
MASK Token ID: 66779


In [18]:
max_input_length = 128
max_target_length = 128
source_lang = "en"
target_lang = "vi"
prefix = "<BOS> "

def preprocess_function(examples):
    inputs = [prefix + ex[source_lang] for ex in examples["translation"]]
    targets = [prefix + ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer.__call__(inputs, max_length=max_input_length, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer.__call__(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

  0%|          | 0/2978 [00:00<?, ?ba/s]

  0%|          | 0/19 [00:00<?, ?ba/s]

  0%|          | 0/20 [00:00<?, ?ba/s]

In [19]:
rm_tokenized_datasets = tokenized_datasets.remove_columns(["translation", "attention_mask"])

In [20]:
BATCH_SIZE = 1
data_collator = DataCollatorForSeq2Seq(tokenizer, label_pad_token_id=pad_token_id)
trainloader = DataLoader(rm_tokenized_datasets['train'], batch_size=BATCH_SIZE, collate_fn=data_collator)
testloader = DataLoader(rm_tokenized_datasets['test'], batch_size=BATCH_SIZE, collate_fn=data_collator)

In [21]:
class Encoder(nn.Module):
    def __init__(self,
                 input_dim,
                 hid_dim,
                 n_layers,
                 n_heads,
                 pf_dim,
                 dropout,
                 device,
                 max_length = 128):
        super().__init__()

        self.device = device

        self.tok_embedding = nn.Embedding(input_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)

        self.layers = nn.ModuleList([EncoderLayer(hid_dim,
                                                  n_heads,
                                                  pf_dim,
                                                  dropout,
                                                  device)
                                     for _ in range(n_layers)])

        self.dropout = nn.Dropout(dropout)

        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)

    def forward(self, src, src_mask):
        batch_size = src.shape[0]
        src_len = src.shape[1]
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        src = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))
        for layer in self.layers:
            src = layer(src, src_mask)
        return src

class EncoderLayer(nn.Module):
    def __init__(self,
                 hid_dim,
                 n_heads,
                 pf_dim,
                 dropout,
                 device):
        super().__init__()

        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim,
                                                                     pf_dim,
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_mask):
        _src, _ = self.self_attention(src, src, src, src_mask)
        src = self.self_attn_layer_norm(src + self.dropout(_src))
        _src = self.positionwise_feedforward(src)
        src = self.ff_layer_norm(src + self.dropout(_src))
        return src

class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()

        assert hid_dim % n_heads == 0

        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads

        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)

        self.fc_o = nn.Linear(hid_dim, hid_dim)

        self.dropout = nn.Dropout(dropout)

        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)

    def forward(self, query, key, value, mask = None):

        batch_size = query.shape[0]
        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)

        attention = torch.softmax(energy, dim = -1)
        x = torch.matmul(self.dropout(attention), V)
        x = x.permute(0, 2, 1, 3).contiguous()
        x = x.view(batch_size, -1, self.hid_dim)
        x = self.fc_o(x)
        return x, attention

class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()

        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.dropout(torch.relu(self.fc_1(x)))
        x = self.fc_2(x)
        return x

class DecoderLayer(nn.Module):
    
    def __init__(self,
                 hid_dim,
                 n_heads,
                 pf_dim,
                 dropout,
                 device):
        super().__init__()

        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim,
                                                                     pf_dim,
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, trg, enc_src, trg_mask, src_mask):

        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
        _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
        _trg = self.positionwise_feedforward(trg)
        trg = self.ff_layer_norm(trg + self.dropout(_trg))
        return trg, attention

class Decoder(nn.Module):
    def __init__(self,
                 output_dim,
                 hid_dim,
                 n_layers,
                 n_heads,
                 pf_dim,
                 dropout,
                 device,
                 max_length = 128):
        super().__init__()

        self.device = device

        self.tok_embedding = nn.Embedding(output_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)

        self.layers = nn.ModuleList([DecoderLayer(hid_dim,
                                                  n_heads,
                                                  pf_dim,
                                                  dropout,
                                                  device)
                                     for _ in range(n_layers)])

        self.fc_out = nn.Linear(hid_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)

    def forward(self, trg, enc_src, trg_mask, src_mask):
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]

        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))
        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)
        output = self.fc_out(trg)
        return output, attention

class Seq2Seq(nn.Module):
    def __init__(self,
                 encoder,
                 decoder,
                 src_pad_idx,
                 trg_pad_idx,
                 device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask

    def make_trg_mask(self, trg):
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        trg_len = trg.shape[1]

        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = self.device)).bool()
        trg_mask = trg_pad_mask & trg_sub_mask
        return trg_mask

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)

        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)
        return output, attention

    def inference(self, sentence, tokenizer, max_input_length=50, beam_width=5):
        self.eval()
        prefix = "<BOS> "
        sentence = prefix + sentence
        tokenized_sentence = tokenizer.__call__(sentence)['input_ids']
        tokenized_sentence = torch.tensor(tokenized_sentence).unsqueeze(0).to(self.device)
        tokenized_sentence = replace_elements(tokenized_sentence, 66750, tokenizer.pad_token_id)
        tokenized_sentence = tokenized_sentence[:, :-1]
        bos_token_id = tokenizer.bos_token_id
        eos_token_id = tokenizer.eos_token_id
        src_mask = self.make_src_mask(tokenized_sentence)
        with torch.no_grad():
            enc_src = self.encoder(tokenized_sentence, src_mask)
        beams = [{'tokens': [bos_token_id], 'score': 0.0}]
        completed_beams = []
        for _ in range(max_input_length):
            new_beams = []
            for beam in beams:
                trg_tensor = torch.LongTensor(beam['tokens']).unsqueeze(0).to(self.device)
                trg_mask = self.make_trg_mask(trg_tensor)
                with torch.no_grad():
                    output, _ = self.decoder(trg_tensor, enc_src, trg_mask, src_mask)
                log_probs = torch.log_softmax(output, dim=-1).squeeze(0)[-1]
                top_scores, top_tokens = torch.topk(log_probs, beam_width)
                for score, token in zip(top_scores.tolist(), top_tokens.tolist()):
                    new_beam = {
                        'tokens': beam['tokens'] + [token],
                        'score': beam['score'] + score
                    }
                    if token == eos_token_id:
                        completed_beams.append(new_beam)
                    else:
                        new_beams.append(new_beam)
            if len(completed_beams) >= beam_width:
                break
            beams = sorted(new_beams, key=lambda x: x['score'], reverse=True)[:beam_width - len(completed_beams)]
        beams = sorted(new_beams, key=lambda x: x['score'], reverse=True)[:beam_width - len(completed_beams)]
        completed_beams.extend(beams)
        completed_beams = sorted(completed_beams, key=lambda x: x['score'], reverse=True)
        translated_sentences = []
        for beam in completed_beams:
            with tokenizer.as_target_tokenizer():
                translated_sentence = tokenizer.decode(beam['tokens'], skip_special_tokens=True)
                translated_sentences.append((translated_sentence, beam['score']))
        return translated_sentences

In [22]:
INPUT_DIM = 66779
OUTPUT_DIM = 66779
HID_DIM = 512
ENC_LAYERS = 8
DEC_LAYERS = 8
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.0
DEC_DROPOUT = 0.0

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

enc = Encoder(INPUT_DIM,
              HID_DIM,
              ENC_LAYERS,
              ENC_HEADS,
              ENC_PF_DIM,
              ENC_DROPOUT,
              DEVICE)

dec = Decoder(OUTPUT_DIM,
              HID_DIM,
              DEC_LAYERS,
              DEC_HEADS,
              DEC_PF_DIM,
              DEC_DROPOUT,
              DEVICE)

model = Seq2Seq(enc, dec, pad_token_id, pad_token_id, DEVICE).to(DEVICE)

criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id).to(DEVICE)

LEARNING_RATE = 5e-5
optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

The model has 136,431,323 trainable parameters


In [25]:
def train(model, trainloader, optimizer, criterion, clip):

    model.train()
    epoch_loss = 0
    for i, batch in enumerate(trainloader):

        input_ids = batch['input_ids'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        
        
        input_ids = replace_elements(input_ids, 66750, tokenizer.pad_token_id)
        input_ids = input_ids[:, :-1]
        labels = replace_elements(labels, 66770, tokenizer.pad_token_id)
        labels = labels[:, :-1]
        processed_labels = labels.clone()
        processed_labels = replace_elements(processed_labels, tokenizer.eos_token_id, tokenizer.pad_token_id)

        optimizer.zero_grad()
        
        output = model(input_ids, processed_labels[:, :-1])[0]

        output_dim = output.shape[-1]
        output = output.contiguous().view(-1, output_dim)
        labels = labels[:, 1:].contiguous().view(-1)
        
        loss = criterion(output, labels)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()

    return epoch_loss / len(trainloader)

In [None]:
N_EPOCHS = 5
CLIP = 1
path_to_save = 'pretrained_transformer.pth'

for epoch in range(N_EPOCHS):
    train_loss = train(model, trainloader, optimizer, criterion, CLIP)
    
    model.to('cpu')
    torch.save(model.state_dict(), path_to_save)
    model.to(DEVICE)
    
    print(f'\tTrain Loss: {train_loss:.3f}')

You're using a MBartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Brother Albert Barnett and his wife , Sister Susan Barnett , from the West Congregation in Tuscaloosa , Alabama
Sample word: Brother Albert Barnett and his wife , Sister Susan Barnett , from the West Congregation in Tuscaloosa , Alabama, translated word: [('Jobs nói :', -8.710390284657478), ('Wozniak, ông nói :', -17.56442255526781), ('Wozniak, ông đã tạo ra một chiếc máy tính của mình, và có thể tạo ra khỏi nhà.', -48.103493608068675), ('Wozniak, ông đã tạo ra một chiếc máy tính của mình, và có thể tạo ra một chiếc máy tính từ chối.', -56.55432908423245), ('Wozniak, ông đã tạo ra một chiếc máy tính của mình, và có thể tạo ra một chiếc máy tính từ năm 1980.', -57.40070529747754)]
	Train Loss: 4.702


In [None]:
model.to('cpu')
torch.save(model.state_dict(), path_to_save)
model.to(DEVICE)

In [None]:
from nltk.translate.bleu_score import corpus_bleu
import torch

def calculate_bleu_score(model, test_data):
    references = []
    translations = []

    model.eval()

    with torch.no_grad():
        
        for i in range(len(test_data)):

            input_text = test_data[i]['en']
            labels_text = test_data[i]['vi']
            output_text = model.inference(input_text, tokenizer)[0][0]
            references.append(labels_text)
            translations.append(output_text)
            
    refer_list = list()
    out_list = list()
    for i in range(len(references)):
        outp = translations[i]
        label = references[i]
        if outp != "":
            outp_list = outp.lower().split()
            label_list = label.lower().split()
            refer_list.append(label_list)
            out_list.append(outp_list)

    # Calculate BLEU score
    if len(refer_list) != 0:
        bleu_score = corpus_bleu(refer_list, out_list)
        print("BLEU score:", bleu_score)
        return bleu_score
    else:
        print("No non-empty translations to compute BLEU score.")
        return None

bleu = calculate_bleu_score(model, test_data['translation'])
print("BLEU score:", bleu)