# Setup

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%pip install datasets transformers evaluate

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import datasets
import tqdm
import evaluate
from transformers import AutoTokenizer

In [3]:
seed = 42

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

# I - Load dataset

In [5]:
dataset = datasets.load_dataset("harouzie/vi_en-translation")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/665 [00:00<?, ?B/s]

(…)-00000-of-00001-8fc21cb8e80d3a2d.parquet:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

(…)-00000-of-00001-858c0e989d9c5637.parquet:   0%|          | 0.00/1.42M [00:00<?, ?B/s]

(…)-00000-of-00001-99e7e50144d1c164.parquet:   0%|          | 0.00/1.42M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/203272 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25409 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/25409 [00:00<?, ? examples/s]

In [6]:
train_data, test_data, valid_data = (dataset['train'], dataset['test'], dataset['valid'])

In [7]:
train_data[1]

{'English': 'The pharmacy is on Fresno Street',
 'Vietnamese': 'hiệu thuốc nằm trên đường fresno'}

# II - Setup tokenizer


In [4]:
UNK_TOKEN = "<unk>"
PAD_TOKEN = "<pad>"
BOS_TOKEN = "<s>"
EOS_TOKEN = "</s>"


In [5]:
tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt",
                                          unk_token=UNK_TOKEN,
                                          pad_token=PAD_TOKEN,
                                          bos_token=BOS_TOKEN,
                                          eos_token=EOS_TOKEN)

# III - Prepare data

In [10]:
def convert_to_ids(example, tokenizer):
    en_ids = tokenizer(example["English"], padding="max_length", truncation=True)
    vi_ids = tokenizer(example["Vietnamese"], padding="max_length", truncation=True)
    return {"en_ids": en_ids['input_ids'], "vi_ids": vi_ids['input_ids']}

In [11]:
fn_kwargs = {"tokenizer":tokenizer}

train_data = train_data.map(convert_to_ids, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(convert_to_ids, fn_kwargs=fn_kwargs)
test_data = test_data.map(convert_to_ids, fn_kwargs=fn_kwargs)

Map:   0%|          | 0/203272 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/25409 [00:00<?, ? examples/s]

Map:   0%|          | 0/25409 [00:00<?, ? examples/s]

In [12]:
DATA_TYPE = "torch"
format_columns = ["en_ids", "vi_ids"]

train_data = train_data.with_format(
    type=DATA_TYPE, columns=format_columns, output_all_columns=True
)

valid_data = valid_data.with_format(
    type=DATA_TYPE,
    columns=format_columns,
    output_all_columns=True,
)

test_data = test_data.with_format(
    type=DATA_TYPE,
    columns=format_columns,
    output_all_columns=True,
)

In [13]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_en_ids = [example["en_ids"] for example in batch]
        batch_vi_ids = [example["vi_ids"] for example in batch]
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
        batch_vi_ids = nn.utils.rnn.pad_sequence(batch_vi_ids, padding_value=pad_index)
        batch = {
            "en_ids": batch_en_ids,
            "vi_ids": batch_vi_ids,
        }
        return batch

    return collate_fn

In [14]:
def get_dataloader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader

In [15]:
PAD_INDEX = tokenizer.pad_token_id
UNK_INDEX = tokenizer.unk_token_id

In [16]:
BATCH_SIZE = 16

train_dataloader = get_dataloader(train_data, BATCH_SIZE, PAD_INDEX, shuffle=True)
valid_dataloader = get_dataloader(valid_data, BATCH_SIZE, PAD_INDEX)
test_dataloader = get_dataloader(test_data, BATCH_SIZE, PAD_INDEX)

In [17]:
del train_data, valid_data, test_data

# IV - Model

In [6]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, hidden_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # src = [src_len, batch_size]
        embedded = self.dropout(self.embedding(src))  # [src_len, batch_size, embedding_dim]
        encoder_outputs, (hidden, cell) = self.rnn(embedded)
        # encoder_outputs = [src_len, batch_size, hidden_dim * 2]
        # hidden, cell = [n_layers * 2, batch_size, hidden_dim]

        return encoder_outputs, hidden, cell

In [7]:
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn = nn.Linear(hidden_dim * 3, hidden_dim)
        self.v = nn.Linear(hidden_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        # hidden = [batch_size, hidden_dim]
        # encoder_outputs = [src_len, batch_size, hidden_dim * 2]
        src_len = encoder_outputs.shape[0]
        hidden = hidden[-1].unsqueeze(1).repeat(1, src_len, 1)  # [batch_size, src_len, hidden_dim]
        encoder_outputs = encoder_outputs.permute(1, 0, 2)  # [batch_size, src_len, hidden_dim * 2]
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))  # [batch_size, src_len, hidden_dim * 3]
        attention = self.v(energy).squeeze(2)  # [batch_size, src_len]
        return torch.softmax(attention, dim=1)  # [batch_size, src_len]

In [8]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim + hidden_dim * 2, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim * 3, output_dim)
        self.attention = attention
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell, encoder_outputs):
        # input = [batch size]
        input = input.unsqueeze(0)  # [1, batch size]
        embedded = self.dropout(self.embedding(input))  # [1, batch size, embedding_dim]

        attn_weights = self.attention(hidden, encoder_outputs)  # [batch size, src_len]
        attn_weights = attn_weights.unsqueeze(1)  # [batch size, 1, src_len]

        encoder_outputs = encoder_outputs.permute(1, 0, 2)  # [batch size, src_len, hidden_dim * 2]
        context = torch.bmm(attn_weights, encoder_outputs).squeeze(1)  # [batch size, hidden_dim * 2]

        rnn_input = torch.cat((embedded, context.unsqueeze(0)), dim=2)  # [1, batch size, embedding_dim + hidden_dim * 2]

        output, (hidden, cell) = self.rnn(rnn_input, (hidden[-2:], cell[-2:]))
        # output = [seq length, batch size, hidden_dim]
        output = output.squeeze(0)  # [batch size, hidden_dim]
        prediction = self.fc_out(torch.cat((output, context), dim=1))  # [batch size, output_dim]
        return prediction, hidden, cell

In [9]:
class Seq2SeqWAttn(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, trg, device, teacher_forcing_ratio=0.5):
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        batch_size = trg.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(device)  # [trg length, batch size, output_dim]
        encoder_outputs, hidden, cell = self.encoder(src)

        # Get <bos> ids
        input = trg[0, :]
        for t in range(1, trg_length):
            output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)  # [batch size]
            input = trg[t] if teacher_force else top1
        return outputs  # [trg length, batch size, output_dim]

# V - Training

In [10]:
class EarlyStopping:
    def __init__(self, patience=5, min_delta=0.001):
        """
        patience: số epoch tối đa không có cải thiện trước khi dừng
        min_delta: mức chênh lệch tối thiểu giữa các epoch để được coi là cải thiện
        """
        self.patience = patience
        self.min_delta = min_delta
        self.best_loss = float('inf')
        self.counter = 0

    def __call__(self, val_loss):
        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1

        return self.counter >= self.patience  # Trả về True nếu cần dừng

In [11]:
INPUT_DIM = len(tokenizer)
OUTPUT_DIM = len(tokenizer)
ENCODER_EMBEDDING_DIM = 256
DECODER_EMBEDDING_DIM = 256
HIDDEN_DIM = 512
N_LAYERS = 2
ENCODER_DROPOUT = 0.5
DECODER_DROPOUT = 0.5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# DEVICE = torch.device('cpu')

encoder = Encoder(
    INPUT_DIM,
    ENCODER_EMBEDDING_DIM,
    HIDDEN_DIM,
    N_LAYERS,
    ENCODER_DROPOUT,
)

attention = Attention(HIDDEN_DIM)

decoder = Decoder(
    OUTPUT_DIM,
    DECODER_EMBEDDING_DIM,
    HIDDEN_DIM,
    N_LAYERS,
    DECODER_DROPOUT,
    attention
)

model = Seq2SeqWAttn(encoder, decoder).to(DEVICE)

In [24]:
batch = next(iter(train_dataloader))
src = batch['en_ids'].to(DEVICE)
trg = batch['vi_ids'].to(DEVICE)
outputs = model(src, trg, DEVICE)

In [25]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)


model.apply(init_weights)

Seq2SeqWAttn(
  (encoder): Encoder(
    (embedding): Embedding(250054, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5, bidirectional=True)
    (fc): Linear(in_features=1024, out_features=512, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(250054, 256)
    (rnn): LSTM(1280, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=1536, out_features=250054, bias=True)
    (attention): Attention(
      (attn): Linear(in_features=1536, out_features=512, bias=True)
      (v): Linear(in_features=512, out_features=1, bias=False)
    )
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [26]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 528,901,830 trainable parameters


In [27]:
from torch.amp import autocast, GradScaler
scaler = GradScaler('cuda')

In [28]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=PAD_INDEX)
early_stopping = EarlyStopping(patience=5, min_delta=0.005)

In [29]:
def train_model(
    model, data_loader, optimizer, criterion, scaler, clip, teacher_forcing_ratio, device, early_stopping=None
):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        src = batch["en_ids"].to(device)
        trg = batch["vi_ids"].to(device)
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        optimizer.zero_grad()
        with autocast('cuda'):
          output = model(src, trg, device, teacher_forcing_ratio)
          # output = [trg length, batch size, trg vocab size]
          output_dim = output.shape[-1]
          output = output[1:].view(-1, output_dim)
          # output = [(trg length - 1) * batch size, trg vocab size]
          trg = trg[1:].view(-1)
          # trg = [(trg length - 1) * batch size]
          loss = criterion(output, trg)
        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        scaler.step(optimizer)
        scaler.update()
        epoch_loss += loss.item()
        if (i + 1) % 100 == 0:
          print(f"Batch: {i + 1}/ {len(data_loader)}: Loss {epoch_loss / (i+1):.3f}")
        if (i + 1) % 3000 == 0:
          torch.save(model.state_dict(), "/content/drive/MyDrive/model.pth")
        del src, trg, output, loss, batch
        torch.cuda.empty_cache()
    return epoch_loss / len(data_loader)

In [30]:
def evaluate_model(model, data_loader, device, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            src = batch["en_ids"].to(device)
            trg = batch["vi_ids"].to(device)
            # src = [src length, batch size]
            # trg = [trg length, batch size]
            with autocast('cuda'):
              output = model(src, trg, 0, device)  # turn off teacher forcing
              # output = [trg length, batch size, trg vocab size]
              output_dim = output.shape[-1]
              output = output[1:].view(-1, output_dim)
              # output = [(trg length - 1) * batch size, trg vocab size]
              trg = trg[1:].view(-1)
              # trg = [(trg length - 1) * batch size]
              loss = criterion(output, trg)
            epoch_loss += loss.item()

    return epoch_loss / len(data_loader)

In [31]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:1"
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

In [32]:
n_epochs = 1
clip = 1.0
teacher_forcing_ratio = 0.5

best_valid_loss = float("inf")

for epoch in tqdm.tqdm(range(n_epochs)):
    train_loss = train_model(
        model,
        train_dataloader,
        optimizer,
        criterion,
        scaler,
        clip,
        teacher_forcing_ratio,
        DEVICE,
        early_stopping
    )
    valid_loss = evaluate_model(
        model,
        valid_dataloader,
        DEVICE,
        criterion,
    )
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "/content/drive/MyDrive/model.pth")
    print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
    print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")

  0%|          | 0/1 [00:00<?, ?it/s]

Batch: 100/ 12705: Loss 7.810
Batch: 200/ 12705: Loss 6.915
Batch: 300/ 12705: Loss 6.548
Batch: 400/ 12705: Loss 6.350
Batch: 500/ 12705: Loss 6.228
Batch: 600/ 12705: Loss 6.144
Batch: 700/ 12705: Loss 6.073
Batch: 800/ 12705: Loss 6.021
Batch: 900/ 12705: Loss 5.972
Batch: 1000/ 12705: Loss 5.934
Batch: 1100/ 12705: Loss 5.901
Batch: 1200/ 12705: Loss 5.869
Batch: 1300/ 12705: Loss 5.843
Batch: 1400/ 12705: Loss 5.817
Batch: 1500/ 12705: Loss 5.794
Batch: 1600/ 12705: Loss 5.770
Batch: 1700/ 12705: Loss 5.745
Batch: 1800/ 12705: Loss 5.721
Batch: 1900/ 12705: Loss 5.698
Batch: 2000/ 12705: Loss 5.672
Batch: 2100/ 12705: Loss 5.645
Batch: 2200/ 12705: Loss 5.623
Batch: 2300/ 12705: Loss 5.599
Batch: 2400/ 12705: Loss 5.575
Batch: 2500/ 12705: Loss 5.554
Batch: 2600/ 12705: Loss 5.533
Batch: 2700/ 12705: Loss 5.513
Batch: 2800/ 12705: Loss 5.495
Batch: 2900/ 12705: Loss 5.475
Batch: 3000/ 12705: Loss 5.454
Batch: 3100/ 12705: Loss 5.435
Batch: 3200/ 12705: Loss 5.416
Batch: 3300/ 1270

  0%|          | 0/1 [3:45:58<?, ?it/s]


TypeError: to() received an invalid combination of arguments - got (CrossEntropyLoss), but expected one of:
 * (torch.device device = None, torch.dtype dtype = None, bool non_blocking = False, bool copy = False, *, torch.memory_format memory_format = None)
 * (torch.dtype dtype, bool non_blocking = False, bool copy = False, *, torch.memory_format memory_format = None)
 * (Tensor tensor, bool non_blocking = False, bool copy = False, *, torch.memory_format memory_format = None)


# Evaluation

In [12]:
INPUT_DIM = len(tokenizer)
OUTPUT_DIM = len(tokenizer)
ENCODER_EMBEDDING_DIM = 256
DECODER_EMBEDDING_DIM = 256
HIDDEN_DIM = 512
N_LAYERS = 2
ENCODER_DROPOUT = 0.5
DECODER_DROPOUT = 0.5
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# DEVICE = torch.device('cpu')

encoder = Encoder(
    INPUT_DIM,
    ENCODER_EMBEDDING_DIM,
    HIDDEN_DIM,
    N_LAYERS,
    ENCODER_DROPOUT,
)

attention = Attention(HIDDEN_DIM)

decoder = Decoder(
    OUTPUT_DIM,
    DECODER_EMBEDDING_DIM,
    HIDDEN_DIM,
    N_LAYERS,
    DECODER_DROPOUT,
    attention
)

model = Seq2SeqWAttn(encoder, decoder).to(DEVICE)

In [14]:
model.load_state_dict(torch.load("saved/seq2seq.pth"))

# test_loss = evaluate_model(model, test_dataloader, criterion, DEVICE)

# print(f"| Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):7.3f} |")

  model.load_state_dict(torch.load("saved/seq2seq.pth"))


<All keys matched successfully>

In [15]:
def translate(
    sentence,
    model,
    tokenizer,
    device,
    max_output_length=25,
):
    model.eval()
    with torch.no_grad():
        ids = tokenizer(sentence, padding="max_length", truncation=True)['input_ids']
        tensor = torch.LongTensor(ids).unsqueeze(-1).to(device)
        encoder_outputs, hidden, cell = model.encoder(tensor)
        inputs = [tokenizer.bos_token_id]
        for _ in range(max_output_length):
            inputs_tensor = torch.LongTensor([inputs[-1]]).to(device)
            output, hidden, cell = model.decoder(inputs_tensor, hidden, cell, encoder_outputs)
            predicted_token = output.argmax(-1).item()
            inputs.append(predicted_token)
            if predicted_token == tokenizer.eos_token_id:
                break
        tokens = tokenizer.decode(inputs)
    return tokens[3:-4]

In [20]:
translate("Who am I?", model, tokenizer, DEVICE)

' ai đã phải'

In [19]:
translate("She sells seashell on the seashore", model, tokenizer, DEVICE)

' cô ấy bị trên trên trên trên'

In [39]:
bleu = evaluate.load("bleu")

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [40]:
# dataset = datasets.load_dataset("harouzie/vi_en-translation")
test_data= dataset['test']

In [None]:
translations = [
    translate(
        example["English"],
        model,
        tokenizer,
        DEVICE
    )
    for example in test_data
]

In [None]:
predictions = [translation for translation in translations]

references = [[example["Vietnamese"]] for example in test_data]

In [None]:
predictions[0]

In [None]:
references[0]

In [None]:
results = bleu.compute(
    predictions=predictions, references=references
)

In [None]:
results