# NLP Exercise 3: Seq2Seq Model and Attention Mechanisms
---

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import datasets
from tqdm import tqdm
import spacy
from pprint import pprint
from transformers import AutoTokenizer
from collections import Counter
import json
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

## Prepare Data and Preprocessing

Load dataset using the `datasets` library.

In [2]:
dataset = datasets.load_dataset('ncduy/mt-en-vi')

In [3]:
# Split the train dataset into train, validation, and test sets
limited_train_data = dataset["train"].select(range(5000))
train_valid_test_split = limited_train_data.train_test_split(test_size=0.2, seed=42)
train_valid_split = train_valid_test_split["train"].train_test_split(test_size=0.1, seed=42)

In [4]:
train_data, valid_data, test_data = (
    train_valid_split["train"].remove_columns('source'),
    train_valid_split["test"].remove_columns('source'),
    train_valid_test_split["test"].remove_columns('source')
)

In [None]:
print(train_data, test_data, valid_data)

In [6]:
# Define special tokens and parameters
max_length = 1000
lower = True
sos_token = "<sos>"
eos_token = "<eos>"
unk_token = "<unk>"
pad_token = "<pad>"
min_freq = 2
special_tokens = [unk_token, pad_token, sos_token, eos_token]

In [7]:
# Define tokenizer
en_nlp = spacy.load('en_core_web_sm')
vi_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

In [8]:
# Step 1: Tokenize and Numericalize sentences
def process_sentence(sentence):
    # Tokenize English
    en_tokens = [token.text for token in en_nlp.tokenizer(sentence["en"])][:max_length]
    vi_tokens = vi_tokenizer.tokenize(sentence["vi"])[:max_length]

    if lower:
        en_tokens = [token.lower() for token in en_tokens]
        vi_tokens = [token.lower() for token in vi_tokens]

    # Add special tokens
    en_tokens = [sos_token] + en_tokens + [eos_token]
    vi_tokens = [sos_token] + vi_tokens + [eos_token]

    # Numericalize tokens
    en_ids = [en_vocab.get(token, en_vocab[unk_token]) for token in en_tokens]
    vi_ids = [vi_vocab.get(token, vi_vocab[unk_token]) for token in vi_tokens]

    return {
        "en": sentence["en"],
        "vi": sentence["vi"],
        "en_tokens": en_tokens,
        "vi_tokens": vi_tokens,
        "en_ids": en_ids,
        "vi_ids": vi_ids,
    }

In [9]:
# Step 2: Build Vocabulary
def build_vocab(data, min_freq, specials):
    counter = Counter()
    for tokens in data:
        counter.update(tokens)
    vocab = {token: idx for idx, token in enumerate(specials)}
    sorted_tokens = sorted(token for token, freq in counter.items() if freq >= min_freq)
    for token in sorted_tokens:
        if token not in vocab:
            vocab[token] = len(vocab)
    return vocab

In [None]:
# Step 3: Generate Tokenized Data for Vocabulary Building
tokenized_train_data = train_data.map(
    lambda example: {"en_tokens": [token.text for token in en_nlp.tokenizer(example["en"])][:max_length],
                     "vi_tokens": vi_tokenizer.tokenize(example["vi"])[:max_length]}
)

In [11]:
# Build vocabularies
en_vocab = build_vocab(tokenized_train_data["en_tokens"], min_freq, special_tokens)
vi_vocab = build_vocab(tokenized_train_data["vi_tokens"], min_freq, special_tokens)

In [None]:
# Step 4: Process Full Dataset
train_data = train_data.map(process_sentence)
valid_data = valid_data.map(process_sentence)
test_data = test_data.map(process_sentence)

In [13]:
# Check for special tokens in both vocabularies
assert en_vocab[unk_token] == vi_vocab[unk_token]
assert en_vocab[pad_token] == vi_vocab[pad_token]

unk_index = en_vocab[unk_token]
pad_index = en_vocab[pad_token]

In [14]:
data_type = "torch"
format_columns = ["en_ids", "vi_ids"]

train_data = train_data.with_format(
    type=data_type, columns=format_columns, output_all_columns=True
)

valid_data = valid_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

test_data = test_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

In [None]:
print(train_data)

## Data Loaders

In PyTorch, `the DataLoader` uses a collate function to combine individual samples into a batch. By default, PyTorch's `DataLoader` stacks tensors along the first dimension. However, when dealing with sequences of varying lengths (e.g., tokenized text), you need to pad them to the same length within the batch. This function handles that padding for both English (`en_ids`) and (`vi_ids`) sequences.

In [16]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_en_ids = [example["en_ids"] for example in batch]
        batch_vi_ids = [example["vi_ids"] for example in batch]
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
        batch_vi_ids = nn.utils.rnn.pad_sequence(batch_vi_ids, padding_value=pad_index)
        batch = {
            "en_ids": batch_en_ids,
            "vi_ids": batch_vi_ids,
        }
        return batch

    return collate_fn

In [17]:
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
        num_workers=0
    )
    return data_loader

In [31]:
batch_size = 16

train_loader = get_data_loader(train_data, batch_size, pad_index, shuffle=True)
valid_loader = get_data_loader(valid_data, batch_size, pad_index)
test_loader = get_data_loader(test_data, batch_size, pad_index)

## Define the Encoder and Decoder for Seq2Seq 

# Encoder

- Encoder reads the input sequence and summerizes the information in something called internal state vectors or context vectors. This context vector aims to encapsulate the information for all input elements to help the decoder make accurate predictions.
- This implementation involves creating an RNN-based encoder.

In [32]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # src = [src length, batch size]
        embedded = self.dropout(self.embedding(src))
        # embedded = [src length, batch size, embedding dim]
        outputs, (hidden, cell) = self.rnn(embedded)
        # outputs = [src length, batch size, hidden dim * n directions]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # outputs are always from the top hidden layer
        return hidden, cell

Decoder

In [33]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        # input = [batch size]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # n directions in the decoder will both always be 1, therefore:
        # hidden = [n layers, batch size, hidden dim]
        # context = [n layers, batch size, hidden dim]
        input = input.unsqueeze(0)
        # input = [1, batch size]
        embedded = self.dropout(self.embedding(input))
        # embedded = [1, batch size, embedding dim]
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        # output = [seq length, batch size, hidden dim * n directions]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # seq length and n directions will always be 1 in this decoder, therefore:
        # output = [1, batch size, hidden dim]
        # hidden = [n layers, batch size, hidden dim]
        # cell = [n layers, batch size, hidden dim]
        prediction = self.fc_out(output.squeeze(0))
        # prediction = [batch size, output dim]
        return prediction, hidden, cell

Seq2Seq Model 

For the final part of the implemenetation, we'll implement the seq2seq model. This will handle:

- receiving the input/source sentence
- using the encoder to produce the context vectors
- using the decoder to produce the predicted output/target sentence

In [34]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        assert (
            encoder.hidden_dim == decoder.hidden_dim
        ), "Hidden dimensions of encoder and decoder must be equal!"
        assert (
            encoder.n_layers == decoder.n_layers
        ), "Encoder and decoder must have equal number of layers!"

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        # teacher_forcing_ratio is probability to use teacher forcing
        # e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        batch_size = trg.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        # tensor to store decoder outputs
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size)
        # last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # first input to the decoder is the <sos> tokens
        input = trg[0, :]
        # input = [batch size]
        for t in range(1, trg_length):
            # insert input token embedding, previous hidden and previous cell states
            # receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            # output = [batch size, output dim]
            # hidden = [n layers, batch size, hidden dim]
            # cell = [n layers, batch size, hidden dim]
            # place predictions in a tensor holding predictions for each token
            outputs[t] = output
            # decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            # get the highest predicted token from our predictions
            top1 = output.argmax(1)
            # if teacher forcing, use actual next token as next input
            # if not, use predicted token
            input = trg[t] if teacher_force else top1
            # input = [batch size]
        return outputs

In [35]:
input_dim = len(vi_vocab)
output_dim = len(en_vocab)
embedding_dim = 256
hidden_dim = 512
n_layers = 2
dropout = 0.5

encoder = Encoder(
    input_dim,
    embedding_dim,
    hidden_dim,
    n_layers,
    dropout,
)

decoder = Decoder(
    output_dim,
    embedding_dim,
    hidden_dim,
    n_layers,
    dropout,
)

model = Seq2Seq(encoder, decoder)

In [None]:
print("Input Dim (Vietnamese Vocab):", input_dim)
print("Output Dim (English Vocab):", output_dim)

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.01, 0.01)


model.apply(init_weights)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

In [39]:
assert encoder.hidden_dim == decoder.hidden_dim
assert encoder.n_layers == decoder.n_layers

In [40]:
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

def train_fn(
    model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio
):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        src = batch["vi_ids"]
        trg = batch["en_ids"]
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        optimizer.zero_grad()
        output = model(src, trg, teacher_forcing_ratio)
        # output = [trg length, batch size, trg vocab size]
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        # output = [(trg length - 1) * batch size, trg vocab size]
        trg = trg[1:].view(-1)
        # trg = [(trg length - 1) * batch size]
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [41]:
def evaluate_fn(model, data_loader, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            src = batch["vi_ids"]
            trg = batch["en_ids"]
            # src = [src length, batch size]
            # trg = [trg length, batch size]
            output = model(src, trg, 0)  # turn off teacher forcing
            # output = [trg length, batch size, trg vocab size]
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            # output = [(trg length - 1) * batch size, trg vocab size]
            trg = trg[1:].view(-1)
            # trg = [(trg length - 1) * batch size]
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [None]:
n_epochs = 30
clip = 1.0
teacher_forcing_ratio = 0.5

best_valid_loss = float("inf")

for epoch in tqdm(range(n_epochs)):
    train_loss = train_fn(
        model,
        train_loader,
        optimizer,
        criterion,
        clip,
        teacher_forcing_ratio,
        
    )
    valid_loss = evaluate_fn(
        model,
        valid_loader,
        criterion,
        
    )
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), "translate-model.pt")
    print(f"\tTrain Loss: {train_loss:7.3f} | Train PPL: {np.exp(train_loss):7.3f}")
    print(f"\tValid Loss: {valid_loss:7.3f} | Valid PPL: {np.exp(valid_loss):7.3f}")

In [None]:
model.load_state_dict(torch.load("translate-model.pt"))

test_loss = evaluate_fn(model, test_loader, criterion)

print(f"| Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):7.3f} |")

## BLEU Score Calculation

In [9]:
from nltk.translate.bleu_score import sentence_bleu

In [None]:
def calculate_bleu_score(reference, candidate):
    """
    Calculate BLEU score for a single reference and candidate sentence pair.
    
        :param reference: List of words in the target sentence (ground truth).
        :param candidate: List of words in the predicted sentence.

    Return: BLEU score (float)
    """
    return sentence_bleu([reference], candidate)