# NLP Exercise 3: Seq2Seq Model and Attention Mechanisms
---

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import datasets
import tqdm
import spacy

  from .autonotebook import tqdm as notebook_tqdm


Load dataset using the 'datasets' library.

In [2]:
dataset = datasets.load_dataset('ncduy/mt-en-vi')

In [3]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['en', 'vi', 'source'],
        num_rows: 2884451
    })
    validation: Dataset({
        features: ['en', 'vi', 'source'],
        num_rows: 11316
    })
    test: Dataset({
        features: ['en', 'vi', 'source'],
        num_rows: 11225
    })
})


In [4]:
train_data, val_data, test_data = (
    dataset["train"].remove_columns('source'),
    dataset["validation"].remove_columns('source'),
    dataset["test"].remove_columns('source')
)

print(train_data[0])

{'en': "- Sorry, that question's not on here.", 'vi': '- Xin lỗi, nhưng mà ở đây không có câu hỏi đấy.'}


In [5]:
en_nlp = spacy.load('en_core_web_sm')
vi_nlp = spacy.load('xx_ent_wiki_sm')

In [6]:
def tokenize(example, en_nlp, vi_nlp, max_length, lower, sos_token, eos_token):
    en_tokens = [token.text for token in en_nlp.tokenizer(example["en"])][:max_length]
    vi_tokens = [token.text for token in vi_nlp.tokenizer(example["vi"])][:max_length]
    if lower:
        en_tokens = [token.lower() for token in en_tokens]
        vi_tokens = [token.lower() for token in vi_tokens]
    en_tokens = [sos_token] + en_tokens + [eos_token]
    vi_tokens = [sos_token] + vi_tokens + [eos_token]
    return {"en_tokens": en_tokens, "vi_tokens": vi_tokens}

In [8]:
max_length = 1000
lower = True
sos_token = "<sos>"
eos_token = "<eos>"

fn_kwargs = {
    "en_nlp": en_nlp,
    "vi_nlp": vi_nlp,
    "max_length": max_length,
    "lower": lower,
    "sos_token": sos_token,
    "eos_token": eos_token,
}

train_data = train_data.map(tokenize, fn_kwargs=fn_kwargs)
valid_data = val_data.map(tokenize, fn_kwargs=fn_kwargs)
test_data = test_data.map(tokenize, fn_kwargs=fn_kwargs)

Map: 100%|██████████| 2884451/2884451 [06:21<00:00, 7568.76 examples/s]
Map: 100%|██████████| 11316/11316 [00:01<00:00, 7288.69 examples/s]
Map: 100%|██████████| 11225/11225 [00:01<00:00, 7360.61 examples/s]


In [19]:
print(train_data[random.randint(0, 999)])

{'en': 'He put it on a personal basis.', 'vi': 'Ông đã dựa trên cơ sở bản thân để nói điều đó.', 'en_tokens': ['<sos>', 'he', 'put', 'it', 'on', 'a', 'personal', 'basis', '.', '<eos>'], 'vi_tokens': ['<sos>', 'ông', 'đã', 'dựa', 'trên', 'cơ', 'sở', 'bản', 'thân', 'để', 'nói', 'điều', 'đó', '.', '<eos>']}


In [22]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordLevelTrainer

In [23]:
# Initialize tokenizer
tokenizer = Tokenizer(WordLevel(unk_token="<unk>"))
tokenizer.pre_tokenizer = Whitespace()

# Trainer settings
trainer = WordLevelTrainer(special_tokens=["<unk>", "<pad>", "<sos>", "<eos>"], min_frequency=2)

# Train tokenizer on English tokens
tokenizer.train_from_iterator(train_data["en_tokens"], trainer)

# Get vocabulary
en_vocab = tokenizer.get_vocab()
vi_vocab = tokenizer.get_vocab()

## Define the Encoder and Decoder for Seq2Seq without Attention

# Encoder

- Encoder reads the input sequence and summerizes the information in something called internal state vectors or context vectors. This context vector aims to encapsulate the information for all input elements to help the decoder make accurate predictions.
- This implementation involves creating an RNN-based encoder.

In [24]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # src = [src length, batch size]
        embedded = self.dropout(self.embedding(src))
        # embedded = [src length, batch size, embedding dim]
        outputs, (hidden, cell) = self.rnn(embedded)
        # outputs = [src length, batch size, hidden dim * n directions]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # outputs are always from the top hidden layer
        return hidden, cell

Decoder

In [25]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        # input = [batch size]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # n directions in the decoder will both always be 1, therefore:
        # hidden = [n layers, batch size, hidden dim]
        # context = [n layers, batch size, hidden dim]
        input = input.unsqueeze(0)
        # input = [1, batch size]
        embedded = self.dropout(self.embedding(input))
        # embedded = [1, batch size, embedding dim]
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        # output = [seq length, batch size, hidden dim * n directions]
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # seq length and n directions will always be 1 in this decoder, therefore:
        # output = [1, batch size, hidden dim]
        # hidden = [n layers, batch size, hidden dim]
        # cell = [n layers, batch size, hidden dim]
        prediction = self.fc_out(output.squeeze(0))
        # prediction = [batch size, output dim]
        return prediction, hidden, cell

Seq2Seq Model without Attention

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert (
            encoder.hidden_dim == decoder.hidden_dim
        ), "Hidden dimensions of encoder and decoder must be equal!"
        assert (
            encoder.n_layers == decoder.n_layers
        ), "Encoder and decoder must have equal number of layers!"

    def forward(self, src, trg, teacher_forcing_ratio):
        # src = [src length, batch size]
        # trg = [trg length, batch size]
        # teacher_forcing_ratio is probability to use teacher forcing
        # e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        batch_size = trg.shape[1]
        trg_length = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        # tensor to store decoder outputs
        outputs = torch.zeros(trg_length, batch_size, trg_vocab_size).to(self.device)
        # last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        # hidden = [n layers * n directions, batch size, hidden dim]
        # cell = [n layers * n directions, batch size, hidden dim]
        # first input to the decoder is the <sos> tokens
        input = trg[0, :]
        # input = [batch size]
        for t in range(1, trg_length):
            # insert input token embedding, previous hidden and previous cell states
            # receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            # output = [batch size, output dim]
            # hidden = [n layers, batch size, hidden dim]
            # cell = [n layers, batch size, hidden dim]
            # place predictions in a tensor holding predictions for each token
            outputs[t] = output
            # decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            # get the highest predicted token from our predictions
            top1 = output.argmax(1)
            # if teacher forcing, use actual next token as next input
            # if not, use predicted token
            input = trg[t] if teacher_force else top1
            # input = [batch size]
        return outputs

## Define the Attention Mechanism and Decoder with Attention

Decoder with Attention

Define the Seq2Seq Model with Attention

## BLEU Score Calculation

In [9]:
from nltk.translate.bleu_score import sentence_bleu

In [31]:
def calculate_bleu_score(reference, candidate):
    """
    Calculate BLEU score for a single reference and candidate sentence pair.
    
        :param reference: List of words in the target sentence (ground truth).
        :param candidate: List of words in the predicted sentence.

    Return: BLEU score (float)
    """
    return sentence_bleu([reference], candidate)

# Example usage
reference = "I am learning NLP".split()
candidate_seq2seq = "I am learni NLP".split()  # Example output without attention
candidate_with_attention = "I am learning NLP".split()  # Example output with attention

bleu_seq2seq = calculate_bleu_score(reference, candidate_seq2seq)
bleu_with_attention = calculate_bleu_score(reference, candidate_with_attention)

print(f"BLEU score for Seq2Seq without attention: {bleu_seq2seq}")
print(f"BLEU score for Seq2Seq with attention: {bleu_with_attention}")


BLEU score for Seq2Seq without attention: 1.0547686614863434e-154
BLEU score for Seq2Seq with attention: 1.0
