## Seq2Seq Model

### Data loaders

The final step of preparing the data is to create the data loaders.
Our goal is to return a batch of data, each batch being a dictionary containing the numericalized sentences (which have also been padded) as PyTorch tensors.

In [1]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import random
import tqdm
import numpy as np
import sentencepiece as spm
import itertools
from torch import nn, optim
from torch.utils.data import DataLoader

In [2]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_src_ids = [example["src_ids"] for example in batch]
        batch_tgt_ids = [example["tgt_ids"] for example in batch]
        batch_src_ids = nn.utils.rnn.pad_sequence(batch_src_ids, padding_value=pad_index)
        batch_tgt_ids = nn.utils.rnn.pad_sequence(batch_tgt_ids, padding_value=pad_index)
        batch = {
            "src_ids": batch_src_ids,
            "tgt_ids": batch_tgt_ids,
        }
        return batch

    return collate_fn

Next, we write the functions which give us our data loaders creating using PyTorch's DataLoader class.
get_data_loader is created using a Dataset, the batch size, the padding token index (which is used for creating the batches in the collate_fn, and a boolean deciding if the examples should be shuffled at the time the data loader is iterated over.
The batch size defines the maximum amount of examples within a batch. If the length of the dataset is not evenly divisible by the batch size then the last batch will be smaller.

In [3]:
def get_data_loader(dataset, batch_size, pad_index, shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
    )
    return data_loader

In [4]:
model_name = 'bpe_8000'

In [6]:
train_data = torch.load("data/tokenized_data/bpe_8000_train.pt")
dev_data = torch.load("data/tokenized_data/bpe_8000_dev.pt")
test_data = torch.load("data/tokenized_data/bpe_8000_test.pt")

In [7]:
model_filename = 'tokenizer_models/' + model_name + '.model'
sp = spm.SentencePieceProcessor(model_file=model_filename)

In [8]:
#


# Generate all combinations of hyperparameters
param_combinations = list(itertools.product(*param_grid.values()))


### Building  a model
#### Encoder
convert words into vectors

* input_dim is the input (source) vocabulary size
* embedding_dim is the dimensionality of the embedding layer.
* hidden_dim is the dimensionality of the hidden and cell states.
* n_layers is the number of layers in the RNN.
* dropout is the amount of dropout to use to prevent overfitting.

In [9]:
class Encoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

In [10]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell

In [11]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        assert (
            encoder.hidden_dim == decoder.hidden_dim
        ), "Hidden dimensions of encoder and decoder must be equal!"
        assert (
            encoder.n_layers == decoder.n_layers
        ), "Encoder and decoder must have equal number of layers!"

    def forward(self, src, tgt, teacher_forcing_ratio):
        batch_size = tgt.shape[1]
        tgt_length = tgt.shape[0]
        tgt_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(tgt_length, batch_size, tgt_vocab_size).to(self.device)
        hidden, cell = self.encoder(src)
        input = tgt[0, :]
        # input = [batch size]
        for t in range(1, tgt_length):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = tgt[t] if teacher_force else top1
        return outputs

In [12]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

In [13]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [14]:
pad_index = 0
criterion = nn.CrossEntropyLoss(ignore_index=pad_index)

In [15]:
def train_fn(
    model, data_loader, optimizer, criterion, clip, teacher_forcing_ratio, device
):
    model.train()
    epoch_loss = 0
    for i, batch in enumerate(data_loader):
        src = batch["src_ids"].to(device)
        tgt = batch["tgt_ids"].to(device)
        # src = [src length, batch size]
        # tgt = [tgt length, batch size]
        optimizer.zero_grad()
        output = model(src, tgt, teacher_forcing_ratio)
        # output = [tgt length, batch size, tgt vocab size]
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        # output = [(tgt length - 1) * batch size, tgt vocab size]
        tgt = tgt[1:].view(-1)
        # tgt = [(tgt length - 1) * batch size]
        loss = criterion(output, tgt)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [16]:
def evaluate_fn(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            src = batch["src_ids"].to(device)
            tgt = batch["tgt_ids"].to(device)
            # src = [src length, batch size]
            # tgt = [tgt length, batch size]
            output = model(src, tgt, 0)  # turn off teacher forcing
            # output = [tgt length, batch size, tgt vocab size]
            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            # output = [(tgt length - 1) * batch size, tgt vocab size]
            tgt = tgt[1:].view(-1)
            # tgt = [(tgt length - 1) * batch size]
            loss = criterion(output, tgt)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)

In [18]:
input_dim = sp.get_piece_size()
output_dim = sp.get_piece_size()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Hyperparameter tuning loop
best_valid_loss = float('inf')
best_params = None

for params in param_combinations:
    batch_size, embedding_dim, hidden_dim, n_layers, dropout, lr, tf_ratio, clip, n_epochs = params

    print(f"Training with parameters: {params}")

    # Update DataLoader with new batch size
    train_data_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=get_collate_fn)
    dev_data_loader = DataLoader(dev_data, batch_size=batch_size, collate_fn=get_collate_fn)

    # Update model with new hyperparameters
    encoder = Encoder(input_dim, embedding_dim, hidden_dim, n_layers, dropout)
    decoder = Decoder(output_dim, embedding_dim, hidden_dim, n_layers, dropout)
    model = Seq2Seq(encoder, decoder, device).to(device)

    # Initialize model weights
    model.apply(init_weights)

    # Update optimizer with new learning rate
    optimizer = optim.Adam(model.parameters())
    print(f"The model has {count_parameters(model):,} trainable parameters")

    # Train and evaluate the model
    train_losses = []
    valid_losses = []

    for epoch in tqdm.tqdm(range(n_epochs)):
        train_loss = train_fn(model, train_data_loader, optimizer, criterion, clip, tf_ratio, device)
        valid_loss = evaluate_fn(model, dev_data_loader, criterion, device)

        train_losses.append(train_loss)
        valid_losses.append(valid_loss)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            best_params = params
            torch.save(model.state_dict(), "best_model.pt")

        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {np.exp(train_loss):.3f}')
        print(f'\tValid Loss: {valid_loss:.3f} | Valid PPL: {np.exp(valid_loss):.3f}')

print(f"Best parameters: {best_params}")

Training with parameters: (128, 256, 256, 2, 0.3, 0.001, 0.25, 0.5, 10)
The model has 8,257,344 trainable parameters


  0%|          | 0/10 [00:00<?, ?it/s]


TypeError: 'function' object is not subscriptable

In [None]:
test_data_loader = DataLoader(test_data, batch_size=batch_size)
model.load_state_dict(torch.load("gec-model_2.pt"))

test_loss = evaluate_fn(model, test_data_loader, criterion, device)

print(f"| Test Loss: {test_loss:.3f} | Test PPL: {np.exp(test_loss):7.3f} |")