In [1]:
import random
import time
import math
import copy

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from dataset import nmtDataset
import helpers as utils

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
# Seeding for consistency in reproducibility
SEED = 1234

random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [4]:
hyp_params = {
    "batch_size": 128,
    "num_epochs": 10,

    # Encoder parameters
    "encoder_embedding_size": 512,
    "encoder_dropout": 0.5,

    # Decoder parameters
    "decoder_dropout": 0.5,
    "decoder_embedding_size": 512,

    # Common parameters
    "hidden_size": 512,
    "num_layers": 2
}

In [5]:
log = utils.Logger('logs/emd512-enc2-dec2-bilstm.out')

In [6]:
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

Collecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')
Collecting de-core-news-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.3.0/de_core_news_sm-3.3.0-py3-none-any.whl (14.6 MB)
[+] Download and installation successful
You can now load the package via spacy.load('de_core_news_sm')


In [7]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, dropout):
        super(Encoder, self).__init__()
        
        self.dropout = nn.Dropout(dropout)
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.LSTM = nn.LSTM(embedding_dim, hidden_size, num_layers, dropout=dropout, bidirectional = True)
        
    def forward(self, x):
        # Shape (embedding) --> [Sequence_length , batch_size , embedding dims]
        embedding = self.dropout(self.embedding(x))
        
        # ************** Multiplied by 2 because of bi-directional LSTM
        # Shape --> (output) [Sequence_length , batch_size , hidden_size * 2]
        # Shape --> (hs, cs) [num_layers * 2, batch_size size, hidden_size]
        outputs, (hidden_state, cell_state) = self.LSTM(embedding)
        
        return hidden_state, cell_state
    
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, dropout, output_size):
        super(Decoder, self).__init__()
        
        self.dropout = nn.Dropout(dropout)
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
         # ************** Multiplying 2 because of bi-directional LSTM
        self.LSTM = nn.LSTM(embedding_dim, hidden_size, num_layers * 2, dropout=dropout)
        
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x, hidden_state, cell_state):
        # As we are not feeding whole sentence we will each token a time
        # hence our sequence length would be just 1 however shape of x is batch_size
        # to add sequence length we will unsequeeze it
        # Shape (x) --> [batch_size] (see seq2seq model) so making it [1, batch_size]
        x = x.unsqueeze(0)
        
        # Shape (embedded) --> (1, batch_size, embedding dims)
        embedded = self.dropout(self.embedding(x))
        
        # Shape (outputs) --> (1, 32, 1024) [1, batch_size , hidden_size]
        # Shape (hs, cl) --> (2, 32, 1024)  [num_layers * 2, batch_size , hidden_size]
        outputs, (hidden_state, cell_state) = self.LSTM(embedded, (hidden_state, cell_state))

        # Shape (outputs) -->  (1, batch_size, hidden_size)
        # Shape (outputs.squeeze(0)) -->  (batch_size, hidden_size)
        # Shape (predictions) --> (batch_size, target_vocab_size)
        predictions = self.fc(outputs.squeeze(0))
        
        return predictions, hidden_state, cell_state

class SeqtoSeq(nn.Module):
    def __init__(self, gen_params, target_vocab, device):
        super(SeqtoSeq, self).__init__()

        self.Encoder = Encoder(gen_params["input_size_encoder"],
                          gen_params["encoder_embedding_size"],
                          gen_params["hidden_size"],
                          gen_params["num_layers"],
                          gen_params["encoder_dropout"]).to(device)

        self.Decoder = Decoder(gen_params["input_size_decoder"],
                          gen_params["decoder_embedding_size"],
                          gen_params["hidden_size"],
                          gen_params["num_layers"],
                          gen_params["decoder_dropout"],
                          gen_params["output_size"]).to(device)

        self.target_vocab = target_vocab
        self.device = device
    
    def forward(self, source, src_lens, target, tfr=0.5):
        # Shape -> (Sentence length, Batch_size)
        batch_size = source.shape[1]

        target_len = target.shape[0]  # Length of target sentences
        target_vocab_size = len(self.target_vocab)
        
        # here we will store all the outputs
        # so outputs is arrange in a way that sentences are in column and batch size is row and every element
        # will consist of probability of each word from the vocab
        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(self.device)

        # Shape --> (hs, cs) (num_layers * 2, batch_size size, hidden_size) (contains encoder's hs, cs - context vectors)
        hidden_state, cell_state = self.Encoder(source)

        # Shape (target) -> (Sentence length, Batch_size)
        # Shape (x) --> (batch_size)
        x = target[0]  # First token (Trigger)
        
        for i in range(1, target_len):
            # Shape (output) --> (batch_size, target_vocab_size)
            # Shape (hs, cl) --> (num_layers * 2, batch_size , hidden_size)
            output, hidden_state, cell_state = self.Decoder(x, hidden_state, cell_state)
            outputs[i] = output
            best_guess = output.argmax(1)  # 0th dimension is batch size, 1st dimension is word embedding
            # Schedule sampling
            x = target[
                i] if random.random() < tfr else best_guess  # Either pass the next word correctly from the dataset
            # or use the earlier predicted word

        # Shape --> (sentence length, batch size, vocab size)
        return outputs

In [8]:
nmtds_train = nmtDataset('datasets/Multi30k/', 'train')
nmtds_valid = nmtDataset('datasets/Multi30k/', 'val', nmtds_train)
nmtds_test = nmtDataset('datasets/Multi30k/', 'test', nmtds_train)

pad_idx = nmtds_train.trg_vocab["<pad>"]

In [9]:
train_dataloader = DataLoader(nmtds_train, batch_size=hyp_params['batch_size'], shuffle=True,
                              collate_fn=lambda batch_size: utils.collate_fn(batch_size, pad_idx, device))

valid_dataloader = DataLoader(nmtds_valid, batch_size=hyp_params['batch_size'], shuffle=True,
                              collate_fn=lambda batch_size: utils.collate_fn(batch_size, pad_idx, device))

In [10]:
hyp_params["input_size_encoder"] = len(nmtds_train.src_vocab)
hyp_params["input_size_decoder"] = len(nmtds_train.trg_vocab)
hyp_params["output_size"] = len(nmtds_train.trg_vocab)

model = SeqtoSeq(hyp_params, target_vocab=nmtds_train.trg_vocab, device=device)
optimizer = optim.Adam(model.parameters())

criterion = nn.CrossEntropyLoss(ignore_index=pad_idx).to(device)

In [11]:
min_el = math.inf
patience = 1
best_model = {}
best_epoch = 0

epoch_loss = 0
for epoch in range(hyp_params["num_epochs"]):
    start = time.time()
    
    epoch_loss = utils.train_model(model, train_dataloader, criterion, optimizer)
    eval_loss = utils.evaluate_model(model, valid_dataloader, criterion)
    
    log.log(f"Epoch: {epoch+1}, Train loss: {epoch_loss}, Eval loss: {eval_loss}, patience: {patience}. Time {time.time() - start}")

    
    if eval_loss < min_el:
        best_epoch = epoch+1
        min_el = eval_loss
        best_model = copy.deepcopy(model)
        torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'eval_loss': min_el
        }, 'model-bilstm.pt')
        patience = 1
    else:
        patience += 1
    
    if patience == 10:
        log.log("[STOPPING] Early stopping in action..")
        log.log(f"Best epoch was {best_epoch} with {min_el} eval loss")
        break

log.close()

100%|████████████████████████████████████████████████████████████████████████████████| 227/227 [00:37<00:00,  6.06it/s]


Epoch: 1, Train loss: 5.234851837158203, Eval loss: 4.6997880935668945, patience: 1. Time 38.24520540237427


100%|████████████████████████████████████████████████████████████████████████████████| 227/227 [00:36<00:00,  6.29it/s]


Epoch: 2, Train loss: 4.6150126457214355, Eval loss: 4.345252513885498, patience: 1. Time 36.634130239486694


100%|████████████████████████████████████████████████████████████████████████████████| 227/227 [00:35<00:00,  6.34it/s]


Epoch: 3, Train loss: 4.262725830078125, Eval loss: 4.234218597412109, patience: 1. Time 36.38913941383362


100%|████████████████████████████████████████████████████████████████████████████████| 227/227 [00:35<00:00,  6.33it/s]


Epoch: 4, Train loss: 3.9060938358306885, Eval loss: 4.087944984436035, patience: 1. Time 36.37311601638794


100%|████████████████████████████████████████████████████████████████████████████████| 227/227 [00:36<00:00,  6.30it/s]


Epoch: 5, Train loss: 3.684907913208008, Eval loss: 3.9035472869873047, patience: 1. Time 36.553874015808105


100%|████████████████████████████████████████████████████████████████████████████████| 227/227 [00:36<00:00,  6.26it/s]


Epoch: 6, Train loss: 3.521395683288574, Eval loss: 3.8680381774902344, patience: 1. Time 36.788660526275635


100%|████████████████████████████████████████████████████████████████████████████████| 227/227 [00:35<00:00,  6.37it/s]


Epoch: 7, Train loss: 3.381364583969116, Eval loss: 3.8179566860198975, patience: 1. Time 36.15723180770874


100%|████████████████████████████████████████████████████████████████████████████████| 227/227 [00:35<00:00,  6.36it/s]


Epoch: 8, Train loss: 3.2475948333740234, Eval loss: 3.800525188446045, patience: 1. Time 36.21805024147034


100%|████████████████████████████████████████████████████████████████████████████████| 227/227 [00:35<00:00,  6.34it/s]


Epoch: 9, Train loss: 3.146925926208496, Eval loss: 3.7838470935821533, patience: 1. Time 36.34213995933533


100%|████████████████████████████████████████████████████████████████████████████████| 227/227 [00:35<00:00,  6.33it/s]


Epoch: 10, Train loss: 3.0509862899780273, Eval loss: 3.79226016998291, patience: 1. Time 36.379178285598755


In [12]:
model_l = SeqtoSeq(hyp_params, target_vocab=nmtds_train.trg_vocab, device=device)
model_l.load_state_dict(torch.load('model-bilstm.pt', map_location=device)["model_state_dict"])
model_l.eval()

SeqtoSeq(
  (Encoder): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(5893, 512)
    (LSTM): LSTM(512, 512, num_layers=2, dropout=0.5, bidirectional=True)
  )
  (Decoder): Decoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(7853, 512)
    (LSTM): LSTM(512, 512, num_layers=4, dropout=0.5)
    (fc): Linear(in_features=512, out_features=7853, bias=True)
  )
  (target_vocab): Vocab()
)

In [13]:
utils.bleu(model_l, nmtds_test, False, device)

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:08<00:00, 113.71it/s]


0.2086530178785324