In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

import torchtext

from torch.utils.data import DataLoader

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker


from dataset import nmtDataset
import helpers as utils

import spacy
import numpy as np

import random
import math
import time

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
log = utils.Logger('logs/transformers.out')  

In [3]:
hyp_params = {
    "batch_size": 64,
    "num_epochs": 10,

    # Encoder parameters
    "encoder_embedding_size": 512,
    "encoder_dropout": 0, # Disabled dropout because now we are only using single layer LSTM

    # Decoder parameters
    "decoder_dropout": 0,
    "decoder_embedding_size": 512,

    # Common parameters
    "hidden_size": 512,
    "num_layers": 1
}

In [4]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [5]:
nmtds_train = nmtDataset('datasets/Multi30k/', 'train')
nmtds_valid = nmtDataset('datasets/Multi30k/', 'val', nmtds_train)
nmtds_test = nmtDataset('datasets/Multi30k/', 'test', nmtds_train)

SRC_PAD_IDX = nmtds_train.src_vocab["<pad>"]
TRG_PAD_IDX = nmtds_train.trg_vocab["<pad>"]

train_dataloader = DataLoader(nmtds_train, batch_size=64, shuffle=True,
                              collate_fn=lambda batch_size: utils.collate_fn(batch_size, SRC_PAD_IDX, device))

valid_dataloader = DataLoader(nmtds_valid, batch_size=64, shuffle=True,
                              collate_fn=lambda batch_size: utils.collate_fn(batch_size, SRC_PAD_IDX, device))

In [385]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, maxlen = 12):
        super(PositionalEncoding, self).__init__()
        
        # A tensor consists of all the possible positions (index) e.g 0, 1, 2, ... max length of input
        # Shape (pos) --> [max len, 1]
        pos = torch.arange(0, maxlen).unsqueeze(1)
        
        pos_encoding = torch.zeros((maxlen, d_model))
        
        # In the paper, they had 2i in the positional encoding formula
        # where i is the dimension 
        sin_den = 10000 ** (torch.arange(0, d_model, 2)/d_model) # sin for even item of position's dimension
        cos_den = 10000 ** (torch.arange(1, d_model, 2)/d_model) # cos for odd 
        
        pos_encoding[:, 0::2] = torch.sin(pos / sin_den) 
        pos_encoding[:, 1::2] = torch.cos(pos / cos_den)
        
        # Shape (pos_embedding) --> [max len, d_model]
        pos_encoding = pos_encoding.unsqueeze(-2)
        # Shape (pos_embedding) --> [max len, 1, d_model]

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_encoding', pos_encoding)

    def forward(self, token_embedding):
        
        # shape (token_embedding) --> [sentence len, batch size, d_model]
        
        # Combining embeddings with positional encodings
        # Note: As we made positional encoding with the size max length of sentence in our dataset 
        #       hence here we are picking till the sentence length in a batch
        return self.dropout(token_embedding + self.pos_encoding[:token_embedding.size(0), :])
    
class InputEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model):
        super(InputEmbedding, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.d_model = d_model

    def forward(self, tokens):
        # shape (tokens) --> [sentence len, batch size]
        # shape (inp_emb) --> [sentence len, batch size, d_model]
        inp_emb = self.embedding(tokens.long()) * math.sqrt(self.d_model)
        return inp_emb

In [386]:
class Seq2SeqTransformer(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, d_model, dropout):
        super(Seq2SeqTransformer, self).__init__()
        
        self.src_inp_emb = InputEmbedding(src_vocab_size, d_model)
        self.trg_inp_emb = InputEmbedding(trg_vocab_size, d_model)
        
        self.positional_encoding = PositionalEncoding(d_model, dropout=dropout)
    
    def forward(self, trg):
        self.positional_encoding(self.trg_inp_emb(trg))
        

In [387]:
model = Seq2SeqTransformer(3434, 1223, 2, 0.5)

In [384]:
model(torch.rand([7, 128]))