## Source:
1.  http://nlp.seas.harvard.edu/annotated-transformer/#background
2.  https://github.com/harvardnlp/annotated-transformer/blob/master/AnnotatedTransformer.ipynb

In [6]:
import os
from os.path import exists
import math
import copy
import time

import torch
import torch.nn as nn
from torch.nn.functional import log_softmax, pad
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
from torchtext.data.functional import to_map_style_dataset
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as datasets

import pandas as pd
import altair
import spacy
import GPUtil
import warnings

warnings.filterwarnings("ignore")
RUN_EXAMPLES = True

In [7]:
# Convenience helper functions

def is_interactive_notebook():
    return __name__ == "__main__"

def show_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        return fn(*args)
    
def execute_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        fn(*args)
        
class DummyOptimizer(torch.optim.Optimizer):
    def __init__(self):
        self.param_groups = [{"lr": 0}]
        None
    
    def step(self):
        None
        
    def zero_grad(self, set_to_none=False):
        None

class DummyScheduler:
    def step(self):
        None

## Model Utils

In [11]:
def clones(module, N):
    """
    Return the N identical layers be it encoder or decoder
    """
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

class LayerNorm(nn.Module):
    """
    compute the layer normalization. Pytorch function can be used here for simplicity.
    """
    def __init__(self, features, eps= 1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps
        
    def forward(self, x):
        mean = x.mean(-1, keepdim= True)
        std = x.std(-1, keepdim= True)
        return self.a_2 * (x-mean)/ (std + self.eps) + self.b_2
    
class SubLayerConnection(nn.Module):
    """
    Code for residual connections and layer norms
    
    """
    
    def __init__(self, size, prob):
        super(SubLayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(prob)
        
    def forward(self, x, sub_layer):
        return x + self.dropout(sub_layer(self.norm(x)))

## Overall encoder-decoder architecture of Transformer

In [8]:
# Encoder-Decoder
class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator
        
    def forward(self, src, tgt, src_mask, tgt_mask):
        op_encoder = self.encoder(self.src_embed(src), src_mask)
        op_decoder = self.decoder(self.tgt_embed(tgt), op_encoder, src_mask, tgt_mask)
        return op_decoder

class Generator(nn.Module):
    """
    MLP and Softmax for the decoder output in transformer architecture
    """
    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)
        
    def forward(self, x):
        return log_softmax(self.proj(x), dim= -1)

## Encoder

In [13]:
class Encoder(nn.Module):
    """
    Encoder definition with N encoderlayers.
    Encoder layer is defined as a seperate module.
    """
    
    def __init__(self, layer, N):
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        x = self.norm(x)
        return x

class EncoderLayer(nn.Module):
    """
    Definition of each Encoder layer i.e., Multi Head Attention and Feed Forward layer.
    """
    
    def __init__(self, size, self_attn, feed_forward, p_dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SubLayerConnection(size, p_dropout), 2)
        self.size = size
        
    def forward(self, x, mask):
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        x = self.sublayer[1](x, self.feed_forward)
        return x
    

## Deocder

In [14]:
class Decoder(nn.Module):
    """
    Decoder definition with N decoder layers
    Decoeder layer is defined as a seperate module
    """
    
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, encoder_op, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, encoder_op, src_mask, tgt_mask)
        x = self.norm(x)
        return x
        
class DecoderLayer(nn.Module):
    """
    Definition of each Decoder Layer i.e., two Multi Head Attentions (with and without mask) and Feed Forward layer
    """
    def __init__(self, size, self_attn, feed_forward, src_attn, p_dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SubLayerConnection(size, p_dropout), 3)
    