<a href="https://colab.research.google.com/github/manikTharaka/annotated-transformer-impl/blob/main/annotated_transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q torchdata==0.3.0 torchtext==0.12 spacy==3.2 altair GPUtil
!python -m spacy download de_core_news_sm
!python -m spacy download en_core_web_sm

In [None]:
import os
from os.path import exists
import torch
import torch.nn as nn
from torch.nn.functional import log_softmax, pad
import math
import copy
import time
from torch.optim.lr_scheduler import LambdaLR
import pandas as pd
import altair as alt
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as datasets
import spacy
import GPUtil
import warnings
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP


# Set to False to skip notebook execution (e.g. for debugging)
warnings.filterwarnings("ignore")
RUN_EXAMPLES = True

In [None]:
def is_interactive_notebook():
    return __name__ == "__main__"


def show_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        return fn(*args)


def execute_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        fn(*args)


class DummyOptimizer(torch.optim.Optimizer):
    def __init__(self):
        self.param_groups = [{"lr": 0}]
        None

    def step(self):
        None

    def zero_grad(self, set_to_none=False):
        None


class DummyScheduler:
    def step(self):
        None

In [None]:
class EncoderDecoder(nn.Module):

    def __init__(self,encodermdecoder,src_embed,tgt_embed,generator):
        super(EncoderDecoder,self).__init__()
        self.encoder=encoder
        self.decoder=decoder
        self.src_embed=src_embed
        self.tgt_embed=tgt_embed
        self.generator=generator

    def forward(self,src,tgt,src_mask,tgt_mask):
        return self.decode(self.encode(src,src_mask),src_mask,tgt,tgt_mask)

    def encode(self,src,src_mask):
        encoded = self.encoder(self.src_embed(src),src_mask)
        return encoded

    def decode(self,memory,src_mask,tgt,tgt_mask):
        decoded = self.dcoder(self.tgt_embed(tgt),memory,src_mask,tgt_mask)

        return decoded

In [None]:
class Generator(nn.Module):

    def __init__(self,d_model,vocab) -> None:
        super(Generator,self).__init__()
        self.proj = nn.Linear(d_model,vocab)

    def forward(self,x):
        return log_softmax(self.proj(x),dim=1)

In [None]:
def clones(module,N):
    """Make identical copies of the module"""
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

class Encoder(nn.Module):

    def __init__(self,layer,N):
        super(Encoder,self).__init__()
        self.layers = clones(layer,N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x,mask)

        return self.norm(x)

In [None]:
class LayerNorm(nn.Module):

    def __init__(self,features,eps=1e-6):
        super(LayerNorm,self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self,x):
        mean = x.mean(-1,keepdim=True)
        std  = x.std(-1,keepdim=True)

        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [None]:
class SubLayerConnection(nn.Module):

    def __init__(self,size,dropout):
        super().__init__()
        self.norm = LayerNorm(size)
        self.droput = nn.Dropout(dropout)

    def forward(self, x, sublayer):

        return x + self.dropout(sublayer(self.norm(x)))

In [None]:
class EncoderLayer(nn.Module):

    def __init__(self,size, self_attn,feed_forward,dropout):
        super().__init__()
        self.self_attention = self_attn
        self.feed_forward = feed_forward
        self.sublayer =  clones(SubLayerConnection(size,dropout),2)
        self.size = size

    def forward(self,x,mask):
        x = self.sublayer[0](x,lambda x:self.self_attention(x,x,x,mask))

        return self.sublayer[1](x,self.feed_forward)

In [None]:
class Decoder(nn.Module):

    def __init__(self,layer,N):
        super().__init__()
        self.layers = clones(layer,N)
        self.norm = LayerNorm(layer.size)

    def forward(self,x,memory,src_mask,tgt_mask):
        for layer in self.layers:
            x = layer(x,memory,src_mask,tgt_mask)

        return self.norm(x)

In [None]:
class DecoderLayer(nn.Module):

    def __init__(self,size,self_attention,src_attention,feed_forward,dropout):
        self.size = size
        self.self_attention = self_attention
        self.src_attention = src_attention
        self.feed_forward = feed_forward
        self.sublayer = clones(SubLayerConnection(size,dropout),3)

    def forward(self,x,memory,src_mask,tgt_mask):
        m = memory
        x = self.sublayer[0](x, lambda x:self.self_attention(x,x,x,tgt_mask))
        x = self.sublayer[1](x, lambda x:self.src_attention(x,m,m,src_mask))
        return self.sublayer[2](x,self.feed_forward)