### Importing necessary packages

In [6]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
from torch.autograd import Variable
import matplotlib.pyplot as plt
import seaborn
seaborn.set_context(context="talk")
%matplotlib inline

SyntaxError: invalid syntax (<ipython-input-6-1d7e0c65ec3f>, line 3)

# Model Architecture

## EncoderDecoder

In [None]:
"""
This is a classic encoder decoder architecture
"""
class EncoderDecoder(nn.Module):
    """
    This class inherits from the nn.module the base class for all neural network modules.
    """
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.generator = generator
    
    def forward(self, src, tgt, src_mask, tgt_mask):
        """
        Take in and process masked src and target sequences
        """
        return self.decode(self.encode(src, src_mask), src_mask, tgt, tgt_mask)
    
    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src),src_mask)
    
    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

## Generator

In [None]:
class Generator(nn.Module):
    """
    Define standard linear + softmax generation steps
    """
    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)
    def forward(self, x): # x is (d_model, vocab)
        return F.log_softmax(self.proj(x), dim=-1)

## Encoder

In [None]:
def clones(module, N):
    """
    Produce N identical layers
    """
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
class Encoder(nn.Module):
    """
    Core encoder is a stack of N layers
    """
    def __init__(self, layer, N):
        super(Encoder,self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size) # We will build the class layernorm
    
    def forward(self, x, mask):
        """
        Pass the input (and mask) through each layer in turn
        """
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

## LayerNorm

In [None]:
class LayerNorm(nn.Module):
    """
    Employ a residual connection around each of the two sub-layers, 
    followed by layer normalization.
    
    Deeper neural networks are more difficult to train. 
    We present a residual learning framework to ease the training of networks 
    that are substantially deeper than those used previously. 
    We explicitly reformulate the layers as learning residual functions 
    with reference to the layer inputs, instead of learning unreferenced functions. 
    We provide comprehensive empirical evidence showing that these residual networks 
    are easier to optimize, and can gain accuracy from considerably increased depth. 
    On the ImageNet dataset we evaluate residual nets with a depth of up to 
    152 layers---8x deeper than VGG nets but still having lower complexity. 
    An ensemble of these residual nets achieves 3.57% error on the ImageNet test set. 
    This result won the 1st place on the ILSVRC 2015 classification task. 
    We also present analysis on CIFAR-10 with 100 and 1000 layers.
    The depth of representations is of central importance for many visual recognition tasks. 
    Solely due to our extremely deep representations, we obtain a 28% relative improvement 
    on the COCO object detection dataset. 
    Deep residual nets are foundations of our submissions to ILSVRC & COCO 2015 competitions, 
    where we also won the 1st places on the tasks of ImageNet detection, 
    ImageNet localization, COCO detection, and COCO segmentation.
    
    Construct a layernorm module:
    """
    def __init__(self, features, eps=1e-6): # features = layer.size
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps
    def forward(self, x):
        mean = x.mean(-1, keepdim = True)
        std = x.std(-1, keepdim = True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
    
    """
    Output of each sublayer is Layernorm(x + sublayer(x)) where sublaer x is the function
    implemented by the sublayer itself
    """

## SublayerConnection

In [None]:
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm
    """
    def __init__(self, size, dropout):
        # dropout = the probability of an element to be zeroed
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, sublayer):
        """
        Apply residual connection to any sublayer with the same size
        """
        return x + self.dropout(sublayer(self.norm(x)))

## EncoderLayer

In [None]:
"""
Each layer has two sub layers. First is the multi-head self attention mechanism, and the second
is a simple position wise, fully connected feed-forward network
"""
class EncoderLayer(nn.Module):
    """
    Encoder is made for self -attn and feed forward
    """
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size
    
    def forward(self, x, mask):
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask)) 
        """
        lambda is basically creating a very short function
        ex:
        raise_to_power = lambda x, y: x ** y
        raise_to_power(2, 3)
        8
        """
        return self.sublayer[1](x, self.feed_forward)

## Decoder

In [None]:
'''
The decoder is also a stack of N = 6 layers
'''
class Decoder(nn.Module):
    # Generic N layer decoder with masking
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
    
    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)

## DecoderLayer 

In [None]:
"""
In addition to the two sublayers in each encoder layer, the decoder inserts a third sublayer
which performs multi head attention over the output of the encoder stack.
Similar to the encoder, residual connections are employed around each of the sub-layers,
followed by layer normalization.
"""
class DecoderLayer(nn.Module):
    "Decoder is made of self-attn, src-attn, and feed forward"
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)
        
    def forward(self, x, memory, src_mask, tgt_mask):
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)

In [None]:
'''
The self attention sublayer is modified in the decoder stack 
to prevent porsitions from attending subsequent positions. This masking
combined with the fact that the output embeddings are offset by one position, ensures
that the prediction for position i, depends only on the known output at position
less than i'''

def subsequent_mask(size):
    "Mask out subsequent positions"
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0

In [None]:
plt.figure(figsize=(5,5))
plt.imshow(subsequent_mask(20)[0])
None

## Attention

In [None]:
"""
An attention function can be described as mapping 
a query and a set of key-value pairs to an output.
where the query, keys, values, and output are all vectors
"""

def attention(query, key, value, mask=None, dropout=None):
    "Compute scaled dot product attention"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1))/math.sqrt(d_k)
    if mask is not None:
        score = scores.masked_fill(mask == 0, -1e9)
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

## MultiHeadedAttention 

In [None]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        """
        Take in model size and number of heads
        """
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # we assume d_v amways equals to d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p = dropout)
    
    def forward(self, query, key, value, mask=None):
        if mask is not None:
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)
        # 1. Do all the linear projections in batch from d_model => h x d_k
        query, key, value = \
            [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1,2)
             for l, x in zip(self.linears, (query, key, value))]
        # 2) Apply attention on all the projected vectors in batch. 
        x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout)
        # 3) "Concat" using a view and apply a final linear.
        x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k)
        return self.linears[-1](x)

## Position-wise Feed-Forward Networks

In [None]:
"""
each of the layers in our encoder and decoder contains a fully connected feed-forward
network. This consists of two linear transformations with a RelU activation in between.
"""
class PositionwiseFeedForward(nn.Module):
    # d_model = 512, d_ff = 2048
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

## Embeddings 

In [None]:
"""
Using learned embeddings to convert the input and output tokens into vectors of
dimension d_model
"""
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model
    def forward(self,x):
        return self.lut(x) * math.sqrt(self.d_model)
    

## Positional Encoding

In [None]:
"""
Because the model has no convolution and no recurence, there is the need to
input some positional information about the relative or absolute position of the tokens in the
sequence
"""
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len = 5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)],
                        requires_grad = False)
        return self.dropout(x)

In [None]:
# Exemple
plt.figure(figsize=(15, 5))
pe = PositionalEncoding(20, 0)
y = pe.forward(Variable(torch.zeros(1, 100, 20)))
plt.plot(np.arange(100), y[0, :, 4:8].data.numpy())
plt.legend(["dim %d"%p for p in [4,5,6,7]])
None

## Full Model 

In [None]:
def make_model(src_vocab, tgt_vocab, N=6, d_model = 512, d_ff = 2048, h = 8, dropout = 0.1):
    """
    Construct a model from the hyper parameters
    """
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
    Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
    Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
    nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
    nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
    Generator(d_model, tgt_vocab))
    
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform(p)
    return model
    

In [None]:
tmp_model = make_model(10,10,2)
None

In [None]:
tmp_model

# Training

## Batches and Masking 

In [None]:
"""
How to train your model
"""
class Batch:
    """
    Object for holding a batch of data with mask during training
    """
    def __init__(self, src, trg=None, pad=0):
        self.src = src
        self.src_mask = (src != pad).unsqueeze(-2)
        if trg is not None:
            self.trg = trg[:, :-1]
            self.trg_y = trg[:, 1:]
            self.trg_mask = self.make_std_mask(self.trg, pad)
            self.ntokens = (self.trg_y != pad).data.sum()
    @staticmethod
    def make_std_mask(tgt, pad):
        """
        Create a mask to hide padding and future words
        """
        tgt_mask = (tgt != pad).unsqueeze(-2)
        tgt_mask = tgt_mask & Variableiable(subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data))
        return tgt_mask

## Training Loop 

In [None]:
"""
Generic training and scoring function to keep track of loss.
We pass in a generic loss compute function that also handles parameter updates
"""
def run_epoch(data_iter, model, loss_compute):
    " Standard Training and Logging Function"
    start = time.time()
    total_tokens = 0
    total_loss = 0
    tokens = 0
    for i, batch in enumerate(data_iter):
        out = model.forward(batch.src, batch.trg, batch.src_mask, batch.trg_maks)
        loss = loss_compute(out, batch.trg_y, batch.ntokens)
        tokens += batch.ntokens
        if i % 50 == 1:
            elapsed = time.time() - start
            print("Epoch Step: %d Loss: %f Tokens per Sec: %f"%
                  (i, loss / batch.ntokens, tokens / elapsed))
            start = time.time()
            tokens = 0
    return total_loss / total_tokens

## Training Data and Batching

Trained on the standard WMT 2014 English-German dataset consisting of about 4.5 million sentence pairs. Sentences were encoded using byte-pair encoding, which has a shared source-target vocabulary of about 37000 tokens. For English- French, we used the significantly larger WMT 2014 English-French dataset consisting of 36M sentences and split tokens into a 32000 word-piece vocabulary.

Sentence pairs were batched together by approximate sequence length. Each training batch contained a set of sentence pairs containing approximately 25000 source tokens and 25000 target tokens.

We will use torch text for batching. This is discussed in more detail below. Here we create batches in a torchtext function that ensures our batch size padded to the maximum batchsize does not surpass a threshold (25000 if we have 8 gpus).

In [None]:
global max_src_in_batch, max_tgt_in_batch
def batch_size_fn(new, count, sofar):
    "Keep augmenting batch and calculate total number of tokens + padding."
    if count == 1:
        max_src_in_batch = 0
        max_tgt_in_batch = 0
    max_src_in_batch = max(max_src_in_batch, len(new.src))
    max_tgt_in_batch = max(max_tgt_in_batch, len(new.trg) + 2)
    src_elements = count * max_src_in_batch
    tgt_elements = count * max_tgt_in_batch
    return max(src_elements, tgt_elements)

# Hardware and Schedule 

## Optimizer

The adam optimizer was used, with B_1 = 0.9, B_2 = 0.98 and eps = 10**-9, also with a variation of the learning rate over the course of training

In [None]:
class NoamOpt:
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
    def rate(self, step = None):
        "Implement lrate above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) * min(step ** (-0.5), step * self.warmup ** (-1.5)))
    def get_std_opt(model):
        return NoamOpt(model.src_embed[0].d_model, 2, 4000,
                      torch.optim.Adam(model.parameters(), lr = 0, betas = (0.9, 0.95), eps = 1e-9))
    

### Examples

In [7]:
opts = [NoamOpt(512, 1, 4000, None), 
        NoamOpt(512, 1, 8000, None),
        NoamOpt(256, 1, 4000, None)]
plt.plot(np.arange(1, 20000), [[opt.rate(i) for opt in opts] for i in range(1, 20000)])
plt.legend(["512:4000", "512:8000", "256:4000"])

NameError: name 'plt' is not defined