# Translation

In [71]:
import torch
import numpy as np

In [72]:
TRAIN_PERC = 0.6
VAL_PERC = 0.2
TEST_PERC = 0.2
VOCAB_SIZE = 10000
MODEL_DIM = 512

## Overview

The basic idea behind machine translation is to take a piece of text in one language and translate it to another language. In this example, we will look at translating English text into Welsh text.

In order to validate our approach, we need examples of English text that have been (correctly) translated to the corresponding Welsh.

## Corpus

### Download

### Load

The corpus is generally too large to load entirely into memory, therefore we need to be able to load batches of translations dynamically.

In [73]:
class EnCyCorpus(torch.utils.data.IterableDataset):
    def __init__(self, en_corpus_file, cy_corpus_file):
        super(EnCyCorpus, self).__init__()
        self.en_corpus_file = en_corpus_file
        self.cy_corpus_file = cy_corpus_file

    def tidy(self, text):
        return text


    def __iter__(self):
        # Create an iterator
        en_itr = open(self.en_corpus_file)
        cy_itr = open(self.cy_corpus_file)
        
        # Map each element using the line_mapper
        mapped_en_itr = map(self.tidy, en_itr)
        mapped_cy_itr = map(self.tidy, cy_itr)
        
        # Zip both iterators
        zipped_itr = zip(mapped_en_itr, mapped_cy_itr)
        
        return zipped_itr

In [74]:
corpus = EnCyCorpus('translation/data/CofnodBachYCynulliad/CofnodBachYCynulliad.en',
                  'translation/data/CofnodBachYCynulliad/CofnodBachYCynulliad.cy')

In [75]:
def load_corpus():
    # Load texts
    with open("translation/data/CofnodYCynulliad/CofnodYCynulliad.en", 'r', encoding='utf-8') as f:
        english_texts = f.read().splitlines()
    with open("translation/data/CofnodYCynulliad/CofnodYCynulliad.cy", 'r', encoding='utf-8') as f:
        welsh_texts = f.read().splitlines()
    texts = list(zip(english_texts, welsh_texts))

    # Split into train / val / test
    n_texts = len(texts)
    corpus = {}
    corpus['train'] = texts[:round(n_texts * TRAIN_PERC)]
    corpus['val'] = texts[round(n_texts * TRAIN_PERC):(round(n_texts * TRAIN_PERC) + round(n_texts * VAL_PERC))]
    corpus['test'] = texts[-round(n_texts * TEST_PERC):]

    # Sort by length to help with batching
    
    return corpus

In [76]:
corpus = load_corpus()

In [77]:
example_text = corpus['train'][1000:1002]
example_text

[('i hope that that will help create the right atmosphere when answering questions .',
  "gobeithiaf y bydd hynny o gymorth i greu'r awyrgylch iawn wrth ateb cwestiynau ."),
 ('extensive consultation was carried out with every relevant party , and the replies are being analysed at present .',
  'cafwyd ymgynghori helaeth gyda phob sefydliad perthnasol a dadansoddir yr atebion ar hyn o bryd .')]

## Model Inputs

The model needs the following as input:
1. The English text, converted into token indices
2. The output Welsh text, converted into token indices
3. The decoder input Welsh text, converted into token indices, which is a shifted version of the output.
4. An attention mask for the English text
5. An attention mask for the Welsh text

### Tokenizer

The first step in processing the text is to break it up into separate tokens, and assign an index to each token. These are typically sub-word level pieces of text. Ideally, we would do this separately for both English and Welsh, since they clearly have different atomic tokens, however for ease of use we will create one tokenizer to deal with both.

https://huggingface.co/course/chapter6/8?fw=pt#building-a-bpe-tokenizer-from-scratch

In [78]:
from tokenizers import Tokenizer
from tokenizers import models, pre_tokenizers, trainers, processors
from tokenizers import normalizers
from tokenizers import decoders
from transformers import PreTrainedTokenizerFast

In [79]:
def create_tokenizer(text):
    tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))
    tokenizer.normalizer = normalizers.Sequence(
        [normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()]
    )
    tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
        [pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Punctuation()]
    )
    special_tokens = ["[BOS]", "[EOS]", "[PAD]", "[MASK]", "[UNK]"]
    tokenizer.model = models.WordPiece(unk_token="[UNK]")
    trainer = trainers.WordPieceTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)
    tokenizer.train_from_iterator(text, trainer)
    tokenizer.post_processor = processors.TemplateProcessing(
        single="[BOS] $A [EOS]",
        special_tokens=[
            ("[BOS]", tokenizer.token_to_id("[BOS]")),
            ("[EOS]", tokenizer.token_to_id("[EOS]")),
        ],
    )
    tokenizer.decoder = decoders.WordPiece(prefix="##")

    pretrained_tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=tokenizer,
        bos_token="[BOS]",
        eos_token="[EOS]",
        pad_token="[PAD]",
        mask_token="[MASK]",
        unk_token="[UNK]",
    )
    return pretrained_tokenizer

In [80]:
english_tokenizer = create_tokenizer(
    text=[pair[0] for pair in corpus['train']]
)
welsh_tokenizer = create_tokenizer(
    text=[pair[1] for pair in corpus['train']]
)









In [81]:
example_tokenizer_output = english_tokenizer(
    text=[ex[0] for ex in example_text],
    return_token_type_ids=False, padding=True, truncation=True, return_tensors="pt", return_attention_mask=True
)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [82]:
example_tokenizer_output

{'input_ids': tensor([[   0,   45,  741,  131,  131,  167,  813, 2058,  112,  671, 8549,  435,
         5596, 1245,   18,    1,    2,    2,    2,    2,    2,    2],
        [   0, 4402, 1068,  250, 2665,  398,  202,  697, 2062,  768,   16,  136,
          112, 7402,  170,  583, 9528,   90,  246, 1028,   18,    1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

### Collator

In general, we need to deal with batches of input, and for training the model it is more efficient. The input to the model is:
1. All the English tokens
2. All the Welsh tokens
3. The source attention mask to tell it which source tokens to pay attention to
4. The target attention mask to tell it which target tokens to pay attention to
5. The decoder input ids, which are right-shifted target tokens


In [83]:
def collate_batch(texts):
    src_batch = english_tokenizer(
        text=[text[0] for text in texts],
        return_token_type_ids=False,
        padding=True,
        truncation=True,
        return_tensors="pt",
        return_attention_mask=True
    )
    src_batch = {'src_' + str(k): v for k,v in src_batch.items()}
    tgt_batch = welsh_tokenizer(
        text=[text[1] for text in texts],
        return_token_type_ids=False,
        padding=True,
        truncation=True,
        return_tensors="pt",
        return_attention_mask=True
    )
    tgt_batch = {'tgt_' + str(k): v for k,v in tgt_batch.items()}
    tgt_batch['tgt_output_ids'] = tgt_batch['tgt_input_ids'][:, 1:]
    tgt_batch['tgt_input_ids'] = tgt_batch['tgt_input_ids'][:, :-1]
    tgt_batch['tgt_attention_mask'] = tgt_batch['tgt_attention_mask'][:, :-1]

    # Extend shortest input
    src_shape = src_batch['src_input_ids'].shape
    tgt_shape = tgt_batch['tgt_input_ids'].shape
    if src_shape[1] < tgt_shape[1]:
        diff = tgt_shape[1] - src_shape[1]
        src_batch['src_input_ids'] = torch.cat((src_batch['src_input_ids'], torch.full([src_shape[0],diff], 2)), dim=1)
        src_batch['src_attention_mask'] = torch.cat((src_batch['src_attention_mask'], torch.full([src_shape[0],diff], 0)), dim=1)
    elif tgt_shape[1] < src_shape[1]:
        diff = src_shape[1] - tgt_shape[1]
        tgt_batch['tgt_input_ids'] = torch.cat((tgt_batch['tgt_input_ids'], torch.full([tgt_shape[0],diff], 2)), dim=1)
        tgt_batch['tgt_attention_mask'] = torch.cat((tgt_batch['tgt_attention_mask'], torch.full([tgt_shape[0],diff], 0)), dim=1)
        tgt_batch['tgt_output_ids'] = torch.cat((tgt_batch['tgt_output_ids'], torch.full([tgt_shape[0],diff], 2)), dim=1)
    
    # Combine
    batch = {**src_batch, **tgt_batch}
    return batch

In [84]:
example_batch = collate_batch(example_text)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [85]:
example_batch

{'src_input_ids': tensor([[   0,   45,  741,  131,  131,  167,  813, 2058,  112,  671, 8549,  435,
          5596, 1245,   18,    1,    2,    2,    2,    2,    2,    2],
         [   0, 4402, 1068,  250, 2665,  398,  202,  697, 2062,  768,   16,  136,
           112, 7402,  170,  583, 9528,   90,  246, 1028,   18,    1]]),
 'src_attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'tgt_input_ids': tensor([[   0, 1070,   63,  195,  196,   53, 1577,   47, 1842,   11,   56, 9015,
           583,  326,  682, 1735,   18,    1,    2,    2,    2,    2],
         [   0, 2704, 1213, 2214,  432, 3405, 1984, 2936,   39, 7446,   81,  153,
          2502,  132,  161,   53,  558,   18,    2,    2,    2,    2]]),
 'tgt_attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 

This is the input that our model will receive, together with the outcome it will be measured against.

## Model

In [89]:
import torch.nn as nn

The model we will build is a canonical encoder-decoder model. On the encoder side, a representation of the input is created, and on the decoder side this representation is used to generate a sequence of tokens in an autoregressive way.

### Encoder

The encoder is a stack of N transformer layers, each of which is composed of a self-attention layer with a dense layer, including a residual connection. Let's start by defining the general Encoder:

#### Embedding

An embedding takes a sequence of token ids and converts it into a sequence of n-dimensional vector representations of each token. Both the encoder and decoder have separate embeddings, since they deal with separate languages.

We will use the pytorch Embedding [https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html].

In [90]:
import math

class Embeddings(nn.Module):
    def __init__(self, d_model=512, n_vocab=20000):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(n_vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

Let's see how this looks for our example batch:

In [91]:
example_embedding_output = Embeddings()(example_batch
['src_input_ids'])
example_embedding_output

tensor([[[-10.7418, -18.5532, -24.5317,  ..., -16.8611, -28.6475,  -9.6731],
         [-29.7550,  -7.0559,   9.6194,  ..., -16.2453, -19.8846, -17.7630],
         [  7.4113,   2.9326, -20.6058,  ..., -30.7097, -10.0630,   3.7162],
         ...,
         [ 12.4295, -35.7591,  -2.5741,  ...,  32.7473,  57.2127,  10.5758],
         [ 12.4295, -35.7591,  -2.5741,  ...,  32.7473,  57.2127,  10.5758],
         [ 12.4295, -35.7591,  -2.5741,  ...,  32.7473,  57.2127,  10.5758]],

        [[-10.7418, -18.5532, -24.5317,  ..., -16.8611, -28.6475,  -9.6731],
         [-14.1038, -21.0192,  48.2898,  ...,   1.6452,  18.7295,  -5.7033],
         [-69.6844,  60.1925, -21.8730,  ...,   0.7527,   0.4434, -19.8733],
         ...,
         [  1.6627,  14.5584,  -1.6639,  ...,   0.5623, -43.7432,   3.2204],
         [  5.0090,  -6.6878, -22.5054,  ...,   3.4984,  -7.9396, -41.7292],
         [ -1.5115,  11.6709,   7.6318,  ...,  -5.2615,   7.4185,   2.5457]]],
       grad_fn=<MulBackward0>)

In [92]:
example_embedding_output.size()

torch.Size([2, 22, 512])

We have converted each token in the batch into a vector of dimension MODEL_DIM. The first rank of the tensor represents the batch no, the second is the token no and the third is the vector coefficient of the embedding.

#### Positional Encoding

In order to tell the model which position each token is in, the token embeddings have to be augmented with position information. We will use a common method that 

In [93]:
from torch.autograd import Variable

class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model=512, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)], 
                         requires_grad=False)
        return self.dropout(x)

In [94]:
example_pos_enc_output = PositionalEncoding()(example_embedding_output)
example_pos_enc_output

tensor([[[-11.9354, -19.5036, -27.2575,  ..., -17.6234, -31.8306,  -9.6368],
         [-32.1261,  -7.2396,  11.6014,  ..., -16.9392, -22.0939, -18.6256],
         [  9.2451,   2.7960, -21.8548,  ..., -33.0108, -11.1809,   5.2402],
         ...,
         [ 13.9771,  -0.0000,  -3.4132,  ...,  37.4970,  63.5718,  12.8620],
         [ 14.8249, -39.2789,  -2.3832,  ...,  37.4970,  63.5719,  12.8620],
         [ 14.7401, -40.3409,  -1.7637,  ...,  37.4970,  63.5720,  12.8620]],

        [[-11.9354, -19.5036, -27.2575,  ..., -17.6234, -31.8306,  -9.6368],
         [-14.7359, -22.7543,  54.5685,  ...,   2.9391,  20.8107,  -5.2259],
         [-76.4167,   0.0000, -23.2629,  ...,   1.9474,   0.4929, -20.9703],
         ...,
         [  2.0140,  17.2745,  -2.4018,  ...,   1.7359, -48.6013,   4.6894],
         [  6.5799,  -6.9775,  -0.0000,  ...,   4.9982,  -8.8195, -45.2547],
         [ -0.7499,   0.0000,   0.0000,  ...,  -4.7350,   8.2452,   3.9397]]],
       grad_fn=<MulBackward0>)

In [95]:
example_pos_enc_output.size()

torch.Size([2, 22, 512])

The size of the tensor is the same as before, since all that has been added is some positional information to each dimension of each token.

#### Encoder Layer

So far we have transformed each input tokens into a vector representing that token, via an embedding, and added positional information, via the positional encoder. Now, the role of the encoder is to represent the input in such a way that the decoder can make best use of it. In order to do this, each layer of the encoder "shares" semantic information between tokens.

#### Multi-Headed Attention

Multi-headed attention is the mechanism by which this sharing takes place. It allows each token to learn how to "pay attention" to other tokens, and gradually assimilate their semantic meaning into their representations. Since the model has to deal with sequences of varying length, the attention mechanism has to learn how to first assess the relationship between tokens, and then how to share their representations i.e. it cannot say "mix token 1 with token 3".

In [96]:
import copy

def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [97]:
import torch.nn.functional as F

def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) \
             / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask.unsqueeze(2) == 0, -1e9)
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

In [98]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, query, key, value, mask=None):
        "Implements Figure 2"
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)
        
        # 1) Do all the linear projections in batch from d_model => h x d_k 
        query, key, value = \
            [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
             for l, x in zip(self.linears, (query, key, value))]
        
        # 2) Apply attention on all the projected vectors in batch. 
        x, self.attn = attention(query, key, value, mask=mask, 
                                 dropout=self.dropout)
        
        # 3) "Concat" using a view and apply a final linear. 
        x = x.transpose(1, 2).contiguous() \
             .view(nbatches, -1, self.h * self.d_k)
        return self.linears[-1](x)

To see the output of the multi-headed attention we need to feed it some input. We can feed it separate values for Q, K, and V. For the encoder, these will all be the representations of the tokens, hence the term "self-attention". We also input the attention mask, since we need to tell the attention layer to ignore any tokens that represent padding, coming from our batch.

In [99]:
example_tokenizer_output

{'input_ids': tensor([[   0,   45,  741,  131,  131,  167,  813, 2058,  112,  671, 8549,  435,
         5596, 1245,   18,    1,    2,    2,    2,    2,    2,    2],
        [   0, 4402, 1068,  250, 2665,  398,  202,  697, 2062,  768,   16,  136,
          112, 7402,  170,  583, 9528,   90,  246, 1028,   18,    1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [100]:
example_attn_output = MultiHeadedAttention(h=8, d_model=MODEL_DIM, dropout=0.1)(
    example_pos_enc_output,
    example_pos_enc_output, 
    example_pos_enc_output,
    example_batch['src_attention_mask']
)
example_attn_output

tensor([[[-1.5848e+01,  3.4616e+00,  6.4872e+00,  ..., -3.3008e+00,
           1.1372e-01,  5.3793e+00],
         [ 6.5011e+00,  1.5313e+01,  2.6866e+00,  ..., -3.4565e+00,
          -7.6228e+00, -2.3052e+00],
         [ 4.1317e+00, -2.0901e+00, -8.5896e-01,  ...,  1.1223e+01,
           7.2100e+00,  1.4491e+00],
         ...,
         [ 6.9053e+00, -6.7535e+00, -4.3447e+00,  ..., -6.9228e+00,
          -1.4999e+00, -1.2341e+01],
         [-1.0241e+00, -1.0723e+01, -9.4821e+00,  ...,  3.4602e+00,
          -7.1168e+00, -1.4010e+01],
         [ 5.1978e-01, -3.0343e+00, -5.6894e+00,  ..., -1.5615e+00,
          -1.0337e+01,  5.8480e+00]],

        [[-1.5500e+01,  3.6269e+00,  7.3738e+00,  ..., -7.0873e+00,
           8.4160e+00,  1.9761e+00],
         [-3.5880e-01,  2.7700e+00, -1.3743e+01,  ..., -3.7603e+00,
           5.8246e+00,  2.0918e+01],
         [-5.8391e+00,  2.0329e+01, -4.8123e+00,  ...,  1.2425e+01,
           1.5544e+00, -6.5014e+00],
         ...,
         [ 1.5815e-01, -7

In [101]:
example_attn_output.size()

torch.Size([2, 22, 512])

#### Layer Normalisation

In [102]:
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

#### Sub-layer Connection

In [103]:
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))

In [104]:
example_attn_sublayer_output = SublayerConnection(size=MODEL_DIM, dropout=0.1)(
    example_pos_enc_output, 
    lambda x: MultiHeadedAttention(h=8, d_model=MODEL_DIM, dropout=0.1)(x, x, x, example_batch['src_attention_mask'])
)
example_attn_sublayer_output

tensor([[[-1.1788e+01, -1.9641e+01, -2.7146e+01,  ..., -1.7288e+01,
          -3.1859e+01, -8.9730e+00],
         [-3.2267e+01, -7.1726e+00,  1.1578e+01,  ..., -1.6820e+01,
          -2.2094e+01, -1.8394e+01],
         [ 9.0569e+00,  2.8178e+00, -2.1855e+01,  ..., -3.2908e+01,
          -1.1329e+01,  5.4037e+00],
         ...,
         [ 1.3910e+01, -5.9531e-02, -3.4236e+00,  ...,  3.7680e+01,
           6.3612e+01,  1.3031e+01],
         [ 1.4747e+01, -3.9312e+01, -2.3949e+00,  ...,  3.7624e+01,
           6.3644e+01,  1.3096e+01],
         [ 1.4697e+01, -4.0440e+01, -1.6874e+00,  ...,  3.7690e+01,
           6.3671e+01,  1.3098e+01]],

        [[-1.1593e+01, -1.9858e+01, -2.7173e+01,  ..., -1.7236e+01,
          -3.1707e+01, -9.0266e+00],
         [-1.4586e+01, -2.2747e+01,  5.4654e+01,  ...,  3.1466e+00,
           2.0957e+01, -5.2655e+00],
         [-7.6384e+01,  0.0000e+00, -2.3228e+01,  ...,  2.4682e+00,
           6.0220e-01, -2.0970e+01],
         ...,
         [ 2.1270e+00,  1

In [105]:
example_attn_sublayer_output.shape

torch.Size([2, 22, 512])

#### Position-Wise Feed-forward Layer

Each token vector is then fed through an identical densely connected layer.

In [106]:
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

In [107]:
example_ffn_output = PositionwiseFeedForward(d_model=MODEL_DIM, d_ff=2048, dropout=0.1)(
    example_attn_sublayer_output
)
example_ffn_output

tensor([[[ -6.8549,   0.1118,  -1.6097,  ...,  -1.1000,  -2.7858,   5.0261],
         [  3.2303,  -3.5483,  -1.9076,  ...,  -4.4952,  -5.0886,   8.3172],
         [  4.7101,   2.1633,   0.6520,  ...,   1.3836,   1.2963,  12.9888],
         ...,
         [  8.9456,   1.2445,  -0.4396,  ...,  -4.1853,   8.7339,   4.8806],
         [  6.0206,   5.1609,   1.1854,  ...,  -7.8664,   9.6261,   5.6694],
         [  7.4900,   6.4613,  -2.6274,  ...,  -3.1340,   7.8336,   6.0030]],

        [[ -3.1191,   3.0587,  -1.2532,  ...,  -1.5697,   2.1519,   4.7717],
         [ -2.5520,   2.5238,  -3.8402,  ..., -11.1359,  11.8362,   6.0863],
         [  5.3625,  -2.4541,   2.3027,  ...,  -9.8394,   2.3857,   1.0974],
         ...,
         [  0.4157,   1.8016,  -7.2268,  ...,  -7.8388,   6.0038,   7.1653],
         [ 10.9670,  -2.8875,   1.2094,  ...,   8.0200,   4.0866,   4.5787],
         [ 11.5248,  -1.4691,  -7.9380,  ...,  -4.2281,  -5.6999,   2.7419]]],
       grad_fn=<ViewBackward0>)

In [108]:
example_ffn_output.shape

torch.Size([2, 22, 512])

This is also wrapped in a residual connection:

In [109]:
example_sublayer_ffn_output = SublayerConnection(size=MODEL_DIM, dropout=0.1)(
    example_attn_sublayer_output, 
    lambda x: PositionwiseFeedForward(d_model=MODEL_DIM, d_ff=2048, dropout=0.1)(x)
)
example_sublayer_ffn_output

tensor([[[-1.1672e+01, -1.9646e+01, -2.6915e+01,  ..., -1.6866e+01,
          -3.1626e+01, -8.8061e+00],
         [-3.2235e+01, -7.1657e+00,  1.1565e+01,  ..., -1.6820e+01,
          -2.2188e+01, -1.8719e+01],
         [ 9.0568e+00,  2.9613e+00, -2.2330e+01,  ..., -3.2912e+01,
          -1.1085e+01,  5.3018e+00],
         ...,
         [ 1.4263e+01, -5.9531e-02, -3.7806e+00,  ...,  3.7680e+01,
           6.3945e+01,  1.2929e+01],
         [ 1.4798e+01, -3.9192e+01, -2.6850e+00,  ...,  3.7624e+01,
           6.3987e+01,  1.3220e+01],
         [ 1.4858e+01, -4.0123e+01, -1.9649e+00,  ...,  3.7987e+01,
           6.4030e+01,  1.3245e+01]],

        [[-1.1544e+01, -1.9902e+01, -2.7173e+01,  ..., -1.6947e+01,
          -3.1506e+01, -8.8545e+00],
         [-1.4717e+01, -2.2579e+01,  5.3989e+01,  ...,  3.2638e+00,
           2.1064e+01, -5.5290e+00],
         [-7.6252e+01,  2.0399e-01, -2.3151e+01,  ...,  2.6145e+00,
           6.2997e-01, -2.0860e+01],
         ...,
         [ 2.2762e+00,  1

In [110]:
example_sublayer_ffn_output.shape

torch.Size([2, 22, 512])

#### Encoder Layer

And that's it! We've reached the end of our first encoder layer. Putting it all together each encoder layer is:

In [111]:
class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        "Follow Figure 1 (left) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)

#### Encoder

And the full encoder is:

In [112]:
class Encoder(nn.Module):
    "Core encoder is a stack of N layers"
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, mask):
        "Pass the input (and mask) through each layer in turn."
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

In [113]:
c = copy.deepcopy
attn = MultiHeadedAttention(h=2, d_model=MODEL_DIM)
ff = PositionwiseFeedForward(d_model=MODEL_DIM, d_ff=2048, dropout=0.1)
dropout = 0.1
example_encoder_output = Encoder(
    layer=EncoderLayer(
        size=MODEL_DIM, 
        self_attn=c(attn), 
        feed_forward=c(ff), 
        dropout=0.1), 
    N=2)(example_pos_enc_output, example_batch['src_attention_mask'])
example_encoder_output

tensor([[[-4.7675e-01, -8.5384e-01, -1.1210e+00,  ..., -6.9738e-01,
          -1.3103e+00, -3.6737e-01],
         [-1.4606e+00, -3.2886e-01,  4.1182e-01,  ..., -7.8795e-01,
          -1.0392e+00, -8.1147e-01],
         [ 4.4371e-01,  1.8860e-01, -7.9598e-01,  ..., -1.3357e+00,
          -4.8368e-01,  2.5915e-01],
         ...,
         [ 6.1511e-01, -2.4969e-04, -1.6428e-01,  ...,  1.5807e+00,
           2.6607e+00,  5.5483e-01],
         [ 6.4504e-01, -1.6256e+00, -1.4088e-01,  ...,  1.5224e+00,
           2.5851e+00,  5.1944e-01],
         [ 6.4625e-01, -1.6383e+00, -9.0536e-02,  ...,  1.5085e+00,
           2.5626e+00,  5.3021e-01]],

        [[-4.7039e-01, -8.0318e-01, -1.1159e+00,  ..., -6.8493e-01,
          -1.2943e+00, -3.6683e-01],
         [-6.6198e-01, -9.9660e-01,  2.0223e+00,  ...,  5.2900e-02,
           7.4259e-01, -3.0415e-01],
         [-3.2112e+00, -3.9775e-02, -9.5455e-01,  ...,  6.1165e-02,
           1.5811e-02, -9.0908e-01],
         ...,
         [ 6.3719e-02,  6

In [114]:
example_encoder_output.shape

torch.Size([2, 22, 512])

### Decoder

The input to the decoder is:
1. The output of the encoder
2. The previous tokens in the sequence up to that point
3. The source mask showing which source tokens the decoder can pay attention to
4. The target mask showing which target tokens the decoder can pay attention to

In [115]:
example_batch

{'src_input_ids': tensor([[   0,   45,  741,  131,  131,  167,  813, 2058,  112,  671, 8549,  435,
          5596, 1245,   18,    1,    2,    2,    2,    2,    2,    2],
         [   0, 4402, 1068,  250, 2665,  398,  202,  697, 2062,  768,   16,  136,
           112, 7402,  170,  583, 9528,   90,  246, 1028,   18,    1]]),
 'src_attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'tgt_input_ids': tensor([[   0, 1070,   63,  195,  196,   53, 1577,   47, 1842,   11,   56, 9015,
           583,  326,  682, 1735,   18,    1,    2,    2,    2,    2],
         [   0, 2704, 1213, 2214,  432, 3405, 1984, 2936,   39, 7446,   81,  153,
          2502,  132,  161,   53,  558,   18,    2,    2,    2,    2]]),
 'tgt_attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 

In [116]:
example_tgt_embedding_output = Embeddings(512, VOCAB_SIZE)(example_batch['tgt_input_ids'])
example_tgt_pos_enc_output = PositionalEncoding(512,0.1)(example_tgt_embedding_output)
example_tgt_pos_enc_output

tensor([[[-13.4112,   7.1272,   4.7381,  ...,   0.0000,   0.0000,   5.6412],
         [-13.6782,   6.9464, -21.1748,  ..., -29.7911,  16.4421,  -1.3626],
         [  9.9126,   6.8113,  42.7099,  ...,  -8.9077,  25.2430, -25.2969],
         ...,
         [ 21.1316,  -0.0000, -12.8013,  ...,  -0.0000, -20.2269,   1.8317],
         [  0.0000, -36.5859, -11.7713,  ..., -11.2292, -20.2268,   1.8317],
         [ 21.8947, -37.6479, -11.1518,  ..., -11.2292, -20.2267,   1.8317]],

        [[-13.4112,   7.1272,   4.7381,  ...,  16.0348,   0.4709,   5.6412],
         [ -9.5638,  14.3176, -14.5953,  ...,   0.2070, -35.2907,  19.7233],
         [-14.9374, -49.6283,  10.3585,  ..., -31.4959,  12.3004,   0.0000],
         ...,
         [ 21.1316, -35.9407, -12.8013,  ..., -11.2292, -20.2269,   1.8317],
         [ 21.9795, -36.5859, -11.7713,  ..., -11.2292, -20.2268,   1.8317],
         [ 21.8947, -37.6479, -11.1518,  ..., -11.2292, -20.2267,   1.8317]]],
       grad_fn=<MulBackward0>)

In [117]:
example_tgt_pos_enc_output.shape

torch.Size([2, 22, 512])

#### Decoder Layer

In [118]:
MultiHeadedAttention(h=8, d_model=512)(
    example_tgt_pos_enc_output,
    example_tgt_pos_enc_output,
    example_tgt_pos_enc_output,
    example_batch['tgt_attention_mask']
)

tensor([[[ -0.2955, -10.7711,   2.6980,  ...,   3.7034,   8.0004,  -3.6077],
         [ 11.1394, -11.7099,  24.3704,  ...,   2.4531,   6.8912,  -6.0044],
         [  0.0283,   8.6027, -14.1181,  ...,   4.5193,  -2.7656,  12.5710],
         ...,
         [  2.5822,  -3.9382,   0.5822,  ...,  11.2627,   4.7259,  -5.3718],
         [  2.1448,  14.8411,  -3.4153,  ...,   7.0794,   0.6785,  -1.5732],
         [ 15.4648,  18.4373, -13.1424,  ...,  19.3747,   3.9657,  -5.6436]],

        [[  0.6216,  -5.1710,  -6.3400,  ...,   2.4746,   7.7962,  -7.3516],
         [ -2.3201,  -4.2860, -11.7811,  ...,  -2.4208,   0.0779,   0.2381],
         [ 17.7032, -11.5069,   2.7769,  ...,  -7.9180,  -7.9924,   9.1242],
         ...,
         [ 14.7434,  -1.3001,   6.6742,  ...,   4.6925,  -2.3997,   2.8855],
         [ 14.6501,  -3.4187,   6.9774,  ...,   6.5574, -10.0707,   4.2343],
         [  9.1968,  -7.7419,   4.7432,  ...,  -0.9139, -10.7522,   8.3900]]],
       grad_fn=<ViewBackward0>)

In [119]:
class DecoderLayer(nn.Module):
    "Decoder is made of self-attn, src-attn, and feed forward (defined below)"
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)
 
    def forward(self, x, memory, src_mask, tgt_mask):
        "Follow Figure 1 (right) for connections."
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)

In [120]:
c = copy.deepcopy
attn = MultiHeadedAttention(h=8, d_model=512)
ff = PositionwiseFeedForward(d_model=512, d_ff=2048, dropout=0.1)
position = PositionalEncoding(d_model=512, dropout=0.1)
DecoderLayer(512, c(attn), c(attn), c(ff), dropout=0.1)(
    x=example_tgt_pos_enc_output,
    memory=example_encoder_output,
    src_mask=example_batch['src_attention_mask'],
    tgt_mask=example_batch['tgt_attention_mask']
)

tensor([[[-13.3243,   7.2067,   4.7191,  ...,   0.1770,   0.4830,   6.2686],
         [-13.8562,   6.7913, -21.2996,  ..., -29.4380,  16.7292,  -1.2595],
         [  9.8920,   6.6326,  42.6286,  ...,  -8.6700,  25.4093, -25.3929],
         ...,
         [ 20.9830,   0.2361, -12.7844,  ...,   0.3443, -20.3158,   2.1160],
         [ -0.1887, -36.1978, -11.9729,  ..., -11.2234, -20.3709,   2.0473],
         [ 21.8522, -37.4589, -11.2534,  ..., -10.7406, -20.6673,   2.1370]],

        [[-13.0859,   7.1533,   4.8148,  ...,  16.5620,   0.5678,   6.6535],
         [ -9.1321,  14.4113, -14.1881,  ...,   0.5718, -36.0848,  19.8146],
         [-14.9012, -49.1435,  10.3008,  ..., -30.9242,  12.6328,   0.7469],
         ...,
         [ 21.4469, -35.7159, -13.0199,  ..., -10.8138, -20.5707,   2.1509],
         [ 21.9492, -36.1538, -11.5344,  ..., -10.9173, -20.5362,   2.2667],
         [ 21.9387, -37.5437, -11.0821,  ..., -10.9419, -20.4520,   2.0873]]],
       grad_fn=<AddBackward0>)

#### Decoder

In [121]:
class Decoder(nn.Module):
    "Generic N layer decoder with masking."
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)

In [122]:
c = copy.deepcopy
attn = MultiHeadedAttention(h=8, d_model=512)
ff = PositionwiseFeedForward(d_model=512, d_ff=2048, dropout=0.1)
position = PositionalEncoding(d_model=512, dropout=0.1)
example_decoder_output = Decoder(
    layer=DecoderLayer(512, c(attn), c(attn), c(ff), dropout=0.1),
    N=2
)(x=example_tgt_pos_enc_output,
    memory=example_encoder_output,
    src_mask=example_batch['src_attention_mask'],
    tgt_mask=example_batch['tgt_attention_mask'])

In [123]:
example_decoder_output

tensor([[[-5.8735e-01,  2.3710e-01,  1.4858e-01,  ..., -2.6921e-02,
           1.1675e-02,  1.9538e-01],
         [-6.9158e-01,  1.9578e-01, -9.3966e-01,  ..., -1.2920e+00,
           5.9979e-01, -1.2577e-01],
         [ 4.3333e-01,  2.8341e-01,  1.7773e+00,  ..., -3.6972e-01,
           1.0646e+00, -1.0376e+00],
         ...,
         [ 7.8984e-01, -4.9802e-02, -6.4208e-01,  ..., -2.1265e-02,
          -9.3646e-01, -2.8173e-02],
         [-7.4588e-02, -1.6112e+00, -5.7532e-01,  ..., -4.8605e-01,
          -9.2021e-01, -2.5805e-02],
         [ 8.6689e-01, -1.6594e+00, -5.5446e-01,  ..., -5.1623e-01,
          -9.2020e-01, -2.1505e-03]],

        [[-5.9821e-01,  2.2719e-01,  1.5098e-01,  ...,  6.2943e-01,
           1.7150e-02,  1.9239e-01],
         [-3.8294e-01,  5.8034e-01, -5.6960e-01,  ..., -5.8507e-03,
          -1.4720e+00,  8.2239e-01],
         [-7.1198e-01, -2.1967e+00,  3.9935e-01,  ..., -1.4091e+00,
           4.7863e-01, -7.9248e-02],
         ...,
         [ 8.4954e-01, -1

In [124]:
example_decoder_output.shape

torch.Size([2, 22, 512])

#### Generator

The final step is to take the output of the decoder and predict the token.

In [125]:
class Generator(nn.Module):
    "Define standard linear + softmax generation step."
    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        return F.log_softmax(self.proj(x), dim=-1)

In [126]:
example_generator_output = Generator(d_model=512, vocab=VOCAB_SIZE)(example_decoder_output)

In [127]:
example_generator_output.shape

torch.Size([2, 22, 10000])

Note that the generator takes in the decoder output for each step, and maps the decoder vector representation for each token to a probability for *all* tokens in the sequence, not just the last one. For example, the first sequence, first token output log probabilities are:

In [128]:
example_generator_output[0, 0, :]

tensor([-9.5265, -9.7819, -9.7393,  ..., -8.5086, -9.4319, -7.8582],
       grad_fn=<SliceBackward0>)

The most likely token is therefore the one with the highest probability:

In [129]:
vocab_ind = example_generator_output[0, 0, :].argmax()
vocab_ind

tensor(8799)

In [130]:
welsh_tokenizer.decode(vocab_ind)

'chynrychioli'

### Encoder-Decoder

Putting the full model together gives:

In [131]:
class EncoderDecoder(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many 
    other models.
    """
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator
        
    def forward(self, src, tgt, src_mask, tgt_mask):
        "Take in and process masked src and target sequences."
        return self.decode(self.encode(src, src_mask), src_mask,
                            tgt, tgt_mask)
    
    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)
    
    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

In [132]:
def make_model(src_vocab, tgt_vocab, N=6, 
               d_model=512, d_ff=2048, h=8, dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), 
                             c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab))
    
    # This was important from their code. 
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform(p)
    return model

In [133]:
example_model = make_model(VOCAB_SIZE, VOCAB_SIZE, 2)

  nn.init.xavier_uniform(p)


In [134]:
example_model_output = make_model(VOCAB_SIZE, VOCAB_SIZE, 2)(
    example_batch['src_input_ids'], example_batch['tgt_input_ids'], example_batch['src_attention_mask'], example_batch['tgt_attention_mask'])
example_model_output

  nn.init.xavier_uniform(p)


tensor([[[ 1.1169, -0.5819,  0.0130,  ..., -0.8328,  1.0758,  0.4566],
         [ 0.2189, -1.0932,  1.3029,  ..., -1.0174,  1.1675,  0.2970],
         [ 0.6341, -0.4515,  0.7189,  ..., -0.2266,  0.7938,  0.1137],
         ...,
         [ 1.1180, -0.5693,  0.7974,  ..., -0.9358,  0.9848,  0.0183],
         [ 0.4512,  0.0294, -0.0631,  ..., -1.0847,  1.5236,  0.4952],
         [ 1.8831, -1.0368,  1.4201,  ..., -0.0585,  0.3711, -0.3482]],

        [[ 0.5812,  0.4304, -0.5816,  ..., -1.6874,  0.9159,  0.1194],
         [ 1.1245, -0.3786, -0.2071,  ..., -1.2755,  0.7649, -0.2972],
         [ 1.8569, -0.4656,  0.8226,  ..., -0.8058,  1.1237, -0.2432],
         ...,
         [ 1.6085,  0.4306, -0.0887,  ..., -1.4484,  0.9135,  0.4617],
         [ 2.1419, -0.2611,  0.2290,  ..., -1.1957,  0.4064, -0.2649],
         [ 2.0685, -1.0459,  1.5858,  ..., -0.9481,  0.7386, -0.2194]]],
       grad_fn=<AddBackward0>)

In [135]:
example_model_output.shape

torch.Size([2, 22, 512])

The forward pass of the model doesn't use the Generator?

## Training

In [136]:
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist
from torch.optim.lr_scheduler import LambdaLR
#import GPUtil
import time

In order to train the model we need to decide on a few training parameters.

### Loss Function

In [137]:
class LabelSmoothing(nn.Module):
    "Implement label smoothing."

    def __init__(self, size, padding_idx, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        self.criterion = nn.KLDivLoss(reduction="sum")
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None

    def forward(self, x, target):
        assert x.size(1) == self.size
        true_dist = x.data.clone()
        true_dist.fill_(self.smoothing / (self.size - 2))
        true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        true_dist[:, self.padding_idx] = 0
        mask = torch.nonzero(target.data == self.padding_idx)
        if mask.dim() > 0:
            true_dist.index_fill_(0, mask.squeeze(), 0.0)
        self.true_dist = true_dist
        return self.criterion(x, true_dist.clone().detach())

In [138]:
class SimpleLossCompute:
    "A simple loss compute and train function."

    def __init__(self, generator, criterion):
        self.generator = generator
        self.criterion = criterion

    def __call__(self, x, y, norm):
        x = self.generator(x)
        sloss = (
            self.criterion(
                x.contiguous().view(-1, x.size(-1)), y.contiguous().view(-1)
            )
            / norm
        )
        return sloss.data * norm, sloss

### Optimizer

In [139]:
example_optimizer = torch.optim.Adam(
        example_model.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-9
)

In [140]:
def rate(step, model_size, factor, warmup):
    """
    we have to default the step to 1 for LambdaLR function
    to avoid zero raising to negative power.
    """
    if step == 0:
        step = 1
    return factor * (
        model_size ** (-0.5) * min(step ** (-0.5), step * warmup ** (-1.5))
    )

In [141]:
example_lr_scheduler = LambdaLR(
    optimizer=example_optimizer,
    lr_lambda=lambda step: rate(
        step, 512, factor=1, warmup=100
    ),
)

In [142]:
class DummyOptimizer(torch.optim.Optimizer):
    def __init__(self):
        self.param_groups = [{"lr": 0}]
        None

    def step(self):
        None

    def zero_grad(self, set_to_none=False):
        None

In [143]:
class DummyScheduler:
    def step(self):
        None

### Epoch

In [147]:
train_dataloader = torch.utils.data.DataLoader(corpus['train'], batch_size=64, collate_fn=collate_batch)
val_dataloader = torch.utils.data.DataLoader(corpus['val'], batch_size=64, collate_fn=collate_batch)
test_dataloader = torch.utils.data.DataLoader(corpus['test'], batch_size=64, collate_fn=collate_batch)

In [148]:
class TrainState:
    """Track number of steps, examples, and tokens processed"""

    step: int = 0  # Steps in the current epoch
    accum_step: int = 0  # Number of gradient accumulation steps
    samples: int = 0  # total # of examples used
    tokens: int = 0  # total # of tokens processed

In [149]:
def run_epoch(
    data_iter,
    model,
    loss_compute,
    optimizer,
    scheduler,
    mode="train",
    accum_iter=1,
    train_state=TrainState(),
): 
    """Train a single epoch"""
    start = time.time()
    total_tokens = 0
    total_loss = 0
    tokens = 0
    n_accum = 0
    for i, batch in enumerate(data_iter):
        out = model.forward(
            batch['src_input_ids'],
            batch['tgt_input_ids'], 
            batch['src_attention_mask'],
            batch['tgt_attention_mask']
        )
        ntokens = (batch['tgt_output_ids'] != 2).data.sum()
        loss, loss_node = loss_compute(out, batch['tgt_output_ids'], ntokens)
        # loss_node = loss_node / accum_iter
        if mode == "train" or mode == "train+log":
            loss_node.backward()
            train_state.step += 1
            train_state.samples += batch['src_input_ids'].shape[0]
            train_state.tokens += ntokens
            if i % accum_iter == 0:
                optimizer.step()
                optimizer.zero_grad(set_to_none=True)
                n_accum += 1
                train_state.accum_step += 1
            scheduler.step()

        total_loss += loss
        total_tokens += ntokens
        tokens += ntokens
        if i % 40 == 1 and (mode == "train" or mode == "train+log"):
            lr = optimizer.param_groups[0]["lr"]
            elapsed = time.time() - start
            print(
                (
                    "Epoch Step: %6d | Accumulation Step: %3d | Loss: %6.2f "
                    + "| Tokens / Sec: %7.1f | Learning Rate: %6.1e"
                )
                % (i, n_accum, loss / ntokens, tokens / elapsed, lr)
            )
            start = time.time()
            tokens = 0
        del loss
        del loss_node
    return total_loss / total_tokens, train_state

In [150]:
run_epoch(
    data_iter=train_dataloader,
    model=example_model,
    loss_compute=SimpleLossCompute(example_model.generator, LabelSmoothing(
        size=VOCAB_SIZE, padding_idx=2, smoothing=0.1
    )),
    optimizer=example_optimizer,
    scheduler=example_lr_scheduler,
    mode="train"
)

Epoch Step:      1 | Accumulation Step:   2 | Loss:   6.23 | Tokens / Sec:   352.2 | Learning Rate: 9.6e-07
Epoch Step:     41 | Accumulation Step:  42 | Loss:   6.27 | Tokens / Sec:   490.1 | Learning Rate: 9.5e-07
Epoch Step:     81 | Accumulation Step:  82 | Loss:   6.33 | Tokens / Sec:   467.9 | Learning Rate: 9.4e-07
Epoch Step:    121 | Accumulation Step: 122 | Loss:   6.28 | Tokens / Sec:   514.9 | Learning Rate: 9.3e-07
Epoch Step:    161 | Accumulation Step: 162 | Loss:   6.17 | Tokens / Sec:   555.5 | Learning Rate: 9.2e-07
Epoch Step:    201 | Accumulation Step: 202 | Loss:   6.30 | Tokens / Sec:   535.7 | Learning Rate: 9.1e-07
Epoch Step:    241 | Accumulation Step: 242 | Loss:   6.30 | Tokens / Sec:   552.5 | Learning Rate: 9.1e-07
Epoch Step:    281 | Accumulation Step: 282 | Loss:   6.36 | Tokens / Sec:   543.7 | Learning Rate: 9.0e-07
Epoch Step:    321 | Accumulation Step: 322 | Loss:   6.21 | Tokens / Sec:   538.9 | Learning Rate: 8.9e-07
Epoch Step:    361 | Accumul

KeyboardInterrupt: 

### Training Loop

In [73]:
import torch

In [74]:
def train_worker(
    gpu,
    config
):
    print(f"Train worker process using GPU: {gpu} for training", flush=True)
    #torch.cuda.set_device(gpu)

    pad_idx = 2
    d_model = 512
    model = make_model(VOCAB_SIZE, VOCAB_SIZE, N=6)
    #model.cuda(gpu)
    module = model
    is_main_process = True

    criterion = LabelSmoothing(
        size=VOCAB_SIZE, padding_idx=pad_idx, smoothing=0.1
    )
    criterion.cuda(gpu)

    # train_dataloader, valid_dataloader = create_dataloaders(
    #     gpu,
    #     vocab_src,
    #     vocab_tgt,
    #     spacy_de,
    #     spacy_en,
    #     batch_size=config["batch_size"] // ngpus_per_node,
    #     max_padding=config["max_padding"],
    #     is_distributed=is_distributed,
    # )

    optimizer = torch.optim.Adam(
        model.parameters(), lr=config["base_lr"], betas=(0.9, 0.98), eps=1e-9
    )
    lr_scheduler = LambdaLR(
        optimizer=optimizer,
        lr_lambda=lambda step: rate(
            step, d_model, factor=1, warmup=config["warmup"]
        ),
    )
    train_state = TrainState()

    for epoch in range(config["num_epochs"]):
        model.train()
        print(f"[GPU{gpu}] Epoch {epoch} Training ====", flush=True)
        _, train_state = run_epoch(
            data_iter(),
            model,
            SimpleLossCompute(module.generator, criterion),
            optimizer,
            lr_scheduler,
            mode="train+log",
            accum_iter=config["accum_iter"],
            train_state=train_state,
        )

        #GPUtil.showUtilization()
        if is_main_process:
            file_path = "%s%.2d.pt" % (config["file_prefix"], epoch)
            torch.save(module.state_dict(), file_path)
        torch.cuda.empty_cache()

        print(f"[GPU{gpu}] Epoch {epoch} Validation ====", flush=True)
        model.eval()
        sloss = run_epoch(
            data_iter(),
            model,
            SimpleLossCompute(module.generator, criterion),
            DummyOptimizer(),
            DummyScheduler(),
            mode="eval",
        )
        print(sloss)
        torch.cuda.empty_cache()

    if is_main_process:
        file_path = "%sfinal.pt" % config["file_prefix"]
        torch.save(module.state_dict(), file_path)

In [75]:
def train_model(config):
    train_worker(
        0, config
    )

In [76]:
config = {
    "batch_size": 32,
    "distributed": False,
    "num_epochs": 1,
    "accum_iter": 10,
    "base_lr": 1.0,
    "max_padding": 72,
    "warmup": 3000,
    "file_prefix": "english_welsh_model_",
    }

In [77]:
torch.set_default_device("cuda:0" if torch.cuda.is_available() else "cpu")

In [78]:
train_model(config)

Train worker process using GPU: 0 for training


  nn.init.xavier_uniform(p)


[GPU0] Epoch 0 Training ====
Epoch Step:      1 | Accumulation Step:   1 | Loss:   7.98 | Tokens / Sec:   193.3 | Learning Rate: 5.4e-07
[GPU0] Epoch 0 Validation ====
(tensor(7.9762), <__main__.TrainState object at 0x7f97cdb98880>)
