In [1]:
import torch
import torch.nn as nn
import math
import numpy as np
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Define the set of token constants
PAD_TKN = 0 # for padding sequence lengths
CLS_TKN = 101 # for the start of the sequence
SEP_TKN = 102 # for the end of sequence

In [3]:

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
result = tokenizer.encode("Hello world!", add_special_tokens=True, return_tensors="pt")
decoded_result = tokenizer.decode(result[0])
print(f"Encoded sentence: {result}")
print(f"Decoded sentence: {decoded_result}")

Encoded sentence: tensor([[ 101, 7592, 2088,  999,  102]])
Decoded sentence: [CLS] hello world! [SEP]


In [4]:
tokenizer.decode([101, 7592, 2088, 999, 102])

'[CLS] hello world! [SEP]'

If we have a much more limited vocabularly, we can simply define a dictionary that creates a token per word and maps from words to token IDs: 

In [5]:
ENGLISH_TOKEN_MAPPING = {
    "<PAD>" : 0,
    "<BOS>" : 1,
    "<EOS>" : 2,
    "the" : 3,
    "quick" : 4,
    "brown" : 5,
    "fox" : 6,
    "jumps" : 7,
    "over" : 8,
    "lazy" : 9,
    "dog" : 10,

}

FRENCH_TOKEN_MAPPING = {
    "<PAD>" : 0,
    "<BOS>" : 1,
    "<EOS>" : 2,
    "le" : 3,
    "renard" : 4,
    "brun" : 5,
    "rapide" : 6,
    "saute" : 7,
    "par-dessus" : 8,
    "chien" : 9,
    "paresseux" : 10
}
    

# From token IDs to token embeddings
Adapted from this [PyTorch tutorial](https://pytorch.org/tutorials/beginner/translation_transformer.html)

In [6]:
class TokenEmbedding(nn.Module):
    """Class for converting token IDs of a fixed vocabulary size into vectors of a fixed length"""
    def __init__(self, vocab_size: int, emb_dim: int, padding_idx: int = 0):
        # The padding index prevents gradients from being propagated to the 
        # embedding parameters for padding tokens
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=padding_idx)
        self._embed_dim = emb_dim
        self._vocab_size = vocab_size
    
    def forward(self, token_ids: torch.Tensor) -> torch.Tensor:
        return self.embedding(token_ids)

In [7]:
vocab_size = len(ENGLISH_TOKEN_MAPPING)
embedding_dim = 128
token_embedding = TokenEmbedding(vocab_size, embedding_dim)
tokens = torch.tensor([ENGLISH_TOKEN_MAPPING["the"]], dtype=torch.long)
embedding = token_embedding(tokens)
print(f"Embedding for 'the':\n{embedding}\nShape: {embedding.shape}")

Embedding for 'the':
tensor([[ 0.8037, -0.0358, -1.1829, -0.3455, -2.1089,  0.2363,  1.1595, -0.6717,
         -1.0007, -0.4557,  0.2388,  0.4289,  0.6512,  1.5092, -0.6770,  0.6057,
          1.1825, -0.0660,  0.4837, -1.7217, -0.7760, -0.2422,  0.4692,  0.6317,
         -2.8646,  2.1812,  0.2099, -0.3220,  1.7585, -1.0891,  2.0922, -2.9054,
         -0.5375,  0.5031,  0.1555,  1.7352,  1.4221, -0.7985, -0.3126, -0.5879,
         -0.1025, -0.4161,  0.3701, -0.2006,  1.4460, -0.8493, -2.2056,  0.1394,
          1.7231,  0.7155, -0.2800,  0.9610, -0.1751, -1.0879, -0.8913, -0.8063,
          0.4753, -0.0712, -1.3732,  1.1815, -1.4304,  1.2193, -0.9271, -0.1687,
         -1.9771, -0.5979,  1.1737, -0.3849,  0.4964,  1.2381, -0.6243, -0.7886,
         -0.2245, -0.0921, -0.8983,  0.1947,  1.7604, -2.6235,  0.5024, -0.0684,
          0.2427,  0.2252, -2.0481,  0.6361,  0.8009,  0.3931, -0.5136,  0.2945,
         -0.1167,  2.8122,  1.1558,  1.9226, -1.1016, -1.7632,  0.5140,  0.4155,
       

# Positional Encoding
We want to add information to the word embedding to encoded information about their positions in a sequence.

Below is taken from the official [PyTorch transformers tutorial](https://pytorch.org/tutorials/beginner/transformer_tutorial.html)


In [8]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
            
        Adds positional encoding to a given sequential input tensor.
            
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [9]:
positional_encoder = PositionalEncoding(embedding_dim)
test_sentence = "the quick brown fox jumps over the lazy dog"
test_tokens = [ENGLISH_TOKEN_MAPPING[token] for token in test_sentence.split()]
test_tokens = torch.tensor(test_tokens, dtype=torch.long)
test_embedding = token_embedding(test_tokens)
test_embedding = test_embedding.unsqueeze(1) # add batch dim
positional_encoder(test_embedding)

tensor([[[ 0.8931,  1.0713, -1.3144,  ...,  1.3008,  0.4784,  0.3049]],

        [[ 3.2182,  1.3569,  2.6505,  ..., -0.1593,  0.3387,  1.4701]],

        [[ 2.5950,  0.1774,  0.1516,  ...,  1.1909, -0.0000,  0.7388]],

        ...,

        [[ 0.5826,  1.0271, -2.2982,  ...,  1.3008,  0.0000,  0.3049]],

        [[-0.7360,  0.4591, -0.1274,  ...,  2.8325,  0.1148,  0.1064]],

        [[ 2.0990, -3.6456,  0.7737,  ...,  1.7929, -0.4480,  2.4312]]],
       grad_fn=<MulBackward0>)

# Preparing the input for the translation task
As discussed in this [blog post](https://kikaben.com/transformers-encoder-decoder/), we typically use teacher forcing to prepare the sequence of inputs in parallel.

For example, the first input to the decoder block would be the encoded `the quick brown fox jumps over the lazy dog` along with the `<BOS>` tag. The decoder would then hopefully output the first word of the translation `le`. The next sets of inputs to the decoder would be the same apart from `<BOS> le` etc.

The teaching forcing means we give the ground-truth tokens at each step of the translations, rather than auto-regressively waiting for the decoder to generate each required token, which means that we can parallelise the input. 

This parallelisation is done using **masked attention** which sets the attention value to 0 for any token that the decoder is not "allowed" to see yet.

Our full target sequence is `<BOS> le renard brun rapide saute par-dessus le chien paresseux <EOS>`

The first attention mask is thus `[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]` as the decoder is only allowed to see the `<BOS>` when predicting the first output token.

The second attention mask is `[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]` etc.

We can group these together in a matrix:

```
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]
```

This can be quickly done using an existing PyTorch function (which uses zeros un-masked values and -infs for masked values):

In [10]:
nn.Transformer.generate_square_subsequent_mask(10)

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

# Constructing the Transformer Model

In [11]:
class TransformerEncoderDecoder(nn.Module):
    def __init__(self, 
                 d_model: int, 
                 n_head: int, 
                 num_encoder_layers: int, 
                 num_decoder_layers: int, 
                 dim_feedforward: int = 512, 
                 dropout: float = 0.1, 
                 activation: str = "relu", 
                 source_vocab_size: int = 100,
                 target_vocab_size: int = 100, 
                 max_seq_length: int = 100, 
                 device: torch.device = torch) -> None:
        super(TransformerEncoderDecoder, self).__init__()
        self.transformer = nn.Transformer(d_model=d_model, 
                                          nhead=n_head, 
                                          num_encoder_layers=num_encoder_layers, 
                                          num_decoder_layers=num_decoder_layers, 
                                          dim_feedforward=dim_feedforward, 
                                          dropout=dropout, 
                                          activation=activation)
        self._target_vocab_size = target_vocab_size
        self._max_seq_length = max_seq_length
        self._device = device
        self.final_layer = nn.Linear(d_model, target_vocab_size)
        self.pe = PositionalEncoding(d_model, dropout)
        self.src_embedding = TokenEmbedding(source_vocab_size, d_model)
        self.tgt_embedding = TokenEmbedding(target_vocab_size, d_model)
    
    def forward(self, 
                src: torch.Tensor, 
                tgt: torch.Tensor,
                generate_mask: bool = True) -> torch.Tensor:
        src = self.src_embedding(src)
        tgt = self.tgt_embedding(tgt)
        src = self.pe(src)
        tgt = self.pe(tgt)
        tgt_mask = None
        if generate_mask:
            tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(0)).to(self._device)
        out = self.transformer(src, tgt, tgt_mask=tgt_mask)
        out = self.final_layer(out)
        return out

## Example usage


In [12]:
source_sentence = "the quick brown fox jumps over the lazy dog"
target_sentence = "<BOS> le renard brun rapide saute par-dessus le chien paresseux <EOS>"
source_tokens = [ENGLISH_TOKEN_MAPPING[token] for token in source_sentence.split()]
target_tokens = [FRENCH_TOKEN_MAPPING[token] for token in target_sentence.split()]
print(f"Source tokens: {source_tokens}")
print(f"Target tokens: {target_tokens}")

Source tokens: [3, 4, 5, 6, 7, 8, 3, 9, 10]
Target tokens: [1, 3, 4, 5, 6, 7, 8, 3, 9, 10, 2]


In [13]:
transformer = TransformerEncoderDecoder(d_model=512,
                                        n_head=2,
                                        num_encoder_layers=6,
                                        num_decoder_layers=6,
                                        source_vocab_size=11,
                                        target_vocab_size=11,
                                        device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))

source_tokens_tensor = torch.tensor(source_tokens, dtype=torch.long).unsqueeze(1)
target_tokens_tensor = torch.tensor(target_tokens, dtype=torch.long).unsqueeze(1)
print(f"Source tokens shape: {source_tokens_tensor.shape}")
print(f"Target tokens shape: {target_tokens_tensor.shape}")

result = transformer(source_tokens_tensor, target_tokens_tensor).squeeze(1) # remove batch dim
print(f"Output shape: {result.shape}")

# Run through softmax to get probabilities
result = nn.functional.softmax(result, dim=-1)
predicted_token_ids = torch.argmax(result, dim=-1)
print(f"Predicted token IDs: {predicted_token_ids}")
predicted_words = [list(FRENCH_TOKEN_MAPPING.keys())[list(FRENCH_TOKEN_MAPPING.values()).index(token_id)] for token_id in predicted_token_ids]
print(f"Predicted Translation: {' '.join(predicted_words)}")



Source tokens shape: torch.Size([9, 1])
Target tokens shape: torch.Size([11, 1])
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Output shape: torch.Size([11, 11])
Predicted token IDs: tensor([3, 3, 3, 3, 5, 5, 5, 3, 3, 3, 3])
Predicted Translation: le le le le brun brun brun le le le le


# Training Loop

In [110]:
loss_function = nn.CrossEntropyLoss()
transformer = TransformerEncoderDecoder(d_model=512,
                                        n_head=2,
                                        num_encoder_layers=6,
                                        num_decoder_layers=6,
                                        source_vocab_size=11,
                                        target_vocab_size=11,
                                        device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))

source_tokens_tensor = torch.tensor(source_tokens, dtype=torch.long).unsqueeze(1)
target_tokens_tensor = torch.tensor(target_tokens, dtype=torch.long).unsqueeze(1)
input_target_tokens = target_tokens_tensor[:-1]
target_target_tokens = target_tokens_tensor[1:]
optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001)
print_every = 10
for train_iter in range(100):
    optimizer.zero_grad()
    result = transformer(source_tokens_tensor, input_target_tokens).squeeze(1)
    loss = loss_function(result.view(-1, transformer._target_vocab_size), target_target_tokens.view(-1))
    loss.backward()
    optimizer.step()
    if train_iter % print_every == 0:
        print(f"Iteration {train_iter}, Loss: {loss.item()}")
        result = nn.functional.softmax(result, dim=-1)
        predicted_token_ids = torch.argmax(result, dim=-1)
        predicted_words = [list(FRENCH_TOKEN_MAPPING.keys())[list(FRENCH_TOKEN_MAPPING.values()).index(token_id)] for token_id in predicted_token_ids]
        print(f"Predicted Translation: {' '.join(predicted_words)}")
        
    
    

Iteration 0, Loss: 2.6233906745910645
Predicted Translation: par-dessus <BOS> <BOS> <BOS> <BOS> <BOS> <EOS> <BOS> <BOS> <BOS>
Iteration 10, Loss: 2.194302558898926
Predicted Translation: rapide le saute le le le le le par-dessus brun
Iteration 20, Loss: 1.9355199337005615
Predicted Translation: renard le brun renard renard le le le <EOS> <EOS>
Iteration 30, Loss: 0.9529194831848145
Predicted Translation: le renard brun rapide saute par-dessus le chien par-dessus <EOS>
Iteration 40, Loss: 0.5133823156356812
Predicted Translation: le renard brun rapide saute par-dessus le chien paresseux <EOS>
Iteration 50, Loss: 0.15060295164585114
Predicted Translation: le renard brun rapide saute par-dessus le chien paresseux <EOS>
Iteration 60, Loss: 0.05734274536371231
Predicted Translation: le renard brun rapide saute par-dessus le chien paresseux <EOS>
Iteration 70, Loss: 0.022655215114355087
Predicted Translation: le renard brun rapide saute par-dessus le chien paresseux <EOS>
Iteration 80, Loss:

# Prediction
Outside of training time, we will not have the target sequence for translation. Thus, we need a method to autoregressively generate the predicted translation, appending the generated token(s) to the input prompt as we go

We do not need to apply any attention masking, as we can have our model pay attention to everything!


In [116]:
trained_model = transformer
input_sequence = source_tokens_tensor
translated_input = torch.tensor([[FRENCH_TOKEN_MAPPING["<BOS>"]]], dtype=torch.long)
max_length = 20

done = False
while not done:
    output_logits = trained_model(input_sequence, translated_input, generate_mask=False)
    output_probs = nn.functional.softmax(output_logits, dim=-1)
    predicted_token_ids = torch.argmax(output_probs, dim=-1)
    if predicted_token_ids[-1] == FRENCH_TOKEN_MAPPING["<EOS>"] or len(predicted_token_ids) >= max_length:
        done = True
    translated_input = torch.cat([translated_input, predicted_token_ids[-1].unsqueeze(0)])
    predicted_words = [list(FRENCH_TOKEN_MAPPING.keys())[list(FRENCH_TOKEN_MAPPING.values()).index(token_id)] for token_id in translated_input]
    print(f"Predicted Translation: {' '.join(predicted_words)}")
        
    
    
    

Predicted Translation: <BOS> le
Predicted Translation: <BOS> le renard
Predicted Translation: <BOS> le renard brun
Predicted Translation: <BOS> le renard brun rapide
Predicted Translation: <BOS> le renard brun rapide saute
Predicted Translation: <BOS> le renard brun rapide saute par-dessus
Predicted Translation: <BOS> le renard brun rapide saute par-dessus le
Predicted Translation: <BOS> le renard brun rapide saute par-dessus le chien
Predicted Translation: <BOS> le renard brun rapide saute par-dessus le chien paresseux
Predicted Translation: <BOS> le renard brun rapide saute par-dessus le chien paresseux <EOS>
