In [4]:
# utils
from utils import count_parameters
import torch

# data
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

# model
import torch.nn as nn
import torch.nn.functional as F

# training
import torch.optim as optim
import tqdm

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [6]:
device

device(type='cpu')

## Data Preparation

In [7]:
# create data fields for source and target
source = Field(
    init_token="<sos>",
    eos_token="<eos>",
    lower=True,
    tokenize="spacy",
    tokenizer_language="de",
    batch_first=True
)
target = Field(
    init_token="<sos>",
    eos_token="<eos>",
    lower=True,
    tokenize="spacy",
    tokenizer_language="de",
    batch_first=True
)

In [8]:
# download the parallel corpus
train, val, test = Multi30k.splits(
    exts=(".de", ".en"),
    fields=(source, target)
)

In [9]:
# build the vocab
source.build_vocab(train)
target.build_vocab(train)

In [35]:
# create data loaders
BATCH_SIZE = 128
train_loader, val_loader, test_loader = BucketIterator.splits(
    datasets=(train, val, test),
    batch_size=BATCH_SIZE,
    device=device,
    shuffle=True
)

In [36]:
batch =  next(iter(train_loader))
print(batch.src.shape, batch.trg.shape)

torch.Size([128, 28]) torch.Size([128, 32])


## Transformer Model

#### Transformer Encoder Model

In [12]:
class Encoder(nn.Module):
    """
        transformer encoder module returns a [batch_size, seq_len, out_dim] tensor
    """
    
    def __init__(self, vocab_size, embedding_dim, num_layers, n_heads, pf_dim, dropout=0.15, max_len=100):
        super(Encoder, self).__init__()
        
        self.vocab_size = vocab_size
        self.max_len = max_len
        
        # tok and pos embedding dim is same because we have to add them
        self.tok_embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.pos_embedding = nn.Embedding(num_embeddings=max_len, embedding_dim=embedding_dim)
        
        
        # encoder layers of transformer encoder module
        self.encoder_layers = nn.ModuleList([EncoderLayer(embedding_dim, n_heads, pf_dim, dropout) for _ in range(num_layers)])
        self.dropout = nn.Dropout(p=dropout)
        
        # scaling
        self.scale = torch.sqrt(torch.FloatTensor([embedding_dim])).to(device)
        
    
    def forward(self, src, src_mask):
        """
            src.shape -> [batch, src_len]
            src_mask -> [batch, src_len]
        """
        batch, src_len = src.shape[0], src.shape[1]
        
        # create position tensor, shape will be [batch, src_len] by dooing so batch_first will be True
        position  = torch.arange(start=0, end=src_len, device=device).unsqueeze(0).repeat(batch, 1)
        
        # embeddings
        tok_embedded = self.tok_embedding(src)
        pos_embedded = self.pos_embedding(position)
        
        # scale the token embeddings by multiplyig it with srqt(d_model) where d_model is embedding_dim
        tok_scaled = tok_embedded * self.scale.to(device)
        
        # add the scaled_tok and position embedding and then apply dropout, that will be input to the encoder
        encoder_input = self.dropout(tok_scaled + pos_embedded)
        
        
        #feed the input to the encoder layers
        for layer in self.encoder_layers:
            src = layer(encoder_input, None)
        
        return src
        

#### Encoder Layer

In [13]:
class EncoderLayer(nn.Module):
    
    def __init__(self, embedding_dim, n_heads, pf_dim, dropout):
        super(EncoderLayer, self).__init__()
        
        # layer normalization
        self.layer_norm =  nn.LayerNorm(normalized_shape=embedding_dim)
        
        # multi-head attention (I love this layer)
        self.multihead_attention = MultiheadAttention(embedding_dim, n_heads, dropout)
                
        # feedforward layer
        self.positionwise_ff = PositionwiseFeedForwardLayer(embedding_dim, pf_dim, dropout)
        
        # dropout layer
        self.dropout = nn.Dropout(p=dropout)

        
    def forward(self, src, src_mask=None):
        
        # compute the attention values (query, key, value) -> (src, src, src)
        attn_out, _  = self.multihead_attention(src, src, src, src_mask)
        
        
        # Normalize the attention and build residual connection and then pass it to positionwise ff layer followed by LN
        attn_norm_out = self.layer_norm(src + self.dropout(attn_out))
        ff_out = self.positionwise_ff(attn_norm_out)
        ff_norm_out = self.layer_norm(attn_norm_out + self.dropout(ff_out)) 
        # this will be output of the Transformer's Encoder layer
        # ff_norm_out.shape [batch, seq_len, embedding_dim]
        
        return ff_norm_out     

#### Multi-head Attention Module

In [18]:
class MultiheadAttention(nn.Module):
    """
        Scaled dot product attention
    """
    def __init__(self, embedding_dim, n_heads, dropout):
        """
            n_heads > 0
        """
        super(MultiheadAttention, self).__init__()
        self.embedding_dim = embedding_dim
        self.n_heads = n_heads
        self.head_dim = embedding_dim // n_heads
        
        # fc for key, query, values
        self.fc_k  = nn.Linear(in_features=embedding_dim, out_features=embedding_dim)
        self.fc_q = nn.Linear(in_features=embedding_dim, out_features=embedding_dim)
        self.fc_v = nn.Linear(in_features=embedding_dim, out_features=embedding_dim)
        self.fc_o = nn.Linear(in_features=embedding_dim, out_features=embedding_dim)

        self.dropout = nn.Dropout(p=dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([embedding_dim])).to(device)
    
    def forward(self, query, key, value, mask=None):
        """
            query.shape -> [batch, src_len, embedding_dim]
            key.shape -> [batch, src_len, embedding_dim]
            value.shape -> [batch, src_len, embedding_dim]
        """
        
        batch_size = query.shape[0]
        src_len = query.shape[1]
        
        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)
        
        #Q = [batch size, query len, hid dim] K&V would have same dim
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        #Q = [batch size, n heads, query len, head dim] K&V have to have same dim
        
                
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        #energy = [batch size, n heads, query len, key len]
        
        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        attention = torch.softmax(energy, dim = -1) 
        #attention = [batch size, n heads, query len, key len]
                
        x = torch.matmul(self.dropout(attention), V)
        #x = [batch size, n heads, query len, embedding_dim]
        
        x = x.permute(0, 2, 1, 3).contiguous()
        #x = [batch size, query len, n heads, head dim]
        
        x = x.view(batch_size, -1, self.embedding_dim)
        #x = [batch size, query len, embedding_dim]
        
        x = self.fc_o(x)
        #x = [batch size, query len, embedding_dim]
        
        return x, attention    

#### Positionwise Feedforad Network

In [19]:
class PositionwiseFeedForwardLayer(nn.Module):
    
    def __init__(self, embedding_dim, pf_dim, dropout):
        super(PositionwiseFeedForwardLayer, self).__init__()
        
        self.fc1 = nn.Linear(in_features=embedding_dim, out_features=pf_dim)
        self.fc2 = nn.Linear(in_features=pf_dim, out_features=embedding_dim)
        
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, x):
        """
            x.shape -> [batch, src_len, embedding_dim]
        """
        out = self.fc2(self.dropout(F.relu((self.fc1(x)))))
        
        return out

### Transformer's Decoder Model

In [62]:
class Decoder(nn.Module):
    """
        transformer encoder module returns a [batch_size, seq_len, out_dim] tensor
    """
    
    def __init__(self, vocab_size, embedding_dim, num_layers, n_heads, pf_dim, dropout=0.15, max_len=100):
        super(Decoder, self).__init__()
        
        self.vocab_size = vocab_size
        self.max_len = max_len
        
        # tok and pos embedding dim is same because we have to add them
        self.tok_embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.pos_embedding = nn.Embedding(num_embeddings=max_len, embedding_dim=embedding_dim)
        
        
        # encoder layers of transformer encoder module
        self.decoder_layers = nn.ModuleList([DecoderLayer(embedding_dim, n_heads, pf_dim, dropout) for _ in range(num_layers)])
        
        self.fc_out = nn.Linear(embedding_dim, out_features=vocab_size)
        
        self.dropout = nn.Dropout(p=dropout)
        
        # scaling
        self.scale = torch.sqrt(torch.FloatTensor([embedding_dim])).to(device)
        
    
    def forward(self, trg, enc_src, trg_mask=None, src_mask=None):

        """
            trg is target tokens and enc_src is encoder
            
            trg.shape -> [batch, trg_len]
            enc_src.shape -> [batch, src_len, embedding_dim] # This is why it is recommended to use same embedding dim
            
        """
        batch, trg_len = trg.shape[0], trg.shape[1]
        
        
        # create position tensor, shape will be [batch, src_len] by dooing so batch_first will be True
        position  = torch.arange(start=0, end=trg_len, device=device).unsqueeze(0).repeat(batch, 1)
        
        # embeddings
        tok_embedded = self.tok_embedding(trg)
        pos_embedded = self.pos_embedding(position)
        
        
        # scale the token embeddings by multiplyig it with srqt(d_model) where d_model is embedding_dim
        tok_scaled = tok_embedded * self.scale.to(device)
        
        # add the scaled_tok and position embedding and then apply dropout, that will be input to the encoder
        decoder_input = self.dropout(tok_scaled + pos_embedded)
        
        
        #feed the input to the encoder layers
        for layer in self.decoder_layers:
            trg, attention = layer(decoder_input, enc_src, None, None)
        
        
        outputs = self.fc_out(trg)
        
        return outputs
        

In [63]:
class DecoderLayer(nn.Module):
    def __init__(self, 
                 embedding_dim, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 ):
        super(DecoderLayer, self).__init__()
        
        self.layer_norm = nn.LayerNorm(normalized_shape=embedding_dim)
        
        self.self_attention = MultiheadAttention(embedding_dim, n_heads, dropout)
        self.encoder_attention = MultiheadAttention(embedding_dim, n_heads, dropout)
        
        self.positionwise_feedforward = PositionwiseFeedForwardLayer(embedding_dim,  pf_dim,  dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):
        
        #trg = [batch size, trg len, hid dim]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, trg len]
        #src_mask = [batch size, src len]
        
        #self attention
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
        
        #dropout, residual connection and layer norm
        trg = self.layer_norm(trg + self.dropout(_trg))
            
        #trg = [batch size, trg len, hid dim]
            
        #encoder attention
        _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)
        
        #dropout, residual connection and layer norm
        trg = self.layer_norm(trg + self.dropout(_trg))
                    
        #trg = [batch size, trg len, hid dim]
        
        #positionwise feedforward
        _trg = self.positionwise_feedforward(trg)
        
        #dropout, residual and layer norm
        trg = self.layer_norm(trg + self.dropout(_trg))
        
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return trg, attention

In [64]:
import pytorch_lightning as pl

In [65]:
PAD_IDX = target.vocab.stoi[target.pad_token]

In [89]:
# combining all modules into a PyTorch-Lightining Tranformer Module

class Transformer(pl.LightningModule):
    
    def __init__(self, encoder, decoder):
        super(Transformer, self).__init__()
        self.encoder = encoder 
        self.decoder = decoder
    
    def forward(self, src, trg):
        
        src_encoded = self.encoder(src, None)
        outputs = self.decoder(trg, src_encoded, None, None)
        return outputs
    
    def configure_optimizers(self):
        return optim.Adam(params=self.parameters(), lr=1e-3)
    
    def train_dataloader(self):
        return train_loader
    
    def training_step(self, batch, batch_idx):
        src, trg = batch.src, batch.trg
        batch_size, trg_len = trg.shape[0], trg.shape[1]
        outputs  = self(src, trg)
        outputs = outputs.view(batch_size*trg_len, -1)
        loss =  F.cross_entropy(outputs, trg.view(-1), ignore_index=PAD_IDX)
        ppl = torch.exp(loss)
        tensorboard_logs = {"loss":loss, "ppl":ppl}
        return {"loss":loss, "ppl":ppl, "log":tensorboard_logs}
    
    def val_dataloader(self):
        return val_loader
    
    def validation_step(self, batch, batch_idx):
        src, trg = batch.src, batch.trg
        batch_size, trg_len = trg.shape[0], trg.shape[1]
        outputs = self(src, trg)
        outputs = outputs.view(batch_size*trg_len, -1)
        loss =  F.cross_entropy(outputs, trg.view(-1), ignore_index=PAD_IDX)
        ppl = torch.exp(loss)
#         tensorboard_logs = {"val_loss":loss, "val_ppl":ppl}
        return {"val_loss":loss, "val_ppl":ppl}

    
    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        avg_ppl = torch.stack([x['val_ppl'] for x in outputs]).mean()
        tensorboard_logs = {'val_loss': avg_loss, 'val_ppl':avg_ppl}
        return {'val_loss': avg_loss,'val_ppl':avg_ppl, 'log': tensorboard_logs}
       

In [90]:
src_vocab = len(source.vocab)
trg_vocab = len(target.vocab)
embedding_dim = 256
n_heads = 8
num_layers = 1
pf_dim = 128

In [91]:
encoder = Encoder(vocab_size=src_vocab, embedding_dim=embedding_dim, num_layers=num_layers, n_heads=n_heads, pf_dim=pf_dim)
decoder = Decoder(vocab_size=trg_vocab, embedding_dim=embedding_dim, num_layers=num_layers, n_heads=n_heads, pf_dim=pf_dim)
transformer = Transformer(encoder, decoder)

In [92]:
# see gpus/tpu params if you want to train on gpus/tpu 
trainer = pl.Trainer(max_epochs=10)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores


In [93]:
trainer.fit(transformer)


  | Name    | Type    | Params
------------------------------------
0 | encoder | Encoder | 5 M   
1 | decoder | Decoder | 5 M   


Epoch 1:  97%|█████████▋| 227/235 [02:39<00:05,  1.43it/s, loss=0.341, v_num=24]
Validating: 0it [00:00, ?it/s][A
Epoch 1:  97%|█████████▋| 228/235 [02:39<00:04,  1.43it/s, loss=0.341, v_num=24]
Epoch 1:  98%|█████████▊| 230/235 [02:39<00:03,  1.44it/s, loss=0.341, v_num=24]
Validating:  50%|█████     | 4/8 [00:00<00:00,  9.42it/s][A
Epoch 1:  99%|█████████▊| 232/235 [02:39<00:02,  1.45it/s, loss=0.341, v_num=24]
Validating:  75%|███████▌  | 6/8 [00:00<00:00,  8.58it/s][A
Epoch 1: 100%|█████████▉| 234/235 [02:39<00:00,  1.46it/s, loss=0.341, v_num=24]
Epoch 1: 100%|██████████| 235/235 [02:40<00:00,  1.47it/s, loss=0.341, v_num=24]
Epoch 2:  97%|█████████▋| 227/235 [03:04<00:06,  1.23it/s, loss=0.135, v_num=24]
Validating: 0it [00:00, ?it/s][A
Epoch 2:  97%|█████████▋| 228/235 [03:04<00:05,  1.24it/s, loss=0.135, v_num=24]
Validating:  25%|██▌       | 2/8 [00:00<00:00,  8.34it/s][A
Epoch 2:  98%|█████████▊| 230/235 [03:04<00:04,  1.25it/s, loss=0.135, v_num=24]
Validating:  50%|███



1

In [94]:
trainer.run_evaluation()


Validating: 0it [00:00, ?it/s][A
Validating:  12%|█▎        | 1/8 [00:00<00:00,  9.70it/s][A
Validating:  25%|██▌       | 2/8 [00:00<00:00,  9.74it/s][A
Validating:  38%|███▊      | 3/8 [00:00<00:00,  9.72it/s][A
Validating:  50%|█████     | 4/8 [00:00<00:00,  9.39it/s][A
Validating:  62%|██████▎   | 5/8 [00:00<00:00,  9.34it/s][A
Validating:  75%|███████▌  | 6/8 [00:00<00:00,  8.79it/s][A
Validating:  88%|████████▊ | 7/8 [00:00<00:00,  8.18it/s][A
Validating: 100%|██████████| 8/8 [00:00<00:00,  7.20it/s][A
                                                         [A

{'val_loss': tensor(0.1275), 'val_ppl': tensor(1.1361)}

#### Note: Some codes are directly taken from here: 
- https://colab.research.google.com/github/bentrevett/pytorch-seq2seq/blob/master/6%20-%20Attention%20is%20All%20You%20Need.ipynb