In [2]:
# utils
from utils import count_parameters
import torch

# data
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

# model
import torch.nn as nn
import torch.nn.functional as F

# training
import torch.optim as optim
import tqdm

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
device

device(type='cpu')

## Data Preparation

In [5]:
# create data fields for source and target
source = Field(
    init_token="<sos>",
    eos_token="<eos>",
    lower=True,
    tokenize="spacy",
    tokenizer_language="de",
    batch_first=True
)
target = Field(
    init_token="<sos>",
    eos_token="<eos>",
    lower=True,
    tokenize="spacy",
    tokenizer_language="de",
    batch_first=True
)

In [6]:
# download the parallel corpus
train, val, test = Multi30k.splits(
    exts=(".de", ".en"),
    fields=(source, target)
)

In [7]:
# build the vocab
source.build_vocab(train)
target.build_vocab(train)

In [8]:
# create data loaders
BATCH_SIZE = 128
train_loader, val_laoder, test_loader = BucketIterator.splits(
    datasets=(train, val, test),
    batch_size=BATCH_SIZE,
    device=device,
    shuffle=True
)

In [9]:
batch =  next(iter(train_loader))
print(batch.src.shape, batch.trg.shape)

torch.Size([128, 27]) torch.Size([128, 30])


## Transformer Model

#### Transformer Encoder Model

In [11]:
class Encoder(nn.Module):
    """
        transformer encoder module returns a [batch_size, seq_len, out_dim] tensor
    """
    
    def __init__(self, vocab_size, embedding_dim, num_layers, n_heads, pf_dim, dropout=0.15, max_len=100):
        super(Encoder, self).__init__()
        
        self.vocab_size = vocab_size
        self.max_len = max_len
        
        # tok and pos embedding dim is same because we have to add them
        self.tok_embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.pos_embedding = nn.Embedding(num_embeddings=max_len, embedding_dim=embedding_dim)
        
        
        # encoder layers of transformer encoder module
        self.encoder_layers = nn.ModuleList([EncoderLayer(embedding_dim, n_heads, pf_dim, dropout) for _ in range(num_layers)])
        self.dropout = nn.Dropout(p=dropout)
        
        # scaling
        self.scale = torch.sqrt(torch.FloatTensor([embedding_dim]))
        
    
    def forward(self, src, src_mask):
        """
            src.shape -> [batch, src_len]
            src_mask -> [batch, src_len]
        """
        batch, src_len = src.shape[0], src.shape[1]
        
        # create position tensor, shape will be [batch, src_len] by dooing so batch_first will be True
        position  = torch.arange(start=0, end=src_len, device=device).unsqueeze(0).repeat(batch, 1)
        
        # embeddings
        tok_embedded = self.tok_embedding(src)
        pos_embedded = self.pos_embedding(position)
        
        # scale the token embeddings by multiplyig it with srqt(d_model) where d_model is embedding_dim
        tok_scaled = tok_embedded * self.scale.to(device)
        
        # add the scaled_tok and position embedding and then apply dropout, that will be input to the encoder
        encoder_input = self.dropout(tok_scaled + pos_embedded)
        
        print(encoder_input.shape)
        
        #feed the input to the encoder layers
        for layer in self.encoder_layers:
            src = layer(encoder_input, None)
        
        return src
        

In [12]:
vocab_size=len(source.vocab)
embedding_dim=256
num_layers=1
n_heads=8
pf_dim=128

In [18]:
encoder = Encoder(
    vocab_size=vocab_size, 
    embedding_dim=embedding_dim, 
    num_layers=num_layers,
    n_heads=n_heads,
    pf_dim=128
)

In [19]:
outputs = encoder(batch.src, None)

torch.Size([128, 27, 256])
query.shpe torch.Size([128, 27, 256]) key.shape torch.Size([128, 27, 256])  value.shape torch.Size([128, 27, 256])
Q.shape torch.Size([128, 27, 256]) K.shape torch.Size([128, 27, 256]) V.shape torch.Size([128, 27, 256])


In [20]:
outputs.shape

torch.Size([128, 27, 256])

#### Encoder Layer

In [15]:
class EncoderLayer(nn.Module):
    
    def __init__(self, embedding_dim, n_heads, pf_dim, dropout):
        super(EncoderLayer, self).__init__()
        
        # layer normalization
        self.layer_norm =  nn.LayerNorm(normalized_shape=embedding_dim)
        
        # multi-head attention (I love this layer)
        self.multihead_attention = MultiheadAttention(embedding_dim, n_heads, dropout)
        
        
        self.dropout = nn.Dropout(p=dropout)
        
        # feedforward layer
        self.positionwise_ff = PositionwiseFeedForwardLayer(embedding_dim, pf_dim, dropout)
        
    def forward(self, src, src_mask=None):
        
        # compute the attention values (query, key, value) -> (src, src, src)
        
        attn_out, _  = self.multihead_attention(src, src, src, src_mask)
        
        
        # Normalize the attention and build residual connection and then pass it to positionwise ff layer followed by LN
        attn_norm_out = self.layer_norm(src + self.dropout(attn_out))
        ff_out = self.positionwise_ff(attn_norm_out)
        
        ff_norm_out = self.layer_norm(attn_norm_out + self.dropout(ff_out)) 
        # this will be output of the Transformer's Encoder layer
        # ff_norm_out.shape [batch, seq_len, embedding_dim]
        
        
        return attn_out
#         return ff_norm_out     

In [38]:
encoder_layer = EncoderLayer(embedding_dim=embedding_dim, n_heads=n_heads, pf_dim=pf_dim, dropout=0.2)

In [39]:
attn = encoder_layer(outputs)
print(attn.shape)

query.shpe torch.Size([128, 31, 256]) key.shape torch.Size([128, 31, 256])  value.shape torch.Size([128, 31, 256])
Q.shape torch.Size([128, 31, 256]) K.shape torch.Size([128, 31, 256]) V.shape torch.Size([128, 31, 256])
torch.Size([128, 31, 256])


#### Multi-head Attention Module

In [16]:
class MultiheadAttention(nn.Module):
    """
        Scaled dot product attention
    """
    def __init__(self, embedding_dim, n_heads, dropout):
        """
            n_heads > 0
        """
        super(MultiheadAttention, self).__init__()
        self.embedding_dim = embedding_dim
        self.n_heads = n_heads
        self.head_dim = embedding_dim // n_heads
        
        # fc for key, query, values
        self.fc_k  = nn.Linear(in_features=embedding_dim, out_features=embedding_dim)
        self.fc_q = nn.Linear(in_features=embedding_dim, out_features=embedding_dim)
        self.fc_v = nn.Linear(in_features=embedding_dim, out_features=embedding_dim)
        
        self.fc_o = nn.Linear(in_features=embedding_dim, out_features=embedding_dim)

        self.dropout = nn.Dropout(p=dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([embedding_dim])).to(device)
    
    def forward(self, query, key, value, mask=None):
        """
            query.shape -> [batch, src_len, embedding_dim]
            key.shape -> [batch, src_len, embedding_dim]
            value.shape -> [batch, src_len, embedding_dim]
        """
        
        batch_size = query.shape[0]
        src_len = query.shape[1]
        
        print(f'query.shpe {query.shape} key.shape {key.shape}  value.shape {value.shape}')
        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)
        
        
        
        #Q = [batch size, query len, hid dim] K&V would have same dim
                
        print(f'Q.shape {Q.shape} K.shape {K.shape} V.shape {V.shape}')
        Q = Q.view(batch_size, src_len, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, src_len, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, src_len, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        
        #Q = [batch size, n heads, query len, head dim] K&V have to have same dim
        
                
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        #energy = [batch size, n heads, query len, key len]
        
        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        attention = torch.softmax(energy, dim = -1) 
        #attention = [batch size, n heads, query len, key len]
                
        x = torch.matmul(self.dropout(attention), V)
        #x = [batch size, n heads, query len, embedding_dim]
        
        x = x.permute(0, 2, 1, 3).contiguous()
        #x = [batch size, query len, n heads, head dim]
        
        x = x.view(batch_size, -1, self.embedding_dim)
        #x = [batch size, query len, embedding_dim]
        
        x = self.fc_o(x)
        #x = [batch size, query len, embedding_dim]
        
        return x, attention    

In [134]:
attention = MultiheadAttention(embedding_dim=embedding_dim, n_heads=n_heads, dropout=0.2)

In [137]:
x, attn = attention(outputs, outputs, outputs)

query.shpe torch.Size([128, 34, 256]) key.shape torch.Size([128, 34, 256])  value.shape torch.Size([128, 34, 256])
Q.shape torch.Size([128, 34, 256]) K.shape torch.Size([128, 34, 256]) V.shape torch.Size([128, 34, 256])


torch.Size([128, 8, 34, 34])

#### Positionwise Feedforad Network

In [17]:
class PositionwiseFeedForwardLayer(nn.Module):
    
    def __init__(self, embedding_dim, pf_dim, dropout):
        super(PositionwiseFeedForwardLayer, self).__init__()
        
        self.fc1 = nn.Linear(in_features=embedding_dim, out_features=pf_dim)
        self.fc2 = nn.Linear(in_features=pf_dim, out_features=embedding_dim)
        
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, x):
        """
            x.shape -> [batch, src_len, embedding_dim]
        """
        out = self.fc2(self.dropout(F.relu((self.fc1(x)))))
        
        return out