In [39]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torch.nn.functional as F
import torchvision.transforms as transforms
# Matplotlib is a useful plotting library for python 
import matplotlib.pyplot as plt
# This code is to make matplotlib figures appear inline in the

import cv2
import math
# notebook rather than in a new window.
%matplotlib inline

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#Set seed for reproducbility
import torch

### Establish Transformer architecture

The objectives of this excersie is to:

   * Know how to implement the transformer architecture by PyTorch
   * Know how to implement the visiaul transformer architecture by PyTorch, and use it for image classifcation task
   * Homework (Try to implement Masked Autoencoder by transformer architecture for image classification (MNIST))  <span style="color:red">(Deadline is 28th April, 2024 18:00)</span>  


In this course, you will need to implement Transformer architecture based on lecture given last time, as showed below:
    
<img src="./transformer.png" width="400" height="200">
   
   
And you need to get familar with the following four components of transformer architecture:
   * Word Embedding
   * Encoder Decoder Models 
   * Attentions 
   * Position encoding
   
   

Now, we need to define the basic building blocks: Embedding,Positional Encoding.  (Multi-Head Attention), and Encoder and Decoder layers. And in the end we combine Encoder and Decoder layers to create the complete Transformer model.

 
Still, if see your <span style="color:red">#### YOUR CODE ####</span>  headline, it is the part you need to implement

### Embeddings

Basic components

Create Word Embeddings

First of all we need to convert each word in the input sequence to an embedding vector. Embedding vectors will create a more semantic representation of each word.


<img src="./embedding.png" width="400" height="200">

Suppoese each embedding vector is of 512 dimension and suppose our vocab size is 100, then our embedding matrix will be of size 100x512. These marix will be learned on training and during inference each word will be mapped to corresponding 512 d vector. 

Suppose we have batch size of 2 and sequence length of 12(12 words). The the output will be 2x10x512.





In [112]:
##hint: you can use nn.Embedding function to create work embedding

class Embedding(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        """
        Args:
            vocab_size: size of vocabulary
            embed_dim: dimension of embeddings
        """
        super(Embedding, self).__init__()
        ###YOUR CODE###
        self.embed = nn.Embedding(vocab_size, embed_dim)
         ###YOUR CODE###
    def forward(self, x):
        """
        Args:
            x: input vector
        Returns:
            out: embedding vector
        """
        out = self.embed(x)
        return out

In [158]:
#create a sample with batch size of 2 , sequence length of 12
src = torch.tensor([[0, 2, 5, 6, 4, 3, 9, 5, 2, 9, 10, 1], 
                    [0, 2, 8, 7, 3, 4, 5, 6, 7, 2, 10, 1]])
ebed = Embedding(100, 512)
embeddings = ebed(src)
embeddings.shape

torch.Size([2, 12, 512])

### Position embedding

In order for the model to make sense of the sentence,we should not only know "what does the word mean" (by above embedding approach), but also need to know "what is the position of the word in the sentence"?

So, next we need to create position embedding, to include the word position information to the model.


In "attention is all you need paper" author used the following functions to create positional encoding. On odd time steps a cosine function is used and in even time steps a sine function is used.


<img src="./position_embedding.png" width="400" height="200">


Positinal embedding will generate a matrix of similar to embedding matrix. It will create a matrix of dimension sequence length x embedding dimension. For each token(word) in sequence, we will find the embedding vector which is of dimension 1 x 512 and it is added with the correspondng positional vector which is of dimension 1 x 512 to get 1 x 512 dim out for each word/token.

for eg: if we have batch size of 32 and seq length of 10 and let embedding dimension be 512. Then we will have embedding vector of dimension 2 x 12 x 512. Similarly we will have positional encoding vector of dimension 2 x 12 x 512. Then we add both.

<img src="./position_embedding2.png" width="400" height="200">

In [184]:

# If you have parameters in your model, which should be saved and restored in the state_dict,
# but not trained by the optimizer, you should register them as buffers.

class PositionalEmbedding(nn.Module):
    def __init__(self,max_seq_len,embed_model_dim):
        """
        Args:
            seq_len        : length of input sequence
            embed_model_dim: demension of embedding
        """
        super(PositionalEmbedding, self).__init__()
        self.embed_dim = embed_model_dim
        pe = torch.zeros(max_seq_len,self.embed_dim)
        for pos in range(max_seq_len):
            for i in range(0,self.embed_dim,2):
                ###YOUR CODE###
                pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/self.embed_dim)))
                pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/self.embed_dim)))
                ###YOUR CODE###

        pe = pe.unsqueeze(0) 
    
        self.register_buffer('pe', pe)


    def forward(self, x):
        """
        Args:
            x: input vector
        Returns:
            x: output
        """
      
        # make embeddings relatively larger
        print("x",x.shape)
        x = x * math.sqrt(self.embed_dim)
        print("x after",x.shape)
        #add constant to embedding
        seq_len = x.size(1)
        x = x + torch.autograd.Variable(self.pe[:,:,:seq_len], requires_grad=False)
        return x

### Block 1:   (Multi-Head) Attention

To build self attention, the first step is to initial metrics WQ,WV,WK. Each of the dim is [embed_dim, head_dim].
Here, we’ve initialized three independent linear layers that apply matrix multiplication to the embedding vectors to produce tensors of shape [batch_size, seq_len, head_dim], where head_dim is the number of dimensions we are projecting into. Although head_dim does not have to be smaller than the number of embedding dimensions of the tokens (embed_dim), in practice it is chosen to be a multiple of embed_dim so that the computation across each head is constant.


<img src="./self_attention1.png" width="400" height="200">

Then Compute attention scores. We determine how much the query and key vectors relate to each other using a similarity function. As the name suggests, the similarity function for scaled dot-product attention is the dot product, computed efficiently using matrix multiplication of the embeddings. Queries and keys that are similar will have a large dot product, while those that don’t share much in common will have little to no overlap. The outputs from this step are called the attention scores, and for a sequence with n input tokens there is a corresponding n x n matrix of attention scores.

<img src="./self_attention2.png" width="400" height="200">

In [42]:
def scaled_dot_product_attention(query, key, value):
    dim_k = query.size(-1)
    # torch.bmm is batch matrix - matrix multiplication. 
    # Basically a dot product.
    ###YOUR CODE####
    scores = torch.bmm(query, key.transpose(1, 2)) / sqrt(dim_k) 
    weights = F.softmax(scores, dim=-1)
    scaled_score = torch.bmm(weights, value)
    print(scaled_score.shape)
    ###YOUR CODE####
    return scaled_score

    
   

Above is the single attetion, now we need to impelment multi-head attetion:
    
<img src="./multi-head-attention.png" width="800" height="400">


<span style="color:red">#### Below, you only need to compute the attention scores  ####</span>

In [13]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim=512, n_heads=8):
        """
        Args:
            embed_dim: dimension of embeding vector output
            n_heads: number of self attention heads
        """
        super(MultiHeadAttention, self).__init__()

        self.embed_dim = embed_dim    #512 dim
        self.n_heads = n_heads        #8
        self.single_head_dim = int(self.embed_dim / self.n_heads)   #512/8 = 64  each key,query, value will be of 64d
       
        #key,query and value matrixes    #64 x 64   
        self.query_matrix = nn.Linear(self.single_head_dim , self.single_head_dim ,bias=False)  # single key matrix for all 8 keys #512x512
        self.key_matrix = nn.Linear(self.single_head_dim  , self.single_head_dim, bias=False)
        self.value_matrix = nn.Linear(self.single_head_dim ,self.single_head_dim , bias=False)
        self.out = nn.Linear(self.n_heads*self.single_head_dim ,self.embed_dim) 

    def forward(self,key,query,value,mask=None):    #batch_size x sequence_length x embedding_dim    # 2 x 12 x 512
        
        """
        Args:
           key : key vector
           query : query vector
           value : value vector
           mask: mask for decoder
        
        Returns:
           output vector from multihead attention
        """
        batch_size = key.size(0)
        seq_length = key.size(1)
        
        # query dimension can change in decoder during inference. 
        # so we cant take general seq_length
        seq_length_query = query.size(1)
        
        # 32x10x512
        key = key.view(batch_size, seq_length, self.n_heads, self.single_head_dim)  #batch_size x sequence_length x n_heads x single_head_dim = (2x12x8x64)
        query = query.view(batch_size, seq_length_query, self.n_heads, self.single_head_dim) #(2x12x8x64)
        value = value.view(batch_size, seq_length, self.n_heads, self.single_head_dim) #(2x12x8x64)
       
        k = self.key_matrix(key)     # (32x10x8x64)
        q = self.query_matrix(query)   
        v = self.value_matrix(value)

        q = q.transpose(1,2)  # (batch_size, n_heads, seq_len, single_head_dim)    # (2 x 12 x 10 x 64)
        k = k.transpose(1,2)  # (batch_size, n_heads, seq_len, single_head_dim)
        v = v.transpose(1,2)  # (batch_size, n_heads, seq_len, single_head_dim)
       
    
        
        # computes attention
        # adjust key for matrix multiplication
        k_adjusted = k.transpose(-1,-2)  #(batch_size, n_heads, single_head_dim, seq_ken)  #(2 x 8 x 64 x 12)
       
        ###YOUR CODE#### 
        ##Your need to compute the attention score here
        product = torch.matmul(q, k_adjusted)  #(2 x 8 x 12 x 64) x (2 x 8 x 64 x 12) = #(2x8x10x12)
      
        # fill those positions of product matrix as (-1e20) where mask positions are 0
        if mask is not None:
             product = product.masked_fill(mask == 0, float("-1e20"))


        #divising by square root of key dimension
        product = product / math.sqrt(self.single_head_dim) # / sqrt(64)

        #applying softmax
        scores = F.softmax(product, dim=-1)
 
        #mutiply with value matrix
        scores = torch.matmul(scores, v)  ##(32x8x 10x 10) x (2 x 8 x 12 x 64) = (2 x 8 x 12 x 64) 

        ###YOUR CODE####
            
            
        #concatenated output
        concat = scores.transpose(1,2).contiguous().view(batch_size, seq_length_query, self.single_head_dim*self.n_heads)  # (2x8x12x64) -> (2x12x8x64)  -> (2,8,512)
        
        output = self.out(concat) #(2,12,512) -> (2,12,512)
       
        return output
    

    

### Encoder:

Now we need to establish the encoder part. There are four steps:

Step 1: First input(padded tokens corresponding to the sentence) get passes through embedding layer and positional encoding layer.

Step 2: As discussed above it will passed through the multihead attention layer and creates useful representational matrix as output.

Step 3: Next we have a normalization and residual connection. The output from multihead attention is added with its input and then normalized.

Step 4: Next we have a feed forward layer and a then normalization layer with residual connection from input(input of feed forward layer) where we passes the output after normalization though it and finally gets the output of encoder.


<img src="./encoder.png" width="300" height="150">

In [43]:

class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, expansion_factor=4, n_heads=8):
        super(TransformerBlock, self).__init__()
        
        """
        Args:
           embed_dim: dimension of the embedding
           expansion_factor: fator ehich determines output dimension of linear layer
           n_heads: number of attention heads
        
        """
        self.attention = MultiHeadAttention(embed_dim, n_heads)
        
        self.norm1 = nn.LayerNorm(embed_dim) 
        self.norm2 = nn.LayerNorm(embed_dim)
        
        self.feed_forward = nn.Sequential(
                          nn.Linear(embed_dim, expansion_factor*embed_dim),
                          nn.ReLU(),
                          nn.Linear(expansion_factor*embed_dim, embed_dim)
        )

        self.dropout1 = nn.Dropout(0.2)
        self.dropout2 = nn.Dropout(0.2)

    def forward(self,key,query,value):
        
        """
        Args:
           key: key vector
           query: query vector
           value: value vector
           norm2_out: output of transformer block
        
        """
        
        attention_out = self.attention(key,query,value)  #2x12x512
        attention_residual_out = attention_out + value  #2x12x512
        norm1_out = self.dropout1(self.norm1(attention_residual_out)) #2x12x512

        feed_fwd_out = self.feed_forward(norm1_out) #2x12x512 -> #2x12x2048 ->2x12x512
        feed_fwd_residual_out = feed_fwd_out + norm1_out #2x12x512
        norm2_out = self.dropout2(self.norm2(feed_fwd_residual_out)) #2x12x512

        return norm2_out


class TransformerEncoder(nn.Module):
    """
    Args:
        seq_len : length of input sequence
        embed_dim: dimension of embedding
        num_layers: number of encoder layers
        expansion_factor: factor which determines number of linear layers in feed forward layer
        n_heads: number of heads in multihead attention
        
    Returns:
        out: output of the encoder
    """
    def __init__(self, seq_len, vocab_size, embed_dim, num_layers=2, expansion_factor=4, n_heads=8):
        super(TransformerEncoder, self).__init__()
        ###YOUR CODE####
        self.embedding_layer = Embedding(vocab_size, embed_dim)
        self.positional_encoder = PositionalEmbedding(seq_len, embed_dim)
        ###YOUR CODE####
        self.layers = nn.ModuleList([TransformerBlock(embed_dim, expansion_factor, n_heads) for i in range(num_layers)])
    
    def forward(self, x):
        embed_out = self.embedding_layer(x)
        out = self.positional_encoder(embed_out)
        for layer in self.layers:
            out = layer(out,out,out)
        return out  #32x10x512

###   Decoder 


Now, let's build the decoder of transformer. We will use the output of encoder to generate key and value vectors for the decoder.There are two kinds of multi head attention in the decoder.One is the decoder attention and other is the encoder decoder attention. 


<img src="./decoder.png" width="200" height="80">



In [15]:
class DecoderBlock(nn.Module):
    def __init__(self, embed_dim, expansion_factor=4, n_heads=8):
        super(DecoderBlock, self).__init__()

        """
        Args:
           embed_dim: dimension of the embedding
           expansion_factor: fator ehich determines output dimension of linear layer
           n_heads: number of attention heads
        
        """
        self.attention = MultiHeadAttention(embed_dim, n_heads=8)
        self.norm = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(0.2)
        self.transformer_block = TransformerBlock(embed_dim, expansion_factor, n_heads)
        
    
    def forward(self, key, query, x,mask):
        
        """
        Args:
           key: key vector
           query: query vector
           value: value vector
           mask: mask to be given for multi head attention 
        Returns:
           out: output of transformer block
    
        """
        
        #we need to pass mask mask only to fst attention
        ##YOUR CODE###
        attention = self.attention(x,x,x,mask=mask) #2x12x512
        value = self.dropout(self.norm(attention + x))
        out = self.transformer_block(key, query, value)
        ##YOUR CODE###
        
        return out

In [44]:
class TransformerDecoder(nn.Module):
    def __init__(self, target_vocab_size, embed_dim, seq_len, num_layers=2, expansion_factor=4, n_heads=8):
        super(TransformerDecoder, self).__init__()
        """  
        Args:
           target_vocab_size: vocabulary size of taget
           embed_dim: dimension of embedding
           seq_len : length of input sequence
           num_layers: number of encoder layers
           expansion_factor: factor which determines number of linear layers in feed forward layer
           n_heads: number of heads in multihead attention
        
        """
        self.word_embedding = nn.Embedding(target_vocab_size, embed_dim)
        self.position_embedding = PositionalEmbedding(seq_len, embed_dim)

        self.layers = nn.ModuleList(
            [
                DecoderBlock(embed_dim, expansion_factor=4, n_heads=8) 
                for _ in range(num_layers)
            ]

        )
        self.fc_out = nn.Linear(embed_dim, target_vocab_size)
        self.dropout = nn.Dropout(0.2)
        
        
    def forward(self, x, enc_out, mask):
        
        """
        Args:
            x: input vector from target
            enc_out : output from encoder layer
            trg_mask: mask for decoder self attention
        Returns:
            out: output vector
        """
            
        
        x = self.word_embedding(x)  #2x12x512
        x = self.position_embedding(x) #2x12x512
        x = self.dropout(x)
     
        for layer in self.layers:
            x = layer(enc_out, x, enc_out, mask) 

        out = F.softmax(self.fc_out(x))

        return out

### Wrap it up, to buid the entire Transformer architecture

In [186]:
class Transformer(nn.Module):
    def __init__(self, embed_dim, src_vocab_size, target_vocab_size, seq_length,num_layers=2, expansion_factor=4, n_heads=8):
        super(Transformer, self).__init__()
        
        """  
        Args:
           embed_dim:  dimension of embedding 
           src_vocab_size: vocabulary size of source
           target_vocab_size: vocabulary size of target
           seq_length : length of input sequence
           num_layers: number of encoder layers
           expansion_factor: factor which determines number of linear layers in feed forward layer
           n_heads: number of heads in multihead attention
        
        """
        
        self.target_vocab_size = target_vocab_size

        self.encoder = TransformerEncoder(seq_length, src_vocab_size, embed_dim, num_layers=num_layers, expansion_factor=expansion_factor, n_heads=n_heads)
        self.decoder = TransformerDecoder(target_vocab_size, embed_dim, seq_length, num_layers=num_layers, expansion_factor=expansion_factor, n_heads=n_heads)
        
        
    def make_trg_mask(self, trg):
        """
        Args:
            trg: target sequence
        Returns:
            trg_mask: target mask
        """
        batch_size, trg_len = trg.shape
        # returns the lower triangular part of matrix filled with ones
        trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
            batch_size, 1, trg_len, trg_len
        )
        return trg_mask    

    def decode(self,src,trg):
        """
        for inference
        Args:
            src: input to encoder 
            trg: input to decoder
        out:
            out_labels : returns final prediction of sequence
        """
        trg_mask = self.make_trg_mask(trg)
        enc_out = self.encoder(src)
        out_labels = []
        batch_size,seq_len = src.shape[0],src.shape[1]
        #outputs = torch.zeros(seq_len, batch_size, self.target_vocab_size)
        out = trg
        for i in range(seq_len): #12
            out = self.decoder(out,enc_out,trg_mask) #bs x seq_len x vocab_dim
            # taking the last token
            out = out[:,-1,:]
     
            out = out.argmax(-1)
            out_labels.append(out.item())
            out = torch.unsqueeze(out,axis=0)
          
        
        return out_labels

    def forward(self, src, trg):
        """
        Args:
            src: input to encoder 
            trg: input to decoder
        out:
            out: final vector which returns probabilities of each target word
        """
        trg_mask = self.make_trg_mask(trg)
        enc_out = self.encoder(src)
   
        outputs = self.decoder(trg, enc_out, trg_mask)
        return outputs

### Create some samples 


In [101]:
src_vocab_size = 11
target_vocab_size = 11
num_layers = 6
seq_length= 12


# let 0 be sos token and 1 be eos token
src = torch.tensor([[0, 2, 5, 6, 4, 3, 9, 5, 2, 9, 10, 1], 
                    [0, 2, 8, 7, 3, 4, 5, 6, 7, 2, 10, 1]])
target = torch.tensor([[0, 1, 7, 4, 3, 5, 9, 2, 8, 10, 9, 1], 
                       [0, 1, 5, 6, 2, 4, 7, 6, 2, 8, 10, 1]])

print(src.shape,target.shape)
model = Transformer(embed_dim=512, src_vocab_size=src_vocab_size, 
                    target_vocab_size=target_vocab_size, seq_length=seq_length,
                    num_layers=num_layers, expansion_factor=4, n_heads=8)
model


torch.Size([2, 12]) torch.Size([2, 12])


Transformer(
  (encoder): TransformerEncoder(
    (embedding_layer): Embedding(
      (embed): Embedding(11, 512)
    )
    (positional_encoder): PositionalEmbedding()
    (layers): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadAttention(
          (query_matrix): Linear(in_features=64, out_features=64, bias=False)
          (key_matrix): Linear(in_features=64, out_features=64, bias=False)
          (value_matrix): Linear(in_features=64, out_features=64, bias=False)
          (out): Linear(in_features=512, out_features=512, bias=True)
        )
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (feed_forward): Sequential(
          (0): Linear(in_features=512, out_features=2048, bias=True)
          (1): ReLU()
          (2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropou

In [104]:
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

model.train()

for epoch in range(100):
    optimizer.zero_grad()
    output = model(src, target)
    loss = criterion(output.contiguous().view(-1, target_vocab_size), target.contiguous().view(-1))
    loss.backward()
    optimizer.step()
    print(f"Epoch: {epoch+1}, Loss: {loss.item()}")

  out = F.softmax(self.fc_out(x))


Epoch: 1, Loss: 2.4025018215179443
Epoch: 2, Loss: 2.388179063796997
Epoch: 3, Loss: 2.415064811706543
Epoch: 4, Loss: 2.4056813716888428
Epoch: 5, Loss: 2.400160789489746
Epoch: 6, Loss: 2.3532369136810303
Epoch: 7, Loss: 2.3556859493255615
Epoch: 8, Loss: 2.379387855529785
Epoch: 9, Loss: 2.350984573364258
Epoch: 10, Loss: 2.338085174560547
Epoch: 11, Loss: 2.378542423248291
Epoch: 12, Loss: 2.3276898860931396
Epoch: 13, Loss: 2.337876081466675
Epoch: 14, Loss: 2.3059282302856445
Epoch: 15, Loss: 2.288414239883423
Epoch: 16, Loss: 2.2943315505981445
Epoch: 17, Loss: 2.3021605014801025
Epoch: 18, Loss: 2.352037191390991
Epoch: 19, Loss: 2.2697043418884277
Epoch: 20, Loss: 2.2466962337493896
Epoch: 21, Loss: 2.1889684200286865
Epoch: 22, Loss: 2.25447678565979
Epoch: 23, Loss: 2.218231439590454
Epoch: 24, Loss: 2.278759241104126
Epoch: 25, Loss: 2.2357537746429443
Epoch: 26, Loss: 2.241239309310913
Epoch: 27, Loss: 2.220705509185791
Epoch: 28, Loss: 2.206367254257202
Epoch: 29, Loss: 2

KeyboardInterrupt: 

#### References

In [185]:
#https://www.kaggle.com/code/arunmohan003/transformer-from-scratch-using-pytorch