In [1]:
import torch.nn.functional as f
import torch
from math import sqrt
from torch import nn
import numpy as np

# Self Attention Implementation

In [2]:
x=torch.tensor([[
    [1,0,1,0],
    [0,2,2,2],
    [1,1,1,1]
]],dtype=torch.float32)

# query_w=torch.tensor([
#     [0,0,1],
#     [1,1,0],
#     [0,1,0],
#     [1,1,0]
# ],dtype=torch.float32)

# key_w=torch.tensor([
#     [1,0,1],
#     [1,0,0],
#     [0,1,0],
#     [1,0,1]
# ],dtype=torch.float32)

# value_w=torch.tensor([
#     [1,0,1],
#     [1,1,0],
#     [0,1,1],
#     [0,0,1]
# ],dtype=torch.float32)



In [3]:
def scaled_dot_product(query,key,value):
    dim_k=query.size(-1)
    scores=torch.bmm(query,key.transpose(1,2))
    weights=f.softmax(scores,dim=-1)
    return torch.bmm(weights,value)

In [4]:
class AttentionHead(nn.Module):
    def __init__(self,embed_dim,head_dim):
        super().__init__()
        self.embed_dim = embed_dim
        self.head_dim = head_dim
        self.to_query = nn.Linear(embed_dim, head_dim)
#         self.to_query.weight = nn.Parameter(query_w.t())
        self.to_key = nn.Linear(embed_dim, head_dim)
#         self.to_key.weight = nn.Parameter(key_w.t())
        self.to_value = nn.Linear(embed_dim, head_dim)
#         self.to_value.weight = nn.Parameter(value_w.t())
    def forward(self,inputs):
#         print(self.q(inputs))
        attention_output=scaled_dot_product(self.to_query(inputs),self.to_key(inputs),self.to_value(inputs))
        return attention_output

In [5]:
attention=AttentionHead(4,3)

In [6]:
attention(x)

tensor([[[-0.4145,  0.2643, -0.8602],
         [-0.4193,  0.2986, -0.8482],
         [-0.4222,  0.2882, -0.8487]]], grad_fn=<BmmBackward0>)

# Multi Headed Attention

In [7]:
class MultiHeadAttention(nn.Module):
    def __init__(self,embed_dim,num_heads):
        super().__init__()
        self.emded_dim=embed_dim
        self.num_heads=num_heads
        self.head_dim=embed_dim//num_heads
        self.heads=nn.ModuleList(AttentionHead(embed_dim,self.head_dim) for _ in range(num_heads))
        self.output_layer=nn.Linear(embed_dim,embed_dim)
    
    def forward(self,inputs):
        x=torch.cat([calc_head(inputs) for calc_head in self.heads],dim=-1)
        x=self.output_layer(x)
        return x

In [8]:
multi_head=MultiHeadAttention(4,1)

In [9]:
multi_head(x)

tensor([[[-0.4922,  0.2438, -0.3506, -0.6668],
         [-0.4585,  0.2299, -0.3184, -0.6346],
         [-0.4613,  0.2292, -0.3187, -0.6343]]], grad_fn=<AddBackward0>)

# Trying Multi head attention on BERT Configuration

In [10]:
token_emb=nn.Embedding(30522,768) #vocab_size of bert,embedding dimension of bert

In [11]:
from transformers import AutoTokenizer

In [12]:
tokenizer=AutoTokenizer.from_pretrained("bert-base-uncased")

In [13]:
inputs=tokenizer("time flies like an arrow",return_tensors='pt',add_special_tokens=False)

In [14]:
inputs

{'input_ids': tensor([[ 2051, 10029,  2066,  2019,  8612]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

In [15]:
# inputs in sparse matrix we want dense

In [16]:
inputs_embeds=token_emb(inputs.input_ids)

In [17]:
inputs_embeds.shape

torch.Size([1, 5, 768])

In [18]:
multi_head=MultiHeadAttention(768,12) # 768 embedding size and 12 heads in bert

In [19]:
result=multi_head(inputs_embeds)

In [20]:
result.shape

torch.Size([1, 5, 768])

# FeedForward Layer

In [21]:
class FeedForward(nn.Module):
    def __init__(self,embed_size,ff_size,dropout_prob):
        super().__init__()
        self.linear_1=nn.Linear(embed_size,ff_size)
        self.linear_2=nn.Linear(ff_size,embed_size)
        self.relu=nn.ReLU()
        self.dropout=nn.Dropout(dropout_prob)
    def forward(self,inputs):
        x=self.linear_1(inputs)
        x=self.relu(x)
        x=self.linear_2(x)
        x=self.dropout(x)
        return x
        

In [22]:
ff_layer=FeedForward(768,200,0.2)

In [23]:
ff_outputs=ff_layer(inputs_embeds)

# Encoder Layer

In [24]:
class EncoderLayer(nn.Module):
    def __init__(self,embed_dim,num_heads,ff_size):
        super().__init__()
        self.multiheadAttention=MultiHeadAttention(embed_dim,num_heads)
        self.feedForward=FeedForward(embed_dim,ff_size,dropout_prob=0.2)
        self.norm_layer1=nn.LayerNorm(embed_dim)
        self.norm_layer2=nn.LayerNorm(embed_dim)
    def forward(self,inputs):
        attention_output=self.multiheadAttention(inputs)
        x=attention_output+inputs
        x=self.norm_layer1(x)
        feed_output=self.feedForward(x)
        x=feed_output+x
        x=self.norm_layer2(x)
        return x

In [25]:
encoder_layer=EncoderLayer(768,12,100)

In [26]:
outputs=encoder_layer(inputs_embeds)

In [27]:
outputs.shape

torch.Size([1, 5, 768])

# Positional Embedding

In [28]:
class Embeddings(nn.Module):
    def __init__(self,embed_size,vocab_size,max_position_embedding):
        super().__init__()
        self.token_embeddings=nn.Embedding(vocab_size,embed_size)
        self.position_embedding=nn.Embedding(max_position_embedding,embed_size)
        self.layer_norm=nn.LayerNorm(embed_size)
        self.dropout=nn.Dropout()
        
    def forward(self,inputs):
        seq_length=inputs.size(1)
        positions_id=[]
        for i in range(len(inputs)):
            single_position_lst=[]
            for j in range(seq_length):
                single_position_lst.append(j)
            positions_id.append(single_position_lst)
        positions_id=torch.LongTensor(positions_id)
        token_embeddings=self.token_embeddings(inputs)
        position_embedding=self.position_embedding(positions_id)
        embeddings=token_embeddings+position_embedding
        embeddings=self.layer_norm(embeddings)
        embeddings=self.dropout(embeddings)
        return embeddings

In [29]:
emb=Embeddings(768,30522,512)

In [30]:
emb(inputs.input_ids).shape

torch.Size([1, 5, 768])

# Complete Encoder

In [31]:
class TransformerEncoder(nn.Module):
    def __init__(self,embed_size,vocab_size,max_position_embedding,num_heads,ff_size,num_of_encoder_layers):
        super().__init__()
        self.embeddings=Embeddings(embed_size,vocab_size,max_position_embedding)
        self.layers=nn.ModuleList([EncoderLayer(embed_size,num_heads,ff_size) for _ in range(num_of_encoder_layers)])
    
    def forward(self,inputs):
        x=self.embeddings(inputs)
        for layer in self.layers:
            x=layer(x)
        return x

In [32]:
encoder=TransformerEncoder(768,50322,512,12,100,10)

In [33]:
encoder_output=encoder(inputs.input_ids)

In [34]:
encoder_output

tensor([[[-0.0795,  0.1168,  0.0550,  ...,  1.1022, -0.9078,  2.1494],
         [ 1.2178,  0.0405, -1.2665,  ..., -1.4237, -0.5697,  0.2349],
         [ 2.0484,  0.1901,  1.0917,  ..., -0.4768, -0.6110,  0.7941],
         [ 0.1756,  0.2771, -1.6968,  ..., -0.5076, -1.0719,  0.7918],
         [ 0.2331,  0.3498, -1.3248,  ..., -0.8824, -2.1113, -0.5023]]],
       grad_fn=<NativeLayerNormBackward0>)

# Adding Classification Head 

In [35]:
class TransformerForClassification(nn.Module):
    def __init__(self,embed_size,vocab_size,max_position_embedding,num_heads,ff_size,num_of_encoder_layers,num_labels):
        super().__init__()
        self.encoder=TransformerEncoder(embed_size,vocab_size,max_position_embedding,num_heads,ff_size,num_of_encoder_layers)
        self.dropout=nn.Dropout()
        self.classifier=nn.Linear(embed_size,num_labels)
        
    def forward(self,inputs):
        x=self.encoder(inputs)[:,0,:] # classify based on CLS token
        x=self.dropout(x)
        result=self.classifier(x)
        result=f.softmax(result,dim=-1)
        return result

In [36]:
classifier=TransformerForClassification(768,50322,512,12,100,10,3)

In [37]:
from transformers import AutoTokenizer

tokenizer=AutoTokenizer.from_pretrained("bert-base-uncased")

inputs=tokenizer("time flies like an arrow",return_tensors='pt')

In [38]:
classifier(inputs.input_ids)

tensor([[0.4330, 0.3706, 0.1964]], grad_fn=<SoftmaxBackward0>)

# Decoder

In [39]:
# decoder will work same as encoder except mask self attention