In [219]:
import math
import torch.nn as nn
import torch

In [220]:
d_model = 512
num_heads = 8
drop_prob =  0.1
batch_size = 30
max_seq_len = 200
ffn_hidden = 2048
num_layers = 5


In [122]:
%%writefile MultiHeadAttention.py
import torch
import torch.nn as nn

class MultiHeadAttention(nn.Module):
    def __init__(self, max_seqlen:int, d_model:int, attention_heads:int):
        super().__init__()
        self.max_seq = max_seqlen
        self.d_model = d_model
        self.ah = attention_heads
        
        self.qkv = nn.Linear(d_model, d_model*3)
        
    @staticmethod
    def scaled_dot_product(q,k,v,mask=None):
        
        d_k = torch.tensor(k.shape[-1], dtype=torch.float)
        attention = (q @ k.transpose(-2,-1)) / torch.sqrt(d_k)        
        if mask is not None:
            attention+=mask
        attention = torch.softmax(attention, dim =-1)
        print(attention)
        scaled = attention @ v
        return scaled
        
    def forward(self,x, mask=None):
        print(mask)
        qkv = self.qkv(x)
        ad = self.d_model // self.ah
        qkv = qkv.reshape(x.shape[0],x.shape[1], self.ah, ad*3 )
        qkv = qkv.permute(0,2,1,3)
        q,k,v = qkv.chunk(chunks=3, dim=3)
        values = MultiHeadAttention.scaled_dot_product(q,k,v, mask =mask)
        print(values)
        batch, head, seq, dim = values.size()
        values = values.reshape(batch, seq, dim*head)
        
        return values 


Overwriting MultiHeadAttention.py


In [123]:
%%writefile transformer_heads/Mask.py
import torch.nn as nn
import torch
class Mask(nn.Module):
    def __init__(self, batch_size:int, attention_head:int, max_seq_len:int):
        super().__init__()
        self.batch = batch_size
        self.ah = attention_head
        self.maxseq = max_seq_len
    
    def forward(self):
        z = torch.zeros((self.batch,self.ah,self.maxseq,self.maxseq))
        z = torch.fill(z,value=torch.tensor(float('-inf')))
        z = torch.triu(z,diagonal=1)
        return z

Overwriting transformer_heads/Mask.py


In [120]:
input = torch.rand((30,10,512))

mask_ = Mask(batch_size=30, attention_head=8,max_seq_len=10)
mask = mask_()

model = MultiHeadAttention(max_seqlen=10, d_model=512, attention_heads=8)
y= model(input, mask)
y.shape

tensor([[[[0., -inf, -inf,  ..., -inf, -inf, -inf],
          [0., 0., -inf,  ..., -inf, -inf, -inf],
          [0., 0., 0.,  ..., -inf, -inf, -inf],
          ...,
          [0., 0., 0.,  ..., 0., -inf, -inf],
          [0., 0., 0.,  ..., 0., 0., -inf],
          [0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., -inf, -inf,  ..., -inf, -inf, -inf],
          [0., 0., -inf,  ..., -inf, -inf, -inf],
          [0., 0., 0.,  ..., -inf, -inf, -inf],
          ...,
          [0., 0., 0.,  ..., 0., -inf, -inf],
          [0., 0., 0.,  ..., 0., 0., -inf],
          [0., 0., 0.,  ..., 0., 0., 0.]],

         [[0., -inf, -inf,  ..., -inf, -inf, -inf],
          [0., 0., -inf,  ..., -inf, -inf, -inf],
          [0., 0., 0.,  ..., -inf, -inf, -inf],
          ...,
          [0., 0., 0.,  ..., 0., -inf, -inf],
          [0., 0., 0.,  ..., 0., 0., -inf],
          [0., 0., 0.,  ..., 0., 0., 0.]],

         ...,

         [[0., -inf, -inf,  ..., -inf, -inf, -inf],
          [0., 0., -inf,  ..., -inf,

torch.Size([30, 10, 512])

In [111]:
input = torch.tensor([[[1,2,3],
                       [4,5,6]],
                      
                      [[7,8,9],
                       [10,11,12]]], dtype=torch.float)

model = MultiHeadAttention(max_seqlen=2,d_model=3, attention_heads=1)

y= model(input, mask)
print(f"shape of output{y.shape}")
print(f"shape of mask {mask.shape}")


tensor([[[[0., -inf],
          [0., 0.]]],


        [[[0., -inf],
          [0., 0.]]]])
shape of q = torch.Size([2, 1, 2, 3]), shape of k= torch.Size([2, 1, 2, 3])
attention shape : torch.Size([2, 1, 2, 2])
tensor([[[[ 0.7554,    -inf],
          [ 0.7076,  4.6328]]],


        [[[11.0638,    -inf],
          [13.5695, 20.0483]]]], grad_fn=<AddBackward0>)
tensor([[[[1.0000, 0.0000],
          [0.0194, 0.9806]]],


        [[[1.0000, 0.0000],
          [0.0015, 0.9985]]]], grad_fn=<SoftmaxBackward0>)
tensor([[[[-0.7736, -0.8567,  0.7082],
          [-1.0681, -1.3890,  1.2756]]],


        [[[-1.3741, -1.9422,  1.8655],
          [-1.6740, -2.4842,  2.4432]]]], grad_fn=<UnsafeViewBackward0>)
shape of outputtorch.Size([2, 2, 3])
shape of mask torch.Size([2, 1, 2, 2])


In [31]:
%%writefile LayerNormalization.py
import torch
import torch.nn as nn

class LayerNormalization(nn.Module):
    def __init__(self,d_model:int, dimension:list, eps:float = 1e-5):
        super().__init__()
        self.eps = eps
        self.dimension = dimension
        self.gamma = nn.Parameter(torch.ones(d_model))
        self.bias = nn.Parameter(torch.zeros(d_model))
    def forward(self, x):
        mean = torch.mean(x, dim = self.dimension, keepdim=True)
        var = torch.var(x, dim = self.dimension, keepdim=True)
        layer_norm = (x - mean) / torch.sqrt(var + self.eps)
        norm = self.gamma * layer_norm + self.bias
        return norm
    

Writing LayerNormalization.py


In [62]:
%%writefile PositionwiseFeedForward.py
import torch
import torch.nn as nn

class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model:int, hidden_unit:int, drop_prob:float):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(d_model, hidden_unit),
            nn.ReLU(),
            nn.Dropout(drop_prob),
            nn.Linear(hidden_unit, d_model),
        )
    def forward(self,x):
        return self.fc(x)
        

Overwriting PositionwiseFeedForward.py


In [67]:
%%writefile PositionEncoding.py
import torch.nn as nn
import torch

class PositionEncoding(nn.Module):
    def __init__(self, max_seq_len:int, d_model:int):
        super().__init__()
        self.maxseq = max_seq_len
        self.d_model = d_model
        
    def forward(self, x):
        batch, seq, dim = x.size()
        pos = torch.arange(0, self.maxseq).reshape(self.maxseq, 1)
        print(f"pos = {pos}")
        even_denom = torch.pow(10000, torch.arange(0,self.d_model,2)/self.d_model )
        print(f"even deno = {even_denom}")
        odd_denom = torch.pow(10000, torch.arange(1,self.d_model,2)/self.d_model )
        print(f"odd denom = {odd_denom}")
        print(f"pos/even denom = {pos/even_denom}")
        print(f"pos/odd denom = {pos/odd_denom}")
        print(f"{(pos/even_denom).unsqueeze(1)}")
        z = x
        z[...,0::2] = (pos/even_denom).unsqueeze(1)
        z[...,1::2] = (pos/odd_denom).unsqueeze(1)
        return z
    
input= torch.tensor([[[1,2,3],
                       [4,5,6]],
                      
                      [[7,8,9],
                       [10,11,12]]], dtype=torch.float)
model = PositionEncoding(2,3)
y = model(input)
y

Overwriting PositionEncoding.py


In [10]:
%%writefile transformer_heads/MultiHeadCrossAttention.py
import torch
import torch.nn as nn

class MultiHeadAttentionCrossAttention(nn.Module):
    def __init__(self, max_seqlen:int, d_model:int, attention_heads:int):
        super().__init__()
        self.max_seq = max_seqlen
        self.d_model = d_model
        self.ah = attention_heads

        self.kv = nn.Linear(d_model, d_model*2)
        self.q = nn.Linear(d_model, d_model)

    @staticmethod
    def scaled_dot_product(q,k,v,mask=None):

        d_k = torch.tensor(k.shape[-1], dtype=torch.float)
        attention = (q @ k.transpose(-2,-1)) / torch.sqrt(d_k)
        if mask is not None:
            attention+=mask
        attention = torch.softmax(attention, dim =-1)
        scaled = attention @ v
        return scaled

    def forward(self,x,y, mask=None):
        batch_size, seq_len, d_model = x.size() # 30x200x512
        
        kv = self.kv(x)
        q = self.q(y)
        print(f"q.shape= {q.shape}")
        ad = self.d_model // self.ah
        kv = kv.reshape(batch_size,seq_len, self.ah, ad*2 ) # 30x8x200x128
        kv = kv.permute(0,2,1,3)
        k,v = kv.chunk(chunks=2, dim=3)
        print(f"k= {k.shape}, v={v.shape}")
        q = q.reshape(y.shape[0], y.shape[1], self.ah, ad ) # 30x8x200x64
        q = q.permute(0,2,1,3)
        
        values = MultiHeadAttentionCrossAttention.scaled_dot_product(q,k,v, mask=mask)
        batch, head, seq, dim = values.size()
        values = values.reshape(batch, seq, dim*head)

        return values 


Writing transformer_heads/MultiHeadCrossAttention.py


q.shape= torch.Size([2, 2, 3])
k= torch.Size([2, 1, 2, 3]), v=torch.Size([2, 1, 2, 3])
torch.Size([2, 2, 3])
