In [219]:
import math
import torch.nn as nn
import torch

In [220]:
d_model = 512
num_heads = 8
drop_prob =  0.1
batch_size = 30
max_seq_len = 200
ffn_hidden = 2048
num_layers = 5


In [65]:
%%writefile MultiHeadAttention.py
import torch
import torch.nn as nn

class MultiHeadAttention(nn.Module):
    def __init__(self, max_seqlen:int, d_model:int, attention_heads:int):
        super().__init__()
        self.max_seq = max_seqlen
        self.d_model = d_model
        self.ah = attention_heads
        
        self.qkv = nn.Linear(d_model, d_model*3)
        
    @staticmethod
    def scaled_dot_product(q,k,v,mask=None):
        
        d_k = torch.tensor(k.shape[-1], dtype=torch.float)
        attention = (q @ k.transpose(-2,-1)) / torch.sqrt(d_k)
        if mask is not None:
            attention+=mask
        attention = torch.softmax(attention, dim =-1)
        scaled = attention @ v
        return scaled, attention
        
    def forward(self,x):
        qkv = self.qkv(x)
        ad = self.d_model // self.ah
        qkv = qkv.reshape(x.shape[0],x.shape[1], self.ah, ad*3 )
        qkv = qkv.permute(0,2,1,3)
        q,k,v = qkv.chunk(chunks=3, dim=3)
        values, attention = MultiHeadAttention.scaled_dot_product(q,k,v)
        batch, head, seq, dim = values.size()
        values = values.reshape(batch, seq, dim*head)
        
        return values 


Overwriting MultiHeadAttention.py


In [63]:
%%writefile LayerNormalization.py
import torch
import torch.nn as nn

class LayerNormalization(nn.Module):
    def __init__(self,d_model:int, dimension:list, eps:float = 1e-5):
        super().__init__()
        self.eps = eps
        self.dimension = dimension
        self.gamma = nn.Parameter(torch.ones(d_model))
        self.bias = nn.Parameter(torch.zeros(d_model))
    def forward(self, x):
        mean = torch.mean(x, dim = self.dimension, keepdim=True)
        var = torch.var(x, dim = self.dimension, keepdim=True)
        layer_norm = (x - mean) / torch.sqrt(var + self.eps)
        norm = self.gamma * layer_norm + self.bias
        return norm
    

Overwriting LayerNormalization.py


In [62]:
%%writefile PositionwiseFeedForward.py
import torch
import torch.nn as nn

class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model:int, hidden_unit:int, drop_prob:float):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(d_model, hidden_unit),
            nn.ReLU(),
            nn.Dropout(drop_prob),
            nn.Linear(hidden_unit, d_model),
        )
    def forward(self,x):
        return self.fc(x)
        

Overwriting PositionwiseFeedForward.py


In [67]:
%%writefile PositionEncoding.py
import torch.nn as nn
import torch

class PositionEncoding(nn.Module):
    def __init__(self, max_seq_len:int, d_model:int):
        super().__init__()
        self.maxseq = max_seq_len
        self.d_model = d_model
        
    def forward(self, x):
        batch, seq, dim = x.size()
        pos = torch.arange(0, self.maxseq).reshape(self.maxseq, 1)
        print(f"pos = {pos}")
        even_denom = torch.pow(10000, torch.arange(0,self.d_model,2)/self.d_model )
        print(f"even deno = {even_denom}")
        odd_denom = torch.pow(10000, torch.arange(1,self.d_model,2)/self.d_model )
        print(f"odd denom = {odd_denom}")
        print(f"pos/even denom = {pos/even_denom}")
        print(f"pos/odd denom = {pos/odd_denom}")
        print(f"{(pos/even_denom).unsqueeze(1)}")
        z = x
        z[...,0::2] = (pos/even_denom).unsqueeze(1)
        z[...,1::2] = (pos/odd_denom).unsqueeze(1)
        return z
    
input= torch.tensor([[[1,2,3],
                       [4,5,6]],
                      
                      [[7,8,9],
                       [10,11,12]]], dtype=torch.float)
model = PositionEncoding(2,3)
y = model(input)
y

Overwriting PositionEncoding.py


tensor([[[ 1.,  2.,  3.],
         [ 4.,  5.,  6.],
         [ 7.,  8.,  9.],
         [10., 11., 12.]]])
torch.Size([1, 4, 3])
pos = tensor([[0],
        [1],
        [2],
        [3]])
even deno = tensor([  1.0000, 464.1590])
odd denom = tensor([21.5443])
pos/even denom = tensor([[0.0000e+00, 0.0000e+00],
        [1.0000e+00, 2.1544e-03],
        [2.0000e+00, 4.3089e-03],
        [3.0000e+00, 6.4633e-03]])
pos/odd denom = tensor([[0.0000],
        [0.0464],
        [0.0928],
        [0.1392]])
tensor([[[0.0000e+00, 0.0000e+00]],

        [[1.0000e+00, 2.1544e-03]],

        [[2.0000e+00, 4.3089e-03]],

        [[3.0000e+00, 6.4633e-03]]])


RuntimeError: The expanded size of the tensor (1) must match the existing size (4) at non-singleton dimension 0.  Target sizes: [1, 4, 2].  Tensor sizes: [4, 1, 2]