In [1]:
import torch
import torch.nn as nn
import math


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\theju\anaconda3\envs\mlp\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "c:\Users\theju\anaconda3\envs\mlp\Lib\site-packages\traitlets\config\application.py", line 992, in launch_instance
    app.start()
  File "c:\Users\theju\anaconda3\envs\mlp\Lib\site-packages\ipykernel\kernelapp.py", line 736, in start
    self.io_loop.start()
  File "c:\Users\the

In [2]:
class SelfAttention(nn.Module):
    def __init__(self,query_shape,key_shape,value_shape, model_size=512):
        super().__init__()
        self.d_q= query_shape[-1]
        self.d_k = key_shape[-1]
        self.d_v = value_shape[-1]
        self.model_size = model_size
       
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, query, key, value, attention_mask=None):
        
        query_key=torch.matmul(query, key.transpose(-2,-1))/math.sqrt(self.d_k)
        if attention_mask is not None:
            query_key = query_key.masked_fill(attention_mask.bool(), -torch.inf)
       
        attention = torch.matmul(self.softmax(query_key), value)
        return attention

In [None]:

attn = SelfAttention(query.shape, key.shape, value.shape)
output = attn(query, query, query)

In [35]:
output

tensor([[[ 2.0064,  0.2102,  0.8838,  ...,  0.0970,  0.6234, -1.1400],
         [-0.0885, -1.0831,  1.2557,  ..., -1.1865, -1.3762, -1.3190],
         [-0.1477,  0.5764, -0.0053,  ...,  1.2574,  1.0753, -0.3646]],

        [[ 1.7582,  0.0844, -1.0347,  ..., -1.9086,  0.8512,  0.2577],
         [ 1.1745,  0.5628,  0.5414,  ...,  1.1986, -0.5605,  0.3247],
         [-0.4274, -0.3267, -0.6601,  ...,  0.0339,  1.0798,  0.9191]]])

In [9]:
class Attention(nn.Module):
    def __init__(self,query_shape,key_shape,value_shape, model_size=512):
        super().__init__()
        self.d_q= query_shape[-1]
        self.d_k = key_shape[-1]
        self.d_v = value_shape[-1]
        self.model_size = model_size
        self.W_q= nn.Parameter(torch.nn.init.xavier_uniform_(torch.empty((self.model_size, self.d_q))))
        self.W_k= nn.Parameter(torch.nn.init.xavier_uniform_(torch.empty((self.model_size, self.d_k))))
        self.W_v= nn.Parameter(torch.nn.init.xavier_uniform_(torch.empty((self.model_size, self.d_v))))
        
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, query, key, value, attention_mask=None):
        query_t = torch.matmul(query, self.W_q)
        key_t = torch.matmul(key, self.W_k)
        value_t = torch.matmul(value, self.W_v)
        
        print(query_t.shape, key_t.shape)
        query_key=torch.matmul(query_t, key_t.transpose(-2,-1))/math.sqrt(self.d_k)
        print(query_key.shape)
        if attention_mask is not None:
            query_key = query_key.masked_fill(attention_mask.bool(), -torch.inf)
       
        attention = torch.matmul(self.softmax(query_key), value_t)
        return attention

In [10]:
query = torch.randn(3, 64, 512)  # Example query tensor
key = torch.randn(3, 78, 512)    # Example key tensor 
value = torch.randn(3, 78, 512)  # Example value tensor

In [17]:
attn = Attention(query.shape, key.shape, value.shape)
output = attn(query, key, value,mask
              )

torch.Size([3, 64, 512]) torch.Size([3, 78, 512])
torch.Size([3, 64, 78])


In [18]:
output.shape

torch.Size([3, 64, 512])

In [16]:
mask = torch.triu(torch.ones((64,78)),diagonal=1)

In [13]:
mask.shape

torch.Size([3, 64, 78])

In [8]:
class MultiHeadAttention(nn.Module):
    def __init__(self, query_shape,key_shape,value_shape, head_count, model_size=512):
        super().__init__()
        self.head_count = head_count
        self.model_size = model_size
        self.query_shape = query_shape
        self.key_shape = key_shape
        
        self.value_shape = value_shape
        self.W_O = nn.Parameter(nn.init.xavier_uniform_(torch.empty(self.head_count*self.value_shape[-1],self.model_size)))

        self.heads = [ Attention(self.query_shape, self.key_shape, self.value_shape, self.model_size) for _ in range(self.head_count)]

    def forward(self, query, key, value):
        mh_p1=torch.cat([head(query, key, value) for head in self.heads],-1)
        mh_p2 = torch.matmul(mh_p1, self.W_O)
        return mh_p2

In [9]:
attn = MultiHeadAttention(query.shape, key.shape, value.shape, model_size=512, head_count=8)

In [10]:
multi_head=attn(query, key, value)

In [11]:
multi_head

tensor([[-1.3320,  1.0428, -0.4936,  ..., -1.4758,  0.2738, -0.5843],
        [-1.0061,  0.2543, -1.0541,  ..., -1.8228,  0.5517,  0.0964],
        [-1.1229,  1.9078, -1.9327,  ..., -0.3243,  1.1641, -0.6567]],
       grad_fn=<MmBackward0>)

In [12]:
multi_head.shape

torch.Size([3, 512])