In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
class SelfAttention(nn.Module):
    def __init__(self,
                d_model=2,
                row_dim=0,
                col_dim=1):
        
        super().__init__()

        self.d_model = d_model
        self.row_dim = row_dim
        self.col_dim = col_dim

        self.W_q = nn.Linear(d_model, d_model, bias=False)
        self.W_k = nn.Linear(d_model, d_model, bias=False)
        self.W_v = nn.Linear(d_model, d_model, bias=False)


    def forward(self, token_embeddings, mask=None):
        Q = self.W_q(token_embeddings)
        K = self.W_k(token_embeddings)
        V = self.W_v(token_embeddings)
        

        similarities = Q @ K.transpose(dim0=self.row_dim, dim1=self.col_dim)
        
        similarities_scaled = similarities / (self.d_model ** 0.5)

        if mask is not None:
            similarities_scaled = similarities_scaled.masked_fill(mask=mask, value=-torch.inf)

        similarities_percentages = F.softmax(similarities_scaled)

        attentions = similarities_percentages @ V

        return attentions, Q, K, V

In [41]:
model = SelfAttention()

encodings_matrix = torch.tensor([[1.16, 0.23],
                                [0.57, 1.36],
                                [4.41, -2.16]])

## set the seed for the random number generator
torch.manual_seed(42)

## create a masked self-attention object
maskedSelfAttention = SelfAttention(d_model=2,
                            row_dim=0,
                            col_dim=1)


mask = torch.tril(torch.ones(3, 3))
mask = mask == 0

model(encodings_matrix, mask=mask)

  similarities_percentages = F.softmax(similarities_scaled)


tensor([[ 0.6038,  0.7434],
        [-0.0062,  0.6072],
        [ 3.4989,  2.2427]], grad_fn=<MmBackward0>)

In [44]:
maskedSelfAttention.W_q.weight.transpose(0, 1)

tensor([[ 0.5406, -0.1657],
        [ 0.5869,  0.6496]], grad_fn=<TransposeBackward0>)