# Attention in transformers: Concept and code in PyTorh
- __Self Attention:__ self attention works by seeing how similar each word is to all of the words in the sentence, including itself. 
  $$ Attention(Q,K,V)=SoftMax \bigg(\frac{QK^T}{\sqrt{d_k}}\bigg)V$$
- Masked-Slef-attention: $$ Masked-Slef-attention (Q,K,V,M)=SoftMax \bigg(\frac{QK^T}{\sqrt{d_k}}+ M\bigg)V$$
- Multi-Head atention: when there are multi attetnion heads at the output of attention layer
- Encoder-decoder attention / cross-atetnion: example Seq2Seq model

In [None]:
## Import the modules that will do all the work
import torch
import torch.nn as nn 
import torch.nn.functional as F 

## Self Attention and masked-self-attention

In [None]:
class SelfAttention(nn.Module): 
                            
    def __init__(self, d_model=2,  
                 row_dim=0, 
                 col_dim=1):
        ## d_model = the number of embedding values per token.
        ##           Because we want to be able to do the math by hand, we've
        ##           the default value for d_model=2.
        ##           However, in "Attention Is All You Need" d_model=512
        ##
        ## row_dim, col_dim = the indices we should use to access rows or columns

        
        super().__init__() # call the parent's __init__ method otherwise there is no point in inheriti from a class to begin with.
        
        ## Initialize the Weights (W) that we'll use to create the
        ## query (q), key (k) and value (v) for each token
        ## NOTE: A lot of implementations include bias terms when
        ##       creating the the queries, keys, and values, but
        ##       the original manuscript that described Attention,
        ##       "Attention Is All You Need" did not, so we won't either
        self.W_q = nn.Linear(in_features=d_model, out_features=d_model, bias=False) # create weight matrix
        self.W_k = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_v = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        
        self.row_dim = row_dim
        self.col_dim = col_dim

    # the forward methos where we actually calculate the self-attention values for each tokens.
    def forward(self, token_encodings):
        ## Create the query, key and values using the encoding numbers
        ## associated with each token (token encodings)
        q = self.W_q(token_encodings)
        k = self.W_k(token_encodings)
        v = self.W_v(token_encodings)

        ## Compute similarities scores: (q * k^T)
        sims = torch.matmul(q, k.transpose(dim0=self.row_dim, dim1=self.col_dim))

        ## Scale the similarities by dividing by sqrt(k.col_dim)
        scaled_sims = sims / torch.tensor(k.size(self.col_dim)**0.5)

        ## Apply softmax to determine what percent of each tokens' value to
        ## use in the final attention values.
        attention_percents = F.softmax(scaled_sims, dim=self.col_dim)

        ## Scale the values by their associated percentages and add them up.
        attention_scores = torch.matmul(attention_percents, v)

        return attention_scores
        
class Attention(nn.Module): 
                            
    def __init__(self, d_model=2,  
                 row_dim=0, 
                 col_dim=1):
        
        super().__init__()
        
        self.W_q = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_k = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_v = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        
        self.row_dim = row_dim
        self.col_dim = col_dim


    ## The forward methods have  only change from SelfAttention and attention is that now we expect 3 sets of encodings to be passed in...
    def forward(self, encodings_for_q, encodings_for_k, encodings_for_v, mask=None):
        ## ...and we pass those sets of encodings to the various weight matrices.
        q = self.W_q(encodings_for_q)
        k = self.W_k(encodings_for_k)
        v = self.W_v(encodings_for_v)

        # Perform dot product as per attention eqaution
        sims = torch.matmul(q, k.transpose(dim0=self.row_dim, dim1=self.col_dim))

        scaled_sims = sims / torch.tensor(k.size(self.col_dim)**0.5)

        # Maked-self-atention: create a masked matrix
        if mask is not None:
            scaled_sims = scaled_sims.masked_fill(mask=mask, value=-1e9)
            
        attention_percents = F.softmax(scaled_sims, dim=self.col_dim)

        attention_scores = torch.matmul(attention_percents, v)

        return attention_scores

## Encoder-Decoder Attention

In [None]:
## create matrices of token encodings...
encodings_for_q = torch.tensor([[1.16, 0.23],
                                [0.57, 1.36],
                                [4.41, -2.16]])

encodings_for_k = torch.tensor([[1.16, 0.23],
                                [0.57, 1.36],
                                [4.41, -2.16]])

encodings_for_v = torch.tensor([[1.16, 0.23],
                                [0.57, 1.36],
                                [4.41, -2.16]])

## set the seed for the random number generator
torch.manual_seed(42)

## create an attention object
attention = Attention(d_model=2,
                      row_dim=0,
                      col_dim=1)

## calculate encoder-decoder attention
attention(encodings_for_q, encodings_for_k, encodings_for_v)

# Calculate Self-Attention
<a id="calculate"></a>

In [None]:
## create a matrix of token encodings...
encodings_matrix = torch.tensor([[1.16, 0.23],
                                 [0.57, 1.36],
                                 [4.41, -2.16]])

## set the seed for the random number generator
torch.manual_seed(42)

## create a basic self-attention ojbect
selfAttention = SelfAttention(d_model=2,
                               row_dim=0,
                               col_dim=1)

## calculate basic attention for the token encodings
selfAttention(encodings_matrix)

In [None]:
# Print Out Weights and Verify Calculations
## print out the weight matrix that creates the queries
selfAttention.W_q.weight.transpose(0, 1)

## print out the weight matrix that creates the keys
selfAttention.W_k.weight.transpose(0, 1)

## print out the weight matrix that creates the values
selfAttention.W_v.weight.transpose(0, 1)


## calculate the queries
selfAttention.W_q(encodings_matrix)


## calculate the keys
selfAttention.W_k(encodings_matrix)

## calculate the values
selfAttention.W_v(encodings_matrix)

q = selfAttention.W_q(encodings_matrix)
print('q =',q)


k = selfAttention.W_k(encodings_matrix)
print('k = ',k)

# printi similarity index
sims = torch.matmul(q, k.transpose(dim0=0, dim1=1))
print('similaritiess = ',sims)

# printi scaled-similarity index
scaled_sims = sims / (torch.tensor(2)**0.5)
print('Scaled  similaritiess = ',scaled_sims)

# percentage values after softmax
attention_percents = F.softmax(scaled_sims, dim=1)
print('attention percents = ',attention_percents) 

# Attention score after multiplying with values
torch.matmul(attention_percents, selfAttention.W_v(encodings_matrix))

## Code Mutli-Head Attention

In [None]:
class MultiHeadAttention(nn.Module):

    def __init__(self, 
                 d_model=2,  
                 row_dim=0, 
                 col_dim=1, 
                 num_heads=1): # number of attention heads we want in multi-head attention
        
        super().__init__() # parent init method

        ## create a bunch of attention heads using gfor lead
        self.heads = nn.ModuleList( # list of mdules that we can indexed
            [Attention(d_model, row_dim, col_dim) 
             for _ in range(num_heads)]
        )

        self.col_dim = col_dim
        
    def forward(self, 
                encodings_for_q, 
                encodings_for_k,
                encodings_for_v):

        ## run the data through all of the attention heads
        return torch.cat([head(encodings_for_q, 
                               encodings_for_k,
                               encodings_for_v) 
                          for head in self.heads], dim=self.col_dim)

## Calcualte Multi-Head Attention

In [None]:
# First, verify that we can still correctly calculate attention with a single head...

## set the seed for the random number generator
torch.manual_seed(42)

## create an attention object
multiHeadAttention = MultiHeadAttention(d_model=2,
                                        row_dim=0,
                                        col_dim=1,
                                        num_heads=1) 

## calculate encoder-decoder attention
multiHeadAttention(encodings_for_q, encodings_for_k, encodings_for_v)
## set the seed for the random number generator
## set the seed for the random number generator
torch.manual_seed(42)

## create an attention object
multiHeadAttention = MultiHeadAttention(d_model=2,
                                        row_dim=0,
                                        col_dim=1,
                                        num_heads=1)

## calculate encoder-decoder attention
multiHeadAttention(encodings_for_q, encodings_for_k, encodings_for_v)




In [None]:
# Second, calculate attention with multiple heads...
## set the seed for the random number generator
torch.manual_seed(42)

## create an attention object
multiHeadAttention = MultiHeadAttention(d_model=2,
                                        row_dim=0,
                                        col_dim=1,
                                        num_heads=2)

## calculate encoder-decoder attention
multiHeadAttention(encodings_for_q, encodings_for_k, encodings_for_v)