### Transformers
* word embedding: token -> numeric vector
* position encoding: keep track of token order
* attention:  SoftMax(QK^T/sqrt(dk))V

### Attention
these terms are based upon database technology
* Key: K, actual key in the data
* Query: Q, term to search the data 
* Value: V, results of the search

Each token has attention with all other tokens in the sentence including itself. 

## self attention in pytorch

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F 

In [8]:
class SelfAttention(nn.Module):
    def __init__(self,
                 d_model=2,
                 row_dim=0,
                 col_dim=1):
        super().__init__()
        # in feature : row number # out feature: num col 
        self.W_q = nn.Linear(in_features=d_model,
                             out_features=d_model,
                             bias=False)
        self.W_k = nn.Linear(in_features=d_model,
                             out_features=d_model,
                             bias=False)
        self.W_v = nn.Linear(in_features=d_model,
                             out_features=d_model,
                             bias=False) 
        self.row_dim = row_dim
        self.col_dim = col_dim

    def forward(self, token_encoding):
        q = self.W_q(token_encoding)
        k = self.W_k(token_encoding)
        v = self.W_v(token_encoding)

        sims = torch.matmul(q, k.transpose(dim0=self.row_dim, dim1=self.col_dim))
        scaled_sims = sims/torch.tensor(k.size(self.col_dim)**0.5)

        attention_percents = F.softmax(scaled_sims, dim=self.col_dim)

        attention_scores = torch.matmul(attention_percents, v)

        return attention_scores

In [14]:
## create a matrix of token encodings...
encodings_matrix = torch.tensor([[[1.16, 0.23],
                                 [0.57, 1.36],
                                 [4.41, -2.16]]])

## set the seed for the random number generator
torch.manual_seed(42)

## create a basic self-attention ojbect
selfAttention = SelfAttention(d_model=2,
                               row_dim=1,
                               col_dim=2)

## calculate basic attention for the token encodings
selfAttention(encodings_matrix)

tensor([[[1.0100, 1.0641],
         [0.2040, 0.7057],
         [3.4989, 2.2427]]], grad_fn=<UnsafeViewBackward0>)

In [13]:
encodings_matrix.shape

torch.Size([1, 3, 2])

In [35]:
q = selfAttention.W_q(encodings_matrix)
k = selfAttention.W_k(encodings_matrix)
v = selfAttention.W_v(encodings_matrix)

In [42]:
sims = torch.matmul(q, k.transpose(1,2))
sims

tensor([[[-0.0990,  0.0648, -0.6523],
         [-0.4022,  0.4078, -3.0024],
         [ 0.4842, -0.6683,  4.0461]]], grad_fn=<UnsafeViewBackward0>)

In [43]:
scaled_sims = sims/(torch.tensor(2)**0.5)
scaled_sims

tensor([[[-0.0700,  0.0458, -0.4612],
         [-0.2844,  0.2883, -2.1230],
         [ 0.3424, -0.4725,  2.8610]]], grad_fn=<DivBackward0>)

In [74]:
F.softmax(scaled_sims, dim=2)

tensor([[[0.3573, 0.4011, 0.2416],
         [0.3410, 0.6047, 0.0542],
         [0.0722, 0.0320, 0.8959]]], grad_fn=<SoftmaxBackward0>)

In [78]:
q

tensor([[[ 0.7621, -0.0428],
         [ 1.1063,  0.7890],
         [ 1.1164, -2.1336]]], grad_fn=<UnsafeViewBackward0>)