## Attention mechanism toy tutorial
- set up
- self attention
- causal attention
- multi-headed attention

In [1]:
import torch
print(torch.__version__)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

2.2.1
Using device: cpu


#### set up

In [25]:
seq = "This is simple toy tutorial on how attention mechanism works"

# import tiktoken
# tokenizer = tiktoken.get_encoding("gpt2")
# tokens = tokenizer.encode(seq, allowed_special={'|eos|'})
# print(tokens)

seq_list = seq.split(" ")
print(seq_list)
token_tensors = [torch.tensor(x) for x in range(len(seq_list))]
token_tensors

['This', 'is', 'simple', 'toy', 'tutorial', 'on', 'how', 'attention', 'mechanism', 'works']


[tensor(0),
 tensor(1),
 tensor(2),
 tensor(3),
 tensor(4),
 tensor(5),
 tensor(6),
 tensor(7),
 tensor(8),
 tensor(9)]

In [26]:
from torch import nn
# convert tokens into 3 dimensional embeddings
embed = nn.Embedding(len(tokens), 3)
input = torch.LongTensor(token_tensors)
embeddings = embed(input)
embeddings

tensor([[-0.9182,  0.0033,  0.9627],
        [-1.0063, -0.3444, -1.4247],
        [ 1.1410,  0.3782, -0.5953],
        [ 0.4299, -0.0343, -0.3688],
        [-1.2274, -0.6004, -0.1838],
        [ 0.4596, -0.0693, -1.5469],
        [ 2.7773,  0.3163, -0.4481],
        [-1.2793,  0.0345, -0.4968],
        [-1.6882, -0.5491, -0.9991],
        [-1.8605, -0.1361, -0.8169]], grad_fn=<EmbeddingBackward0>)

### self attention
- also called scaled dot-product attention
- 3 steps: 
    1. attention score
    2. normalization
    3. context vector 
- make it with trainable weights

In [29]:
print(seq_list)
print(seq_list[3])

['This', 'is', 'simple', 'toy', 'tutorial', 'on', 'how', 'attention', 'mechanism', 'works']
toy


In [39]:
# calculate attention score
# the dot product between each input token's embedding and each of the rest of the tokens' embeddings in the same sequence
# we refer to the token of focus as query - here take "toy" as example, which is the indexed by 3 in the seq_list 
query = embeddings[3]
attention_score_for_toy = torch.empty(embeddings.shape[0])
for x in range(len(embeddings)):
    attention_score_for_toy[x] = torch.dot(embeddings[x], query)
print(attention_score_for_toy)


tensor([-0.7499,  0.1047,  0.6971,  0.3220, -0.4393,  0.7705,  1.3484, -0.3680,
        -0.3385, -0.4939], grad_fn=<CopySlices>)


In [41]:
# attention scores' meaning:
# dot product mathematically combines two vectors to yield a scalar value
# dot product also measures similarity between the two vectors: the higher the more similar

# in practice, rather than loop through each token as the query, we do matmul
attention_scores = torch.matmul(embeddings, embeddings.T)
attention_scores

tensor([[ 1.7698, -0.4488, -1.6195, -0.7499,  0.9481, -1.9114, -2.9804,  0.6965,
          0.5864,  0.9214],
        [-0.4488,  3.1609, -0.4302,  0.1047,  1.7037,  1.7652, -2.2653,  1.9832,
          3.3113,  3.0828],
        [-1.6195, -0.4302,  1.7993,  0.6971, -1.5181,  1.4192,  3.5553, -1.1509,
         -1.5391, -1.6880],
        [-0.7499,  0.1047,  0.6971,  0.3220, -0.4393,  0.7705,  1.3484, -0.3680,
         -0.3385, -0.4939],
        [ 0.9481,  1.7037, -1.5181, -0.4393,  1.9008, -0.2383, -3.5165,  1.6408,
          2.5854,  2.5154],
        [-1.9114,  1.7652,  1.4192,  0.7705, -0.2383,  2.6089,  1.9478,  0.1780,
          0.8076,  0.4179],
        [-2.9804, -2.2653,  3.5553,  1.3484, -3.5165,  1.9478,  8.0142, -3.3196,
         -4.4146, -4.8441],
        [ 0.6965,  1.9832, -1.1509, -0.3680,  1.6408,  0.1780, -3.3196,  1.8847,
          2.6371,  2.7813],
        [ 0.5864,  3.3113, -1.5391, -0.3385,  2.5854,  0.8076, -4.4146,  2.6371,
          4.1497,  4.0317],
        [ 0.9214,  

In [46]:
# normalize attention scores into attention weights
# use torch.softmax() to avoid overflow/underflow and optimize compute
attention_weights = torch.softmax(attention_scores, dim=-1)
attention_weights

tensor([[3.6066e-01, 3.9226e-02, 1.2165e-02, 2.9027e-02, 1.5857e-01, 9.0853e-03,
         3.1196e-03, 1.2330e-01, 1.1045e-01, 1.5440e-01],
        [6.7964e-03, 2.5115e-01, 6.9234e-03, 1.1820e-02, 5.8488e-02, 6.2197e-02,
         1.1050e-03, 7.7349e-02, 2.9190e-01, 2.3227e-01],
        [4.0431e-03, 1.3281e-02, 1.2346e-01, 4.1004e-02, 4.4746e-03, 8.4413e-02,
         7.1471e-01, 6.4598e-03, 4.3818e-03, 3.7755e-03],
        [3.4629e-02, 8.1389e-02, 1.4719e-01, 1.0115e-01, 4.7241e-02, 1.5839e-01,
         2.8229e-01, 5.0735e-02, 5.2255e-02, 4.4732e-02],
        [5.4625e-02, 1.1629e-01, 4.6378e-03, 1.3641e-02, 1.4162e-01, 1.6677e-02,
         6.2869e-04, 1.0920e-01, 2.8083e-01, 2.6185e-01],
        [3.8281e-03, 1.5126e-01, 1.0702e-01, 5.5941e-02, 2.0399e-02, 3.5169e-01,
         1.8156e-01, 3.0933e-02, 5.8055e-02, 3.9317e-02],
        [1.6540e-05, 3.3814e-05, 1.1401e-02, 1.2545e-03, 9.6760e-06, 2.2846e-03,
         9.8498e-01, 1.1782e-05, 3.9412e-06, 2.5651e-06],
        [3.7602e-02, 1.3614

In [48]:
# check to make sure attention scores sum up to 1 for each row
attention_weights[3].sum()

tensor(1.0000, grad_fn=<SumBackward0>)

In [47]:
# calculate context vector bease on attention weights and input embedding 
all_context_vecs = torch.matmul(attention_weights, embeddings)
# all_context_vecs = attention_weights @ embeddings
all_context_vecs

tensor([[-1.1575, -0.1810, -0.0689],
        [-1.3100, -0.3126, -0.9870],
        [ 2.1370,  0.2556, -0.5660],
        [ 0.6602,  0.0412, -0.7017],
        [-1.4214, -0.3106, -0.7217],
        [ 0.4206, -0.0289, -1.0308],
        [ 2.7501,  0.3157, -0.4522],
        [-1.4282, -0.2850, -0.7887],
        [-1.5255, -0.3317, -0.8916],
        [-1.5547, -0.3089, -0.8561]], grad_fn=<MmBackward0>)

#### make it with trainable weights

### causal attention

### multi-headed attention