# Self Attention
- Reference
    - https://ratsgo.github.io/nlpbook/docs/language_model/tr_self_attention/

In [1]:
import torch
import numpy as np

'''
    input_x: sequential input data
    w_key: weights for making key matrix
    w_query: weights for making query matrix
    w_value: weights for making value matrix
'''

x = torch.tensor([
  [1.0, 0.0, 1.0, 0.0],
  [0.0, 2.0, 0.0, 2.0],
  [1.0, 1.0, 1.0, 1.0],  
])
w_key = torch.tensor([
  [0.0, 0.0, 1.0],
  [1.0, 1.0, 0.0],
  [0.0, 1.0, 0.0],
  [1.0, 1.0, 0.0]
])
w_query = torch.tensor([
  [1.0, 0.0, 1.0],
  [1.0, 0.0, 0.0],
  [0.0, 0.0, 1.0],
  [0.0, 1.0, 1.0]
])
w_value = torch.tensor([
  [0.0, 2.0, 0.0],
  [0.0, 3.0, 0.0],
  [1.0, 0.0, 3.0],
  [1.0, 1.0, 0.0]
])

In [4]:
# get ready
keys = np.dot(x, w_key)
querys = np.dot(x, w_query)
values = np.dot(x, w_value)

# 3 x 3
print(keys)
print(querys)
print(values)

[[0. 1. 1.]
 [4. 4. 0.]
 [2. 3. 1.]]
[[1. 0. 2.]
 [2. 2. 2.]
 [2. 1. 3.]]
[[1. 2. 3.]
 [2. 8. 0.]
 [2. 6. 3.]]


In [6]:
# attention score
attention_score = np.dot(querys, keys.T)
print(attention_score)

[[ 2.  4.  4.]
 [ 4. 16. 12.]
 [ 4. 12. 10.]]


In [22]:
# softmax
from torch.nn.functional import softmax

# keys.shape[-1] = length of sequence
attention_score_softmax = softmax(torch.Tensor(attention_score) / np.sqrt(keys.shape[1]), dim=-1)
print(attention_score_softmax)

tensor([[1.3613e-01, 4.3194e-01, 4.3194e-01],
        [8.9045e-04, 9.0884e-01, 9.0267e-02],
        [7.4449e-03, 7.5471e-01, 2.3785e-01]])


In [23]:
# weighted sum

# exmaple
attn_scores_softmax = torch.tensor([
  [0.0, 0.5, 0.5],
  [0.0, 1.0, 0.0],
  [0.0, 0.9, 0.1]
])

weighted_values = attn_scores_softmax @ values
weighted_values

tensor([[2.0000, 7.0000, 1.5000],
        [2.0000, 8.0000, 0.0000],
        [2.0000, 7.8000, 0.3000]])