<a href="https://colab.research.google.com/github/muhajirakbarhsb/NLP_class_2023/blob/main/Self_Attention_for_Transformer_Neural_Networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Self Attention in Transformers

## Generate Data

In [1]:
import numpy as np
import math
# nama saya adalah muhajir
L, d_k, d_v = 4, 8, 8
q = np.random.randn(L, d_k)
k = np.random.randn(L, d_k)
v = np.random.randn(L, d_v)

In [None]:
print("Q\n", q)
print("K\n", k)
print("V\n", v)

Q
 [[ 1.11463077 -1.61118447  0.54147411 -0.74795067 -1.17261228 -0.17991552
   1.045193    1.08462789]
 [ 0.39346084 -0.0943694   0.54644756  0.16949109 -0.65849926  0.64650622
   0.426543   -1.12598942]
 [ 0.31880994  0.69748883  2.3498791   1.34022374  0.27939465 -0.12330068
   0.08064605 -0.41289694]
 [ 0.33571747 -2.68938734  0.20870225  0.09218937 -0.12195233  1.14108108
   0.56365024 -0.02904754]]
K
 [[-0.97934243 -0.3139709   0.61703621  0.77079775 -0.08300723  1.08536188
  -2.24613447  0.48858564]
 [ 0.97632654  1.82049063  0.1326423  -1.92552993 -0.70349899  1.26033579
  -0.39922419  0.67526354]
 [ 0.31373313  0.97305745  0.53850262  1.1908357  -0.91723031  0.98094577
  -1.16461067  1.58418976]
 [ 0.29072402  1.6008319  -1.34954098  0.24156335 -1.42140451 -0.73642022
  -0.30595885 -1.52221799]]
V
 [[ 0.79097351 -1.44719286 -0.80885359 -0.11991704 -1.2083264   1.90760162
   0.09660963  0.22398411]
 [ 1.5706123  -1.45513936 -0.26028603  0.33463494  1.73969334  2.49307489
  -0.7

## Self Attention

$$
\text{self attention} = softmax\bigg(\frac{Q.K^T}{\sqrt{d_k}}+M\bigg)
$$

$$
\text{new V} = \text{self attention}.V
$$

In [2]:
np.matmul(q, k.T)

array([[ 0.99968242,  2.48521869,  0.99022297,  0.23246704],
       [ 3.45624174,  0.41924737,  1.25002168,  1.33799021],
       [-1.21823541,  2.23287623, -3.37501046, -1.11386584],
       [ 1.59483301, -1.19508959,  3.52297155,  1.73691772]])

In [3]:
# Why we need sqrt(d_k) in denominator
q.var(), k.var(), np.matmul(q, k.T).var()

(0.9773561173119378, 0.552814817073815, 3.189739897316829)

In [4]:
scaled = np.matmul(q, k.T) / math.sqrt(d_k)
q.var(), k.var(), scaled.var()

(0.9773561173119378, 0.552814817073815, 0.3987174871646036)

In [5]:
q.var()

0.9773561173119378

In [6]:
q

array([[ 1.38905225,  1.19186346,  0.88148395, -0.32463032, -0.86934711,
         0.64318405,  0.59280408,  0.63209254],
       [-0.1240182 , -1.36702359,  0.39453158, -1.73559691, -2.05269885,
        -0.64386136, -0.25463646,  0.21926453],
       [-1.92515172,  0.55800565, -0.11161961,  0.5946847 ,  1.13986177,
         1.37843431,  1.73472423,  0.39020062],
       [ 0.28780968, -0.33984103,  0.42803332,  0.38788878, -1.92198321,
        -0.33599494, -0.48475889,  0.1357611 ]])

Notice the reduction in variance of the product

In [7]:
scaled

array([[ 0.35344111,  0.87865749,  0.35009669,  0.08218951],
       [ 1.22196599,  0.14822633,  0.4419494 ,  0.47305097],
       [-0.43071126,  0.78944096, -1.19324639, -0.39381105],
       [ 0.56385862, -0.42252798,  1.24555854,  0.61409315]])

## Masking

- This is to ensure words don't get context from words generated in the future.
- Not required in the encoders, but required int he decoders

In [8]:
mask = np.tril(np.ones( (L, L) ))
mask

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [9]:
mask[mask == 0] = -np.infty
mask[mask == 1] = 0

In [10]:
mask

array([[  0., -inf, -inf, -inf],
       [  0.,   0., -inf, -inf],
       [  0.,   0.,   0., -inf],
       [  0.,   0.,   0.,   0.]])

In [11]:
scaled + mask

array([[ 0.35344111,        -inf,        -inf,        -inf],
       [ 1.22196599,  0.14822633,        -inf,        -inf],
       [-0.43071126,  0.78944096, -1.19324639,        -inf],
       [ 0.56385862, -0.42252798,  1.24555854,  0.61409315]])

## Softmax

$$
\text{softmax} = \frac{e^{x_i}}{\sum_j e^x_j}
$$

In [12]:
def softmax(x):
  return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

In [13]:
attention = softmax(scaled + mask)

In [None]:
attention

array([[1.        , 0.        , 0.        , 0.        ],
       [0.66175691, 0.33824309, 0.        , 0.        ],
       [0.24477442, 0.42487923, 0.33034636, 0.        ],
       [0.14346915, 0.43745742, 0.23915437, 0.17991907]])

In [15]:
new_v = np.matmul(attention, v)
new_v

array([[ 0.4518304 , -1.11781126, -0.6164827 , -1.0088047 , -1.35845577,
        -0.54871931, -0.6643454 , -0.84114448],
       [ 0.63016241, -0.95004634, -0.32841113, -0.79463558, -0.96910556,
        -0.23129338, -0.56308456, -0.93180638],
       [ 0.96544514, -0.44854885,  0.2698335 , -0.3930442 , -0.25937312,
         0.344086  , -0.31649545, -0.78575067],
       [ 0.31149247, -0.11422097,  0.03789597, -0.20867083, -0.38985792,
        -0.14512092,  0.18535288,  0.87024363]])

In [16]:
v

array([[ 0.4518304 , -1.11781126, -0.6164827 , -1.0088047 , -1.35845577,
        -0.54871931, -0.6643454 , -0.84114448],
       [ 1.15201581, -0.45911543,  0.5145734 , -0.16791197,  0.17025089,
         0.6975909 , -0.26676474, -1.19711051],
       [ 0.71156333,  1.06288856,  0.39247476, -0.70799863, -1.02329497,
        -0.30923866,  0.06803584,  2.3203882 ],
       [-0.87234105, -1.25087748, -0.17557706,  1.47672444,  1.5237339 ,
         0.24843624,  1.37436404,  0.5041741 ]])

# Function

In [17]:
def softmax(x):
  return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

def scaled_dot_product_attention(q, k, v, mask=None):
  d_k = q.shape[-1]
  scaled = np.matmul(q, k.T) / math.sqrt(d_k)
  if mask is not None:
    scaled = scaled + mask
  attention = softmax(scaled)
  out = np.matmul(attention, v)
  return out, attention

In [18]:
values, attention = scaled_dot_product_attention(q, k, v, mask=1)
print("Q\n", q)
print("K\n", k)
print("V\n", v)
print("New V\n", values)
print("Attention\n", attention)

Q
 [[ 1.38905225  1.19186346  0.88148395 -0.32463032 -0.86934711  0.64318405
   0.59280408  0.63209254]
 [-0.1240182  -1.36702359  0.39453158 -1.73559691 -2.05269885 -0.64386136
  -0.25463646  0.21926453]
 [-1.92515172  0.55800565 -0.11161961  0.5946847   1.13986177  1.37843431
   1.73472423  0.39020062]
 [ 0.28780968 -0.33984103  0.42803332  0.38788878 -1.92198321 -0.33599494
  -0.48475889  0.1357611 ]]
K
 [[ 0.36519457 -0.14110782 -1.10518068 -0.74009487 -1.49381809  0.23658004
   0.94368935 -0.97407051]
 [ 0.05709775  0.46349096  0.3676294  -1.10056692  0.07724166  1.15229172
   0.58136812  0.24288999]
 [ 0.51292077  0.602594   -1.02323547  0.92924403 -1.75931246 -0.9341624
   0.00332954 -0.26487062]
 [-0.36683261  0.58602813 -0.26363397 -0.04792898 -1.10491872  0.11954381
  -0.419089   -0.83631392]]
V
 [[ 0.4518304  -1.11781126 -0.6164827  -1.0088047  -1.35845577 -0.54871931
  -0.6643454  -0.84114448]
 [ 1.15201581 -0.45911543  0.5145734  -0.16791197  0.17025089  0.6975909
  -0.266

In [None]:
values

array([[ 0.79956317, -0.73564748, -0.26162336,  0.44419674,  0.65131838,
         1.19261664, -0.21407408,  0.11010225],
       [ 0.41067204, -0.60665665, -0.01281275,  0.32600419,  0.06311218,
         0.63413812, -0.03969421,  0.10818871],
       [ 0.44464447, -0.45774848, -0.36839081,  0.45117879, -0.0726794 ,
         0.66948065,  0.06045603,  0.30557085],
       [ 0.65615294, -0.92339818, -0.5288817 ,  0.2113847 , -0.40626596,
         1.28408196,  0.02249295,  0.23302734]])

In [None]:
d_k = q.shape[-1]

In [None]:
d_k

8