In [45]:
import numpy as np
import random
from scipy.special import softmax

<div>
<img src="..\\images\\transformer_full.png" width="800"/>
</div>

#### Encoder representations of three different words

In [46]:
word_1 = np.array([1, 0, 0, 1, 0, 1, 0, 0])
word_2 = np.array([0, 1, 0, 0, 1, 1, 0, 1])
word_3 = np.array([1, 1, 0, 0, 1, 1, 1, 0])

#### Generating the weight matrices <br> These are the matrices to train <br> They are trained in order to build an inquiry system: what is 'key' for the word 'query'?

In [47]:
random.seed(42)
W_Q = np.random.randint(low=3, size=(8, 3))
W_K = np.random.randint(low=3, size=(8, 3))
W_V = np.random.randint(low=3, size=(8, 3))

#### Generating the queries, keys and values

In [48]:
query_1 = word_1 @ W_Q
key_1 = word_1 @ W_K
value_1 = word_1 @ W_V
 
query_2 = word_2 @ W_Q
key_2 = word_2 @ W_K
value_2 = word_2 @ W_V
 
query_3 = word_3 @ W_Q
key_3 = word_3 @ W_K
value_3 = word_3 @ W_V

#### Dot product is a similarity score between queries and keys <br> In reality we use a fully connected layer

In [49]:
scores_1 = np.array([np.dot(query_1, key_1), np.dot(query_1, key_2), np.dot(query_1, key_3)])
scores_2 = np.array([np.dot(query_2, key_1), np.dot(query_2, key_2), np.dot(query_2, key_3)])
scores_3 = np.array([np.dot(query_3, key_1), np.dot(query_3, key_2), np.dot(query_3, key_3)])

#### Computing the weights by a softmax operation (can be thought as a probability vector)

In [50]:
weights_1 = softmax(scores_1 / key_1.shape[0] ** 0.5)
weights_2 = softmax(scores_2 / key_2.shape[0] ** 0.5)
weights_3 = softmax(scores_3 / key_3.shape[0] ** 0.5)

print(weights_1)
print(weights_2)
print(weights_3)

[0.49754595 0.00490809 0.49754595]
[4.9975518e-01 4.8963998e-04 4.9975518e-01]
[9.03393582e-02 8.85108617e-05 9.09572131e-01]


#### Computing the attention by a weighted sum of the value vectors <br> Can be thought of doing 'proportional retrieval' according to the probability vector

In [51]:
attention_1 = (weights_1[0] * value_1) + (weights_1[1] * value_2) + (weights_1[2] * value_3)
attention_2 = (weights_2[0] * value_1) + (weights_2[1] * value_2) + (weights_2[2] * value_3)
attention_3 = (weights_3[0] * value_1) + (weights_3[1] * value_2) + (weights_3[2] * value_3)

#### Correlation between words

In [52]:
print(attention_1)
print(attention_2)
print(attention_3)

[6.49754595 0.50245405 6.50245405]
[6.49975518 0.50024482 6.50024482]
[6.90957213 0.90966064 7.72889341]


#### To summarize:

#### Input: $\mathbf{X}\in\mathbf{R}^{batch \times dim}$

#### Trainable weight matrices: $\mathbf{W}_i^{Q},\mathbf{W}_i^{K},\mathbf{W}_i^{V}\in\mathbf{R}^{d_{model}\times d_k}$

#### We create three different representations: $\mathbf{Q} = \mathbf{X}\mathbf{W}_Q, \mathbf{K} = \mathbf{X}\mathbf{W}_K, \mathbf{V} = \mathbf{X}\mathbf{W}_V$

#### All these operations can be summarized into this formula: <br> $\text{Attention}(\mathbf{Q}, \mathbf{K}, \mathbf{V}) = \text{softmax}(\frac{\mathbf{Q}\mathbf{K}^\top}{\sqrt{n}})\mathbf{V}$

#### The attention operation can be applied multiple times in parallel in this way: <br> $\begin{aligned} \text{MultiHead}(\mathbf{Q}, \mathbf{K}, \mathbf{V}) &= [\text{head}_1; \dots; \text{head}_h]\mathbf{W}^O \\ \text{where head}_i &= \text{Attention}(\mathbf{Q}\mathbf{W}^Q_i, \mathbf{K}\mathbf{W}^K_i, \mathbf{V}\mathbf{W}^V_i) \end{aligned} $

<div>
<img src="..\\images\\multi-head-attention-peltarion.png" width="500"/>
</div>