# Operaciones que realiza un mecanismo de atención

Primero se importan las librerías en el cabecero

In [None]:
import random as rd
import numpy as np
import scipy.special as sci
import pandas as pd

El corpus en este caso es inicializado de forma aleatoria y representa a los embeddings ya con codificación posicional.
En la práctica se pueden tomar estos embeddings de una librería como word2vec.

In [1]:
corpus = np.around(np.random.rand(3,3),3)
corpus.shape
print(f"Corpus shape: {corpus.shape}\n\n")
print(f"Corpus:\n{corpus}")

NameError: name 'np' is not defined

A partir de los embeddings iniciales se obtienen 3 matrices, Q, K y V, donde cada una es una copia de dichos embeddings.

In [None]:
Q = corpus.copy()
K = corpus.copy()
V = corpus.copy()

print(f"Q shape: {Q.shape}\n\n")
print(f"Q:\n{Q}")

Q shape: (3, 3)


Q:
[[0.952 0.541 0.262]
 [0.563 0.289 0.299]
 [0.039 0.092 0.575]]


En este caso la matriz de pesos inicia de forma aleatoria, en la práctica estas matrices pueden ya estar dadas gracias a entrenamientos previos para ahorrar tiempo y entrenamiento o bien inicializarse de forma aleatoria.

In [None]:
W_Q = np.around(np.random.rand(3,3),3)
W_K = np.around(np.random.rand(3,3),3)
W_V = np.around(np.random.rand(3,3),3)

print(f"Key weights: {W_K}\n\n")
print(f"Query weights: {W_Q}\n\n")
print(f"Values weights: {W_V}\n\n")

Key weights: [[0.794 0.186 0.549]
 [0.957 0.987 0.647]
 [0.147 0.598 0.322]]


Query weights: [[0.594 0.956 0.248]
 [0.053 0.511 0.227]
 [0.581 0.742 0.666]]


Values weights: [[0.993 0.009 0.158]
 [0.633 0.589 0.641]
 [0.519 0.158 0.874]]




Se multiplica cada matriz (Q,K,V) con su respectiva matriz de pesos.

In [None]:
Q = np.dot(Q,W_Q)
K = np.dot(K,W_K)
V = np.dot(V,W_V)

print(f"Key (Post adding weights): \n{K}\n\n")
print(f"Query (Post adding weights): \n{Q}\n\n")
print(f"Values (Post adding weights): \n{V}\n\n")

Key (Post adding weights): 
[[1.312139 0.867715 0.957039]
 [0.767548 0.568763 0.592348]
 [0.203535 0.441908 0.266085]]


Query (Post adding weights): 
[[0.746383 1.380967 0.533395]
 [0.523458 0.907765 0.404361]
 [0.362117 0.510946 0.413506]]


Values (Post adding weights): 
[[1.423767 0.368613 0.726185]
 [0.897177 0.22253  0.535529]
 [0.395388 0.145389 0.567684]]




Se tiene que realizar la siguiente operación: 
$$\text{Attention(Q,K,V)}=\text{softmax}(\frac{QK^T}{\sqrt{d_k}})$$

In [None]:
res = np.dot(Q,K.T)
res

array([[2.68812384, 1.67428317, 0.90410384],
       [1.86152021, 1.15760472, 0.61528504],
       [1.31424472, 0.81348881, 0.40952235]])

In [None]:
res = res/np.sqrt(K.shape[0]+K.shape[1])
res

array([[1.09742196, 0.68352324, 0.36909885],
       [0.75996244, 0.47259015, 0.25118906],
       [0.53653816, 0.33210542, 0.1671868 ]])

In [None]:
res = sci.softmax(res,axis=0)
print(f"Sum = {np.sum(res,axis=0)}")
# e_x = np.exp(res[:1] - np.max(res))
# res = e_x / e_x.sum()
# print(f"Sum = {np.sum(res)}")

Sum = [1. 1. 1.]


In [None]:
res = res*V
res

array([[0.27108538, 0.04639636, 0.06674331],
       [0.12189573, 0.0226827 , 0.04374575],
       [0.04296372, 0.01287733, 0.04263613]])

In [None]:
res = np.around(res,4)
res

array([[0.2711, 0.0464, 0.0667],
       [0.1219, 0.0227, 0.0437],
       [0.043 , 0.0129, 0.0426]])

In [None]:
res_df = pd.DataFrame(res)
res_df.columns=["Amo","el","queso"]
res_df.index=["Amo","el","queso"]
res_df

Unnamed: 0,Amo,el,queso
Amo,0.2711,0.0464,0.0667
el,0.1219,0.0227,0.0437
queso,0.043,0.0129,0.0426
