In [1]:
import numpy as np
import pandas as pd

### Data

In [2]:
sentence = "love apple phones"

### Tokenization

In [3]:
def tokenizer(text: str):
    tokens = text.split()
    tokens_id = range(len(tokens))
    return tokens, tokens_id
    
vocab, tokens_id = tokenizer(text=sentence)
tokens_map = dict(zip(tokens_id, vocab))

tokens_map

{0: 'love', 1: 'apple', 2: 'phones'}

### Parameters

In [15]:
d_model = 8 # length of the embedding
d_k = 4  # keys and quaries dimension
d_v = 4  # values dimension

### Embeddings matrix

In [5]:
id_embedding_map = {token: np.random.rand(d_model) for token in tokens_id}

embeddings_matrix = np.stack(list(id_embedding_map.values()))

display(embeddings_matrix)

print(embeddings_matrix.shape) # len vocal * d_model

array([[0.78225753, 0.9884757 , 0.41644721, 0.71749692, 0.50666314,
        0.84125382, 0.88550623, 0.17975176],
       [0.51745202, 0.38937903, 0.17271229, 0.23723197, 0.03387767,
        0.18519248, 0.75803969, 0.26412614],
       [0.32629386, 0.63747084, 0.31457801, 0.26486692, 0.92467969,
        0.49783837, 0.42522985, 0.78500256]])

(3, 8)


### Positional encoding

In [7]:
def sinusoidal_positional_encoding(seq_len, d_model):
    """
    Returns a numpy array of shape (seq_len, d_model)
    containing sinusoidal positional encodings.
    """
    # positions: [0, 1, 2, ..., seq_len-1]
    positions = np.arange(seq_len)[:, np.newaxis]  # (seq_len, 1)
    
    # dimensions: [0, 1, 2, ..., d_model-1]
    dims = np.arange(d_model)[np.newaxis, :]       # (1, d_model)
    
    # Calculate the 'div_term' = 10000^(2i / d_model) for even and odd indices
    # We only need to handle the factor for the even dimension indices
    # but we can do so by dividing dims//2 when forming the exponent:
    div_term = np.power(10000.0, (dims // 2) * 2.0 / d_model)
    
    # Create an empty matrix for storing the encoding
    pe = np.zeros((seq_len, d_model))
    
    # Even dims: use sine
    pe[:, 0::2] = np.sin(positions / div_term[:, 0::2])
    
    # Odd dims: use cosine
    pe[:, 1::2] = np.cos(positions / div_term[:, 1::2])
    
    return pe

In [10]:
pos_matrix = sinusoidal_positional_encoding(seq_len=len(vocab), d_model=d_model)


### Adding Embeddings to Positional enc

In [14]:
embeddings = np.add(embeddings_matrix, pos_matrix)

print(embeddings)
print(embeddings.shape)

[[0.78225753 1.9884757  0.41644721 1.71749692 0.50666314 1.84125382
  0.88550623 1.17975176]
 [1.358923   0.92968134 0.2725457  1.23223613 0.04387751 1.18514248
  0.75903969 1.26412564]
 [1.23559129 0.22132401 0.51324734 1.2449335  0.94467835 1.49763838
  0.42722984 1.78500056]]
(3, 8)


### Initialize W_k, W_q and W_v matrices

In [22]:
# d_model * d_k
# key[query] = value

W_k = np.random.rand(d_model, d_k)
W_q = np.random.rand(d_model, d_k)
W_v = np.random.rand(d_model, d_k)

print(W_k)
print(W_k.shape)

[[0.00581358 0.74509774 0.22289167 0.67983892]
 [0.67510815 0.89754159 0.95443399 0.79108334]
 [0.91539601 0.52155654 0.66244052 0.23199175]
 [0.92179022 0.74348323 0.09589523 0.83251177]
 [0.38519726 0.56646805 0.44867178 0.13805532]
 [0.1812258  0.0049433  0.43057946 0.79737343]
 [0.57006183 0.74437079 0.2164188  0.63903859]
 [0.18315223 0.35270595 0.62486562 0.91580085]]
(8, 4)


### Create K, Q and V linear transformations

In [26]:
# embeddings -> (3, 8), W_k -> (8, 4) = K -> (3, 4)

K = embeddings @ W_k
Q = embeddings @ W_q
V = embeddings @ W_v

print(K)
print(K.shape)

[[4.5610853  5.23308967 4.46175677 6.81571258]
 [2.91679301 3.94683834 2.97308667 5.34218524]
 [2.97976545 3.80268957 3.22257006 5.40289603]]
(3, 4)


### Similarity score via dot product

In [29]:
# similarty = Q * K.T
Q.shape, K.shape, K.T.shape

similarity_matrix = (Q @ K.T) / np.sqrt(d_k)
print(similarity_matrix)
print(similarity_matrix.shape)

[[51.19151604 37.18700331 37.36968   ]
 [35.92473711 26.066561   26.21786946]
 [39.08212982 28.19236516 28.46137931]]
(3, 3)


In [31]:
df_similarity = pd.DataFrame(sim_matrix, index=[f"K_{token}" for token in vocab], columns=[f"Q_{token}" for token in vocab])
df_similarity

Unnamed: 0,Q_love,Q_apple,Q_phones
K_love,51.191516,37.187003,37.36968
K_apple,35.924737,26.066561,26.217869
K_phones,39.08213,28.192365,28.461379


### Convert is probabilities via softmax

In [34]:
softmax = lambda x: np.exp(x) / sum(np.exp(x))

similarity_prob_matrix = softmax(similarity_matrix)

df_similarity_prob = pd.DataFrame(similarity_prob_matrix, index=[f"K_{token}" for token in vocab], columns=[f"Q_{token}" for token in vocab])
df_similarity_prob

Unnamed: 0,Q_love,Q_apple,Q_phones
K_love,0.9999943,0.999861,0.99985
K_apple,2.342716e-07,1.5e-05,1.4e-05
K_phones,5.507543e-06,0.000124,0.000135


### Attention formula
The attention mechanism can be expressed as:
$$
\text{Attention}(Q, K, V) = \text{softmax}\left(\frac{Q \cdot K^\top}{\sqrt{d_k}}\right) \cdot V
$$


In [38]:
# Attention -> softmax( (Q * K.T) / sqrt(d_k) ) * V
attention = similarity_prob_matrix @ V

print(attention)
print(attention.shape) # number of vords * d_k

[[1.48622846e+01 1.28706430e+01 1.24085241e+01 1.40977763e+01]
 [1.30750430e-04 1.12927411e-04 1.16775130e-04 1.31763963e-04]
 [1.18125578e-03 1.02119570e-03 1.05899477e-03 1.19402658e-03]]
(3, 4)


Each row is a new vector rapresentation of each word

In [39]:
for t, e in zip(vocab, attention):
    print(f"new {t}: {e}")

new love: [14.86228465 12.87064303 12.40852415 14.09777628]
new apple: [0.00013075 0.00011293 0.00011678 0.00013176]
new phones: [0.00118126 0.0010212  0.00105899 0.00119403]


### Create W_o marrix for multi head computation

In [42]:
W_o = np.random.rand(d_model, d_model)

print(W_o)
print(W_o.shape)

[[5.81876408e-02 1.47190614e-01 2.54777067e-01 8.31229393e-01
  2.72155422e-01 7.73214221e-01 1.74398622e-01 4.00250267e-01]
 [1.04187043e-01 2.15469075e-01 6.02959357e-01 2.26095797e-01
  8.23316733e-01 6.15700419e-01 2.69758278e-01 8.64536500e-01]
 [6.17497537e-01 4.04103518e-01 8.82961718e-01 3.03137457e-01
  2.63691187e-01 8.80932353e-01 4.64046835e-01 6.28353976e-01]
 [6.16391016e-01 6.52444226e-01 4.23397142e-01 1.25330180e-01
  3.09311960e-01 4.21122019e-01 7.86015737e-01 3.62926700e-04]
 [5.76615409e-01 8.44129911e-01 2.88604904e-01 8.80840913e-01
  7.83247161e-01 5.97249404e-01 5.06151501e-01 3.09687300e-01]
 [3.26017136e-01 6.68449682e-01 8.38887965e-01 3.94920585e-01
  3.90236305e-01 9.83827641e-01 8.28383320e-01 3.51111890e-01]
 [2.90245148e-01 5.15568155e-01 9.70275721e-01 8.29719220e-01
  1.76189210e-01 6.31311639e-01 5.65782863e-01 4.60448641e-01]
 [9.85958149e-01 4.65051727e-01 5.78449038e-01 9.04220429e-01
  6.86995671e-01 7.89685412e-01 4.61571607e-01 5.38128121e-01]]

### Ensamble all togheter

In [58]:
def multi_head_attention(
    embeddings: np.array,
    d_k: int,
    heads_number: int):

    embedding_size = embeddings.shape[1]
    weights_size = (embedding_size, d_k)
    
    attention_res = []
    for i in range(heads_number):
        #np.random.seed(np.random.randint(1,1e4)+i)
    
        W_k = np.random.rand(weights_size[0], weights_size[1])
        W_q = np.random.rand(weights_size[0], weights_size[1])
        W_v = np.random.rand(weights_size[0], weights_size[1])

        K = embeddings @ W_k
        Q = embeddings @ W_q
        V = embeddings @ W_v

        sim_matrix = (Q @ K.T) / np.sqrt(d_k)
    
        softmax = lambda x: np.exp(x) / sum(np.exp(x))
    
        attention = softmax(sim_matrix) @ V
        attention_res.append(attention)
        
    Z = np.concatenate(attention_res, axis=1)

    if heads_number == 1:
        return Z
    else:
        W_o = np.random.rand(embeddings.shape[1], embeddings.shape[1])
        return Z @ W_o

    

In [60]:
res = multi_head_attention(
    embeddings=embeddings,
    d_k=4, 
    heads_number=2
)

res, res.shape

(array([[6.32699339e+01, 6.88208273e+01, 4.55828563e+01, 7.12309575e+01,
         5.96603174e+01, 6.65213586e+01, 6.83997707e+01, 6.16278543e+01],
        [3.62286484e-02, 4.82190764e-02, 3.40402108e-02, 3.09759029e-02,
         2.71392674e-02, 3.43392635e-02, 4.84358002e-02, 4.18827041e-02],
        [7.81874780e-01, 1.03953504e+00, 7.31190183e-01, 6.75879329e-01,
         5.90705018e-01, 7.45611965e-01, 1.04374052e+00, 9.04007321e-01]]),
 (3, 8))