In [1]:
import numpy as np

In [2]:
# Sample tokenized input (6 words)
tokens = ["What", "are", "the", "symptoms", "of", "diabetes"]

In [3]:
# Embedding matrix (each row is a token's embedding vector of size 4)
E = np.random.rand(len(tokens), 4)  # Random embeddings for illustration

In [4]:
# Positional encoding
pos = np.arange(len(tokens))[:, np.newaxis]  # Positions: [0, 1, 2, ..., 5]
div_term = np.power(10000, 2 * np.arange(2) / 4)
pe = np.zeros((len(tokens), 4))
pe[:, 0::2] = np.sin(pos / div_term)
pe[:, 1::2] = np.cos(pos / div_term)

In [5]:
# Input representation after adding positional encoding
X = E + pe

In [6]:
# Query, Key, and Value weight matrices
W_q = np.random.rand(4, 4)
W_k = np.random.rand(4, 4)
W_v = np.random.rand(4, 4)

In [7]:
# Compute Queries (Q), Keys (K), and Values (V)
Q = X @ W_q  # Shape: (6, 4)
K = X @ W_k  # Shape: (6, 4)
V = X @ W_v  # Shape: (6, 4)

In [8]:
# Compute Attention Scores (Softmax(QK^T / sqrt(d_k)))
d_k = 4  # Scaling factor (dimension of key vectors)
scores = Q @ K.T / np.sqrt(d_k)  # Raw attention scores (before softmax)
attention_scores = np.exp(scores) / np.sum(np.exp(scores), axis=-1, keepdims=True)  # Softmax normalization

In [9]:
# Compute final output by multiplying attention scores with V
output = attention_scores @ V

In [10]:
print("Tokens:", tokens)

Tokens: ['What', 'are', 'the', 'symptoms', 'of', 'diabetes']


In [11]:
print("\nAttention Scores (How much each token attends to others):")
for i, token in enumerate(tokens):
    print(f"{token}: {attention_scores[i]}")

print("\nFinal Output (Transformed embeddings after self-attention):")
for i, token in enumerate(tokens):
    print(f"{token}: {output[i]}")


Attention Scores (How much each token attends to others):
What: [0.02078255 0.52564312 0.42337144 0.0163475  0.00063632 0.01321908]
are: [1.78313185e-03 6.06323504e-01 3.90046040e-01 1.09050205e-03
 3.10924968e-06 7.53712988e-04]
the: [2.01660088e-03 6.27837882e-01 3.68013961e-01 1.26235104e-03
 4.27275064e-06 8.64932236e-04]
symptoms: [1.19509379e-02 5.78954529e-01 3.93834980e-01 8.47454171e-03
 1.81151993e-04 6.60385910e-03]
of: [0.06927859 0.45303167 0.36070465 0.05713279 0.00899744 0.05085486]
diabetes: [1.90622868e-02 5.48823360e-01 4.05499061e-01 1.44820353e-02
 5.14804683e-04 1.16184523e-02]

Final Output (Transformed embeddings after self-attention):
What: [2.06233027 1.20035602 2.68788587 2.23433294]
are: [2.12983593 1.22516987 2.76021217 2.23400822]
the: [2.13751841 1.22947997 2.76563822 2.22390223]
symptoms: [2.10147949 1.21590447 2.72803719 2.2281548 ]
of: [1.91991187 1.15470768 2.51682524 2.16290184]
diabetes: [2.07543653 1.20628411 2.70004631 2.22776361]
