In [5]:
import torch

from nanollm.token import BPETokenizer

## Setting up

In [8]:
sentence = "In absolute time what goes round comes around."

bpe = BPETokenizer()
tokens = bpe.tokenize(sentence)

print(tokens)
print(f"# tokens = {len(tokens)}")

[818, 4112, 640, 644, 2925, 2835, 2058, 1088, 13]
# tokens = 9


In [12]:
# this simplified example assumes that the vocabulary is the set of tokens
# (vocabulary has only 9 possible tokens)
# each token is embedded into 3 dimensions
# embeddings are initially randomized

embedding = torch.nn.Embedding(len(tokens), 3)
print(f"created embedding layer: {embedding}")
print(embedding.weight.data)

created embedding layer: Embedding(9, 3)
tensor([[-1.1068,  1.0614, -0.1729],
        [ 1.0466, -0.7614,  0.7629],
        [-0.4582,  0.4299,  0.6685],
        [-0.0931,  0.9515,  0.3471],
        [ 0.8221, -0.9725, -0.6850],
        [ 0.8352, -0.0795, -1.3230],
        [-0.6154, -1.5507,  0.0692],
        [ 0.0125, -0.6393,  0.5143],
        [ 0.3528,  0.1270,  0.3728]])


In [17]:
# map tokens to initial embeddings

inputs = embedding.weight.data

## Context vector example

Calculating context vector for single token

In [25]:
x_2 = inputs[1]
print(f"token embedding x_2 at index 1: {x_2}")

# this is the dimensionality of the input embeddings determined by the embedding layer
d_in = inputs.shape[1]
print(f"dimensionality of input embeddings: {d_in}")

# this is the dimensionality of context vectors - a constant
# typically this should be the same as the dimensionality of the input embeddings d_in
# here we want it to be different to illustrate the concept
d_out = 2
print(f"dimensionality of context vectors: {d_in}")

token embedding x_2 at index 1: tensor([ 1.0466, -0.7614,  0.7629])
dimensionality of input embeddings: 3
dimensionality of context vectors: 3


In [26]:
# learnable matrices used by self-attention are randomly initialized

torch.manual_seed(0)
W_query = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_key = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)
W_value = torch.nn.Parameter(torch.rand(d_in, d_out), requires_grad=False)

In [37]:
# calculate context vector c_2 for token x_2

query_2 = x_2 @ W_query
key_2 = x_2 @ W_key
value_2 = x_2 @ W_value

print(f"query_2: {query_2}")
print(f"key_2: {key_2}")
print(f"value_2: {value_2}")

# we will need key and value vectors for all tokens
keys = inputs @ W_key # (9,3) @ (3,2) = (9,2)
values = inputs @ W_value # (9,3) @ (3,2) = (9,2)
print(f"calculated query_2: {query_2.shape}")
print(f"calculated keys: {keys.shape}")
print(f"calculated values: {values.shape}")

# calculate attention scores for token x_2
attn_scores_2 = query_2 @ keys.T # (2,) @ (9,2).T = (9,)
print(f"calculated attn scores for x_2: {attn_scores_2.shape}")
print(f"attn scores for x_2: {attn_scores_2}")

# use softmax to convert attn scores into weights
d_k = keys.shape[-1] # output dimensionality of key vectors
attn_weights_2 = torch.softmax(attn_scores_2 / d_k**0.5, dim=-1)
print(f"calculated attn weights for x_2: {attn_weights_2.shape}")
print(f"attn weights for x_2: {attn_weights_2}")
assert torch.allclose(attn_weights_2.sum(), torch.tensor(1.0)) # softmax sums up to 1

# calculate context vector c_2
context_2 = attn_weights_2 @ values # attn weights used to weigh values of all tokens surrounding x_2
print(f"calculated context vector for x_2: {context_2.shape}") # (9,) @ (9,2) = (2,)
print(f"context vector for x_2: {context_2}")

query_2: tensor([0.6865, 1.1872])
key_2: tensor([0.4322, 0.7633])
value_2: tensor([0.3319, 0.3923])
calculated query_2: torch.Size([2])
calculated keys: torch.Size([9, 2])
calculated values: torch.Size([9, 2])
calculated attn scores for x_2: torch.Size([9])
attn scores for x_2: tensor([-0.5454,  1.2029,  0.2943,  1.1301, -0.3735,  0.1375, -2.4616, -0.2940,
         0.8964])
calculated attn weights for x_2: torch.Size([9])
attn weights for x_2: tensor([0.0606, 0.2087, 0.1098, 0.1982, 0.0684, 0.0982, 0.0156, 0.0724, 0.1680])
calculated context vector for x_2: torch.Size([2])
context vector for x_2: tensor([0.1613, 0.2270])


In [43]:
# self-attention via dedicated layer

from nanollm.attn import SelfAttention

torch.manual_seed(0)
attn = SelfAttention(3, 2)

out = attn(inputs)
out

tensor([[-0.1516, -0.2833],
        [ 0.1613,  0.2270],
        [ 0.0130,  0.0065],
        [ 0.0400,  0.0489],
        [ 0.0165,  0.0106],
        [-0.0241, -0.0569],
        [-0.1195, -0.2228],
        [ 0.0342,  0.0401],
        [ 0.0776,  0.1064]], grad_fn=<MmBackward0>)