# Position-independent embeddings

In [1]:
import torch

In [3]:
vocab = ["<pad>", "<unk>", "a", "b", "c", "d", "e", "f", "g", "h"]
vocab_size = len(vocab)
embedding_dim = 4

In [4]:
torch.manual_seed(0)
embedding = torch.nn.Embedding(vocab_size, embedding_dim)

In [6]:
# initialize the embedding layer with random weights
# each word is represented by dim=4 tensor -> 10 words -> 10x4 tensor

print(embedding.weight.shape)
print(embedding.weight)

torch.Size([10, 4])
Parameter containing:
tensor([[-1.1258, -1.1524, -0.2506, -0.4339],
        [ 0.8487,  0.6920, -0.3160, -2.1152],
        [ 0.3223, -1.2633,  0.3500,  0.3081],
        [ 0.1198,  1.2377,  1.1168, -0.2473],
        [-1.3527, -1.6959,  0.5667,  0.7935],
        [ 0.5988, -1.5551, -0.3414,  1.8530],
        [-0.2159, -0.7425,  0.5627,  0.2596],
        [-0.1740, -0.6787,  0.9383,  0.4889],
        [ 1.2032,  0.0845, -1.2001, -0.0048],
        [-0.5181, -0.3067, -1.5810,  1.7066]], requires_grad=True)


In [10]:
# in this type of embedding, both "a" tokens will have the same representation

sequence = ["a", "c", "a"]
sequence_indices = torch.tensor([vocab.index(token) for token in sequence])
print(sequence_indices)
print(embedding(sequence_indices))


tensor([2, 4, 2])
tensor([[ 0.3223, -1.2633,  0.3500,  0.3081],
        [-1.3527, -1.6959,  0.5667,  0.7935],
        [ 0.3223, -1.2633,  0.3500,  0.3081]], grad_fn=<EmbeddingBackward0>)


# Position-aware embeddings

In [93]:
with open("../assets/corpus_01.txt", "r") as file:
    corpus = file.read()

print(f"corpus length (# of characters): {len(corpus)}")
print(f"corpus length (estimated # of words): {len(corpus.split())}")

corpus length (# of characters): 20479
corpus length (estimated # of words): 3634


In [17]:
from nanollm.token import BPETokenizer

tokenizer = BPETokenizer()

# gpt-2 tokenizer has a vocabulary size of 50257 - this is predetermined
print(f"gpt-2 tokenizer vocab length: {tokenizer._engine.n_vocab}")

gpt tokenizer vocab length: 50257


In [23]:
from nanollm.data import create_dataloader

loader = create_dataloader(
    corpus, batch_size=4, max_length=4, stride=4, shuffle=False
)

In [68]:
vocab_size = tokenizer._engine.n_vocab
embedding_dim = 256

embedding_layer = torch.nn.Embedding(vocab_size, embedding_dim)

In [95]:
tokenized_corpus = [
    token.item() for batch in loader for sample in batch[0] for token in sample
]

# tokenized corpus length is < original corpus length in chars 
# but longer then original corpus length in words
# this is typical for bpe tokenization
print(f"original corpus length (# chars): {len(corpus)}")
print(f"original corpus length (est # words): {len(corpus.split())}")
print(f"tokenized corpus length: {len(tokenized_corpus)}")

original corpus length (# chars): 20479
original corpus length (est # words): 3634
tokenized corpus length: 5136


In [96]:
# find all occurences of token 40 that represents word "I"

indices = [i for i, token in enumerate(tokenized_corpus) if token == 40]
print(f"# occurences of word I = {len(indices)}")
print(f"first 5 occurences = {indices[:5]}")

# occurences of word I = 29
first 5 occurences = [0, 379, 736, 882, 895]


In [97]:
# produce an embedding without any context information
# the token representing word "I" becomes a 256-dimensional column tensor

print(embedding_layer(torch.tensor([40])).shape)

torch.Size([1, 256])


In [81]:
# convert the entire tokenized corpus to 256-dimensional embeddings

corpus_as_tensor = torch.cat([sample for batch in loader for sample in batch[0]])
embedding_0 = embedding_layer(corpus_as_tensor) # position-agnostic embeddings
print(f"size of embedded corpus: {embedding_0.shape}")

size of embedded corpus: torch.Size([5136, 256])


In [101]:
# compare embeddings for two different occurences of word "I"

print(f"token #0 = {corpus_as_tensor[0]}, token #{indices[1]} = {corpus_as_tensor[indices[1]]}")

# the embeddings are exactly the same - no positional information is encoded
print(f"embedding for token #0 = {embedding_0[0][:5]}")
print(f"embedding for token #{indices[1]} = {embedding_0[indices[1]][:5]}")

token #0 = 40, token #379 = 40
embedding for token #0 = tensor([-0.1018, -0.6005, -0.6997,  0.4333, -0.4055], grad_fn=<SliceBackward0>)
embedding for token #379 = tensor([-0.1018, -0.6005, -0.6997,  0.4333, -0.4055], grad_fn=<SliceBackward0>)


In [73]:
# now create a layer for absolute positional embedding
# the underlying logic is that we define a context window length (e.g. 4)
# and we assign an embedding value (256-dims) to each position in the context window

# this will produces a 4x256 matrix

context_length = 4
positional_embedding_layer = torch.nn.Embedding(context_length, embedding_dim)

# this creates the (4, 256) embedding matrix
positional_embeddings = positional_embedding_layer(torch.arange(context_length))

In [74]:
positional_embeddings

tensor([[-0.4794, -0.7480, -0.0834,  ...,  0.8273,  1.7678, -0.8788],
        [-0.4280, -1.7475, -0.7723,  ..., -3.2978, -0.1759,  1.7628],
        [ 0.7874,  1.3287,  1.1307,  ..., -0.4192,  0.7647, -0.3317],
        [ 0.7321, -1.6347, -0.0794,  ...,  1.1676, -0.0695, -1.4688]],
       grad_fn=<EmbeddingBackward0>)

In [107]:
sample = loader.dataset[0][0]

print(f"a sample tokenized sequence: {sample}")

# to create positional embeddings, just add the matrices

base_embedding = embedding_layer(sample)
positional_embedding = positional_embedding_layer(torch.arange(context_length))
sample_embedding = base_embedding + positional_embedding
print(f"base embedding: {base_embedding}")
print(f"positional embedding: {sample_embedding}")

a sample tokenized sequence: tensor([  40,  367, 2885, 1464])
base embedding: tensor([[-0.1018, -0.6005, -0.6997,  ..., -0.1395, -2.3091,  1.4107],
        [-0.2515, -0.8558,  1.0292,  ...,  1.3501, -0.3530, -0.4898],
        [-0.5669,  0.4898, -0.6491,  ..., -1.3058,  1.2083, -1.6056],
        [-1.2992,  0.7592,  0.5555,  ...,  1.4055,  0.9644, -0.5949]],
       grad_fn=<EmbeddingBackward0>)
positional embedding: tensor([[-0.5812, -1.3485, -0.7831,  ...,  0.6878, -0.5414,  0.5319],
        [-0.6795, -2.6033,  0.2569,  ..., -1.9477, -0.5289,  1.2730],
        [ 0.2206,  1.8185,  0.4815,  ..., -1.7250,  1.9730, -1.9373],
        [-0.5671, -0.8755,  0.4761,  ...,  2.5731,  0.8949, -2.0637]],
       grad_fn=<AddBackward0>)
