1. Loading the data

In [9]:
import urllib.request
url = ("https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt")
filepath = "the-verdict.txt"

urllib.request.urlretrieve(url, filepath)

('the-verdict.txt', <http.client.HTTPMessage at 0x7f99a1acaaf0>)

In [10]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

print(f"Total number of characters: {len(raw_text)}")
print(raw_text[:500])

Total number of characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)

"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it'


2. Preprocessing - tokenization

In [11]:
import re

preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))
print(preprocessed[:30])

4690
['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


3. Creating vocab

In [12]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
print(f"Vocabulary size: {len(all_tokens)}")

Vocabulary size: 1132


In [13]:
vocab = {token:integer for integer, token in enumerate(all_tokens)}
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)



('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


4. Simple tokenizer

In [14]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]

        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
            

        ids = [self.str_to_int[word] for word in preprocessed]

        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text
    

tokenizer = SimpleTokenizerV2(vocab=vocab)

text = """
        "It's the last he painted, you know, "
        Mrs. Gisburn said with pardonable pride."""

ids = tokenizer.encode(text)
print(ids)
print(tokenizer.decode(ids))

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]
" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


In [15]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)
ids = tokenizer.encode(text)
print(ids)
print(tokenizer.decode(ids))

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.
[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]
<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


5. Byte pair encoding


In [16]:
from importlib.metadata import version
import tiktoken

print(f"tiktoken version: {version('tiktoken')}")

tiktoken version: 0.8.0


In [17]:
tokenizer = tiktoken.get_encoding("gpt2")   # TODO: implement own tokenizer
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})    # Vocabulary size GPT2: 50257
print(integers)
strings = tokenizer.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.
[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 262, 20562, 13]
Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [18]:
test_text = "Ekretss ddddxxx thae je"
integers = tokenizer.encode(test_text)
print(integers)
strings = tokenizer.decode(integers)
print(strings)

[36, 74, 1186, 824, 288, 1860, 67, 31811, 294, 3609, 11223]
Ekretss ddddxxx thae je


6. DataLoader

In [19]:
enc_text = tokenizer.encode(raw_text)
print(len(enc_text))
enc_sample = enc_text[50:]

5145


In [20]:
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f"x:    {x}")
print(f"y:         {y}")

x:    [290, 4920, 2241, 287]
y:         [4920, 2241, 287, 257]


In [21]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context, "---->", desired)

[290] ----> 4920
[290, 4920] ----> 2241
[290, 4920, 2241] ----> 287
[290, 4920, 2241, 287] ----> 257


In [22]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a


In [23]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())

2.5.1+cu124
True


In [31]:
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_lenght, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)
        for i in range(0, len(token_ids)- max_lenght, stride):
            input_chunk = token_ids[i:i + max_lenght]
            target_chunk = token_ids[i + 1: i + max_lenght + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [32]:
def create_dataloader_v1(txt, batch_size=4, max_lenght=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_lenght, stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,    # Drop last batch if it is shorter than the specified batch_size to prevent loss spikes during training
        num_workers=num_workers
    )

    return dataloader

In [48]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_lenght=4, stride=4, shuffle=False
)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)
# second_batch = next(data_iter)
# print(second_batch)

[tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]]), tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])]


7. Token embeddings

In [50]:
input_ids = torch.tensor([2,3,5,1])
vocab_size = 6
output_dim = 3

In [53]:
torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)    # Better than one-hot encoding - more efficient way of matrix multiplication in fully connecterd layer
print(embedding_layer.weight)   # Initiated embedding layer with weights

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [54]:
print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [55]:
print(embedding_layer(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


8. Positional token embeddings

In [57]:
# Position-independent embedding of tokens are better for reproducibility purposes.
# However self-attention mechanism is also positio-agnostic (meaning it treats all tokens in a sequence equally regardless of their order)
# it is helpful to injest positional information into the LLM

vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [59]:
max_lenght = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_lenght=max_lenght, stride=max_lenght, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Token ID: \n", inputs)
print("\nInput shape: \n", inputs.shape)    # first batch consists of 8 text examples, 4 tokens each

Token ID: 
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Input shape: 
 torch.Size([8, 4])


In [60]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

# 8 - examples
# 4 - tokens for each example
# 256 - embedding values for each token

torch.Size([8, 4, 256])


In [61]:
# To add absolute embedding approach we add another layer of embeddings

context_lenght = max_lenght
pos_embedding_layer = torch.nn.Embedding(context_lenght, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_lenght))
print(pos_embeddings.shape)

# 4 - tokens in example
# 256 - values of pos embeddings


torch.Size([4, 256])


In [64]:
# we just add that two embedding layers
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

# When adding positional embeddings to token embeddings, PyTorch automatically broadcasts the positional embeddings across the batch dimension. 
# Broadcasting expands the positional embeddings from 4×256 to 8×4×256 to match the shape of the token embeddings.
# Duplicating positional embeddings for each batch example would be redundant and inefficient in terms of memory.

torch.Size([8, 4, 256])


9. Simple self attention mechanism without trainable weights

In [65]:
inputs = torch.tensor(
    [
        [0.43, 0.15, 0.89], # Your
        [0.55, 0.87, 0.66], # journey
        [0.57, 0.85, 0.64], # starts
        [0.22, 0.58, 0.33], # with
        [0.77, 0.25, 0.10], # one
        [0.05, 0.80, 0.55], # step

    ]
)

In [66]:
# 1. Multiplying input token with all tokens in a sequence -> it gives us floats
query = inputs[1] # attention scores for word 'journey'
attn_scores_2 = torch.empty(inputs.shape[0])
for i, x_i in enumerate(inputs):
    attn_scores_2[i] = torch.dot(x_i, query)    # concise way of multiplying two vectors. It is a measure of similarity of one word to each words in a sequence
print(attn_scores_2)

tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])


In [73]:
# torch.dot(x_i, query) is essentially the same as:
res = 0

for idx, element in enumerate(inputs[0]):
    res += inputs[0][idx] * query[idx]

print(res)

tensor(0.9544)


In [78]:
# 2. Normalize the results
attn_weights_2_tmp = attn_scores_2 / attn_scores_2.sum()    # we normalize but it is advisable to do that with softmax function
print(f"Attention weights normalized: {attn_weights_2_tmp}")
print(f"Sum: {attn_weights_2_tmp.sum()}")

Attention weights normalized: tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656])
Sum: 1.0000001192092896


In [80]:
def softmax_naive(x):
    return torch.exp(x) / torch.exp(x).sum(dim=0)   # softmax function assures that we don't get negative values. 
                                                    # this apprach however can have underflow or overflow. Therefore we use build in torch.softmax
attn_weights_2_naive = softmax_naive(attn_scores_2)
print(f"Attention scores naive: {attn_weights_2_naive}")
print(f"Sum: {attn_weights_2_naive.sum()}")

Attention scores naive: tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
Sum: 1.0


In [82]:
attn_weights_2 = torch.softmax(attn_scores_2, dim=0)
print(f"Attention scores torch.softmax: {attn_weights_2}")
print(f"Sum: {attn_weights_2.sum()}")

Attention scores torch.softmax: tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
Sum: 1.0


In [85]:
# 3. Now we can calculate context vector by multiplying each token vector values with corresponding attention weights 
query = inputs[1]
context_vec_2 = torch.zeros(query.shape)
for i, x_i in enumerate(inputs):
    context_vec_2 += attn_weights_2[i]*x_i

print(context_vec_2)

tensor([0.4419, 0.6515, 0.5683])


In [109]:
# To do that for all:
# 1. Compute attention scores
# 2. Compute attention weights
# 3. Compute context vectors

attn_scores = torch.empty(6, 6)
for i, x_i in enumerate(inputs):
    for j, x_j in enumerate(inputs):
        attn_scores[i, j] = torch.dot(x_i, x_j)
print(attn_scores)

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])


In [110]:
# More efficeint way is to use matrix multiplications:
attn_scores = inputs @ inputs.T
print(attn_scores)

tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
        [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
        [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
        [0.4753, 0.8434, 0.8296, 0.4937, 0.3474, 0.6565],
        [0.4576, 0.7070, 0.7154, 0.3474, 0.6654, 0.2935],
        [0.6310, 1.0865, 1.0605, 0.6565, 0.2935, 0.9450]])


In [111]:
# We normalize
attn_weights = torch.softmax(attn_scores, dim=-1) # dim=-1 means we want to apply the softmax along the last dimension of attn_scores tensor
print(attn_weights)

# dim=-1: This is commonly used for normalizing attention scores so that each query focuses on different keys with probabilities summing to 1.
# dim=-2: This is less common but could be used in scenarios where you want to normalize attention scores across queries.


tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
        [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
        [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
        [0.1435, 0.2074, 0.2046, 0.1462, 0.1263, 0.1720],
        [0.1526, 0.1958, 0.1975, 0.1367, 0.1879, 0.1295],
        [0.1385, 0.2184, 0.2128, 0.1420, 0.0988, 0.1896]])


In [112]:
row_2_sum = attn_weights[1].sum()
print(f"Row 2 sum: {row_2_sum}")
print(f"All rows sums across columns: {attn_weights.sum(dim=-1)}")

Row 2 sum: 1.0
All rows sums across columns: tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])


In [113]:
# Now we calculate context vector
all_context_vecs = attn_weights @ inputs
all_context_vecs


tensor([[0.4421, 0.5931, 0.5790],
        [0.4419, 0.6515, 0.5683],
        [0.4431, 0.6496, 0.5671],
        [0.4304, 0.6298, 0.5510],
        [0.4671, 0.5910, 0.5266],
        [0.4177, 0.6503, 0.5645]])

In [114]:
all_context_vecs[1] # same as context_vec_2

tensor([0.4419, 0.6515, 0.5683])