In [None]:
!pip install uv

Collecting uv
  Downloading uv-0.6.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading uv-0.6.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.2/16.2 MB[0m [31m52.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: uv
Successfully installed uv-0.6.6


In [None]:
!uv pip install -r https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/refs/heads/main/requirements.txt

In [None]:
import urllib.request
url = ("https://raw.githubusercontent.com/rasbt/"
       "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
       "the-verdict.txt")
file_path = "the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

('the-verdict.txt', <http.client.HTTPMessage at 0x7edbdb6deb10>)

In [None]:
with open('./the-verdict.txt', 'r') as f:
    raw_text = f.read()

In [None]:
len(raw_text)

20479

In [None]:
import re

preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))

4690


In [None]:
all_words = sorted(list(set(preprocessed)))
vocab_size = len(all_words)
print(vocab_size)

1130


In [None]:
vocab = {s:i for i,s in enumerate(all_words)}

In [None]:
class SimpleTokenizerV1:
    def __init__(self,vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self,text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [
            self.str_to_int[s] for s in preprocessed
        ]
        return ids

    def decode(self,ids):
        text = " ".join([self.int_to_str[id] for id in ids])

        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)

        return text

In [None]:
tokenizer = SimpleTokenizerV1(vocab)

In [None]:
text = """"It's the last he painted, you know,"
       Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)

print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [None]:
print(tokenizer.decode(ids))

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


In [None]:
tokenizer.encode("Hello")

KeyError: 'Hello'

In [None]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer,token in enumerate(all_tokens)}

print(len(vocab.items()))

1132


In [None]:
class SimpleTokenizerV2:
    def __init__(self,vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self,text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]

        preprocessed = [item if item in self.str_to_int
                        else "<|unk|>" for item in preprocessed]

        ids = [
            self.str_to_int[s] for s in preprocessed
        ]
        return ids

    def decode(self,ids):
        text = " ".join([self.int_to_str[id] for id in ids])

        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)

        return text

In [None]:
tokenizer = SimpleTokenizerV2(vocab)

In [None]:
tokenizer.encode("Hello")

[1131]

In [None]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [None]:
print(tokenizer.decode(tokenizer.encode(text)))

<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


In [None]:
import tiktoken

In [None]:
tokenizer = tiktoken.get_encoding("gpt2")

In [None]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)
print(tokenizer.decode(integers))

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]
Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


In [None]:
enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [None]:
enc_sample = enc_text[:10]

In [None]:
len(enc_sample)

10

In [None]:
print(enc_sample)

[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138]


In [None]:
context_size = 4

for i in range(len(enc_sample) - context_size + 1):
    x = enc_sample[i:i+context_size]
    y = enc_sample[i+1:i+context_size + 1]
    print(x)
    print(y)
    print()

[40, 367, 2885, 1464]
[367, 2885, 1464, 1807]

[367, 2885, 1464, 1807]
[2885, 1464, 1807, 3619]

[2885, 1464, 1807, 3619]
[1464, 1807, 3619, 402]

[1464, 1807, 3619, 402]
[1807, 3619, 402, 271]

[1807, 3619, 402, 271]
[3619, 402, 271, 10899]

[3619, 402, 271, 10899]
[402, 271, 10899, 2138]

[402, 271, 10899, 2138]
[271, 10899, 2138]



In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
torch.__version__

'2.6.0+cu124'

In [None]:
class GPTDatasetV1(Dataset):
    def __init__(self,txt, tokenizer, max_length,stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt,allowed_special={"<|endoftext|>"})

        for i in range(0,len(token_ids) - max_length,stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]

In [None]:
torch.manual_seed(123)
linear = torch.nn.Linear(4, 5, bias=False)
linear.weight

Parameter containing:
tensor([[-0.2039,  0.0166, -0.2483,  0.1886],
        [-0.4260,  0.3665, -0.3634, -0.3975],
        [-0.3159,  0.2264, -0.1847,  0.1871],
        [-0.4244, -0.3034, -0.1836, -0.0983],
        [-0.3814,  0.3274, -0.1179,  0.1605]], requires_grad=True)

In [None]:
torch.manual_seed(123)

embedding = torch.nn.Embedding(4, 5)
embedding.weight

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.3035, -0.5880,  1.5810],
        [ 1.3010,  1.2753, -0.2010, -0.1606, -0.4015],
        [ 0.6957, -1.8061, -1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096, -0.4076,  0.7953]], requires_grad=True)

In [None]:
onehot = torch.nn.functional.one_hot(torch.tensor([2, 3, 1]))
onehot

tensor([[0, 0, 1, 0],
        [0, 0, 0, 1],
        [0, 1, 0, 0]])

In [None]:
linear(onehot.float())

tensor([[-0.2483, -0.3634, -0.1847, -0.1836, -0.1179],
        [ 0.1886, -0.3975,  0.1871, -0.0983,  0.1605],
        [ 0.0166,  0.3665,  0.2264, -0.3034,  0.3274]], grad_fn=<MmBackward0>)

In [None]:
linear.weight = torch.nn.Parameter(embedding.weight.T)

In [None]:
linear.weight

Parameter containing:
tensor([[ 0.3374,  1.3010,  0.6957, -2.8400],
        [-0.1778,  1.2753, -1.8061, -0.7849],
        [-0.3035, -0.2010, -1.1589, -1.4096],
        [-0.5880, -0.1606,  0.3255, -0.4076],
        [ 1.5810, -0.4015, -0.6315,  0.7953]], requires_grad=True)

In [None]:
linear(onehot.float())

tensor([[ 0.6957, -1.8061, -1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096, -0.4076,  0.7953],
        [ 1.3010,  1.2753, -0.2010, -0.1606, -0.4015]], grad_fn=<MmBackward0>)

In [None]:
embedding(torch.tensor([2,3,1]))

tensor([[ 0.6957, -1.8061, -1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096, -0.4076,  0.7953],
        [ 1.3010,  1.2753, -0.2010, -0.1606, -0.4015]],
       grad_fn=<EmbeddingBackward0>)