In [123]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [124]:
from importlib.metadata import version

In [125]:
import tiktoken

In [126]:
import os
os.chdir("/media/mistertandon/DATA/git_repos/ai/01-llm-rsbt/settings")

In [127]:
with open("the-verdict.txt", "r", encoding = "utf-8") as f:
    raw_text = f.read()
print(f"length of raw text: {len(raw_text)} characters")

length of raw text: 20479 characters


In [128]:
tokenizer = tiktoken.get_encoding("gpt2")
enc_text = tokenizer.encode(raw_text)
print(f"Length of encoded text: {len(enc_text)} tokens")

Length of encoded text: 5145 tokens


In [129]:
enc_sample = enc_text[:10]
print(f"First 10 tokens: {enc_sample}")


First 10 tokens: [40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138]


One of the easiest and most intuitive ways to create the inputâ€“target pairs for the next-word prediction task is to create two variables, x and y, where x contains the input tokens and y contains the targets, which are the inputs shifted by 1:

In [130]:
context_size = 8
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f"x: {x}")
print(f"y:     {y}")

x: [40, 367, 2885, 1464, 1807, 3619, 402, 271]
y:     [367, 2885, 1464, 1807, 3619, 402, 271, 10899]


In [131]:
for i in range(1, context_size):
    context = enc_sample[:i]
    target = enc_sample[i]
    print(f"context: {context} -> target {target}")

context: [40] -> target 367
context: [40, 367] -> target 2885
context: [40, 367, 2885] -> target 1464
context: [40, 367, 2885, 1464] -> target 1807
context: [40, 367, 2885, 1464, 1807] -> target 3619
context: [40, 367, 2885, 1464, 1807, 3619] -> target 402
context: [40, 367, 2885, 1464, 1807, 3619, 402] -> target 271


In [132]:
for i in range(1, context_size):
    context = enc_sample[:i]
    target = enc_sample[i]
    print(f"context: {tokenizer.decode(context)} ----> {tokenizer.decode([target])}")

context: I ---->  H
context: I H ----> AD
context: I HAD ---->  always
context: I HAD always ---->  thought
context: I HAD always thought ---->  Jack
context: I HAD always thought Jack ---->  G
context: I HAD always thought Jack G ----> is


Identify the installed PyTorch version

In [133]:
from importlib.metadata import version

In [134]:
import torch
from torch.utils.data import Dataset, DataLoader
version("torch")

'2.9.1+cu126'

In [135]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(txt)

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i+1: i+max_length + 1]

            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    
    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

The following code uses the GPTDatasetV1 to load the inputs in batches via a PyTorch DataLoader.

In [136]:
def create_dataloader_v1(txt, batch_size, max_length, stride, shuffle, drop_last, num_workers):

    tokenizer = tiktoken.get_encoding("gpt2")

    dataset = GPTDatasetV1(txt=txt, tokenizer=tokenizer, max_length=max_length, stride=stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [137]:
dataloader_v1 = create_dataloader_v1(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False, drop_last=True, num_workers=2)
data_iter_v1 = iter(dataloader_v1)
first_batch_v1 = next(data_iter_v1)
print(first_batch_v1)


[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [138]:
second_batch_v1 = next(data_iter_v1)
print(second_batch_v1)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


In [139]:
dataloader_v2 = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False, drop_last=True, num_workers=2)
data_iter_v2 = iter(dataloader_v2)
first_batch_v2 = next(data_iter_v2)
print(first_batch_v2)

[tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]]), tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])]
