## creating a gpt2 like decoder only transformer using the verdict by Edith Wharton

### https://en.wikisource.org/wiki/The_Verdict

### https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt

In [1]:
from importlib.metadata import version

print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

torch version: 2.3.0+cu121
tiktoken version: 0.7.0


In [2]:
import os
import urllib.request

if not os.path.exists("the-verdict.txt"):
    url = ("https://raw.githubusercontent.com/rasbt/"
           "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
           "the-verdict.txt")
    file_path = "the-verdict.txt"
    urllib.request.urlretrieve(url, file_path)

In [3]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    
print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [4]:
import importlib
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))

tiktoken version: 0.7.0


In [5]:
tokenizer = tiktoken.get_encoding("gpt2")

In [6]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [7]:
strings = tokenizer.decode(integers)

print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


In [8]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [9]:
enc_sample = enc_text[50:]

In [10]:
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [11]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(context, "---->", desired)
    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

[290] ----> 4920
 and ---->  established
[290, 4920] ----> 2241
 and established ---->  himself
[290, 4920, 2241] ----> 287
 and established himself ---->  in
[290, 4920, 2241, 287] ----> 257
 and established himself in ---->  a


In [12]:
import torch
print("PyTorch version:", torch.__version__)

PyTorch version: 2.3.0+cu121


In [35]:
from torch.utils.data import Dataset, DataLoader

max_length = 4
stride=1

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)
        for i in range(0, len(token_ids) - max_length, stride):            
            input_chunk = token_ids[i : i + max_length]
            input_chunk_text = tokenizer.decode(input_chunk)
            print(f"input_chunk {input_chunk} - {input_chunk_text}")            
            target_chunk = token_ids[i + 1: i + max_length + 1]
            target_chunk_text = tokenizer.decode(target_chunk)
            print(f"target_chunk {target_chunk} - {target_chunk_text}")
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
            if i > 10:
                break
            

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]
text = "One Two Three Four Five Six seven eight nine ten eleven twelve"
dataset = GPTDatasetV1(text, tokenizer, max_length, stride)
print(type(dataset))

input_chunk [3198, 4930, 7683, 6675] - One Two Three Four
target_chunk [4930, 7683, 6675, 10579] -  Two Three Four Five
input_chunk [4930, 7683, 6675, 10579] -  Two Three Four Five
target_chunk [7683, 6675, 10579, 9699] -  Three Four Five Six
input_chunk [7683, 6675, 10579, 9699] -  Three Four Five Six
target_chunk [6675, 10579, 9699, 3598] -  Four Five Six seven
input_chunk [6675, 10579, 9699, 3598] -  Four Five Six seven
target_chunk [10579, 9699, 3598, 3624] -  Five Six seven eight
input_chunk [10579, 9699, 3598, 3624] -  Five Six seven eight
target_chunk [9699, 3598, 3624, 5193] -  Six seven eight nine
input_chunk [9699, 3598, 3624, 5193] -  Six seven eight nine
target_chunk [3598, 3624, 5193, 3478] -  seven eight nine ten
input_chunk [3598, 3624, 5193, 3478] -  seven eight nine ten
target_chunk [3624, 5193, 3478, 22216] -  eight nine ten eleven
input_chunk [3624, 5193, 3478, 22216] -  eight nine ten eleven
target_chunk [5193, 3478, 22216, 14104] -  nine ten eleven twelve
<class '_

In [41]:
batch_size = 2  # Adjust the batch size as needed
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
print(data_loader)

for batch_idx, (input_ids, target_ids) in enumerate(data_loader):
    print(f"Batch {batch_idx}:")
    input_chunk_texts = []
    
    # input_ids is a batch of tensors, so you need to iterate over them
    for id_tensor in input_ids:
        # Convert the tensor to a list of integers
        id_list = id_tensor.tolist()
        # Decode the list of token ids to text
        input_chunk_text = tokenizer.decode(id_list)
        input_chunk_texts.append(input_chunk_text)
    
    print(f"Input IDs: {input_ids} - {input_chunk_texts}")
    
    # Similarly, decode target_ids
    target_chunk_texts = []
    for target_tensor in target_ids:
        target_list = target_tensor.tolist()
        target_chunk_text = tokenizer.decode(target_list)
        target_chunk_texts.append(target_chunk_text)
    
    print(f"Target IDs: {target_ids} - {target_chunk_texts}")

<torch.utils.data.dataloader.DataLoader object at 0x0000025725C66070>
Batch 0:
Input IDs: tensor([[ 3198,  4930,  7683,  6675],
        [ 4930,  7683,  6675, 10579]]) - ['One Two Three Four', ' Two Three Four Five']
Target IDs: tensor([[ 4930,  7683,  6675, 10579],
        [ 7683,  6675, 10579,  9699]]) - [' Two Three Four Five', ' Three Four Five Six']
Batch 1:
Input IDs: tensor([[ 7683,  6675, 10579,  9699],
        [ 6675, 10579,  9699,  3598]]) - [' Three Four Five Six', ' Four Five Six seven']
Target IDs: tensor([[ 6675, 10579,  9699,  3598],
        [10579,  9699,  3598,  3624]]) - [' Four Five Six seven', ' Five Six seven eight']
Batch 2:
Input IDs: tensor([[10579,  9699,  3598,  3624],
        [ 9699,  3598,  3624,  5193]]) - [' Five Six seven eight', ' Six seven eight nine']
Target IDs: tensor([[9699, 3598, 3624, 5193],
        [3598, 3624, 5193, 3478]]) - [' Six seven eight nine', ' seven eight nine ten']
Batch 3:
Input IDs: tensor([[ 3598,  3624,  5193,  3478],
        [ 362

In [43]:
from torch.utils.data import Dataset, DataLoader

max_length = 4
stride = 1

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)
        for i in range(0, len(token_ids) - max_length, stride):            
            input_chunk = token_ids[i : i + max_length]           
            target_chunk = token_ids[i + 1: i + max_length + 1]         
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))     

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]

dataset = GPTDatasetV1(raw_text, tokenizer, max_length, stride)

In [59]:
batch_size = 1  # Adjust the batch size as needed
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
data_iter = iter(data_loader)

# Iterate over the first 6 items
for i in range(6):        
    # Get the next batch (input_ids, target_ids)
    item_batch = next(data_iter)
    
    # item_batch is a tuple (input_ids, target_ids)
    input_ids, target_ids = item_batch
    
    # Since batch_size is 1, input_ids and target_ids will have a single element each
    input_id = input_ids[0]
    target_id = target_ids[0]
    
    # Convert tensors to lists of token IDs
    input_id_list = input_id.tolist()
    target_id_list = target_id.tolist()
    
    # Decode the token IDs to text
    input_chunk_text = tokenizer.decode(input_id_list)
    target_chunk_text = tokenizer.decode(target_id_list)
    
    # Print the input and target IDs as well as their decoded text
    print(f"Input ID {i}: {input_id} - {input_chunk_text}")    
    print(f"Target ID {i}: {target_id} - {target_chunk_text}")

Input ID 0: tensor([  40,  367, 2885, 1464]) - I HAD always
Target ID 0: tensor([ 367, 2885, 1464, 1807]) -  HAD always thought
Input ID 1: tensor([ 367, 2885, 1464, 1807]) -  HAD always thought
Target ID 1: tensor([2885, 1464, 1807, 3619]) - AD always thought Jack
Input ID 2: tensor([2885, 1464, 1807, 3619]) - AD always thought Jack
Target ID 2: tensor([1464, 1807, 3619,  402]) -  always thought Jack G
Input ID 3: tensor([1464, 1807, 3619,  402]) -  always thought Jack G
Target ID 3: tensor([1807, 3619,  402,  271]) -  thought Jack Gis
Input ID 4: tensor([1807, 3619,  402,  271]) -  thought Jack Gis
Target ID 4: tensor([ 3619,   402,   271, 10899]) -  Jack Gisburn
Input ID 5: tensor([ 3619,   402,   271, 10899]) -  Jack Gisburn
Target ID 5: tensor([  402,   271, 10899,  2138]) -  Gisburn rather


In [67]:
output_dim = 256
vocab_size = 50257
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

batch_size = 8
max_length = 4
stride = 4
dataset = GPTDatasetV1(raw_text, tokenizer, max_length, stride)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
data_iter = iter(data_loader)
inputs, targets = next(data_iter)
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])
torch.Size([8, 4, 256])


In [74]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
print(torch.arange(context_length))
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings.shape)
#print(pos_embeddings[0])

tensor([0, 1, 2, 3])
torch.Size([4, 256])


In [72]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
