In [1]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
import requests
from model import *
from my_tokenizer.regex import *

In [2]:
# import os
# import requests

os.makedirs("data", exist_ok=True)

url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)

with open("data/input.txt", "w", encoding="utf-8") as f:
    f.write(response.text)

print("Download complete: data/input.txt")

Download complete: data/input.txt


In [3]:
class BPEDataset(Dataset):
    def __init__(self, text, tokenizer, seq_len):
        self.tokenizer = tokenizer
        self.seq_len = seq_len
        
        print("Encoding text...")
        # Encode the entire text into integers using the trained tokenizer
        self.data = torch.tensor(self.tokenizer.encode(text), dtype=torch.long)
        print(f"Total tokens: {len(self.data)}")

    def __len__(self):
        return len(self.data) - self.seq_len - 1

    def __getitem__(self, idx):
        start = torch.randint(0, len(self.data) - self.seq_len - 1, (1,)).item()
        chunk = self.data[start : start + self.seq_len + 1]
        x = chunk[:-1]
        y = chunk[1:]
        return x, y

In [4]:
with open('data/input.txt', 'r', encoding='utf-8') as f:
        text = f.read()

In [5]:
len(text)

1115394

In [6]:
# Train Tokenizer
print("Training Tokenizer (this may take a moment)...")
tokenizer = RegexTokenizer()
# vocab_size=512 for speed. GPT-4 uses ~100k.
tokenizer.train(text, vocab_size=512, verbose=False)
print("Tokenizer trained!")

Training Tokenizer (this may take a moment)...


Training Tokenizer...: 100%|█| 256/256 [01:56<00:00,  2.20it/s, last_merge=(262,

Tokenizer trained!





In [7]:
dataset = BPEDataset(text, tokenizer, seq_len=128)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

Encoding text...
Total tokens: 658028


In [8]:
batch = next(iter(train_loader))
print(f"x: {batch[0].shape}\ny: {batch[1].shape}")

x: torch.Size([32, 128])
y: torch.Size([32, 128])


In [9]:
batch[0][:50]

tensor([[262, 296, 114,  ...,  58, 291, 109],
        [256, 271, 110,  ..., 318, 333, 110],
        [385, 274,  44,  ..., 121, 269, 114],
        ...,
        [ 77, 121, 306,  ...,  99, 121,  59],
        [384,  10,  89,  ..., 102, 276, 333],
        [267, 365, 332,  ..., 263, 114, 276]])

In [13]:
config = Config(vocab_size=512, embed_size=256, seq_len=128, n_layer=2, h=2, d_ff=128, total_epochs=1, lr=1e-3, dropout=0.0)    
print(f"Running on: {config.device}")

model = GPT.build_gpt(config)
print("Model built successfully.")

Running on: cpu
Model built successfully.


In [14]:
# 1. Get a single batch from your data loader
data_iter = iter(train_loader)
x, y = next(data_iter)

# 2. Now run the check
print(f"Max Token ID in Data: {x.max().item()}")
print(f"Model Vocab Size: {model.config.vocab_size}")

if x.max().item() >= model.config.vocab_size:
    print("\nCRITICAL ERROR FOUND:")
    print(f"Your data contains token ID {x.max().item()}, but your model only knows up to {model.config.vocab_size - 1}.")
    print("FIX: You need to re-initialize the model with the correct vocab_size.")
else:
    print("\nStatus: Vocabulary size looks consistent.")

Max Token ID in Data: 511
Model Vocab Size: 512

Status: Vocabulary size looks consistent.


In [15]:
model = GPT.build_gpt(config)
print(f"Training on {config.device}...")
model.train_gpt(train_loader)

Training on cpu...


0/1: 100%|███████████████████| 20560/20560 [1:59:59<00:00,  2.86it/s, loss=1.97]

Epoch: 0/1 Loss: 2.1431859313745907





In [22]:
print("\nGenerating text\n")
model.eval()
start_tokens = torch.tensor([tokenizer.encode("The")], device=config.device)
generated = model.generate(start_tokens, max_new_token=200, top_k=5)

# Decode back to text
decoded = tokenizer.decode(generated[0].tolist())
print(f"input: The\n")
print(f"output: \n\n{decoded}")


Generating text

input: The

output: 

The King Edward's judge,
And he shall be my souls of my fellow,
Whose friends, and Warwick, he is an enemy
Shall be true that I did a pupil.
Ah, cousin York, or I say it, that
My presumble at that we may durst.

FRIAR LAURENCE:
I am encounterfeit is.

GRUMIO:
Alack, madam, and my gage.

MARIANA:
It is a
