# 01: Train Decoder-Only LLM (GPT-style)
This notebook demonstrates how to train a decoder-only transformer model from scratch using PyTorch.

In [None]:
# Install dependencies (if needed)
!pip install torch transformers tqdm

In [1]:
import sys
import os
sys.path.append(os.path.abspath(".."))

In [2]:
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer
from models.decoder_only import GPTStyleDecoder
from models.rotary_embeddings import build_rope_cache
from tqdm import tqdm

## Load and tokenize dataset

In [3]:
if not os.path.exists("../data/tiny_shakespeare.txt"):
    from urllib.request import urlretrieve
    os.makedirs("../data", exist_ok=True)
    urlretrieve("https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt", "../data/tiny_shakespeare.txt")

In [4]:
# Tokenize without batch padding or truncation
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
with open("../data/tiny_shakespeare.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Tokenize as flat list (no return_tensors, no truncation)
tokens = tokenizer.encode(text, add_special_tokens=False)
print(f"Total tokens: {len(tokens)}")  # Just for sanity check

Token indices sequence length is longer than the specified maximum sequence length for this model (338025 > 1024). Running this sequence through the model will result in indexing errors


Total tokens: 338025


## Prepare Dataset

In [5]:
class TextDataset(Dataset):
    def __init__(self, tokens, block_size=128):
        self.examples = [tokens[i:i+block_size] for i in range(0, len(tokens)-block_size, block_size)]
    def __len__(self):
        return len(self.examples)
    def __getitem__(self, idx):
        x = self.examples[idx]
        return torch.tensor(x[:-1]), torch.tensor(x[1:])

device = "cuda" if torch.cuda.is_available() else "cpu"
dataset = TextDataset(tokens, block_size=128)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True, pin_memory=(device == "cuda"))

## Initialize Model

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = GPTStyleDecoder(
    vocab_size=tokenizer.vocab_size,
    embed_dim=768,
    depth=6,
    heads=12,
    ff_dim=2048,
    max_len=128
).to(device)

## Training Loop

In [7]:
optimizer = optim.AdamW(model.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss()

for epoch in range(3):
    model.train()
    total_loss = 0
    loop = tqdm(dataloader, desc=f"Epoch {epoch+1}/3")
    for x, y in loop:
        x, y = x.to(device), y.to(device)
        logits = model(x)
        #sin, cos = build_rope_cache(x.size(1), model.embed_dim // model.num_heads, device)
        #logits = model(x, sin=sin, cos=cos)
        loss = criterion(logits.view(-1, logits.size(-1)), y.view(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loop.set_postfix(loss=loss.item())

torch.save(model.state_dict(), "gpt_decoder_trained.pt")
print("Model saved.")

Epoch 1/3: 100%|██████████████████████████████████████████████████████████| 330/330 [17:40<00:00,  3.21s/it, loss=4.79]
Epoch 2/3: 100%|██████████████████████████████████████████████████████████| 330/330 [18:23<00:00,  3.34s/it, loss=4.02]
Epoch 3/3: 100%|██████████████████████████████████████████████████████████| 330/330 [17:48<00:00,  3.24s/it, loss=3.69]


Model saved.
