In [1]:
"""
================================================================================
MINI-GPT: CHARACTER-LEVEL LANGUAGE MODEL
================================================================================

This notebook builds a tiny GPT from scratch and trains it on Shakespeare.

WHAT WE'RE BUILDING:
    A decoder-only transformer that learns to predict the next character.
    Train it on Shakespeare, and it generates Shakespeare-ish text!

    Training: "to be or not to b" → predict "e"
    Generation: Start with "The " → generates "The king hath..."

WHY CHARACTER-LEVEL?
    - Simple vocabulary (just ~65 characters: a-z, A-Z, space, punctuation)
    - No tokenizer needed
    - Same architecture as GPT, just smaller
    - Trains in minutes, not days

PREREQUISITES:
    - Understanding of transformer architecture (see 0B.ipynb)
    - PyTorch installed: pip install torch

DON'T WORRY if the generated text is gibberish at first!
    Watch the loss decrease and the output improve over training.

Let's build a mini-GPT!
"""

print("=" * 70)
print("MINI-GPT: CHARACTER-LEVEL LANGUAGE MODEL")
print("=" * 70)


MINI-GPT: CHARACTER-LEVEL LANGUAGE MODEL


In [2]:
# =============================================================================
# STEP 1: IMPORTS AND SETUP
# =============================================================================

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import os

# Check if GPU is available (will use CPU if not)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nUsing device: {device}")
if device.type == 'cpu':
    print("(Training on CPU — will take ~10-20 minutes)")
else:
    print("(Training on GPU — will be much faster!)")



Using device: cpu
(Training on CPU — will take ~10-20 minutes)


In [3]:
# =============================================================================
# STEP 2: LOAD AND PREPARE DATA
# =============================================================================

print("-" * 70)
print("STEP 2: Loading Shakespeare dataset...")
print("-" * 70)

# Load the text (Shakespeare's complete works, ~1.1MB)
# Source: https://github.com/karpathy/char-rnn/tree/master/data/tinyshakespeare
data_path = os.path.join(os.path.dirname(os.path.abspath("__file__")), "data", "shakespeare.txt")
with open(data_path, 'r', encoding='utf-8') as f:
    text = f.read()

print(f"Dataset size: {len(text):,} characters")
print(f"First 200 characters:\n{text[:200]}")


----------------------------------------------------------------------
STEP 2: Loading Shakespeare dataset...
----------------------------------------------------------------------
Dataset size: 1,115,394 characters
First 200 characters:
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


In [4]:
# =============================================================================
# STEP 3: BUILD VOCABULARY
# =============================================================================
# Map each unique character to an integer (and vice versa)

print("-" * 70)
print("STEP 3: Building vocabulary...")
print("-" * 70)

# Get all unique characters
chars = sorted(list(set(text)))
vocab_size = len(chars)

print(f"Vocabulary size: {vocab_size} unique characters")
print(f"Characters: {''.join(chars)}")

# Create mappings
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}

# Encode/decode functions
def encode(s):
    """Convert string to list of integers"""
    return [char_to_idx[c] for c in s]

def decode(indices):
    """Convert list of integers back to string"""
    return ''.join([idx_to_char[i] for i in indices])

# Test encoding/decoding
test_str = "Hello!"
encoded = encode(test_str)
decoded = decode(encoded)
print(f"\nTest: '{test_str}' → {encoded} → '{decoded}'")


----------------------------------------------------------------------
STEP 3: Building vocabulary...
----------------------------------------------------------------------
Vocabulary size: 65 unique characters
Characters: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz

Test: 'Hello!' → [20, 43, 50, 50, 53, 2] → 'Hello!'


In [5]:
# =============================================================================
# STEP 4: CREATE TRAINING DATA
# =============================================================================
# Split text into sequences for training

print("-" * 70)
print("STEP 4: Creating training sequences...")
print("-" * 70)

# Hyperparameters
block_size = 64     # Context length (how many characters to look at)
batch_size = 32     # How many sequences to process in parallel

# Encode entire text
data = torch.tensor(encode(text), dtype=torch.long)
print(f"Encoded data shape: {data.shape}")

# Train/val split (90/10)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]
print(f"Training tokens: {len(train_data):,}")
print(f"Validation tokens: {len(val_data):,}")

def get_batch(split):
    """
    Get a random batch of training/validation data.
    
    Returns:
        x: Input sequences, shape (batch_size, block_size)
        y: Target sequences (shifted by 1), shape (batch_size, block_size)
    """
    data_split = train_data if split == 'train' else val_data
    
    # Random starting positions
    ix = torch.randint(len(data_split) - block_size, (batch_size,))
    
    # Extract sequences
    x = torch.stack([data_split[i:i+block_size] for i in ix])
    y = torch.stack([data_split[i+1:i+block_size+1] for i in ix])
    
    return x.to(device), y.to(device)

# Test batch creation
x, y = get_batch('train')
print(f"\nBatch shapes: x={x.shape}, y={y.shape}")
print(f"Example input:  {decode(x[0].tolist()[:30])}...")
print(f"Example target: {decode(y[0].tolist()[:30])}...")


----------------------------------------------------------------------
STEP 4: Creating training sequences...
----------------------------------------------------------------------
Encoded data shape: torch.Size([1115394])
Training tokens: 1,003,854
Validation tokens: 111,540

Batch shapes: x=torch.Size([32, 64]), y=torch.Size([32, 64])
Example input:  
That Romeo should, upon recei...
Example target: That Romeo should, upon receip...


In [6]:
# =============================================================================
# STEP 5: DEFINE THE TRANSFORMER MODEL
# =============================================================================
# This is a decoder-only transformer (like GPT)
# Uses masked self-attention so it can only look at previous characters

print("-" * 70)
print("STEP 5: Defining the Transformer Model...")
print("-" * 70)

# MODEL HYPERPARAMETERS
# These are much smaller than real GPT, but same architecture
n_embd = 64     # Embedding dimension (GPT-3: 12288)
n_head = 4      # Number of attention heads (GPT-3: 96)
n_layer = 4     # Number of transformer blocks (GPT-3: 96)
dropout = 0.1   # Dropout rate


class Head(nn.Module):
    """
    Single head of self-attention.
    
    This is the PyTorch equivalent of what we built with NumPy in 0B.ipynb,
    but with masked attention for autoregressive generation.
    """
    
    def __init__(self, head_size):
        super().__init__()
        # Linear projections for Q, K, V
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        
        # Causal mask: lower triangular matrix
        # This prevents attending to future positions
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        B, T, C = x.shape  # Batch, Time (sequence), Channels (embedding)
        
        k = self.key(x)    # (B, T, head_size)
        q = self.query(x)  # (B, T, head_size)
        v = self.value(x)  # (B, T, head_size)
        
        # Attention scores
        wei = q @ k.transpose(-2, -1) * (C ** -0.5)  # (B, T, T)
        
        # Mask future positions (causal attention)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        
        # Softmax
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        
        # Weighted sum of values
        out = wei @ v  # (B, T, head_size)
        return out


class MultiHeadAttention(nn.Module):
    """
    Multiple heads of self-attention in parallel.
    Same concept as our multi_head_attention function, but in PyTorch.
    """
    
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        # Run all heads in parallel, concatenate outputs
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out


class FeedForward(nn.Module):
    """
    Position-wise feed-forward network.
    Same as our feed_forward function: expand → ReLU → compress.
    """
    
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),  # Expand by 4x
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),  # Compress back
            nn.Dropout(dropout),
        )
    
    def forward(self, x):
        return self.net(x)


class Block(nn.Module):
    """
    Transformer block: attention + feed-forward with residuals.
    This is exactly what our TransformerLayer does!
    """
    
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)  # Self-attention
        self.ffwd = FeedForward(n_embd)                   # Feed-forward
        self.ln1 = nn.LayerNorm(n_embd)                   # Layer norm 1
        self.ln2 = nn.LayerNorm(n_embd)                   # Layer norm 2
    
    def forward(self, x):
        # Attention with residual connection
        x = x + self.sa(self.ln1(x))
        # Feed-forward with residual connection
        x = x + self.ffwd(self.ln2(x))
        return x


class MiniGPT(nn.Module):
    """
    A tiny GPT-style language model.
    
    Architecture:
        Token Embedding + Position Embedding
        → N Transformer Blocks
        → Layer Norm
        → Linear (project to vocabulary)
    
    This is the same architecture as GPT-2/3, just much smaller!
    """
    
    def __init__(self):
        super().__init__()
        
        # Token embeddings: vocab_size → n_embd
        self.token_embedding = nn.Embedding(vocab_size, n_embd)
        
        # Position embeddings: block_size → n_embd
        self.position_embedding = nn.Embedding(block_size, n_embd)
        
        # Transformer blocks
        self.blocks = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
        
        # Final layer norm
        self.ln_f = nn.LayerNorm(n_embd)
        
        # Output projection: n_embd → vocab_size
        self.lm_head = nn.Linear(n_embd, vocab_size)
    
    def forward(self, idx, targets=None):
        """
        Forward pass.
        
        Parameters:
            idx: Input token indices, shape (B, T)
            targets: Target token indices, shape (B, T), optional
        
        Returns:
            logits: Predictions, shape (B, T, vocab_size)
            loss: Cross-entropy loss (if targets provided)
        """
        B, T = idx.shape
        
        # Get token and position embeddings
        tok_emb = self.token_embedding(idx)  # (B, T, n_embd)
        pos_emb = self.position_embedding(torch.arange(T, device=device))  # (T, n_embd)
        
        # Add them together (broadcasting handles batch dimension)
        x = tok_emb + pos_emb  # (B, T, n_embd)
        
        # Pass through transformer blocks
        x = self.blocks(x)  # (B, T, n_embd)
        
        # Final layer norm
        x = self.ln_f(x)  # (B, T, n_embd)
        
        # Project to vocabulary
        logits = self.lm_head(x)  # (B, T, vocab_size)
        
        # Compute loss if targets provided
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits_flat = logits.view(B*T, C)
            targets_flat = targets.view(B*T)
            loss = F.cross_entropy(logits_flat, targets_flat)
        
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        """
        Generate new tokens autoregressively.
        
        Parameters:
            idx: Starting context, shape (B, T)
            max_new_tokens: How many new tokens to generate
        
        Returns:
            idx: Extended sequence, shape (B, T + max_new_tokens)
        """
        for _ in range(max_new_tokens):
            # Crop to last block_size tokens (context window limit)
            idx_cond = idx[:, -block_size:]
            
            # Get predictions
            logits, _ = self(idx_cond)
            
            # Focus on last time step
            logits = logits[:, -1, :]  # (B, vocab_size)
            
            # Apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)
            
            # Sample from distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            
            # Append to sequence
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
        
        return idx


# Create model
model = MiniGPT().to(device)

# Print configuration
print(f"""
Model Configuration:
  Vocabulary size: {vocab_size}
  Context length:  {block_size} characters
  Embedding dim:   {n_embd}
  Attention heads: {n_head}
  Layers:          {n_layer}
  Dropout:         {dropout}
""")

# Count parameters
n_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {n_params:,}")
print(f"(For comparison: GPT-3 has 175 billion parameters!)")


----------------------------------------------------------------------
STEP 5: Defining the Transformer Model...
----------------------------------------------------------------------

Model Configuration:
  Vocabulary size: 65
  Context length:  64 characters
  Embedding dim:   64
  Attention heads: 4
  Layers:          4
  Dropout:         0.1

Total parameters: 211,777
(For comparison: GPT-3 has 175 billion parameters!)


In [7]:
# =============================================================================
# STEP 6: TRAINING LOOP
# =============================================================================
# Train the model to predict the next character

print("-" * 70)
print("STEP 6: Training the model...")
print("-" * 70)

# Training hyperparameters
learning_rate = 3e-4  # Standard for transformers
max_iters = 3000      # Number of training steps (increase for better results)
eval_interval = 500   # How often to print progress
eval_iters = 100      # How many batches to average for evaluation

# Optimizer (AdamW - the standard for transformers)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

@torch.no_grad()
def estimate_loss():
    """Estimate loss on train and val sets."""
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

def generate_sample(prompt="", max_tokens=100):
    """Generate a sample from the model."""
    model.eval()
    if prompt:
        context = torch.tensor([encode(prompt)], dtype=torch.long, device=device)
    else:
        context = torch.zeros((1, 1), dtype=torch.long, device=device)
    generated = model.generate(context, max_new_tokens=max_tokens)
    model.train()
    return decode(generated[0].tolist())

# Show untrained generation
print("\nBEFORE TRAINING (random gibberish):")
print("-" * 40)
print(generate_sample("The ", 100))
print("-" * 40)

# Training loop
print(f"\nTraining for {max_iters} iterations...")
print("(Watch the loss decrease and generated text improve!)\n")

for iter in range(max_iters):
    
    # Evaluate periodically
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"Step {iter:4d}: train loss = {losses['train']:.4f}, val loss = {losses['val']:.4f}")
        
        # Show a sample every evaluation
        if iter > 0:
            print("\nSample generation:")
            print(generate_sample("KING:", 100))
            print()
    
    # Get batch
    xb, yb = get_batch('train')
    
    # Forward pass
    logits, loss = model(xb, yb)
    
    # Backward pass
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print("\n" + "=" * 70)
print("TRAINING COMPLETE!")
print("=" * 70)


----------------------------------------------------------------------
STEP 6: Training the model...
----------------------------------------------------------------------

BEFORE TRAINING (random gibberish):
----------------------------------------
The LMkv:kth-K-DjQxNeSUot3
R$vNScA'ml: pxKMvXc'B!YJu!:W3FEy&3MeykE3WvbThtk?dRRjjn:PDpxQkZyenzMrWkmohXA--
----------------------------------------

Training for 3000 iterations...
(Watch the loss decrease and generated text improve!)

Step    0: train loss = 4.2756, val loss = 4.2758
Step  500: train loss = 2.4938, val loss = 2.4988

Sample generation:
KING:
CEKOr gooeteirwe hEManbp I owG tata sthife breere eteteste rere fuuthaethar mad s s 
I e fay wicire

Step 1000: train loss = 2.3537, val loss = 2.3633

Sample generation:
KING:
I nellatr, lif olta it shs cat sthen'
busgichif, s besanenTher p to?
Cro beab es tin solo oneewe my

Step 1500: train loss = 2.2441, val loss = 2.2628

Sample generation:
KING:
Rourme too whing dad ind mwith here,

In [None]:
# =============================================================================
# STEP 7: EXAMPLE GENERATIONS
# =============================================================================
# See what the trained model can do!

print("-" * 70)
print("STEP 7: Example Generations")
print("-" * 70)

print("""
Your mini-GPT is now trained! Here are some example generations:
""")

# Example prompts
prompts = [
    "ROMEO:",
    "To be or not to be",
    "The king",
    "O, ",
    "Love is",
]

for prompt in prompts:
    print(f"Prompt: '{prompt}'")
    print("-" * 40)
    generated = generate_sample(prompt, max_tokens=150)
    print(generated)
    print("\n")


In [None]:
# =============================================================================
# STEP 8: INTERACTIVE MODE
# =============================================================================
# Try your own prompts!

print("=" * 70)
print("TRY IT YOURSELF!")
print("=" * 70)
print("""
Enter your own prompts below to see what the model generates.
The model has learned from Shakespeare, so try things like:
  - Character names: "HAMLET:", "KING:", "JULIET:"
  - Dramatic openings: "O, what", "Alas,", "Hark!"
  - Questions: "What is", "Why do"
  
Note: This is a tiny model trained briefly — output won't be perfect,
but you should see recognizable patterns and structure!
""")

def interactive_generate(prompt, length=200):
    """
    Generate text from a prompt.
    
    Parameters:
        prompt: Starting text
        length: Number of characters to generate
    """
    print(f"\nPrompt: '{prompt}'")
    print("-" * 50)
    result = generate_sample(prompt, max_tokens=length)
    print(result)
    print("-" * 50)
    return result

# Example interactive call - modify the prompt below!
_ = interactive_generate("BRUTUS:", length=200)

print("""
To try your own prompts, run:
    interactive_generate("Your prompt here", length=200)
    
Experiment with different prompts and lengths!
""")


In [None]:
# =============================================================================
# SUMMARY: WHAT WE BUILT
# =============================================================================

print("""
================================================================================
CONGRATULATIONS! YOU JUST BUILT A LANGUAGE MODEL!
================================================================================

WHAT YOU ACCOMPLISHED:
    
    1. LOADED DATA: Shakespeare's complete works (~1.1 million characters)
    
    2. BUILT VOCABULARY: Mapped 65 unique characters to numbers
    
    3. CREATED A MINI-GPT:
       - Token embeddings (characters → vectors)
       - Position embeddings (location information)
       - 4 Transformer layers (self-attention + feed-forward)
       - Output projection (vectors → character probabilities)
    
    4. TRAINED IT: The model learned patterns in Shakespeare's writing
       - Character combinations (th, qu, ing, tion, etc.)
       - Word patterns
       - Dialogue structure (CHARACTER_NAME:)
       - Poetic rhythm
    
    5. GENERATED TEXT: The model can now continue any prompt!

THIS IS THE SAME ARCHITECTURE AS:
    - GPT-2 (1.5 billion parameters)
    - GPT-3 (175 billion parameters)
    - GPT-4 (rumored trillions)
    
    We just used:
    - 65 vocabulary vs 50,000+ tokens
    - 4 layers vs 96+ layers
    - 64 embedding dim vs 12,288
    - ~100K parameters vs 175+ billion

KEY INSIGHT:
    The magic isn't in the size — it's in the architecture.
    Self-attention allows every position to look at every other position,
    enabling the model to learn complex patterns and dependencies.
    
    Scale it up, and you get ChatGPT.

================================================================================
NEXT UP: Part 1 covers training dynamics, optimization, and how to scale up!
================================================================================
""")
