# Small GPT Demo

This notebook demonstrates how to use the Small GPT model for training and inference.

## Setup

In [None]:
import sys
import os
import torch
import matplotlib.pyplot as plt

# Add src to path
sys.path.insert(0, os.path.join(os.getcwd(), '..', 'src'))

from models.gpt import create_small_gpt
from training.dataset import prepare_data, create_dataloaders, SimpleTokenizer
from training.trainer import GPTTrainer
from utils.inference import GPTInference
from utils.helpers import set_seed, print_model_info, get_device

In [None]:
# Set random seed
set_seed(42)

# Get device
device = get_device()
print(f"Using device: {device}")

## Data Preparation

In [None]:
# Create sample text data
sample_text = """
The quick brown fox jumps over the lazy dog. This sentence contains all letters of the alphabet.
Machine learning is a subset of artificial intelligence that enables computers to learn and improve from experience.
Natural language processing involves the interaction between computers and human language.
Deep learning uses neural networks with multiple layers to model and understand complex patterns.
Transformers have revolutionized the field of natural language processing with their attention mechanisms.
""" * 10  # Repeat for more data

# Save to file
os.makedirs('../data', exist_ok=True)
with open('../data/demo_text.txt', 'w') as f:
    f.write(sample_text)

print(f"Sample text length: {len(sample_text)} characters")

In [None]:
# Prepare datasets
train_dataset, val_dataset, tokenizer = prepare_data(
    '../data/demo_text.txt', 
    block_size=64, 
    train_split=0.8
)

print(f"Vocabulary size: {tokenizer.vocab_size}")
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

# Create data loaders
train_loader, val_loader = create_dataloaders(
    train_dataset, val_dataset, batch_size=8
)

## Model Creation

In [None]:
# Model configuration
config = {
    'vocab_size': tokenizer.vocab_size,
    'd_model': 128,
    'n_heads': 4,
    'n_layers': 3,
    'd_ff': 512,
    'max_seq_len': 128,
    'dropout': 0.1,
}

# Create model
model = create_small_gpt(config)
print_model_info(model)

## Training

In [None]:
# Create trainer
trainer = GPTTrainer(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    lr=1e-3,
    device=device
)

# Train for a few epochs
train_losses, val_losses = trainer.train(epochs=3)

print(f"Final train loss: {train_losses[-1]:.4f}")
print(f"Final validation loss: {val_losses[-1]:.4f}")

In [None]:
# Plot training curves
plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss (log scale)')
plt.title('Training and Validation Loss (Log Scale)')
plt.yscale('log')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

## Text Generation

In [None]:
# Create inference object
inference = GPTInference(model, tokenizer, device)

# Test prompts
prompts = [
    "The quick",
    "Machine learning",
    "Deep learning",
    "Natural language"
]

In [None]:
# Generate text with different temperatures
for prompt in prompts:
    print(f"\nPrompt: '{prompt}'")
    print("-" * 50)
    
    for temp in [0.1, 0.7, 1.0]:
        generated = inference.generate(
            prompt, 
            max_new_tokens=30, 
            temperature=temp
        )
        print(f"Temp {temp}: {generated}")
    print()

## Model Analysis

In [None]:
# Analyze attention patterns (simplified)
model.eval()
sample_text = "The quick brown fox"
tokens = tokenizer.encode(sample_text)
tokens_tensor = torch.tensor(tokens).unsqueeze(0).to(device)

with torch.no_grad():
    # Get embeddings
    embeddings = model.embedding(tokens_tensor)
    
    # Forward through first transformer block to get attention
    x, attention_weights = model.transformer_blocks[0](embeddings)
    
print(f"Input tokens: {tokens}")
print(f"Attention weights shape: {attention_weights.shape}")
print(f"Sample attention weights (first head):")
print(attention_weights[0, 0].cpu().numpy())

In [None]:
# Analyze model parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Model size: {total_params * 4 / 1024 / 1024:.2f} MB (float32)")

# Parameter breakdown by layer type
param_breakdown = {}
for name, param in model.named_parameters():
    layer_type = name.split('.')[0]
    if layer_type not in param_breakdown:
        param_breakdown[layer_type] = 0
    param_breakdown[layer_type] += param.numel()

print("\nParameter breakdown:")
for layer_type, count in sorted(param_breakdown.items()):
    percentage = (count / total_params) * 100
    print(f"{layer_type}: {count:,} ({percentage:.1f}%)")