# Speedrun: TinyStories pretraining

Minimal notebook for **timed** or **full** pretraining on TinyStories on dual T4 (e.g. Kaggle).
Run cells in order. Cell 1 prepares data; Cell 2 runs DDP training (choose timed or by epoch).

In [None]:
# Install (run once). On Kaggle: GPU T4 x 2, then run this cell.
import sys
from pathlib import Path

ROOT = Path.cwd()
if not (ROOT / "cs336_systems").exists() and (ROOT.parent / "cs336_systems").exists():
    ROOT = ROOT.parent
sys.path.insert(0, str(ROOT))
import os
os.chdir(ROOT)

!pip install -q -r requirements.txt datasets transformers
%cd cuda 2>/dev/null && !pip install -q -e . && %cd .. || true

In [None]:
# Prepare TinyStories: download, tokenize, save .pt (T4-safe: seq_len=256)
from pathlib import Path
from cs336_systems.tinystories_data import build_tinystories_pt

SEQ_LEN = 256
OUT_DIR = "/kaggle/working" if Path("/kaggle").exists() else str(ROOT)
data_path = Path(OUT_DIR) / "tinystories.pt"

# Set max_samples=50000 for a quicker test; None = full dataset
if not data_path.exists():
    build_tinystories_pt(
        output_path=data_path,
        seq_len=SEQ_LEN,
        max_samples=None,
        vocab_size=10000,
    )
else:
    print(f"Using existing {data_path}")
print(f"Data path: {data_path}")

In [None]:
# Train: pick one.
# Option A — Timed speedrun (e.g. 20 min on 2x T4)
MAX_MINUTES = 20
# Option B — Full pretraining (1 epoch over TinyStories)
EPOCHS = 1
USE_TIMED = True  # set False for full epoch run

save_path = Path(OUT_DIR) / "speedrun_model.pt"
cmd = (
    f"torchrun --nproc_per_node=2 train.py --ddp "
    f"--config small --batch_size 4 --seq_len {SEQ_LEN} "
    f"--data_path {data_path} --mixed_precision "
    f"--save_path {save_path} "
)
if USE_TIMED:
    cmd += f" --max_minutes {MAX_MINUTES}"
else:
    cmd += f" --epochs {EPOCHS}"

!{cmd}

In [None]:
# Quick generation (single GPU, load saved checkpoint)
import torch
from cs336_systems.model import CONFIGS, TransformerLM

if save_path.exists():
    model = TransformerLM.from_config("small", use_flash=False)
    model.load_state_dict(torch.load(save_path, map_location="cpu", weights_only=True))
    model.eval()
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)
    
    prompt = torch.randint(0, 10000, (1, 10), device=device)
    with torch.no_grad():
        for _ in range(50):
            logits = model(prompt)
            next_id = logits[:, -1, :].argmax(dim=-1, keepdim=True)
            prompt = torch.cat([prompt, next_id], dim=1)
    print("Sample output (token ids):", prompt[0].tolist()[:30])
else:
    print("No checkpoint found; run training cell first.")