# Part 6: Beautified — Modular Name Generation

This notebook uses:
- **`data_loading`** — dataset download, normalization, vocabularies, and validation
- **`model`** — `CategoryConditionedNameModel` (torch.nn.Module) for category-conditioned next-character prediction

No inline layer lists; everything is organized in modules for readability and reuse.

In [None]:
import sys
from pathlib import Path

# Ensure project root is on path (run from repo root or from part6)
root = Path.cwd() if Path.cwd().name != "part6" else Path.cwd().parent
if str(root) not in sys.path:
    sys.path.insert(0, str(root))

import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

from part6.data_loading import (
    load_words_and_categories,
    build_character_vocabulary,
    build_category_vocabulary,
    get_train_val_test_splits,
    validate_dataset,
)
from part6.model import CategoryConditionedNameModel, ModelConfig

%matplotlib inline

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    torch.cuda.empty_cache()

In [None]:
import kagglehub

path = kagglehub.dataset_download("isaacbenge/fantasy-for-markov-generator")
print(f"Dataset path: {path}")

In [None]:
# Load words and categories (optional: add extra CSV)
extra_csv = Path("data/forgotten_realms_inhabitants_by_race.csv")
if not extra_csv.is_file():
    extra_csv = Path("../data/forgotten_realms_inhabitants_by_race.csv")
extra_csv = str(extra_csv) if extra_csv.is_file() else None

words, categories = load_words_and_categories(
    path,
    extra_csv_path=extra_csv,
    extra_name_column="name",
    extra_category_column="source_category",
)
print(f"Loaded {len(words)} words with {len(categories)} categories")

In [None]:
# Build vocabularies
char_vocab = build_character_vocabulary(words)
cat_vocab = build_category_vocabulary(categories)

print(f"Character vocab size: {char_vocab.size}")
print(f"Category vocab size: {cat_vocab.size}")
print(f"Sample chars: {char_vocab.itos}")

In [None]:
# Train/val/test splits and dataset building
block_size = 24

(Xtr, Ytr, Ctr), (Xdev, Ydev, Cdev), (Xte, Yte, Cte) = get_train_val_test_splits(
    words,
    cat_vocab.normalized_categories,
    char_vocab,
    cat_vocab,
    block_size,
    train_frac=0.95,
    val_frac=0.03,
    seed=42,
)

print(f"Train: {Xtr.shape[0]}, Val: {Xdev.shape[0]}, Test: {Xte.shape[0]}")

In [None]:
# Validate datasets
validate_dataset(Xtr, Ytr, Ctr, char_vocab, cat_vocab)

# Sanity checks before training
assert Ytr.max() < char_vocab.size and Ytr.min() >= 0
assert not torch.isnan(Ytr).any()

In [None]:
# Model and optimizer
g = torch.Generator().manual_seed(2147483647)

config = ModelConfig(
    vocab_size=char_vocab.size,
    cat_vocab_size=cat_vocab.size,
    block_size=block_size,
    n_embd=64,
    n_hidden=512,
    num_heads=8,
    num_attention_blocks=5,
    num_mlp_layers=1,
    cat_emb_dim=32,
    dropout=0.1,
    last_layer_scale=0.1,
)
model = CategoryConditionedNameModel(config).to(device)

n_params = sum(p.numel() for p in model.parameters())
print(f"Parameters: {n_params:,}")

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [None]:
# Training loop
max_steps = 300_000
batch_size = 256
lossi = []

for i in range(max_steps):
    ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
    Xb = Xtr[ix].to(device)
    Yb = Ytr[ix].to(device)
    Cb = Ctr[ix].to(device)

    model.train()
    optimizer.zero_grad()
    logits, loss = model(Xb, Cb, targets=Yb)
    loss.backward()
    optimizer.step()

    lossi.append(loss.log10().item())
    if (i + 1) % 10_000 == 0:
        mem = torch.cuda.memory_allocated(0) / 1e9 if torch.cuda.is_available() else 0
        print(f"{i+1:7d}/{max_steps:7d}: loss={loss.item():.4f}  GPU={mem:.2f}GB")

print("Training done.")

In [None]:
plt.plot(torch.tensor(lossi).view(-1, 1000).mean(1))
plt.xlabel("Step (1k)")
plt.ylabel("log10(loss)")
plt.title("Training loss")
plt.show()

In [None]:
# Evaluation: train and val loss
@torch.no_grad()
def split_loss(split: str) -> float:
    model.eval()
    splits = {
        "train": (Xtr, Ytr, Ctr),
        "val": (Xdev, Ydev, Cdev),
        "test": (Xte, Yte, Cte),
    }
    X, Y, C = splits[split]
    n = min(2048, X.shape[0])
    Xb = X[:n].to(device)
    Yb = Y[:n].to(device)
    Cb = C[:n].to(device)
    logits, _ = model(Xb, Cb, targets=Yb)
    loss = F.cross_entropy(logits, Yb)
    return loss.item()

# print(f"Train loss: {split_loss('train'):.4f}")
print(f"Val loss:   {split_loss('val'):.4f}")

In [None]:
# Generate names for a given category
gen = torch.Generator(device=device).manual_seed(2147483647 + 1)

cat_name = "dwarves"
cat_idx = next((k for k, v in cat_vocab.itos.items() if v.lower() == cat_name), 0)

for _ in range(15):
    name = model.generate(cat_idx, char_vocab.itos, generator=gen)
    print(f"[{cat_name}] {name}")

In [None]:
# Random category sampling
import random

gen = torch.Generator(device=device).manual_seed(2147483647 + 2)
cat_indices = list(cat_vocab.itos.keys())

for _ in range(10):
    c = random.choice(cat_indices)
    cat_label = cat_vocab.itos[c]
    name = model.generate(c, char_vocab.itos, generator=gen)
    print(f"[{cat_label}] {name}")