## Load a tiny text dataset

In [2]:
from datasets import load_dataset
ds = load_dataset("wikitext", "wikitext-2-raw-v1")
text = "\n".join(ds["train"]["text"])
# quick tokenizer to small vocab (byte-level or simple char-level):
vocab = sorted(set(list(text)))
stoi = {ch:i for i,ch in enumerate(vocab)}
data_ids = torch.tensor([stoi.get(ch, 0) for ch in text], dtype=torch.long)
vocab_size = len(vocab); vocab_size


1013

## Train both models briefly, log ppl

In [3]:
from spectral_attention.train_eval import make_model, train_tiny_lm
logs_spec = train_tiny_lm(make_model("spectral", depth=2), data_ids, vocab_size, steps=200, T=512)
logs_van  = train_tiny_lm(make_model("vanilla", depth=2), data_ids, vocab_size, steps=200, T=512)


AttributeError: 'SpectralEncoder' object has no attribute 'layers'

## Plot ppl curves

In [None]:
import matplotlib.pyplot as plt
plt.plot([l["step"] for l in logs_spec], [l["ppl"] for l in logs_spec], label="spectral")
plt.plot([l["step"] for l in logs_van],  [l["ppl"] for l in logs_van],  label="vanilla")
plt.legend(); plt.ylabel("perplexity"); plt.xlabel("step")


## Long-range toy (copy task)

In [None]:
import torch, torch.nn as nn
def make_copy_batch(bsz=32, T=1024, vocab_size=32, device="cuda" if torch.cuda.is_available() else "cpu"):
    x = torch.randint(1, vocab_size, (bsz, T), device=device)
    y = x.clone()
    D = 512
    E = torch.nn.functional.one_hot(x, num_classes=vocab_size).float() @ torch.randn(vocab_size, D, device=device)*0.02
    return E, y, vocab_size, D
def train_copy(kind, steps=300, T=2048):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    E, y, V, D = make_copy_batch(T=T, device=device)
    model = make_model(kind, depth=2, d_model=D, n_heads=8).to(device)
    head = nn.Linear(D, V, bias=False).to(device)
    opt = torch.optim.AdamW(list(model.parameters())+list(head.parameters()), lr=3e-4)
    loss_fn = nn.CrossEntropyLoss()
    logs=[]
    for s in range(1, steps+1):
        opt.zero_grad(set_to_none=True)
        h = model(E)
        loss = loss_fn(head(h).reshape(-1, V), y.reshape(-1))
        loss.backward(); nn.utils.clip_grad_norm_(model.parameters(), 1.0); opt.step()
        if s%20==0: logs.append((s, loss.item()))
    return logs
logs_s = train_copy("spectral", T=4096)
logs_v = train_copy("vanilla",  T=4096)
plt.plot(*zip(*logs_s), label="spectral")
plt.plot(*zip(*logs_v), label="vanilla")
plt.legend(); plt.ylabel("loss"); plt.xlabel("step")


NameError: name 'make_model' is not defined