In [5]:
import torch
import torch.nn as nn
from standard_LLM import TinyTransformer
from dataset import get_batch_split, vocab_size, stoi, decode

In [6]:
device = (
    "mps" if torch.backends.mps.is_available()
    else "cuda" if torch.cuda.is_available()
    else "cpu"
)
print(f"Using device: {device}")

Using device: mps


In [7]:
block_size = 64
batch_size = 32
n_embd = 128
n_heads = 4
n_layers = 4

steps = 5000
eval_interval = 500
eval_steps = 50

model = TinyTransformer(vocab_size, n_embd=n_embd, block_size=block_size, n_heads=n_heads, n_layers=n_layers).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
loss_fn = nn.CrossEntropyLoss()

@torch.no_grad()
def estimate_loss():
    model.eval()
    out = {}
    for split in ['train', 'val']:
        losses = torch.zeros(eval_steps)
        for k in range(eval_steps):
            x, y = get_batch_split(split, block_size=block_size, batch_size=batch_size, device=device)
            logits = model(x)
            B, T, C = logits.shape
            loss = loss_fn(logits.view(B*T, C), y.view(B*T))
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

for step in range(steps):
    x, y = get_batch_split('train', block_size=block_size, batch_size=batch_size, device=device)

    logits = model(x)
    B, T, C = logits.shape
    loss = loss_fn(logits.view(B*T, C), y.view(B*T))

    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) 
    optimizer.step()
    
    if step % eval_interval == 0 or step == steps - 1:
        losses = estimate_loss()
        print(f"Step {step}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

torch.save(model.state_dict(), 'tiny_shakespeare_model.pth')
print("Model saved to tiny_shakespeare_model.pth")


Step 0: train loss 3.8858, val loss 3.8961
Step 500: train loss 2.1981, val loss 2.2304
Step 1000: train loss 1.9259, val loss 2.0086
Step 1500: train loss 1.7543, val loss 1.8949
Step 2000: train loss 1.6566, val loss 1.8181
Step 2500: train loss 1.5756, val loss 1.7449
Step 3000: train loss 1.5302, val loss 1.7086
Step 3500: train loss 1.4901, val loss 1.6896
Step 4000: train loss 1.4715, val loss 1.6802
Step 4500: train loss 1.4435, val loss 1.6265
Step 4999: train loss 1.4210, val loss 1.6294
Model saved to tiny_shakespeare_model.pth


In [20]:
def generate(model, start, length=100, temperature=0.8, top_k=40):
    model.eval()
    with torch.no_grad():
        x = torch.tensor([stoi[s] for s in start], dtype=torch.long).unsqueeze(0).to(device)
        idx = model.generate(x, max_new_tokens=length, temperature=temperature, top_k=top_k)
        return decode(idx[0].tolist())

print(generate(model, "RO", length=500))

ROMEO:
With all my soul loving should my franch would I read?

QUEEN ELIZABETH:
Banished; the worlds of the woman hangs to thee
colless in your compons and dishonour,
For therefore is the please is langues,
And conclured, the sea in my burnences;
I certain tied which corsuls follow his suits,
Or with the child; if thou distraight of your hate
He shall had and rule with my beausion.

CORIOLANUS:
I me you with my bey-gone.

ROMEO:
Though in this own wish'd, let's an of my chamber:
And in the obes of
