# makemore: becoming a backprop ninja

In [2]:
import torch
import torch.nn.functional as F

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [5]:
len(words)

32033

In [6]:
chars = ['.'] + sorted(set(''.join(words)))
stoi = {s:i for i, s in enumerate(chars)}
itos = chars
print(chars)
vocab_size = len(chars)

['.', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [7]:
from collections import deque

In [8]:
block_size = 3

def build_dataset(words):
    X, Y = [], []

    for w in words:
        # print(w)
        context = deque([0] * block_size)
        for ch in w + '.':
            ix = stoi[ch]
            X.append(list(context))
            Y.append(ix)
            # print(''.join(itos[i] for i in context), '---->', itos[ix])
            context.popleft()
            context.append(ix)
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y

g = torch.Generator().manual_seed(2147483647)
train_words, dev_words, test_words = torch.utils.data.random_split(words, [0.8, 0.1, 0.1], generator=g)

In [9]:
len(train_words), len(dev_words), len(test_words)

(25627, 3203, 3203)

In [10]:
Xtr, Ytr = build_dataset(train_words)
Xdev, Ydev = build_dataset(dev_words)
Xtest, Ytest = build_dataset(test_words)

In [16]:
n_embed = 10
n_hidden = 64

g = torch.Generator().manual_seed(2147483647)
C = torch.randn((vocab_size, n_embed),           generator=g)
# Layer 1
W1 = torch.randn((n_embed*block_size, n_hidden), generator=g) * 0.2
b1 = torch.randn(n_hidden,                       generator=g) * 0.1
# Layer 2
W2 = torch.randn((n_hidden, vocab_size),         generator=g) * 0.1
b2 = torch.randn(vocab_size,                     generator=g) * 0.1
# BatchNorm params
bngain = torch.randn((1, n_hidden), generator=g) * 0.1 + 1.0
bnbias = torch.randn((1, n_hidden), generator=g) * 0.1

bnmean_running = torch.ones((1, n_hidden))
bnstd_running = torch.zeros((1, n_hidden))

parameters = [C, W1, W2, b2, bngain, bnbias]
for p in parameters:
    p.requires_grad = True

In [12]:
sum(p.nelement() for p in parameters)

12097

In [15]:
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):
    # minibatch
    idx = torch.randint(0, Xtr.shape[0], (batch_size, ), generator=g)
    Xb, Yb = Xtr[idx], Ytr[idx]

    # forward pass
    emb = C[Xb]
    embcat = emb.view(emb.shape[0], -1)
    hpreact = embcat @ W1
    bnmeani = hpreact.mean(0, keepdim=True)
    bnstdi = hpreact.std(0, keepdim=True)
    hpreact = bngain * (hpreact - bnmeani) / bnstdi + bnbias
    with torch.no_grad():
        bnmean_running = 0.999 * bnmean_running + 0.001 * bnmeani
        bnstd_running = 0.999 * bnstd_running + 0.001 * bnstdi
    
    h = torch.tanh(hpreact)  # Why tanh here?
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Yb)
    
    # backward pass
    for p in parameters:
        p.grad = None  # Warn! typo like `p.gard` doesn't raise an error!!
    loss.backward()

    # update
    lr = 0.1 if i < 100000 else 0.01
    for p in parameters:
        p.data += -lr * p.grad
    if i % 10000 == 0:
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())

      0/ 200000: 1.8528
  10000/ 200000: 2.4387
  20000/ 200000: 2.2693
  30000/ 200000: 2.2244
  40000/ 200000: 2.0032
  50000/ 200000: 2.1466
  60000/ 200000: 1.9634
  70000/ 200000: 2.2301
  80000/ 200000: 2.1237
  90000/ 200000: 1.9560
 100000/ 200000: 1.9631
 110000/ 200000: 1.8676
 120000/ 200000: 2.1283
 130000/ 200000: 2.3299
 140000/ 200000: 1.9709
 150000/ 200000: 1.9470
 160000/ 200000: 2.5972
 170000/ 200000: 2.1323
 180000/ 200000: 2.0372
 190000/ 200000: 1.9299


In [14]:
def evaluate(X, Y):
    with torch.no_grad():
        emb = C[X]
        embcat = emb.view(emb.shape[0], -1)
        hpreact = embcat @ W1 # + b1
        hpreact = bngain * (hpreact - bnmean_running) / bnstd_running + bnbias
        h = torch.tanh(hpreact) # Why tanh here?
        logits = h @ W2 + b2
        loss = F.cross_entropy(logits, Y)
        return loss.item()

print(f'Train loss: {evaluate(Xtr, Ytr)}')
print(f'Test loss: {evaluate(Xtest, Ytest)}')
print(f'Dev loss: {evaluate(Xdev, Ydev)}')

Train loss: 2.062927722930908
Test loss: 2.1066877841949463
Dev loss: 2.1133031845092773


In [42]:
g = torch.Generator().manual_seed(12345)
block_size = 3
for _ in range(20):
    out = []
    context = deque([0] * block_size)
    while True:
        emb = C[torch.tensor([context])]
        h = torch.tanh(emb.view(1, -1) @ W1 + b1)
        logits = h @ W2 + b2
        probs = F.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()

        context.popleft()
        context.append(ix)
        
        out.append(ix)
        if ix == 0:
            break
    print(''.join(itos[i] for i in out))

surepian.
nat.
nail.
zerik.
jayley.
mad.
rodyanni.
ter.
carssorer.
chan.
sharlaya.
arahlylan.
gena.
sem.
hendrie.
aden.
jazya.
dalliellarn.
kashlyrion.
maliikott.
