<h1 style="text-align: center; font-weight: bold; font-size: 36px;">Makemore Part 4: Backprop Ninja</h1>

# Introduction

Manual backprop through a **MLP** model. Pen-and-paper derivations

Inspired by Karpathy [Neural Networks: Zero-to-Hero](https://github.com/karpathy/nn-zero-to-hero). 
We are using the same [names.txt](https://github.com/karpathy/makemore/blob/master/names.txt) as in Zero to Hero so we can compare results.

# Imports

In [1]:
import random
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

# Build the Dataset

In [2]:
with open('../data/names.txt', 'r') as f:
    names = f.read().splitlines()
print("Num names:", len(names))
print("Example names:", names[:10])
print("Min length:", min(len(name) for name in names))
print("Max length:", max(len(name) for name in names))

Num names: 32033
Example names: ['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia', 'harper', 'evelyn']
Min length: 2
Max length: 15


In [3]:
# Get vocabulary
letters = sorted(list(set(''.join(names))))
letters = ['.'] + letters
n_vocab = len(letters)
print(letters)

['.', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [4]:
class Tokenizer:
    def __init__(self, vocab):
        assert isinstance(vocab, list)
        assert all(isinstance(v, str) for v in vocab)
        assert all(len(v) == 1 for v in vocab)
        self.stoi = {ch: i for i, ch in enumerate(vocab)}
        self.itos = {i: ch for i, ch in enumerate(vocab)}

    def encode(self, text):
        return [self.stoi[s] for s in text]

    def decode(self, sequence):
        if isinstance(sequence, list):
            return ''.join([self.itos[i] for i in sequence])
        elif isinstance(sequence, torch.Tensor):
            assert sequence.ndim in [0, 1]
            if sequence.ndim == 0:
                return self.itos[sequence.item()]  # one char
            else:
                return ''.join([self.itos[i.item()] for i in sequence])
        else:
            raise ValueError(f"Type {type(sequence)} not supported")

In [5]:
def build_dataset(tok, block_size, names):
    X, Y = [], []  # inputs and targets
    for name in names:
        name = '.'*block_size + name + '.'  # add start/stop tokens '..emma.'
        for i in range(len(name) - block_size):
            X.append(tok.encode(name[i:i+block_size]))
            Y.append(tok.encode(name[i+block_size])[0])  # [0] to keep Y 1d tensor
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

In [6]:
block_size = 3  # context length
tok = Tokenizer(vocab=letters)

random.seed(42)
random.shuffle(names)
n1 = int(0.8*len(names))
n2 = int(0.9*len(names))

Xtr, Ytr = build_dataset(tok, block_size, names[:n1])
Xval, Yval = build_dataset(tok, block_size, names[n1:n2])
Xtest, Ytest = build_dataset(tok, block_size, names[n2:])

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [7]:
def cmp(s, at, bt):
    ex = torch.all(at == bt).item()
    app = torch.allclose(at, bt)
    maxdiff = (at - bt).abs().max().item()
    print(f'{s:18s} | exat: {str(ex):5s} | approx: {str(app):5s} | maxdiff: {maxdiff}')

# Train the Model

In [8]:
# Init Layers
torch.manual_seed(0)

# Hyperparameters
n_batch = 32
n_embd = 10
n_hid = 200

# Model
C = torch.randn((n_vocab, n_embd))                              # n_vocab, n_emb (embeddings)
W1_kaiming_init = (5/3)/((n_embd*block_size)**0.5)              # tanh_gain / sqrt(fan_in)
W1 = torch.randn((n_embd*block_size, n_hid)) * W1_kaiming_init  # n_seq*n_emb, n_hid
b1 = torch.randn(n_hid)                      * 0.1              # n_hid
bngain = torch.randn((1, n_hid))             * 0.1 + 1.0        # 1, n_hid
bnbias = torch.randn((1, n_hid))             * 0.1              # 1, n_hid
W2 = torch.randn((n_hid, n_vocab))           * 0.1              # n_hid, n_out
b2 = torch.randn(n_vocab)                    * 0.1              # 1, n_out

# Gather Params
params = [C, W1, b1, bngain, bnbias, W2, b2]
for p in params:
    p.requires_grad = True

# No gradient calculation
bnmean1_running = torch.zeros((1, n_hid))
bnvar1_running = torch.ones((1, n_hid))

In [9]:
iters, losses = [], []

lr_schedule = [0.1]*100000 + [0.01]*100000
num_epochs = len(lr_schedule)
batch_size = 32
i = 0

In [10]:
import time
time_start = time.time()
for _ in range(num_epochs):

    # Random mini batch
    batch_indices = torch.randint(0, Xtr.shape[0], (batch_size,))
    x_batch = Xtr[batch_indices]
    y_batch = Ytr[batch_indices]

    # Forward Pass

    # Embedding
    emb = C[x_batch]                                  # n_batch, n_seq, n_emb
    embcat = emb.view(-1, n_embd*block_size)          # n_batch, n_embd*block_size
    # Linear 1
    z1 = embcat @ W1 + b1                             # n_batch, n_hid
    # Bachnorm 1
    z1_mean = z1.mean(0, keepdim=True)
    z1_var = z1.var(0, keepdim=True, unbiased=True)
    z1_std_inv = (z1_var + 1e-5)**-0.5
    zx = (z1 - z1_mean) * z1_std_inv
    zz = bngain * zx + bnbias
    with torch.no_grad():
        bnmean1_running = 0.999 * bnmean1_running + 0.001 * z1_mean
        bnvar1_running = 0.999 * bnvar1_running + 0.001 * z1_var
    # Tanh 1
    h1 = torch.tanh(zz)
    # Linear 2
    logits = h1 @ W2 + b2                           # n_batch, n_vocab
    # Cross Entropy Loss
    loss = F.cross_entropy(logits, y_batch)

    # Backward Pass - Torch
    #loss.backward()

    # Backward Pass - Manual
    grads = list(range(7))
    with torch.no_grad():
        # Cross Entropy
        d_logits = F.softmax(logits, dim=1)
        d_logits[range(n_batch), y_batch] -= 1
        d_logits /= n_batch
        # Linear 2
        d_h1 = d_logits @ W2.T
        d_W2 = h1.T @ d_logits
        d_b2 = d_logits.sum(dim=0)
        # Tanh 1        
        d_zz = (1 - torch.tanh(zz)**2)  *   d_h1  # don't forget chain rule
        # Batch Norm
        d_bngain = (zx * d_zz).sum(dim=0, keepdim=True)
        d_bnbias = d_zz.sum(dim=0, keepdim=True)
        d_z1 = bngain * z1_std_inv / n_batch * (
            n_batch * d_zz 
            - d_zz.sum(0) 
            - n_batch/(n_batch-1) * zx * (d_zz * zx).sum(0)
        )
        # Linear 1
        d_embcat = d_z1 @ W1.T
        d_W1 = embcat.T @ d_z1
        d_b1 = d_z1.sum(dim=0)
        # Embedding
        d_emb = d_embcat.view(-1, block_size, n_embd)
        d_C = torch.zeros_like(C)
        d_C.index_add_(0, x_batch.view(-1), d_emb.view(-1, n_embd))
        grads = [d_C, d_W1, d_b1, d_bngain, d_bnbias, d_W2, d_b2]

    # Update
    lr = lr_schedule[i]
    for p, grad in zip(params, grads):
        # p.data += -lr * p.grad   # Torch
        p.data += -lr * grad      # Manual

    # Stats
    if i % 10000 == 0:
        time_taken = time.time() - time_start
        time_start = time.time()
        print(f"{time_taken:.2f}  {i}  {loss.item()}")
    iters.append(i)
    losses.append(loss.item())
    i += 1


0.06  0  3.466693639755249
24.71  10000  2.5049779415130615
41.14  20000  1.9582557678222656
43.67  30000  1.9478200674057007
42.93  40000  2.1745622158050537
42.67  50000  2.1868133544921875
43.68  60000  1.9040783643722534
40.21  70000  1.9509446620941162
45.13  80000  2.00536847114563
41.18  90000  2.4943363666534424
38.10  100000  2.0863988399505615
41.10  110000  2.2291364669799805
43.78  120000  2.1264493465423584
43.08  130000  2.389422655105591
41.33  140000  2.2911500930786133
44.44  150000  2.2828786373138428
43.70  160000  2.3709001541137695
37.07  170000  1.9904111623764038
50.55  180000  1.9067087173461914
38.19  190000  1.9187291860580444


In [None]:
# Gather Params
names = ["d_C", "d_W1", "d_b1", "d_bngain", "d_bnbias", "d_W2", "d_b2"]
for param, grad, name in zip(params, grads, names):
    print(name)
    cmp(name, grad, param.grad)

In [19]:
@torch.no_grad()
def evaluate(Xset, Yset):
    # Embedding
    emb = C[Xset]                                  # n_batch, n_seq, n_emb
    embcat = emb.view(-1, n_embd*block_size)          # n_batch, n_embd*block_size
    # Linear 1
    z1 = embcat @ W1 + b1                             # n_batch, n_hid
    # Bachnorm 1
    z1_std_inv = (bnvar1_running + 1e-5)**-0.5
    zx = (z1 - bnmean1_running) * z1_std_inv
    zz = bngain * zx + bnbias
    # Tanh 1
    h1 = torch.tanh(zz)
    # Linear 2
    logits = h1 @ W2 + b2                           # n_batch, n_vocab
    # Cross Entropy Loss
    loss = F.cross_entropy(logits, Yset)
    return loss.item()

print("train = ", evaluate(Xtr, Ytr))    # ~2.12
print("eval =  ", evaluate(Xval, Yval))  # ~2.15

train =  2.072300910949707
eval =   2.1154792308807373


In [20]:
@torch.no_grad()
def sample_name():
    context = tok.encode('.'*block_size)
    while True:
        x = torch.tensor(context[-3:]).view(1, -1)   # n_batch=1, n_seq
        # Forward Pass
        emb = C[x]                                  # n_batch, n_seq, n_emb
        embcat = emb.view(-1, n_embd*block_size)          # n_batch, n_embd*block_size
        # Linear 1
        z1 = embcat @ W1 + b1                             # n_batch, n_hid
        # Bachnorm 1
        z1_std_inv = (bnvar1_running + 1e-5)**-0.5
        zx = (z1 - bnmean1_running) * z1_std_inv
        zz = bngain * zx + bnbias
        # Tanh 1
        h1 = torch.tanh(zz)
        # Linear 2
        logits = h1 @ W2 + b2                           # n_batch, n_vocab
        probs = torch.softmax(logits, dim=1)
        # Sample
        sample = torch.multinomial(probs, 1).item()
        context.append(sample)
        # Break
        if sample == 0:  # stop token
            break

    return tok.decode(context)[block_size:]

In [21]:
torch.manual_seed(42)

for i in range(10):
    print(sample_name())

anuellyn.
jamar.
idushan.
shan.
silaylen.
kemarce.
man.
emiah.
nasildie.
kani.
