<h1 style="text-align: center; font-weight: bold; font-size: 36px;">Character Level MLP - Activations, Gradients and BatchNorm</h1>

# Introduction

Let's create a **MLP** model. Explore training and debugging techniques.

Inspired by Karpathy [Neural Networks: Zero-to-Hero](https://github.com/karpathy/nn-zero-to-hero). 
We are using the same [names.txt](https://github.com/karpathy/makemore/blob/master/names.txt) as in Zero to Hero so we can compare results.

# Imports

In [None]:
import random
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

torch.set_printoptions(precision=4, sci_mode=False)

# Build the Dataset

In [2]:
with open('../data/names.txt', 'r') as f:
    names = f.read().splitlines()
print("Num names:", len(names))
print("Example names:", names[:10])
print("Min length:", min(len(name) for name in names))
print("Max length:", max(len(name) for name in names))

Num names: 32033
Example names: ['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia', 'harper', 'evelyn']
Min length: 2
Max length: 15


In [3]:
# Get vocabulary
letters = sorted(list(set(''.join(names))))
letters = ['.'] + letters
n_vocab = len(letters)
print(letters)

['.', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [4]:
class Tokenizer:
    def __init__(self, vocab):
        assert isinstance(vocab, list)
        assert all(isinstance(v, str) for v in vocab)
        assert all(len(v) == 1 for v in vocab)
        self.stoi = {ch: i for i, ch in enumerate(vocab)}
        self.itos = {i: ch for i, ch in enumerate(vocab)}

    def encode(self, text):
        return [self.stoi[s] for s in text]

    def decode(self, sequence):
        if isinstance(sequence, list):
            return ''.join([self.itos[i] for i in sequence])
        elif isinstance(sequence, torch.Tensor):
            assert sequence.ndim in [0, 1]
            if sequence.ndim == 0:
                return self.itos[sequence.item()]  # one char
            else:
                return ''.join([self.itos[i.item()] for i in sequence])
        else:
            raise ValueError(f"Type {type(sequence)} not supported")

In [5]:
def build_dataset(tok, block_size, names):
    X, Y = [], []  # inputs and targets
    for name in names:
        name = '.'*block_size + name + '.'  # add start/stop tokens '..emma.'
        for i in range(len(name) - block_size):
            X.append(tok.encode(name[i:i+block_size]))
            Y.append(tok.encode(name[i+block_size])[0])  # [0] to keep Y 1d tensor
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

In [79]:
block_size = 3  # context length
tok = Tokenizer(vocab=letters)

random.seed(42)
random.shuffle(names)
n1 = int(0.8*len(names))
n2 = int(0.9*len(names))

Xtr, Ytr = build_dataset(tok, block_size, names[:n1])
Xval, Yval = build_dataset(tok, block_size, names[n1:n2])
Xtest, Ytest = build_dataset(tok, block_size, names[n2:])

torch.Size([182437, 3]) torch.Size([182437])
torch.Size([22781, 3]) torch.Size([22781])
torch.Size([22928, 3]) torch.Size([22928])


In [159]:
# Expected initial loss:
expected_initial_loss = -1 * torch.tensor(1/n_vocab).log()
print(expected_initial_loss)

tensor(3.2958)


In [269]:
# Init Layers
torch.manual_seed(42)

n_embd = 10
n_hid = 200
C = torch.randn((n_vocab, n_embd))                       # n_vocab, n_emb (embeddings)
W1_kaiming_init = (5/3)/ ((n_embd*block_size)**0.5)      # tanh_gain / sqrt(fan_in)
W1 = torch.randn((n_embd*block_size, n_hid)) * W1_kaiming_init  # ~0.3 n_seq*n_emb, n_hid1
# b1 = torch.randn((1, n_hid))                 * 0.01     # 1, n_hid1
bngain1 = torch.ones((1, n_hid))
bnbias1 = torch.zeros((1, n_hid))
W2 = torch.randn((n_hid, n_vocab))           * 0.01     # n_hid1, n_out
b2 = torch.randn((1, n_vocab))               * 0.00     # 1, n_out
# params = [C, W1, b1, bngain1, bnbias1, W2, b2]
params = [C, W1, bngain1, bnbias1, W2, b2]

for p in params:
    p.requires_grad = True

# No gradient calculation
bnmean1_running = torch.zeros((1, n_hid))
bnstd1_running = torch.ones((1, n_hid))

In [270]:
iters, losses = [], []

lr_schedule = [0.1]*100000 + [0.01]*100000
num_epochs = len(lr_schedule)
batch_size = 32
i = 0

In [271]:
for _ in range(num_epochs):

    # Random mini batch
    batch_indices = torch.randint(0, Xtr.shape[0], (batch_size,))
    x_batch = Xtr[batch_indices]
    y_batch = Ytr[batch_indices]

    # Forward Pass
    emb = C[x_batch]                            # n_batch, n_seq, n_emb
    embcat = emb.view(-1, n_embd*block_size)    # n_batch, n_embd*block_size
    z1 = embcat @ W1  # + b1                    # n_batch, n_hid1

    # Batchnorm
    z1_mean = torch.mean(z1, dim=0, keepdim=True)  # 1, n_hid1
    z1_std = torch.std(z1, dim=0, keepdim=True)    # 1, n_hid1
    zx = (z1 - z1_mean) / (z1_std + 1e-5)          # n_batch, n_hid1
    zz = zx * bngain1 + bnbias1                    # n_batch, n_hid1

    with torch.no_grad():
        bnmean1_running = 0.999 * bnmean1_running + 0.001 * z1_mean
        bnstd1_running = 0.999 * bnstd1_running + 0.001 * z1_std

    h1 = torch.tanh(zz)                         # n_batch, n_hid1
    logits = h1 @ W2 + b2                       # n_batch, n_vocab
    loss = F.cross_entropy(logits, y_batch)

    # Backward Pass
    for p in params:
        p.grad = None
    loss.backward()
    
    lr = lr_schedule[i]
    for p in params:
        p.data += -lr * p.grad

    if i % 10000 == 0:
        print(i, loss.item())

    iters.append(i)
    losses.append(loss.item())
    i += 1

print(i, loss.item())

0 3.3060147762298584
10000 2.6263699531555176
20000 2.3344156742095947
30000 2.264537811279297
40000 2.169726848602295
50000 2.227142810821533
60000 2.2064599990844727
70000 1.878106713294983
80000 2.375546455383301
90000 2.1862945556640625
100000 2.4630401134490967
110000 2.091844320297241
120000 1.9077339172363281
130000 2.129446506500244
140000 2.0926647186279297
150000 1.826656699180603
160000 1.8776742219924927
170000 2.1841251850128174
180000 2.052818536758423
190000 2.019075632095337
200000 2.0592923164367676


In [None]:
# hidden layer activations
plt.hist(z1.view(-1).tolist(), bins=100)
plt.show()

# hidden layer outputs
plt.hist(h1.view(-1).tolist(), bins=100)
plt.show()

# neurons in tanh flat region
plt.imshow(h1.abs() > 0.99, cmap='gray', interpolation='nearest')
plt.show()

In [None]:
#plt.plot(iters, torch.log(torch.tensor(losses)))
plt.plot(iters, torch.tensor(losses).log10())
plt.show()

In [272]:
@torch.no_grad()
def calc_batch_norm_params_on_train_set():

    # Whole Dataset
    x_batch = Xtr

    # Forward Pass
    emb = C[x_batch]                            # n_batch, n_seq, n_emb
    embcat = emb.view(-1, n_embd*block_size)    # n_batch, n_embd*block_size
    z1 = embcat @ W1  # + b1                    # n_batch, n_hid1

    # Batchnorm
    z1_mean = torch.mean(z1, dim=0, keepdim=True)  # 1, n_hid1
    z1_std = torch.std(z1, dim=0, keepdim=True)    # 1, n_hid1

    return z1_std, z1_mean

bnstd1, bnmean1 = calc_batch_norm_params_on_train_set()

# Compare approaches
print( torch.abs(bnstd1 - bnstd1_running).max() )
print( torch.abs(bnmean1 - bnmean1_running).max() )

tensor(0.0438)
tensor(0.0280)


In [273]:
@torch.no_grad()
def evaluate(Xset, Yset):
    # Forward Pass
    emb = C[Xset]                             # n_batch, n_seq, n_emb
    embcat = emb.view(-1, n_embd*block_size)  # n_batch, n_embd*block_size
    z1 = embcat @ W1  # + b1                  # n_batch, n_hid1

    # Batchnorm
    zx = (z1 - bnmean1_running) / bnstd1_running   # n_batch, n_hid1
    zz = zx * bngain1 + bnbias1                    # n_batch, n_hid1

    h1 = torch.tanh(zz)                         # n_batch, n_hid1
    logits = h1 @ W2 + b2                       # n_batch, n_vocab
    loss = F.cross_entropy(logits, Yset)
    return loss.item()

print("train = ", evaluate(Xtr, Ytr))    # ~2.12
print("eval =  ", evaluate(Xval, Yval))  # ~2.15

train =  2.0653347969055176
eval =   2.121156930923462


In [None]:
# Base:
# train =  2.1214349269866943
# eval =   2.1567015647888184

# Fixed W2*0.01 and b2*0.0 init:
# train =  2.064913511276245
# eval =   2.129284143447876

# Fixed W1*0.2 and b1*0.01 init:
# train =  2.0375447273254395
# eval =   2.104278326034546

# Kaming init
# same as above, we went 0.2 -> 0.3 on W1, so not much difference
# train =  2.0372910499572754
# eval =   2.1173431873321533

# Initial batchnorm (same, no gains expected, NN probably context limited)
# train =  2.067298650741577
# eval =   2.1195051670074463

# Proper batchnorm (running mean/std, no linear layer bias)
# train =  2.0653347969055176
# eval =   2.121156930923462

# Sampling

In [274]:
@torch.no_grad()
def sample_name():
    context = tok.encode('.'*block_size)
    while True:
        x = torch.tensor(context[-3:]).view(1, -1)   # n_batch=1, n_seq
        # Forward Pass
        emb = C[x]                                # n_batch, n_seq, n_emb
        embcat = emb.view(-1, n_embd*block_size)  # n_batch, n_embd*block_size
        z1 = embcat @ W1  # + b1                  # n_batch, n_hid1


        # Batchnorm
        zx = (z1 - bnmean1_running) / bnstd1_running   # n_batch, n_hid1
        zz = zx * bngain1 + bnbias1                    # n_batch, n_hid1

        h1 = torch.tanh(zz)                         # n_batch, n_hid1
        logits = h1 @ W2 + b2                       # n_batch, n_vocab
        probs = torch.softmax(logits, dim=1)
        # Sample
        sample = torch.multinomial(probs, 1).item()
        context.append(sample)
        # Break
        if sample == 0:  # stop token
            break

    return tok.decode(context)[block_size:]

In [275]:
torch.manual_seed(42)

for i in range(10):
    print(sample_name())

anuelen.
tis.
mari.
sedyn.
shan.
silaylen.
kemah.
lucin.
epiachaleilani.
sana.
