<h1 style="text-align: center; font-weight: bold; font-size: 36px;">Character Level MLP - Activations, Gradients and BatchNorm</h1>

# Introduction

Let's create a **MLP** model. Explore training and debugging techniques.

Inspired by Karpathy [Neural Networks: Zero-to-Hero](https://github.com/karpathy/nn-zero-to-hero). 
We are using the same [names.txt](https://github.com/karpathy/makemore/blob/master/names.txt) as in Zero to Hero so we can compare results.

# Imports

In [1]:
import random
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

# Build the Dataset

In [2]:
with open('../data/names.txt', 'r') as f:
    names = f.read().splitlines()
print("Num names:", len(names))
print("Example names:", names[:10])
print("Min length:", min(len(name) for name in names))
print("Max length:", max(len(name) for name in names))

Num names: 32033
Example names: ['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia', 'harper', 'evelyn']
Min length: 2
Max length: 15


In [3]:
# Get vocabulary
letters = sorted(list(set(''.join(names))))
letters = ['.'] + letters
n_vocab = len(letters)
print(letters)

['.', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [4]:
class Tokenizer:
    def __init__(self, vocab):
        assert isinstance(vocab, list)
        assert all(isinstance(v, str) for v in vocab)
        assert all(len(v) == 1 for v in vocab)
        self.stoi = {ch: i for i, ch in enumerate(vocab)}
        self.itos = {i: ch for i, ch in enumerate(vocab)}

    def encode(self, text):
        return [self.stoi[s] for s in text]

    def decode(self, sequence):
        if isinstance(sequence, list):
            return ''.join([self.itos[i] for i in sequence])
        elif isinstance(sequence, torch.Tensor):
            assert sequence.ndim in [0, 1]
            if sequence.ndim == 0:
                return self.itos[sequence.item()]  # one char
            else:
                return ''.join([self.itos[i.item()] for i in sequence])
        else:
            raise ValueError(f"Type {type(sequence)} not supported")

In [5]:
def build_dataset(tok, block_size, names):
    X, Y = [], []  # inputs and targets
    for name in names:
        name = '.'*block_size + name + '.'  # add start/stop tokens '..emma.'
        for i in range(len(name) - block_size):
            X.append(tok.encode(name[i:i+block_size]))
            Y.append(tok.encode(name[i+block_size])[0])  # [0] to keep Y 1d tensor
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

In [6]:
block_size = 3  # context length
tok = Tokenizer(vocab=letters)

random.seed(42)
random.shuffle(names)
n1 = int(0.8*len(names))
n2 = int(0.9*len(names))

Xtr, Ytr = build_dataset(tok, block_size, names[:n1])
Xval, Yval = build_dataset(tok, block_size, names[n1:n2])
Xtest, Ytest = build_dataset(tok, block_size, names[n2:])

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [7]:
# Init Layers
torch.manual_seed(42)

n_embd = 10
n_hidden = 200
C = torch.randn((n_vocab, n_embd), requires_grad=True)     # n_vocab, n_emb (embeddings)
W1 = torch.randn((n_embd*block_size, n_hidden), requires_grad=True)   # n_seq+n_emb, n_hid1
b1 = torch.randn((1, n_hidden), requires_grad=True)   # 1, n_hid1
W2 = torch.randn((n_hidden, n_vocab), requires_grad=True)  # n_hid1, n_out
b2 = torch.randn((1, n_vocab), requires_grad=True)    # 1, n_out
params = [C, W1, b1, W2, b2]

In [8]:
iters, losses = [], []

lr_schedule = [0.1]*100000 + [0.01]*100000
num_epochs = len(lr_schedule)
batch_size = 32
i = 0

In [9]:
for _ in range(num_epochs):

    # Random mini batch
    batch_indices = torch.randint(0, Xtr.shape[0], (batch_size,))
    x_batch = Xtr[batch_indices]
    y_batch = Ytr[batch_indices]

    # Forward Pass
    emb = C[x_batch]                            # n_batch, n_seq, n_emb
    embcat = emb.view(-1, n_embd*block_size)  # n_batch, n_embd*block_size
    z1 = embcat @ W1 + b1                     # n_batch, n_hid1
    h1 = torch.tanh(z1)                       # n_batch, n_hid1
    logits = h1 @ W2 + b2                       # n_batch, n_vocab
    loss = F.cross_entropy(logits, y_batch)

    # Backward Pass
    for p in params:
        p.grad = None
    loss.backward()
    
    lr = lr_schedule[i]
    for p in params:
        p.data += -lr * p.grad

    if i % 10000 == 0:
        print(i, loss.item())

    iters.append(i)
    losses.append(loss.item())
    i += 1
print(i, loss.item())

0 28.3853759765625
10000 2.928753137588501
20000 2.7920820713043213
30000 2.2142562866210938
40000 2.274606704711914
50000 1.9432498216629028
60000 2.0952043533325195
70000 2.558513641357422
80000 2.2729926109313965
90000 1.9342801570892334
100000 2.281275987625122
110000 2.261439085006714
120000 1.9926286935806274
130000 2.179287910461426
140000 1.9996957778930664
150000 2.20851993560791
160000 2.2130329608917236
170000 1.991726279258728
180000 2.318720817565918
190000 2.176032543182373
200000 2.0051398277282715


In [None]:
#plt.plot(iters, torch.log(torch.tensor(losses)))
plt.plot(iters, torch.tensor(losses).log10())
plt.show()

In [None]:
@torch.no_grad()
def evaluate(Xset, Yset):
    # Forward Pass
    emb = C[Xset]                            # n_batch, n_seq, n_emb
    embcat = emb.view(-1, n_embd*block_size)  # n_batch, n_embd*block_size
    z1 = embcat @ W1 + b1                     # n_batch, n_hid1
    h1 = torch.tanh(z1)                       # n_batch, n_hid1
    logits = h1 @ W2 + b2                       # n_batch, n_vocab
    loss = F.cross_entropy(logits, Yset)
    return loss.item()

print("train = ", evaluate(Xtr, Ytr))    # ~2.12
print("eval =  ", evaluate(Xval, Yval))  # ~2.15

train =  2.1214349269866943
eval =   2.1567015647888184


# Sampling

In [15]:
@torch.no_grad()
def sample_name():
    context = tok.encode('.'*block_size)
    while True:
        x = torch.tensor(context[-3:]).view(1, -1)   # n_batch=1, n_seq
        # Forward Pass
        emb = C[x]                                  # n_batch, n_seq, n_emb
        h1 = torch.tanh(emb.view(-1, 30) @ W1 + b1)  # n_batch, n_hid1
        logits = h1 @ W2 + b2                       # n_batch, n_vocab
        probs = torch.softmax(logits, dim=1)
        # Sample
        sample = torch.multinomial(probs, 1).item()
        context.append(sample)
        # Break
        if sample == 0:  # stop token
            break

    return tok.decode(context)[block_size:]

In [None]:
torch.manual_seed(42)

for i in range(10):
    print(sample_name())

anuelen.
tia.
marian.
dan.
shawnika.
yana.
kemah.
lani.
sepiaciolaniam.
mik.
