<h1 style="text-align: center; font-weight: bold; font-size: 36px;">Character Level MLP - Torch Autograd</h1>

# Introduction

Let's create a **bigram** model by **gradient descent** - a single linear layer pseudo neural network.

Inspired by Karpathy [Neural Networks: Zero-to-Hero](https://github.com/karpathy/nn-zero-to-hero). 
We are using the same [names.txt](https://github.com/karpathy/makemore/blob/master/names.txt) as in Zero to Hero so we can compare results.

References:

- [Bengio et al. 2003 MLP language model paper](https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)

# Imports

In [12]:
import random
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

# Build the Dataset

In [2]:
with open('../data/names.txt', 'r') as f:
    names = f.read().splitlines()
print("Num names:", len(names))
print("Example names:", names[:10])
print("Min length:", min(len(name) for name in names))
print("Max length:", max(len(name) for name in names))

Num names: 32033
Example names: ['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia', 'harper', 'evelyn']
Min length: 2
Max length: 15


In [4]:
# Get vocabulary
letters = sorted(list(set(''.join(names))))
letters = ['.'] + letters
print(letters)

['.', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [65]:
class Tokenizer:
    def __init__(self, vocab):
        assert isinstance(vocab, list)
        assert all(isinstance(v, str) for v in vocab)
        assert all(len(v) == 1 for v in vocab)
        self.stoi = {ch: i for i, ch in enumerate(vocab)}
        self.itos = {i: ch for i, ch in enumerate(vocab)}

    def encode(self, text):
        return [self.stoi[s] for s in text]

    def decode(self, sequence):
        if isinstance(sequence, list):
            return ''.join([self.itos[i] for i in sequence])
        elif isinstance(sequence, torch.Tensor):
            assert sequence.ndim in [0, 1]
            if sequence.ndim == 0:
                return self.itos[sequence.item()]  # one char
            else:
                return ''.join([self.itos[i.item()] for i in sequence])
        else:
            raise ValueError(f"Type {type(sequence)} not supported")

In [43]:
def build_dataset(tok, block_size, names):
    X, Y = [], []  # inputs and targets
    for name in names:
        name = '.'*block_size + name + '.'  # add start/stop tokens '..emma.'
        for i in range(len(name) - block_size):
            X.append(tok.encode(name[i:i+block_size]))
            Y.append(tok.encode(name[i+block_size])[0])  # [0] to keep Y 1d tensor
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

In [69]:
block_size = 3  # context length
tok = Tokenizer(vocab=letters)

random.seed(42)
random.shuffle(names)
n1 = int(0.8*len(names))
n2 = int(0.9*len(names))

Xtr, Ytr = build_dataset(tok, block_size, names[:n1])
Xval, Yval = build_dataset(tok, block_size, names[n1:n2])
Xtest, Ytest = build_dataset(tok, block_size, names[n2:])

torch.Size([182437, 3]) torch.Size([182437])
torch.Size([22781, 3]) torch.Size([22781])
torch.Size([22928, 3]) torch.Size([22928])


In [14]:
# Init Layers
torch.manual_seed(42)
C = torch.randn((27, 10), requires_grad=True)     # n_vocab, n_emb (embeddings)
W1 = torch.randn((30, 200), requires_grad=True)   # n_seq+n_emb, n_hid1
b1 = torch.randn((1, 200), requires_grad=True)   # 1, n_hid1
W2 = torch.randn((200, 27), requires_grad=True)  # n_hid1, n_out
b2 = torch.randn((1, 27), requires_grad=True)    # 1, n_out
params = [C, W1, b1, W2, b2]

In [15]:
iters, losses, lrs = [], [], []

lr_schedule = [0.1]*50000 + [0.01]*50000
num_epochs = len(lr_schedule)
i = 0

In [16]:
for _ in range(num_epochs):

    # Random mini batch
    batch_indices = torch.randint(0, Xtr.shape[0], (32,))
    x_batch = Xtr[batch_indices]
    y_batch = Ytr[batch_indices]

    # Forward Pass
    emb = C[x_batch]                            # n_batch, n_seq, n_emb
    h1 = torch.tanh(emb.view(-1, 30) @ W1 + b1)  # n_batch, n_hid1
    logits = h1 @ W2 + b2                       # n_batch, n_vocab
    loss = F.cross_entropy(logits, y_batch)

    if i % 10000 == 0: print(i, loss.item())

    # Backward Pass
    for p in params:
        p.grad = None
    loss.backward()
    for p in params:
        p.data += -lr_schedule[i] * p.grad

    iters.append(i)
    losses.append(loss.item())
    lrs.append(lr_schedule[i])
    i += 1


0 28.3853759765625
10000 2.928753137588501
20000 2.7920820713043213
30000 2.2142562866210938
40000 2.274606704711914
50000 1.9432498216629028
60000 2.1025607585906982
70000 2.414307117462158
80000 2.2305052280426025
90000 1.911867380142212


In [None]:
#plt.plot(iters, torch.log(torch.tensor(losses)))
plt.plot(iters, losses)
plt.show()

In [None]:
# Train Set
emb = C[Xtr]                                  # n_batch, n_seq, n_emb
h1 = torch.tanh(emb.view(-1, 30) @ W1 + b1)  # n_batch, n_hid1
logits = h1 @ W2 + b2                       # n_batch, n_vocab
loss = F.cross_entropy(logits, Ytr)
print(loss)  # ~2.18

tensor(2.1783, grad_fn=<NllLossBackward0>)


In [None]:
# Eval Set
emb = C[Xval]                                  # n_batch, n_seq, n_emb
h1 = torch.tanh(emb.view(-1, 30) @ W1 + b1)  # n_batch, n_hid1
logits = h1 @ W2 + b2                       # n_batch, n_vocab
loss = F.cross_entropy(logits, Yval)
print(loss)  # ~2.20

tensor(2.2007, grad_fn=<NllLossBackward0>)


# Sampling

In [96]:
def sample_name():
    context = tok.encode('.'*block_size)
    while True:
        x = torch.tensor(context[-3:]).view(1, -1)   # n_batch=1, n_seq
        # Forward Pass
        emb = C[x]                                  # n_batch, n_seq, n_emb
        h1 = torch.tanh(emb.view(-1, 30) @ W1 + b1)  # n_batch, n_hid1
        logits = h1 @ W2 + b2                       # n_batch, n_vocab
        probs = torch.softmax(logits, dim=1)
        # Sample
        sample = torch.multinomial(probs, 1).item()
        context.append(sample)
        # Break
        if sample == 0:  # stop token
            break

    return tok.decode(context)[block_size:]

In [97]:
torch.manual_seed(42)

for i in range(10):
    print(sample_name())

anuelen.
tia.
aabidushante.
nariel.
aley.
kemah.
lanie.
epiacenden.
daze.
angon.


# Next Steps

- tune optimization: `lr_schedule`, `batch_size`
- tune model: `emb_size`, `hid_size`
- larger context