<h1 style="text-align: center; font-weight: bold; font-size: 36px;">Character Level MLP - Torch Autograd</h1>

# Introduction

Let's create a **bigram** model by **gradient descent** - a single linear layer pseudo neural network.

Inspired by Karpathy [Neural Networks: Zero-to-Hero](https://github.com/karpathy/nn-zero-to-hero). 
We are using the same [names.txt](https://github.com/karpathy/makemore/blob/master/names.txt) as in Zero to Hero so we can compare results.

References:

- [Bengio et al. 2003 MLP language model paper](https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)

# Imports

In [202]:
import torch
import torch.nn.functional as F

# Build the Dataset

In [3]:
with open('../data/names.txt', 'r') as f:
    names = f.read().splitlines()
print("Num names:", len(names))
print("Example names:", names[:10])
print("Min length:", min(len(name) for name in names))
print("Max length:", max(len(name) for name in names))

Num names: 32033
Example names: ['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia', 'harper', 'evelyn']
Min length: 2
Max length: 15


In [20]:
# Confirm the vocabulary is ASCII only
letters = sorted(list(set(''.join(names))))

# Add start/stop/pad tokens - same for all
letters = ['.'] + letters
print(letters)

['.', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [122]:
class Tokenizer:
    def __init__(self, vocab):
        assert isinstance(vocab, list)
        assert all(isinstance(v, str) for v in vocab)
        assert all(len(v) == 1 for v in vocab)
        self.stoi = {ch: i for i, ch in enumerate(vocab)}
        self.itos = {i: ch for i, ch in enumerate(vocab)}

    def encode(self, text):
        return [self.stoi[s] for s in text]

    def decode(self, sequence):
        assert isinstance(sequence, torch.Tensor)
        assert sequence.ndim in [0, 1]
        if sequence.ndim == 0:
            return self.itos[sequence.item()]  # one char
        else:
            return ''.join([self.itos[i.item()] for i in sequence])

In [123]:
tok = Tokenizer(vocab=letters)

print(list(tok.stoi.items())[:10])
print(list(tok.itos.items())[:10])

[('.', 0), ('a', 1), ('b', 2), ('c', 3), ('d', 4), ('e', 5), ('f', 6), ('g', 7), ('h', 8), ('i', 9)]
[(0, '.'), (1, 'a'), (2, 'b'), (3, 'c'), (4, 'd'), (5, 'e'), (6, 'f'), (7, 'g'), (8, 'h'), (9, 'i')]


In [124]:
blok_size = 3  # context length

X, Y = [], []  # inputs and targets

for name in names:
    name = '.'*blok_size + name + '.'  # add start/stop tokens '..emma.'
    for i in range(len(name) - blok_size):
        X.append(tok.encode(name[i:i+blok_size]))
        Y.append(tok.encode(name[i+blok_size])[0])  # [0] to keep Y 1d tensor

X = torch.tensor(X)
Y = torch.tensor(Y)

print("Num examples:", len(X))
print(f"X {X.shape},{X.dtype}:")
print(X[:10])
print(f"Y {Y.shape},{Y.dtype}:")
print(Y[:10])

Num examples: 228146
X torch.Size([228146, 3]),torch.int64:
tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13],
        [13, 13,  1],
        [ 0,  0,  0],
        [ 0,  0, 15],
        [ 0, 15, 12],
        [15, 12,  9],
        [12,  9, 22]])
Y torch.Size([228146]),torch.int64:
tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9])


In [252]:
# Mini batch:
x_batch = X[:32]
y_batch = Y[:32]

for x, y in zip(x_batch[:10], y_batch[:10]):
    print(f"{str(x):<20} -> {str(y):<16}     {tok.decode(x)} -> {tok.decode(y)}")

tensor([0, 0, 0])    -> tensor(5)            ... -> e
tensor([0, 0, 5])    -> tensor(13)           ..e -> m
tensor([ 0,  5, 13]) -> tensor(13)           .em -> m
tensor([ 5, 13, 13]) -> tensor(1)            emm -> a
tensor([13, 13,  1]) -> tensor(0)            mma -> .
tensor([0, 0, 0])    -> tensor(15)           ... -> o
tensor([ 0,  0, 15]) -> tensor(12)           ..o -> l
tensor([ 0, 15, 12]) -> tensor(9)            .ol -> i
tensor([15, 12,  9]) -> tensor(22)           oli -> v
tensor([12,  9, 22]) -> tensor(9)            liv -> i


In [285]:
# Init Layers
torch.manual_seed(42)
C = torch.randn((27, 2), requires_grad=True)     # n_vocab, n_emb (embeddings)
W1 = torch.randn((6, 100), requires_grad=True)   # n_seq+n_emb, n_hid1
b1 = torch.randn((1, 100), requires_grad=True)   # 1, n_hid1
W2 = torch.randn((100, 27), requires_grad=True)  # n_hid1, n_out
b2 = torch.randn((1, 27), requires_grad=True)    # 1, n_out
params = [C, W1, b1, W2, b2]

In [286]:
for i in range(1000):

    # Random mini batch
    batch_indices = torch.randint(0, X.shape[0], (32,))
    x_batch = X[batch_indices]
    y_batch = Y[batch_indices]

    # Forward Pass
    emb = C[x_batch]                            # n_batch, n_seq, n_emb
    h1 = torch.tanh(emb.view(-1, 6) @ W1 + b1)  # n_batch, n_hid1
    logits = h1 @ W2 + b2                       # n_batch, n_vocab
    loss = F.cross_entropy(logits, y_batch)

    if i % 100 == 0: print(loss.item())

    # Backward Pass
    for p in params:
        p.grad = None
    loss.backward()
    for p in params:
        p.data += -0.1 * p.grad

# Expect loss to drop to ~0.25

14.903022766113281
3.795100688934326
3.4425148963928223
2.797386646270752
2.9738969802856445
2.947486400604248
2.6208128929138184
2.5312087535858154
2.565312623977661
2.579007387161255


In [None]:
# Forward Pass
emb = C[X]                                  # n_batch, n_seq, n_emb
h1 = torch.tanh(emb.view(-1, 6) @ W1 + b1)  # n_batch, n_hid1
logits = h1 @ W2 + b2                       # n_batch, n_vocab
loss = F.cross_entropy(logits, Y)
print(loss)  # ~2.71

tensor(2.7095, grad_fn=<NllLossBackward0>)


# Appendix

In [240]:
def verify_softmax_and_cross_entropy():
    def softmax(logits):
        """Numerically stable softmax"""
        max_ = torch.max(logits, dim=-1, keepdim=True)[0]
        exp = torch.exp(logits - max_)
        exp_sum = torch.sum(exp, dim=-1, keepdim=True)
        return exp / exp_sum

    def only_cross_entropy(y_hat, correct_target_idx):
        """Compute the cross-entropy loss. Equivalent to neg log likelihood."""
        target_class_prob = y_hat[torch.arange(len(y_hat)), correct_target_idx]    # n_batch
        ce_loss = -1 * torch.log(target_class_prob)
        return ce_loss

    def fused_cross_entropy(logits, correct_target_idx):
        """Softmax fused with cross_entropy. Matches F.cross_entropy"""
        y_hat = softmax(logits)
        ce_loss = only_cross_entropy(y_hat, correct_target_idx)
        return ce_loss.mean()
    
    # Rand init
    torch.manual_seed(42)
    
    # Init Layers
    C = torch.randn((27, 2), requires_grad=True)     # n_vocab, n_emb (embeddings)
    W1 = torch.randn((6, 100), requires_grad=True)   # n_seq+n_emb, n_hid1
    b1 = torch.randn((1, 100), requires_grad=True)   # 1, n_hid1
    W2 = torch.randn((100, 27), requires_grad=True)  # n_hid1, n_out
    b2 = torch.randn((1, 27), requires_grad=True)    # 1, n_out

    # Mini batch:
    x_batch = X[:12]
    y_batch = Y[:12]

    # Embed inputs
    emb = C[x_batch]  # n_batch, n_seq, n_emb

    # First layer
    z1 = emb.view(-1, 6) @ W1 + b1  # n_batch, n_hid1
    h1 = torch.tanh(z1)             # n_batch, n_hid1

    # Output layer
    logits = h1 @ W2 + b2   # n_batch, n_vocab

    probs = softmax(logits)                # n_batch, n_vocab
    probs_2 = torch.softmax(logits, -1)    # Equivalently
    assert torch.allclose(probs, probs_2)

    loss = fused_cross_entropy(logits, y_batch)  # scalar
    loss_2 = F.cross_entropy(logits, y_batch)    # equivalent
    assert torch.allclose(loss, loss_2)

    print(loss)

    print("PyTorch softmax/cross_entropy seem correct! :)")

verify_softmax_and_cross_entropy()

tensor(19.1366, grad_fn=<MeanBackward0>)
PyTorch softmax/cross_entropy seem correct! :)
