# makemore 2
Following a tutorial from [Andrej Karpathy](https://karpathy.ai/):
- [The spelled-out intro to language modeling: Building makemore Part 2: MLP](https://youtu.be/TCH_1BHY58I?list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ)

Implement MultiLayer Perceptron (MLP) character-level language model. Given a N-gram of characters, predict the next character. 

# Setup

In [None]:
# !pip install torch matplotlib graphviz

# Load the data

In [1]:
words = open('names.txt', 'r').read().splitlines()

# Implement the model

In [11]:
import torch
import torch.nn.functional as F

In [3]:
# build the vocabulary and mappings
chars = sorted(list(set(''.join(words))))
stoi = {s: i + 1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i: s for s, i in stoi.items()}

In [8]:
# build the training set
block_size = 3 # the N-gram context size
X, Y = [], []
for w in words[:5]:
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(f'given context: {"".join(itos[i] for i in context)} expected next char: {itos[ix]}')
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

given context: ... expected next char: e
given context: ..e expected next char: m
given context: .em expected next char: m
given context: emm expected next char: a
given context: mma expected next char: .
given context: ... expected next char: o
given context: ..o expected next char: l
given context: .ol expected next char: i
given context: oli expected next char: v
given context: liv expected next char: i
given context: ivi expected next char: a
given context: via expected next char: .
given context: ... expected next char: a
given context: ..a expected next char: v
given context: .av expected next char: a
given context: ava expected next char: .
given context: ... expected next char: i
given context: ..i expected next char: s
given context: .is expected next char: a
given context: isa expected next char: b
given context: sab expected next char: e
given context: abe expected next char: l
given context: bel expected next char: l
given context: ell expected next char: a
given context: l

## Working through the model

In [33]:
# Embedding layer:
# cram the data (vocabulary) into a 2 dimentional space
C = torch.randn((len(stoi), 2))

In [46]:
# One way to compute the embeddings for the vocab element 5 is to do a matrix multiplication of the one-hot vector for 5 with the embedding matrix C:
inp = F.one_hot(torch.tensor(5), num_classes=len(itos)).float()
print(f'{inp @ C=}')
# this if equivallent to retrieving the row 5 from the embedding matrix
print(f'{C[5]=}')

# there is synthactic sugar to retrieve multiple rows at once
print(f'{C[[1, 2, 3]]=}')
# or even compute the embeddings for entire matrices
print(f'{C[X]=}')

# because X[13, 2] == 1
print(f'{torch.equal(C[X][13, 2], C[1])=}')


inp @ C=tensor([-0.4171,  1.2442])
C[5]=tensor([-0.4171,  1.2442])
C[[1, 2, 3]]=tensor([[-0.0769, -0.7655],
        [-0.1885,  0.9772],
        [ 1.0304,  1.8648]])
C[X]=tensor([[[-0.5755, -1.3022],
         [-0.5755, -1.3022],
         [-0.5755, -1.3022]],

        [[-0.5755, -1.3022],
         [-0.5755, -1.3022],
         [-0.4171,  1.2442]],

        [[-0.5755, -1.3022],
         [-0.4171,  1.2442],
         [ 0.7413,  0.4988]],

        [[-0.4171,  1.2442],
         [ 0.7413,  0.4988],
         [ 0.7413,  0.4988]],

        [[ 0.7413,  0.4988],
         [ 0.7413,  0.4988],
         [-0.0769, -0.7655]],

        [[-0.5755, -1.3022],
         [-0.5755, -1.3022],
         [-0.5755, -1.3022]],

        [[-0.5755, -1.3022],
         [-0.5755, -1.3022],
         [-0.6066, -1.0406]],

        [[-0.5755, -1.3022],
         [-0.6066, -1.0406],
         [ 0.2635,  1.4497]],

        [[-0.6066, -1.0406],
         [ 0.2635,  1.4497],
         [-0.9866, -1.4730]],

        [[ 0.2635,  1.4497],


In [48]:
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [71]:
# 6 because we have 3 embeddings (we use N-grams of size N=3) and each embedding is 2 dimentional
# 100 is the (arbitrary) size of the hidden layer
W1 = torch.randn((6, 100))
b1 = torch.randn(100)

# juggle with dimensions
# print(f'{emb.shape=}')
# print(f'{emb.view(32, 6).shape=}')

h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
h.shape

torch.Size([32, 100])

In [90]:
# create the output layer
W2 = torch.randn((100, len(itos)))
b2 = torch.randn(len(itos))

logits = h @ W2 + b2

# compute the loss
counts = logits.exp()
probs = counts / counts.sum(1, keepdim=True)
loss = -probs[torch.arange(len(X)), Y].log().mean()
print(f'{loss=}')

# which is equivallent to
loss2 = F.cross_entropy(logits, Y)
print(f'{loss2=}')

loss=tensor(17.7309)
loss2=tensor(17.7309)


## Refactored model

In [113]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((len(stoi), 2), generator=g)
W1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100, len(itos)), generator=g)
b2 = torch.randn(len(itos), generator=g)
parameters = [C, W1, b1, W2, b2]

In [119]:
# hyperparameters
learning_rate = 0.1
epoch = 100

In [115]:
for p in parameters:
    p.requires_grad = True

In [132]:
for _ in range(epoch):
    # forward pass
    emb = C[X]
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y)
    print(f'{loss.item()=}')

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update the parameters
    for p in parameters:
        p.data -= learning_rate * p.grad

loss.item()=0.25492164492607117
loss.item()=0.2549186944961548
loss.item()=0.254915714263916
loss.item()=0.25491267442703247
loss.item()=0.2549097239971161
loss.item()=0.2549067735671997
loss.item()=0.25490376353263855
loss.item()=0.25490084290504456
loss.item()=0.2548978328704834
loss.item()=0.2548949122428894
loss.item()=0.25489193201065063
loss.item()=0.25488898158073425
loss.item()=0.25488612055778503
loss.item()=0.25488314032554626
loss.item()=0.25488021969795227
loss.item()=0.2548772692680359
loss.item()=0.2548743784427643
loss.item()=0.2548714578151703
loss.item()=0.2548685669898987
loss.item()=0.2548656761646271
loss.item()=0.25486278533935547
loss.item()=0.2548598349094391
loss.item()=0.25485697388648987
loss.item()=0.2548540532588959
loss.item()=0.25485122203826904
loss.item()=0.25484833121299744
loss.item()=0.25484541058540344
loss.item()=0.2548425793647766
loss.item()=0.2548397183418274
loss.item()=0.2548368573188782
loss.item()=0.25483402609825134
loss.item()=0.25483116507