# Build makmore MLP

In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
words = open('../names.txt','r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [None]:
len(words)

32033

## Build the dataset

In [None]:
chars = sorted(set(list(''.join(words))))
stoi = {char:idx+1 for idx,char in enumerate(chars)}
stoi['.'] = 0 
itos = {idx:char for char, idx in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


#|hide
03/23/23 Stop at 10:22 / 1:15:39

In [None]:
block_size = 3 #characters to take as context before predicting the next

X,Y=[],[]
for w in words[:5]:
    print(w)
    context = [0]*block_size
    for ch in w + '.':
        ix=stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context))
        context = context[1:] + [ix]
        
X = torch.tensor(X)
Y = torch.tensor(Y)

emma
...
..e
.em
emm
mma
olivia
...
..o
.ol
oli
liv
ivi
via
ava
...
..a
.av
ava
isabella
...
..i
.is
isa
sab
abe
bel
ell
lla
sophia
...
..s
.so
sop
oph
phi
hia


In [None]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [None]:
# We will embed the 27 characters into 2-d space
C = torch.randn((27,2))

In [None]:
C[5]

tensor([0.8425, 0.3145])

In [None]:
#|hide
# Identical to indexing into the embedding table
# directly
F.one_hot(torch.tensor(5), num_classes=27).float() @ C

tensor([0.8425, 0.3145])

In [None]:
#|hide
C[[5,6,7,7]]

tensor([[ 0.8425,  0.3145],
        [ 0.8930,  1.2062],
        [-1.6647, -1.2650],
        [-1.6647, -1.2650]])

In [None]:
#|hide
C[torch.tensor([5,6,7,7])]

tensor([[ 0.8425,  0.3145],
        [ 0.8930,  1.2062],
        [-1.6647, -1.2650],
        [-1.6647, -1.2650]])

In [None]:
#|hide
X.shape

torch.Size([32, 3])

In [None]:
#|hide
C[X].shape

torch.Size([32, 3, 2])

In [None]:
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [None]:
W1 = torch.randn((6,100)) # 100 neurons each taking 6 inputs
b1 = torch.randn((100)) # the bias for each of the 100 neurons

In [None]:
#|hide
# pluck out the embeddings for the first character
# in the context
emb[:,0,:].shape

torch.Size([32, 2])

In [None]:
#|hide
# pluck out the embeddings for the first, second and 
# third character in the context and concatenate them
torch.cat([emb[:,0,:],emb[:,1,:],emb[:,2,:]], dim=1).shape

torch.Size([32, 6])

In [None]:
#|hide
# unbind allows us to get a list of tensors
# equivalent to our manual plucking of embeddings in the
# previous line
len(torch.unbind(emb,1)), len(torch.unbind(emb,1)[0]), torch.unbind(emb,1)[0][0]

(3, 32, tensor([ 0.1410, -0.9693]))

In [None]:
#|hide
torch.cat(torch.unbind(emb,dim=1), dim=1).shape

torch.Size([32, 6])

In [None]:
#|hide
a = torch.arange(18)
a

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])

In [None]:
#|hide
a.shape

torch.Size([18])

In [None]:
#|hide
# .view is extremely efficient
a.view(2,9)

tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8],
        [ 9, 10, 11, 12, 13, 14, 15, 16, 17]])

In [None]:
#|hide
a.view(2,3,3)

tensor([[[ 0,  1,  2],
         [ 3,  4,  5],
         [ 6,  7,  8]],

        [[ 9, 10, 11],
         [12, 13, 14],
         [15, 16, 17]]])

In [None]:
#|hide
a.storage()

 0
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 18]

In [None]:
#|hide
emb.view((32,6)) == torch.cat(torch.unbind(emb,dim=1), dim=1)

tensor([[True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, T

In [None]:
#|hide 
h = torch.tanh(emb.view((-1,6)) @ W1 + b1)
h

tensor([[ 0.2321,  0.8842, -0.3949,  ..., -0.9303,  0.9055,  0.9973],
        [ 0.8935,  0.9244,  0.4600,  ..., -0.9758,  0.6007,  0.3516],
        [-0.1606, -0.2722,  0.9683,  ..., -0.9979,  0.9871,  0.5120],
        ...,
        [ 0.5043, -0.9989,  0.3366,  ..., -0.9225, -0.9290, -0.8909],
        [-0.1081, -0.7365,  0.2289,  ...,  0.7690, -0.9949, -1.0000],
        [-0.9999, -0.9561,  0.9999,  ..., -0.9973,  1.0000, -0.9909]])

In [None]:
#|hide
h.shape

torch.Size([32, 100])

#|hide
03/27/23 Stop at 28:33 / 1:15:39

In [None]:
W2 = torch.randn((100,27)) # 27 neurons each taking 100 inputs
b2 = torch.randn(27) # the biases for these neurons

In [None]:
logits = h @ W2 + b2

In [None]:
#|hide
logits.shape

torch.Size([32, 27])

In [None]:
#hide
logits

tensor([[ 4.0534e+00,  2.8155e+00,  8.3932e+00, -8.5655e+00, -1.0642e+01,
         -3.7789e+00,  1.1983e+01,  1.4416e+01, -1.7913e+00,  1.0491e+01,
         -3.8777e+00,  6.2393e+00,  1.4216e+01, -6.1704e+00, -1.2285e+00,
          1.7222e+01,  1.3303e+01,  6.6962e+00, -2.1041e+00, -5.3312e+00,
         -2.5310e+00,  1.5954e+01,  7.9101e+00,  2.1804e+01, -9.4494e+00,
         -5.1376e+00, -3.0385e+00],
        [-3.9419e+00, -2.0030e+00,  9.6213e+00, -1.3761e+01, -1.8332e+00,
         -4.9217e+00,  1.1379e+01,  8.4836e+00, -1.1214e+01,  9.1936e+00,
          1.1317e+00,  1.3088e+00,  1.0536e+01, -2.4659e-01,  2.4150e+00,
          3.3822e+00,  2.2016e+00,  7.7727e+00,  9.0946e-01, -4.3787e+00,
         -7.3824e+00,  6.2250e+00,  1.2186e+01,  1.4452e+01, -2.0017e+00,
         -3.0360e+00, -6.6501e+00],
        [-8.2188e-01, -1.7039e+00, -2.0108e+00, -1.1875e+01, -6.8269e+00,
          3.0404e+00,  8.1591e-01,  5.7254e+00,  1.3021e+00,  5.8640e+00,
          6.3562e+00, -4.3840e+00,  8.08

In [None]:
counts = logits.exp()

In [None]:
prob = counts/counts.sum(dim=1,keepdim=True)

In [None]:
#|hide
prob.shape

torch.Size([32, 27])

In [None]:
#|hide
prob.sum(dim=1)

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000])

In [None]:
#|hide
torch.arange(32)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])

In [None]:
#|hide
Y

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])

In [None]:
loss = -prob[torch.arange(32),Y].log().mean()
loss

tensor(17.9215)

In [None]:
#|hide
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27,2), generator=g)

W1 = torch.randn((6,100), generator=g) # 100 neurons each taking 6 inputs
b1 = torch.randn((100), generator=g) # the bias for each of the 100 neurons

W2 = torch.randn((100,27), generator=g) # 27 neurons each taking 100 inputs
b2 = torch.randn(27, generator=g) # the biases for these neurons

parameters = [C, W1, b1, W2, b2]

In [None]:
#|hide
sum([p.nelement() for p in parameters])

3481

In [None]:
#|hide
emb = C[X] #(32,3,2)
h = torch.tanh(emb.view((-1,6)) @ W1 + b1)
logits = h @ W2 + b2
counts = logits.exp()
prob = counts/counts.sum(1,keepdims=True)
loss = -prob[torch.arange(32),Y].log().mean()
loss

tensor(17.7697)

In [None]:
#|hide
# This is to be preferred as it is
# numerically more well behaved (uses logsumexp) 
# and
# the forward and backward passes are more efficient
# (uses fused kernels)
F.cross_entropy(logits, Y)

tensor(17.7697)

In [None]:
#|hide
# large positive logits will get us into
# trouble
logits = torch.tensor([-100,-3,0,100])
counts = logits.exp()
counts

tensor([3.7835e-44, 4.9787e-02, 1.0000e+00,        inf])

In [None]:
#hide
probs = counts/counts.sum()
probs

tensor([0., 0., 0., nan])

#|hide

Adding or subtracting a constant in these calculations
don't change the result. Hence we can subtract the maximum
of the logits and get an outcome that is numerically stable

In [None]:
#|hide
logits = torch.tensor([-100,-3,0,100])-100
counts = logits.exp()
probs = counts/counts.sum()
probs

tensor([0.0000e+00, 1.4013e-45, 3.7835e-44, 1.0000e+00])

In [None]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27,2), generator=g)

W1 = torch.randn((6,100), generator=g) # 100 neurons each taking 6 inputs
b1 = torch.randn((100), generator=g) # the bias for each of the 100 neurons

W2 = torch.randn((100,27), generator=g) # 27 neurons each taking 100 inputs
b2 = torch.randn(27, generator=g)

parameters = [C, W1, b1, W2, b2]

In [None]:
sum([p.nelement() for p in parameters])

3481

In [None]:
    for p in parameters:
        p.requires_grad = True

In [None]:
for _ in range(1000):
    # forward pass
    emb = C[X] #(32,3,2)
    h = torch.tanh(emb.view((-1,6)) @ W1 + b1) #(32,100)
    logits = h @ W2 + b2 #(32,27)
    loss = F.cross_entropy(logits, Y)

    #backward pass
    for p in parameters:
        p.grad = None
    loss.backward()  
    for p in parameters:
        p.data += -0.1 * p.grad

print(loss.item())

0.25614601373672485


#|hide
03/28/23 Stop at 39:39 / 1:15:39