<a href="https://colab.research.google.com/github/martush/martush_notebooks/blob/develop/Char_Level_Language_Model_MLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#from google.colab import drive
#drive.mount('/content/gdrive')
# download the names.txt file from github
!wget https://raw.githubusercontent.com/karpathy/makemore/master/names.txt

Mounted at /content/gdrive


In [20]:
#words = open('gdrive/My Drive/names.txt', 'r').read().splitlines()
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [4]:
len(words)

32033

In [5]:
#build vocabulary of all chars and mapping to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [6]:
#build dataset
block_size = 3 #context length - how many input chars to predict next
X, Y = [], []

for w in words[:5]:
  print(w)
  context = [0] * block_size
  print(f'Context: {context}')
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)
    print(''.join(itos[i] for i in context), '---->', itos[ix])
    context = context[1:] + [ix] #crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)

emma
Context: [0, 0, 0]
... ----> e
..e ----> m
.em ----> m
emm ----> a
mma ----> .
olivia
Context: [0, 0, 0]
... ----> o
..o ----> l
.ol ----> i
oli ----> v
liv ----> i
ivi ----> a
via ----> .
ava
Context: [0, 0, 0]
... ----> a
..a ----> v
.av ----> a
ava ----> .
isabella
Context: [0, 0, 0]
... ----> i
..i ----> s
.is ----> a
isa ----> b
sab ----> e
abe ----> l
bel ----> l
ell ----> a
lla ----> .
sophia
Context: [0, 0, 0]
... ----> s
..s ----> o
.so ----> p
sop ----> h
oph ----> i
phi ----> a
hia ----> .


In [7]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [8]:
X

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13],
        [13, 13,  1],
        [ 0,  0,  0],
        [ 0,  0, 15],
        [ 0, 15, 12],
        [15, 12,  9],
        [12,  9, 22],
        [ 9, 22,  9],
        [22,  9,  1],
        [ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1, 22],
        [ 1, 22,  1],
        [ 0,  0,  0],
        [ 0,  0,  9],
        [ 0,  9, 19],
        [ 9, 19,  1],
        [19,  1,  2],
        [ 1,  2,  5],
        [ 2,  5, 12],
        [ 5, 12, 12],
        [12, 12,  1],
        [ 0,  0,  0],
        [ 0,  0, 19],
        [ 0, 19, 15],
        [19, 15, 16],
        [15, 16,  8],
        [16,  8,  9],
        [ 8,  9,  1]])

## Construct embeddings

Now to create the lookup table. In the reference paper they have 17,000 words crammed into 30 dimensions. We only have 27 characters and we can start  with only 2 dimensions.

In [9]:
C = torch.randn((27,2))

In [10]:
# Lets check a single embedding
C[5]

tensor([0.8768, 0.5885])

In [11]:
C[torch.tensor([5,6,7,7,7])]

tensor([[ 0.8768,  0.5885],
        [ 1.1330, -0.8091],
        [ 1.0338, -0.6338],
        [ 1.0338, -0.6338],
        [ 1.0338, -0.6338]])

In [21]:
# We can also index with multiple dimension
C[X]

tensor([[[-0.8678, -0.5760],
         [-0.8678, -0.5760],
         [-0.8678, -0.5760]],

        [[-0.8678, -0.5760],
         [-0.8678, -0.5760],
         [ 0.8768,  0.5885]],

        [[-0.8678, -0.5760],
         [ 0.8768,  0.5885],
         [-1.3930, -0.7881]],

        [[ 0.8768,  0.5885],
         [-1.3930, -0.7881],
         [-1.3930, -0.7881]],

        [[-1.3930, -0.7881],
         [-1.3930, -0.7881],
         [-0.9813, -1.9754]],

        [[-0.8678, -0.5760],
         [-0.8678, -0.5760],
         [-0.8678, -0.5760]],

        [[-0.8678, -0.5760],
         [-0.8678, -0.5760],
         [ 0.5870, -2.7339]],

        [[-0.8678, -0.5760],
         [ 0.5870, -2.7339],
         [-0.4194,  0.1204]],

        [[ 0.5870, -2.7339],
         [-0.4194,  0.1204],
         [-0.9771,  0.7770]],

        [[-0.4194,  0.1204],
         [-0.9771,  0.7770],
         [ 0.7082, -1.2748]],

        [[-0.9771,  0.7770],
         [ 0.7082, -1.2748],
         [-0.9771,  0.7770]],

        [[ 0.7082, -1

In [13]:
C[X].shape

torch.Size([32, 3, 2])

In [14]:
X[13,2]

tensor(1)

In [15]:
C[X][13,2]

tensor([-0.9813, -1.9754])

In [16]:
C[1]

tensor([-0.9813, -1.9754])

In [17]:
encoded = F.one_hot(torch.tensor(5), num_classes=27)
encoded

tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0])

In [22]:
# one-hot are int64 and C is float - pytorch doesn't know how to multiply them
encoded @ C

RuntimeError: expected m1 and m2 to have the same dtype, but got: long int != float

In [None]:
# identical output
encoded.float() @ C

2 approaches with identical results. Will simply index (C[5]) since much faster

In [None]:
# Pytorch indexing is quite flexible and powerful
# We have a X with size [32, 3] which we want to embed
# We can index 3 things at the same time
C[[5,6,7]]

In [25]:
# This is our embedding
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

## Construct the hidden layer

In [42]:
# torch.Size([32, 3, 2])
# Inputs: 3 x 2 - 3 two-dimensional embeddings
# Outputs we pick - pick 100 neurons
W1 = torch.randn((6, 100))
# biases - initialized randomly, need 100 of them
b1 = torch.randn(100)

In [None]:
# Normally we'd take the input and multiply it by the weights
emb @ W1 + b1
# Problem is the embeddings are stacked up in the dimensions in the input tensor - 32 x 3 x 1 can't multiply by 6 x 100
# We need to concatenate the inputs

There are multiple ways to concatenate the dimensions.

In [26]:
# First way - use torch.concat
emb[:, 0, :].shape
# These are the embeddings of the first word

torch.Size([32, 2])

In [28]:
# These are the dimensions we want to concatenate
torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], 1).shape

torch.Size([32, 6])

In [30]:
# The above way is not scalable since directly indexing into the 3 dimensions - unbind (removes a tensor dimension and returns slices of it)
len(torch.unbind(emb, 1))
# exactly equal to the list above

3

In [31]:
# This is inefficient because it creates all kinds of new memory - 2 tensors cannot be manipulated like that without new memory being created
torch.cat(torch.unbind(emb, 1), 1).shape

torch.Size([32, 6])

In [34]:
# Better way
a = torch.arange(18)
a

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])

In [35]:
a.shape

torch.Size([18])

In [36]:
a.view(9, 2)

tensor([[ 0,  1],
        [ 2,  3],
        [ 4,  5],
        [ 6,  7],
        [ 8,  9],
        [10, 11],
        [12, 13],
        [14, 15],
        [16, 17]])

As long as the total number of elements multiply to be the number of the original vector, you can pick any dimensions.
This is extremely efficient - each tensor has underlying storage which is just the numbers in a 1-dimensional vector.
No memory is changed when calling .view - the underlying storage is the same, only the view of this tensor is changed -  storage offset, strides and shape

In [37]:
a.storage()

  a.storage()


 0
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 18]

In [38]:
emb.shape

torch.Size([32, 3, 2])

In [39]:
emb.view(32, 6)

tensor([[-0.8678, -0.5760, -0.8678, -0.5760, -0.8678, -0.5760],
        [-0.8678, -0.5760, -0.8678, -0.5760,  0.8768,  0.5885],
        [-0.8678, -0.5760,  0.8768,  0.5885, -1.3930, -0.7881],
        [ 0.8768,  0.5885, -1.3930, -0.7881, -1.3930, -0.7881],
        [-1.3930, -0.7881, -1.3930, -0.7881, -0.9813, -1.9754],
        [-0.8678, -0.5760, -0.8678, -0.5760, -0.8678, -0.5760],
        [-0.8678, -0.5760, -0.8678, -0.5760,  0.5870, -2.7339],
        [-0.8678, -0.5760,  0.5870, -2.7339, -0.4194,  0.1204],
        [ 0.5870, -2.7339, -0.4194,  0.1204, -0.9771,  0.7770],
        [-0.4194,  0.1204, -0.9771,  0.7770,  0.7082, -1.2748],
        [-0.9771,  0.7770,  0.7082, -1.2748, -0.9771,  0.7770],
        [ 0.7082, -1.2748, -0.9771,  0.7770, -0.9813, -1.9754],
        [-0.8678, -0.5760, -0.8678, -0.5760, -0.8678, -0.5760],
        [-0.8678, -0.5760, -0.8678, -0.5760, -0.9813, -1.9754],
        [-0.8678, -0.5760, -0.9813, -1.9754,  0.7082, -1.2748],
        [-0.9813, -1.9754,  0.7082, -1.2

In [40]:
# Verify result same as before
emb.view(32, 6) == torch.cat(torch.unbind(emb, 1), 1)

tensor([[True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, T

In [43]:
# Now multiply to create the hiden layer
h = emb.view(32, 6) @ W1 + b1

#Improve - remove harcoding of the 32
h = emb.view(emb.shape[0], 6) @ W1 + b1
#can also do -1 - since the number of elements needs to be the same and we have the other as 6, pytorch will derive this to be 32
h = emb.view(-1, 6) @ W1 + b1
# Also add tanh the whole thing
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)

In [46]:
# Need to be careful with the broadcasting rules for adding b1
# We have 32, 100 added to 100
# 32, 100
#     100
#pytorch will align them on the right, create  a fake dimension (1x100 row vector) and then copy vertically for every one of the 32 rows and do an element-wise addition
# in this case the correct thing will be happening

In [44]:
h.shape
# 32 x 100: the 100 activations for our 32 examples

torch.Size([32, 100])

## Output Layer

In [47]:
# Input is 100 (outputs from hidden layer), outputs are 27  (the 27 characters)
W2 = torch.randn((100, 27))
b2 = torch.randn(27)

In [48]:
logits = h @ W2 + b2

In [49]:
logits.shape

torch.Size([32, 27])

In [50]:
logits

tensor([[-5.1836e+00, -7.5340e-01, -1.4372e+00,  6.6722e+00, -6.7186e+00,
          2.0786e+00, -5.9823e+00,  9.3230e+00, -1.7241e+01, -7.8906e+00,
          2.9034e+00, -7.1685e+00, -2.4494e+00,  2.4386e+00, -1.4416e+00,
         -1.0815e+01,  1.1374e+01, -1.0126e+01,  1.1194e+01,  3.0792e+00,
          5.8258e+00,  1.0673e+01,  3.8797e+00,  7.6634e+00, -3.5406e+00,
         -1.4204e+00, -7.2076e+00],
        [-2.3593e+00, -5.2011e+00,  3.5675e+00,  9.2174e+00, -3.6281e+00,
         -5.4712e+00,  9.2209e-01, -2.4453e+00, -1.5070e+01,  1.0010e+00,
          8.6779e+00, -7.1191e+00, -2.4086e+00,  4.5717e+00, -8.4149e-01,
         -3.7803e+00,  6.1163e+00, -1.4999e+00, -2.7804e+00,  6.0377e+00,
          5.5989e+00,  1.4721e+01,  1.0056e+01,  1.0892e+01,  6.2097e+00,
         -5.3787e+00, -6.6482e+00],
        [ 2.9832e-01, -2.0079e+00, -5.1163e-01,  9.4941e+00,  2.0517e+00,
          7.3904e+00,  1.0407e+01,  3.2891e+00,  1.2824e+01,  3.2527e+00,
          9.1522e+00, -6.4659e+00,  3.66

In [51]:
counts = logits.exp()

In [54]:
prob = counts / counts.sum(1, keepdims=True)

In [55]:
prob.shape

torch.Size([32, 27])

In [56]:
# Every row sums up to 1
prob[0].sum()

tensor(1.)

## Loss function

In [57]:
# Index into the rows of prob and from each row pluck out the probability assigned to the correct character
# iterator
torch.arange(32)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])

In [58]:
# Indices of correct characters
Y

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])

In [59]:
prob[torch.arange(32), Y]

tensor([3.6747e-05, 3.7659e-05, 4.9466e-12, 1.2362e-04, 6.9568e-12, 9.2345e-11,
        1.4742e-12, 3.1718e-06, 3.2212e-12, 1.8734e-12, 1.0331e-05, 3.5257e-08,
        2.1641e-06, 2.1563e-07, 4.6641e-07, 1.1791e-13, 1.7204e-09, 8.8416e-03,
        1.6172e-08, 1.6094e-10, 4.0914e-08, 1.3155e-06, 6.8367e-06, 1.0493e-03,
        1.7483e-07, 9.9944e-05, 6.3317e-08, 2.5939e-07, 1.1865e-09, 7.1756e-10,
        1.2057e-07, 1.7494e-05])

In [61]:
# This is what we need to minimize to get the correct character
loss = - prob[torch.arange(32), Y].log().mean()
loss

tensor(16.8338)

## Full flow

In [65]:
#dataset
X.shape, Y.shape

(torch.Size([32, 3]), torch.Size([32]))

In [66]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, 2), generator=g)
W1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [67]:
# number of params
sum(p.nelement() for p in parameters)

3481

In [69]:
# Layer 1: embedding
emb = C[X] #(32, 3, 2) 32 examples with 3 chars each and 2 dimensions to them
# Layer 2: hidden layer
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # outputs (32, 100)
# Layer 3: output
logits = h @ W2 + b2
counts = logits.exp()
prob = counts / counts.sum(1, keepdims=True)
loss = -prob[torch.arange(32), Y].log().mean()
loss

tensor(17.7697)

In [70]:
# Calculating the loss can be done directly from pytorch
F.cross_entropy(logits, Y)

tensor(17.7697)

In [71]:
# Layer 1: embedding
emb = C[X] #(32, 3, 2) 32 examples with 3 chars each and 2 dimensions to them
# Layer 2: hidden layer
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # outputs (32, 100)
# Layer 3: output
logits = h @ W2 + b2
#counts = logits.exp()
#prob = counts / counts.sum(1, keepdims=True)
#loss = -prob[torch.arange(32), Y].log().mean()
loss = F.cross_entropy(logits, Y)
loss

tensor(17.7697)

Many reasons to prefer F.cross_entropy over own implementation
1. F.cross_entropy won't create all the intermediate tensors. Pytorch uses fused kernels to more efficienctly calculate
The backwards pass would be much more efficient
2. F.cross_entropy is more numerically well-behaved - example below

In [72]:
logits = torch.tensor([-2, -3, 0, 5])
counts = logits.exp()
probs = counts / counts.sum()
probs

tensor([9.0466e-04, 3.3281e-04, 6.6846e-03, 9.9208e-01])

In [73]:
# Supposed we have more extreme numbers
logits = torch.tensor([-100, -3, 0, 5])
counts = logits.exp()
probs = counts / counts.sum()
probs

tensor([0.0000e+00, 3.3311e-04, 6.6906e-03, 9.9298e-01])

In [74]:
# Supposed we have more extreme positive
logits = torch.tensor([-100, -3, 0, 100])
counts = logits.exp()
probs = counts / counts.sum()
probs

tensor([0., 0., 0., nan])

In [75]:
counts

tensor([3.7835e-44, 4.9787e-02, 1.0000e+00,        inf])

If you pass a very positive number to .exp() we run out of range in our floating point number that represents the count(e to the power of 100).

You can offset logits by any arbitrary number you want and get the same result.
F.cross_entropy calculates the max value in the logits and subtracts it

In [76]:
for p in parameters:
    p.requires_grad = True

In [80]:
for _ in range(100):
    # forward pass
    emb = C[X]
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y)
    #print(loss.item())
    # backward pass
    for p in parameters:
        p.grad = None

    loss.backward()

    #update
    for p in parameters:
        p.data += -0.1 * p.grad

print(loss.item())

0.31377431750297546


We're achieving extremely low loss - overfitting. We are fitting 32 examples using almost 4k parameters - very easy to make the NN fit them.
We can't achieve 0 - ... is supposed to predict 5 different first letters

In [81]:
logits.max(1)

torch.return_types.max(
values=tensor([11.4639, 13.4778, 19.0661, 17.9120, 13.2064, 11.4639, 13.2552, 11.8626,
        13.6934, 15.6432, 12.8634, 17.9044, 11.4639, 13.2158, 14.3344, 17.2696,
        11.4639, 14.0626, 11.7470, 13.5321, 15.9663, 12.5515,  8.1474,  8.1505,
        14.0189, 11.4639, 13.5286, 13.8694, 11.3024, 14.4007, 15.8964, 12.3963],
       grad_fn=<MaxBackward0>),
indices=tensor([ 1, 13, 13,  1,  0,  1, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  1, 19,
         1,  2,  5, 12, 12,  1,  0,  1, 15, 16,  8,  9,  1,  0]))

## Run on full dataset

In [82]:
#build dataset
block_size = 3 #context length - how many input chars to predict next
X, Y = [], []

for w in words:
  #print(w)
  context = [0] * block_size
  #print(f'Context: {context}')
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)
    #print(''.join(itos[i] for i in context), '---->', itos[ix])
    context = context[1:] + [ix] #crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)

In [83]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([228146, 3]), torch.int64, torch.Size([228146]), torch.int64)

In [84]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, 2), generator=g)
W1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [85]:
# number of params
sum(p.nelement() for p in parameters)

3481

In [86]:
for p in parameters:
    p.requires_grad = True

In [88]:
for _ in range(10):
    # forward pass
    emb = C[X]
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y)
    print(loss.item())
    # backward pass
    for p in parameters:
        p.grad = None

    loss.backward()

    #update
    for p in parameters:
        p.data += -0.1 * p.grad

print(loss.item())

10.709586143493652
10.407632827758789
10.127808570861816
9.864365577697754
9.614503860473633
9.376440048217773
9.148944854736328
8.931111335754395
8.7222318649292
8.521750450134277
8.521750450134277


In practice forward and backwards passes on batches of the data, not the full dataset

In [89]:
torch.randint(0, 5, (32,))

tensor([1, 1, 1, 3, 2, 1, 3, 3, 2, 0, 3, 3, 3, 4, 2, 2, 1, 3, 0, 0, 4, 2, 2, 1,
        2, 2, 3, 3, 1, 0, 1, 3])