In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn.functional as F

In [2]:
# download the names.txt file from github
!wget https://raw.githubusercontent.com/karpathy/makemore/master/names.txt

--2024-08-08 16:01:08--  https://raw.githubusercontent.com/karpathy/makemore/master/names.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 228145 (223K) [text/plain]
Saving to: ‘names.txt.2’


2024-08-08 16:01:08 (6.09 MB/s) - ‘names.txt.2’ saved [228145/228145]



In [3]:
# read in all the words
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [4]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
27


In [5]:
# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):
  X, Y = [], []

  for w in words:
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)    # shuffling the order of words before splitting
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr,  Ytr  = build_dataset(words[:n1])     # 80%
Xdev, Ydev = build_dataset(words[n1:n2])   # 10%
Xte,  Yte  = build_dataset(words[n2:])     # 10%

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [None]:
g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((27, 10), generator=g)
W1 = torch.randn((30, 200), generator=g)
b1 = torch.randn(200, generator=g)
W2 = torch.randn((200, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [None]:
sum(p.nelement() for p in parameters) # number of parameters in total

11897

In [None]:
for p in parameters:
  p.requires_grad = True

In [None]:
for i in range(20000):

  # minibatch construct
  ix = torch.randint(0, Xtr.shape[0], (32,))

  # forward pass
  emb = C[Xtr[ix]] # (32, 3, 2)
  h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 100)
  logits = h @ W2 + b2 # (32, 27)
  loss = F.cross_entropy(logits, Ytr[ix])
  #print(loss.item())

  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  # update
  #lr = lrs[i]
  lr = 0.1 if i < 10000 else 0.01
  for p in parameters:
    p.data += -lr * p.grad

  # # track stats
  # #lri.append(lre[i])
  # stepi.append(i)
  # lossi.append(loss.log10().item())

print(loss.item())

2.498176336288452


In [None]:
# validation loss
emb = C[Xdev] # (32, 3, 2)
h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, Ydev)
print('validation loss',loss.data)

validation loss tensor(2.3188)


### E01: Tune the hyperparameters of the training to beat my best validation loss of 2.2

In [None]:
# Increase the hidden layers from 200 to 400

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((27, 10), generator=g)
W1 = torch.randn((30, 400), generator=g)
b1 = torch.randn(400, generator=g)
W2 = torch.randn((400, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

for p in parameters:
  p.requires_grad = True

In [None]:
for i in range(300000):

  # minibatch construct
  ix = torch.randint(0, Xtr.shape[0], (32,))

  # forward pass
  emb = C[Xtr[ix]] # (32, 3, 2)
  h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 100)
  logits = h @ W2 + b2 # (32, 27)
  loss = F.cross_entropy(logits, Ytr[ix])
  #print(loss.item())

  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  # update
  #lr = lrs[i]
  #lr = 0.1 if i < 10000 else 0.01
  lr = 0.001
  for p in parameters:
    p.data += -lr * p.grad

  # # track stats
  # #lri.append(lre[i])
  # stepi.append(i)
  # lossi.append(loss.log10().item())

print(loss.item())

1.771917462348938


In [None]:
# validation loss
emb = C[Xdev] # (32, 3, 2)
h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, Ydev)
loss

tensor(2.3468, grad_fn=<NllLossBackward0>)

#### Comments/Insights: Increasing number of neurons in a single hidden layer, reduces the training loss but does not reduce the validation loss, which means our model is overfitting. Next will try with multilayer neural networks.

In [None]:
# Include an additional hidden layers

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((27, 10), generator=g)
W1 = torch.randn((30, 200), generator=g)
b1 = torch.randn(200, generator=g)
W2 = torch.randn((200, 200), generator=g)
b2 = torch.randn(200, generator=g)
W3 = torch.randn((200, 200), generator=g)
b3 = torch.randn(200, generator=g)
W4 = torch.randn((200, 27), generator=g)
b4 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2, W3, b3, W4, b4]

for p in parameters:
  p.requires_grad = True

In [None]:
for i in range(50000):

  # minibatch construct
  ix = torch.randint(0, Xtr.shape[0], (32,))

  # forward pass
  emb = C[Xtr[ix]] # (32, 3, 10)
  h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # 32x30 @ 30x300 -> 32 x 300
  logits1 = torch.tanh(h @ W2 + b2) # 32x300 @ 300x300 -> 32 x 300
  logits2 = torch.tanh(logits1 @ W3 + b3) # 32x300 @ 300x300 -> 32 x 300
  logits = logits2 @ W4 + b4 # 32x300 @ 300x27 -> 32 x 27
  loss = F.cross_entropy(logits, Ytr[ix])
  #print(loss.item())

  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  # update
  #lr = lrs[i]
  lr = 0.15 if i < 30000 else 0.001
  #lr = 0.001
  for p in parameters:
    p.data += -lr * p.grad

  # # track stats
  # #lri.append(lre[i])
  # stepi.append(i)
  # lossi.append(loss.log10().item())

print(loss.item())

2.0119032859802246


In [None]:
# validation loss
emb = C[Xdev]
h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # 32x30 @ 30x300 -> 32 x 300
logits1 = torch.tanh(h @ W2 + b2) # 32x300 @ 300x300 -> 32 x 300
logits2 = torch.tanh(logits1 @ W3 + b3) # 32x300 @ 300x300 -> 32 x 300
logits = logits2 @ W4 + b4 # 32x300 @ 300x27 -> 32 x 27
loss = F.cross_entropy(logits, Ydev)
loss

tensor(2.1920, grad_fn=<NllLossBackward0>)

#### Comments: Creating multilayer perceptrons and substantially decreasing learning rate reduces loss of validation set to 2.19

### E02: I was not careful with the intialization of the network in this video. (1) What is the loss you'd get if the predicted probabilities at initialization were perfectly uniform? What loss do we achieve? (2) Can you tune the initialization to get a starting loss that is much more similar to (1)?

In [None]:
g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((27, 10), generator=g)
W1 = torch.randn((30, 200), generator=g)
b1 = torch.randn(200, generator=g)
W2 = torch.randn((200, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [None]:
for p in parameters:
  p.requires_grad = True

In [None]:
for i in range(30000):

  # minibatch construct
  ix = torch.randint(0, Xtr.shape[0], (32,))

  # forward pass
  emb = C[Xtr[ix]] # (32, 3, 2)
  h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 100)
  logits = h @ W2 + b2 # (32, 27)
  loss = F.cross_entropy(logits, Ytr[ix])
  print(loss.item())

  break

  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  # update
  #lr = lrs[i]
  #lr = 0.1 if i < 10000 else 0.01
  lr = 0.001
  for p in parameters:
    p.data += -lr * p.grad

  # # track stats
  # #lri.append(lre[i])
  # stepi.append(i)
  # lossi.append(loss.log10().item())

#print(loss.item())

22.844247817993164


#### Initial loss is very high 22.84, trying to reduce initial loss by adjusting the weights to approximately uniform.

In [6]:
### Making the neuron weights to approximately uniform by multiply it with 0.02 and bias as 0 initially

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((27, 10), generator=g)
W1 = torch.randn((30, 200), generator=g) * 0.02
b1 = torch.randn(200, generator=g) * 0
W2 = torch.randn((200, 27), generator=g) * 0.02
b2 = torch.randn(27, generator=g) * 0
parameters = [C, W1, b1, W2, b2]

for p in parameters:
  p.requires_grad = True

In [7]:
for i in range(30000):

  # minibatch construct
  ix = torch.randint(0, Xtr.shape[0], (32,))

  # forward pass
  emb = C[Xtr[ix]] # (32, 3, 2)
  h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 100)
  logits = h @ W2 + b2 # (32, 27)
  loss = F.cross_entropy(logits, Ytr[ix])
  print(loss.item())

  break

  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  # update
  #lr = lrs[i]
  #lr = 0.1 if i < 10000 else 0.01
  lr = 0.001
  for p in parameters:
    p.data += -lr * p.grad

  # # track stats
  # #lri.append(lre[i])
  # stepi.append(i)
  # lossi.append(loss.log10().item())

#print(loss.item())

3.296635627746582
3.296635627746582


#### Insights: Making the initial weights to nearly uniform reduces the initial loss from 22.844 to 3.296

### E01: I did not get around to seeing what happens when you initialize all weights and biases to zero. Try this and train the neural net. You might think either that 1) the network trains just fine or 2) the network doesn't train at all, but actually it is 3) the network trains but only partially, and achieves a pretty bad final performance. Inspect the gradients and activations to figure out what is happening and why the network is only partially training, and what part is being trained exactly.

In [18]:
### Making the neuron weights to approximately uniform by multiply it with 0.02 and bias as 0 initially

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((27, 10), generator=g)
W1 = torch.randn((30, 200), generator=g) * 0.02
b1 = torch.randn(200, generator=g) * 0
W2 = torch.randn((200, 27), generator=g) * 0.02
b2 = torch.randn(27, generator=g) * 0
parameters = [C, W1, b1, W2, b2]

for p in parameters:
  p.requires_grad = True

In [19]:
for i in range(30000):

  # minibatch construct
  ix = torch.randint(0, Xtr.shape[0], (32,))

  # forward pass
  emb = C[Xtr[ix]] # (32, 3, 2)
  h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 100)
  logits = h @ W2 + b2 # (32, 27)
  loss = F.cross_entropy(logits, Ytr[ix])
  #print(loss.item())

  #break

  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  # update
  #lr = lrs[i]
  #lr = 0.1 if i < 10000 else 0.01
  lr = 0.01
  for p in parameters:
    p.data += -lr * p.grad

  # # track stats
  # #lri.append(lre[i])
  # stepi.append(i)
  # lossi.append(loss.log10().item())

print(loss.item())

2.5573537349700928


In [20]:
# validation loss
emb = C[Xdev] # (32, 3, 2)
h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, Ydev)
loss

tensor(2.3422, grad_fn=<NllLossBackward0>)

#### Weights and bias to 0

In [21]:
### Making the neuron weights and bias to 0 initially

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((27, 10), generator=g)
W1 = torch.randn((30, 200), generator=g) * 0
b1 = torch.randn(200, generator=g) * 0
W2 = torch.randn((200, 27), generator=g) * 0
b2 = torch.randn(27, generator=g) * 0
parameters = [C, W1, b1, W2, b2]

for p in parameters:
  p.requires_grad = True

In [22]:
for i in range(30000):

  # minibatch construct
  ix = torch.randint(0, Xtr.shape[0], (32,))

  # forward pass
  emb = C[Xtr[ix]] # (32, 3, 2)
  h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 100)
  logits = h @ W2 + b2 # (32, 27)
  loss = F.cross_entropy(logits, Ytr[ix])
  #print(loss.item())

  #break

  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  # update
  #lr = lrs[i]
  #lr = 0.1 if i < 10000 else 0.01
  lr = 0.01
  for p in parameters:
    p.data += -lr * p.grad

  # # track stats
  # #lri.append(lre[i])
  # stepi.append(i)
  # lossi.append(loss.log10().item())

print(loss.item())

2.8362462520599365


In [14]:
# validation loss
emb = C[Xdev] # (32, 3, 2)
h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, Ydev)
loss

tensor(2.8227, grad_fn=<NllLossBackward0>)

#### Although there is a small difference in loss, it looks like despite of weights and bias being initialized to 0, the model trains. However as mentioned in the lecture, it just partially trains.

### E02: BatchNorm, unlike other normalization layers like LayerNorm/GroupNorm etc. has the big advantage that after training, the batchnorm gamma/beta can be "folded into" the weights of the preceeding Linear layers, effectively erasing the need to forward it at test time. Set up a small 3-layer MLP with batchnorms, train the network, then "fold" the batchnorm gamma/beta into the preceeding Linear layer's W,b by creating a new W2, b2 and erasing the batch norm. Verify that this gives the same forward pass during inference. i.e. we see that the batchnorm is there just for stabilizing the training, and can be thrown out after training is done! pretty cool.

In [33]:
# With gamma and beta
vocab_size = 27
n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 200 # the number of neurons in the hidden layer of the MLP

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C  = torch.randn((vocab_size, n_embd),            generator=g)
W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5/3)/((n_embd * block_size)**0.5) #* 0.2
#b1 = torch.randn(n_hidden,                        generator=g) * 0.01
W3 = torch.randn((n_hidden, vocab_size),          generator=g) * 0.01
b3 = torch.randn(vocab_size,                      generator=g) * 0

# BatchNorm parameters
gamma = torch.ones((1, n_hidden))   # gamma
beta = torch.zeros((1, n_hidden))  # beta
# bnmean_running = torch.zeros((1, n_hidden))
# bnstd_running = torch.ones((1, n_hidden))

parameters = [C, W1, W3, b3, gamma, beta]
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
  p.requires_grad = True


12097


In [34]:
# same optimization as last time
max_steps = 2000
batch_size = 32
lossi = []

for i in range(max_steps):

  # minibatch construct
  ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
  Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y

  # forward pass
  emb = C[Xb] # embed the characters into vectors
  embcat = emb.view(emb.shape[0], -1) # concatenate the vectors
  # Linear layer
  hpreact = embcat @ W1 #+ b1 # hidden layer pre-activation
  # BatchNorm layer
  # -------------------------------------------------------------
  bnmeani = hpreact.mean(0, keepdim=True)
  bnstdi = hpreact.std(0, keepdim=True)
  hpreact = gamma * (hpreact - bnmeani) / bnstdi + beta
  # with torch.no_grad():
  #   bnmean_running = 0.999 * bnmean_running + 0.001 * bnmeani
  #   bnstd_running = 0.999 * bnstd_running + 0.001 * bnstdi
  # -------------------------------------------------------------
  # Non-linearity
  h = torch.tanh(hpreact) # hidden layer
  logits = h @ W3 + b3 # output layer
  loss = F.cross_entropy(logits, Yb) # loss function

  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  # update
  #lr = 0.1 if i < 10000 else 0.01 # step learning rate decay
  lr = 0.1
  for p in parameters:
    p.data += -lr * p.grad

  # track stats
  if i % 10000 == 0: # print every once in a while
    print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
  lossi.append(loss.log10().item())

#print(loss)


      0/   2000: 3.3239


#### Replacing gamma and beta with another layers with W2 and b2

In [35]:
# With gamma and beta
vocab_size = 27
n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 200 # the number of neurons in the hidden layer of the MLP

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C  = torch.randn((vocab_size, n_embd),            generator=g)
W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5/3)/((n_embd * block_size)**0.5) #* 0.2
#b1 = torch.randn(n_hidden,                        generator=g) * 0.01
W3 = torch.randn((n_hidden, vocab_size),          generator=g) * 0.01
b3 = torch.randn(vocab_size,                      generator=g) * 0

# BatchNorm parameters
W2 = torch.ones((n_hidden, n_hidden))   # gamma
b2 = torch.zeros((1, n_hidden))  # beta


parameters = [C, W1, W3, b3, W2, b2]
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
  p.requires_grad = True


51897


In [36]:
# same optimization as last time
max_steps = 2000
batch_size = 32
lossi = []

for i in range(max_steps):

  # minibatch construct
  ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
  Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y

  # forward pass
  emb = C[Xb] # embed the characters into vectors
  embcat = emb.view(emb.shape[0], -1) # concatenate the vectors
  # Linear layer
  hpreact = embcat @ W1 #+ b1 # hidden layer pre-activation
  # BatchNorm layer
  # -------------------------------------------------------------
  bnmeani = hpreact.mean(0, keepdim=True)
  bnstdi = hpreact.std(0, keepdim=True)

  hpreact = (hpreact - bnmeani) / bnstdi


  # -------------------------------------------------------------
  # Non-linearity
  #h = torch.tanh(hpreact) # hidden layer
  h = torch.tanh(hpreact @ W2 +b2)

  logits = h @ W3 + b3 # output layer
  loss = F.cross_entropy(logits, Yb) # loss function

  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  # update
  #lr = 0.1 if i < 10000 else 0.01 # step learning rate decay
  lr = 0.1
  for p in parameters:
    p.data += -lr * p.grad

  # track stats
  if i % 10000 == 0: # print every once in a while
    print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
  lossi.append(loss.log10().item())

#print(loss)


      0/   2000: 3.3220


#### Both cases give approximately similar loss in forward pass, which means batch norm is just for stabilizing the activations.