In [2]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
with open('../names.txt', 'r') as file:
    names = file.read().split()

print('Total names:', len(names))
print(names[:5])

Total names: 32033
['emma', 'olivia', 'ava', 'isabella', 'sophia']


In [4]:
# Form stoi and itos
vocab = sorted(list(set(''.join(names))))
stoi = {s: i+1 for i, s in enumerate(vocab)}
stoi['.'] = 0
itos = {stoi[s]: s for s in stoi}

vocab_size = len(stoi)

In [5]:
block_size = 3
def form_dataset(words):
    X = []
    Y = []

    for word in words:
        word = ['.'] * block_size + list(word) + ['.']
        for ind in range(3, len(word)):
            X.append([stoi[x] for x in word[ind-3:ind]])
            Y.append(stoi[word[ind]])
            # print(''.join(word[ind-3:ind]), '--->', word[ind])

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y

In [6]:
import random
random.seed(42)
random.shuffle(names)

X, Y = form_dataset(names)

n1 = int(0.8 * X.shape[0]) 
n2 = int(0.9 * X.shape[0])
Xtr, Xdev, Xts = X.tensor_split((n1, n2), dim=0) # input is split into X[:n1], X[n1:n2] and X[n2:]
Ytr, Ydev, Yts = Y.tensor_split((n1, n2), dim=0)

g = torch.Generator().manual_seed(2147483647)

In [9]:
n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 64 # the number of neurons in the hidden layer of the MLP
batch_size = 32
C = torch.randn(vocab_size, n_embd,             generator=g)
W1 = torch.randn(block_size * n_embd, n_hidden, generator=g) * (5/3)*((block_size * n_embd)**0.5)
b1 = torch.randn(n_hidden,                      generator=g) * 0.1

W2 = torch.randn(n_hidden, vocab_size,          generator=g) * 0.1
b2 = torch.randn(vocab_size,                    generator=g) * 0.1

bngain = torch.ones(1, n_hidden)
bnbias = torch.zeros(1, n_hidden)

parameters = [C, W1, b1, W2, b2, bngain, bnbias]
total_params = 0
for p in parameters:
    total_params += p.nelement()
    p.requires_grad = True

bnmean_running = torch.ones(1, n_hidden)
bnstd_running = torch.ones(1, n_hidden)

print('Total parameters:', total_params)
print('Vocab size:', vocab_size)
print('Examples in training set:', Xtr.shape)

Total parameters: 4137
Vocab size: 27
Examples in training set: torch.Size([182516, 3])


## Forward pass

- Write each step in manageable chunk of operation
- Break down F.cross_entropy in terms of operation that takes place

In [10]:
ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y

In [None]:
n = Xb.shape[0]
emb = C[Xb]
embcat = emb.view(-1, block_size * n_embd)

# Linear layer 1
hprebn = embcat @ W1 + b1
print(hprebn.shape, vocab_size) # number of examples * neurons in hidden_layer

# Batchnorm
bnmeani = hprebn.sum(dim=0, keepdim=True) / n
bndiff = hprebn - bnmeani
bndiff2 = bndiff ** 2
bnvar = bndiff2.sum(0, keepdim=True) / (n-1)
bnvar_inv = (bnvar + 1e-5)**-0.5

print(bnmeani.shape, bndiff.shape)

bnraw = bndiff * bnvar_inv
hpreact = bngain * bnraw + bnbias

# Non-linearity
h = torch.tanh(hpreact) # hidden layer

print(h.shape)

logits = h @ W2 + b2 # output layer
logit_maxes = logits.max(1, keepdim=True).values
norm_logits = logits - logit_maxes # subtract max for numerical stability
counts = norm_logits.exp()
counts_sum = counts.sum(1, keepdims=True) 
counts_sum_inv = counts_sum ** -1
probs = counts * counts_sum_ inv

logprobs = probs.log()
loss = -logprobs[range(n), Yb].mean()

print(loss.item())

for p in parameters:
  p.grad = None
for t in [logprobs, probs, counts, counts_sum, counts_sum_inv,
          norm_logits, logit_maxes, logits, h, hpreact, bnraw,
         bnvar_inv, bnvar, bndiff2, bndiff, hprebn, bnmeani,
         embcat, emb]:
  t.retain_grad()
loss.backward()

torch.Size([32, 64]) 27
torch.Size([1, 64]) torch.Size([32, 64])
torch.Size([32, 64])
3.3768866062164307


In [12]:
# utility function we will use later when comparing manual gradients to PyTorch gradients
def cmp(s, dt, t):
  ex = torch.all(dt == t.grad).item()
  app = torch.allclose(dt, t.grad)
  maxdiff = (dt - t.grad).abs().max().item()
  print(f'{s:15s} | exact: {str(ex):5s} | approximate: {str(app):5s} | maxdiff: {maxdiff}')

In [None]:
# Exercise 1: backprop through the whole thing manually, 
# backpropagating through exactly all of the variables 
# as they are defined in the forward pass above, one by one

# Example: dx stands for derivative of 'x' wrt loss - how does x influence the loss
# size of derivate is equal to the input we derive

# logprobs is batch_size * vocab_size, i.e. for each entry, it contains probability of next character in vocab
# More negative the value of logprobs at Ybs, higher the loss
# But not all the values of logprobs effect the loss, only at the indexes where label match Yb.
# Rule: Shape of dlogprobs will be same as logprops
dlogprobs = torch.zeros_like(logprobs)
dlogprobs[range(n), Yb] = -1.0 / n
cmp('logprobs', dlogprobs, logprobs)

# = dprobs 
# = ∂loss / ∂probs 
# = (∂loss / ∂logprobs) * (∂logprobs / ∂probs)
# = dlogprobs * (∂logprobs / ∂probs)
dprobs = (1.0 / probs) * dlogprobs
cmp('probs', dprobs, probs)

dcounts_sum_inv
# cmp('counts_sum_inv', dcounts_sum_inv, counts_sum_inv)
# cmp('counts_sum', dcounts_sum, counts_sum)
# cmp('counts', dcounts, counts)
# cmp('norm_logits', dnorm_logits, norm_logits)
# cmp('logit_maxes', dlogit_maxes, logit_maxes)
# cmp('logits', dlogits, logits)
# cmp('h', dh, h)
# cmp('W2', dW2, W2)
# cmp('b2', db2, b2)
# cmp('hpreact', dhpreact, hpreact)
# cmp('bngain', dbngain, bngain)
# cmp('bnbias', dbnbias, bnbias)
# cmp('bnraw', dbnraw, bnraw)
# cmp('bnvar_inv', dbnvar_inv, bnvar_inv)
# cmp('bnvar', dbnvar, bnvar)
# cmp('bndiff2', dbndiff2, bndiff2)
# cmp('bndiff', dbndiff, bndiff)
# cmp('bnmeani', dbnmeani, bnmeani)
# cmp('hprebn', dhprebn, hprebn)
# cmp('embcat', dembcat, embcat)
# cmp('W1', dW1, W1)
# cmp('b1', db1, b1)
# cmp('emb', demb, emb)
# cmp('C', dC, C)

logprobs        | exact: True  | approximate: True  | maxdiff: 0.0
probs           | exact: True  | approximate: True  | maxdiff: 0.0


In [None]:
max_steps = 10

batch_size = 32 # Take 32 training data points for each loop of gradient descend

for _ in range(max_steps):
    # mini batch construction
    batch_inds = torch.randint(low=0, high=Xtr.shape[0], size=(batch_size,))
    Xb, Yb = Xtr[batch_inds], Ytr[batch_inds]
    
    # Forward pass
    embs = C[Xb]
    embcat = embs.view(-1, block_size * n_embd)
    hpreact = embcat @ W1 + b1 # hpreact: hidden layer pre activation

    # Batch normalization
    hmean = hpreact.mean(0, keepdim=True)
    hstd = hpreact.std(0, keepdim=True)
    hpreact = bngain * ((hpreact - hmean) / hstd) + bnbias

    with torch.no_grad():
        bnmean_running = bnmean_running * 0.999 + hmean * 0.001 # 0.001 is known as momentum in the pytorch initialization
        bnstd_running = bnstd_running * 0.999 + hstd * 0.001

    # tanh activation
    h = torch.tanh(hpreact)

    logits = h @ W2 + b2 # output layer
    loss = F.cross_entropy(logits, Yb) # loss function

    print(loss.item())

    # Backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # Updating weights
    for p in parameters:
        p.data += -0.1 * p.grad

2.6837503910064697
2.7931134700775146
2.5961287021636963
2.551647663116455
2.56289005279541
2.70651912689209
2.856381416320801
2.50877046585083
2.9261226654052734
2.7769172191619873
