# Backpropagation Ninja

In [1]:
import random
import torch
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
# retina display
%config InlineBackend.figure_format = 'retina'

In [2]:
# read the dataset
# read the dataset
words = open('./data/names.txt', 'r').read().splitlines()
print('Number of words:', len(words))

Number of words: 32033


In [3]:
# build up the dictionary for mapping characters to integers
chars = sorted(list(set(''.join(words))))
print('Number of characters:', len(chars))
# chars to integers
char2int = {c: i+1 for i, c in enumerate(chars)}
# add . as the padding character
char2int['.'] = 0
# integers to chars
int2char = {i: c for c, i in char2int.items()}
# vocabulary size
vocab_size = len(char2int)

Number of characters: 26


In [4]:
# build the dataset
block_size = 3 

def build_dataset(words):
    X, Y = [], []
    for word in words:
        # add . as the padding character to the beginning of the word
        context = [0] * block_size
        for char in word + '.':
            X.append(context)
            Y.append(char2int[char])
            # update the context
            context = context[1:] + [char2int[char]]
    # convert to tensors
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print('X:', X.shape, 'Y:', Y.shape)
    return X, Y

In [5]:
# split the dataset into training and validation sets
# 80%, 10%, 10%
# set the random seed
random.seed(666)
# shuffle the words
random.shuffle(words)
# split the words
train_words = words[:int(0.8*len(words))]
val_words = words[int(0.8*len(words)):int(0.9*len(words))]
test_words = words[int(0.9*len(words)):]

# build the datasets
X_train, Y_train = build_dataset(train_words)
X_val, Y_val = build_dataset(val_words)
X_test, Y_test = build_dataset(test_words)

X: torch.Size([182426, 3]) Y: torch.Size([182426])
X: torch.Size([22807, 3]) Y: torch.Size([22807])
X: torch.Size([22913, 3]) Y: torch.Size([22913])


In [None]:
# utility functions to compare gradients
def compare_gradients(s, dt, t):
    # s: string
    # dt: gradient calculated by hand
    # t: gradient calculated by pytorch
    ex = torch.all(dt == t.grad).item()
    app = torch.allclose(dt, t.grad)
    maxdiff = (dt - t.grad).abs().max().item()
    print(f'{s:15s} | exact: {str(ex):5s} | approximate: {str(app):5s} | maxdiff: {maxdiff}')

In [6]:
n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 64 # the number of neurons in the hidden layer of the MLP

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C  = torch.randn((vocab_size, n_embd),            generator=g)
# Layer 1
W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5/3)/((n_embd * block_size)**0.5)
b1 = torch.randn(n_hidden,                        generator=g) * 0.1 # using b1 just for fun, it's useless because of BN
# Layer 2
W2 = torch.randn((n_hidden, vocab_size),          generator=g) * 0.1
b2 = torch.randn(vocab_size,                      generator=g) * 0.1
# BatchNorm parameters
bngain = torch.randn((1, n_hidden))*0.1 + 1.0
bnbias = torch.randn((1, n_hidden))*0.1

# Note: I am initializating many of these parameters in non-standard ways
# because sometimes initializating with e.g. all zeros could mask an incorrect
# implementation of the backward pass.

parameters = [C, W1, b1, W2, b2, bngain, bnbias]
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
  p.requires_grad = True

4137
