# Setup

In [1]:
with open('../data/names.txt') as names_file:
    words = names_file.read().splitlines()

words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [2]:
len(words)

32033

## Build Vocabulary

In [3]:
from string import ascii_lowercase

BOUNDARY = "."

vocab = [BOUNDARY] + list(ascii_lowercase)

vtoi = {v: i for i, v in enumerate(vocab)}

vtoi

{'.': 0,
 'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26}

In [4]:
len(vocab)

27

## Build Datasets

Training: 80%, used to train parameters

Dev: 10%, used to test hyperparameters

Test: 10%, used to evaluate model performance

In [5]:
import random
from typing import Collection, Tuple

import torch

n = 3

def build_dataset(words: "Collection[str]", n: "int" = 3) -> "Tuple[torch.Tensor, torch.Tensor]":
    X, Y = [], []
    for word in words:
        context = [0] * n
        for c in word + BOUNDARY:
            i = vtoi[c]
            X.append(context)
            Y.append(i)

            # update context (crop & append)
            context = context[1:] + [i]

    X = torch.tensor(X).short()
    Y = torch.tensor(Y).short()
    print(X.shape, Y.shape)
    return X, Y

random.seed(42)
random.shuffle(words)

ix_dev = int(len(words) * 0.8)
ix_test = int(len(words) * 0.9)

Xtr, Ytr = build_dataset(words[:ix_dev])
Xdev, Ydev = build_dataset(words[ix_dev:ix_test])
Xtest, Ytest = build_dataset(words[ix_test:])

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


# Backpropagation

This utility function will be used to compare gradients computed by our model with those computed by PyTorch.

In [6]:
import torch

def cmp(name: "str", dt: "torch.Tensor", t: "torch.Tensor"):
    exact = torch.all(dt == t.grad).item()
    approx = torch.allclose(dt, t.grad)
    maxdiff = (dt - t.grad).abs().max().item()
    print(f"{name:15s} | exact: {str(exact):5s} | approx: {str(approx):5s} | maxdiff: {maxdiff:10.3e}")

# Model

## Initialization

Note: Many of these parameters are being initialized in non-standard ways. Some initialization best practices e.g. all zeros could mask incorrect implementations of backward pass.

In [7]:
import torch

emb_size = 10 # dimensionality of the embedding
h_size = 200 # number of neurons in hidden layer

g = torch.Generator().manual_seed(2147483647)

C = torch.randn(len(vocab), emb_size, generator=g)

# layer 1
W1 = torch.randn(n * emb_size, h_size, generator=g) * (5/3) / ((n * emb_size)**0.5)

# layer 2
W2 = torch.randn(h_size, len(vocab), generator=g) * 0.1 # initialize with small values, creates uniform distribution of initial probabilities
b2 = torch.randn(len(vocab), generator=g) * 0.1

# batchnorm parameters
bn_gain = torch.randn((1, h_size), generator=g) * 0.1 + 1.0
bn_bias = torch.randn((1, h_size), generator=g) * 0.1

parameters = [C, W1, W2, b2, bn_gain, bn_bias]

for p in parameters:
    p.requires_grad = True

# total parameters
sum(p.numel() for p in parameters)

12097

# Single Pass

In [8]:
batch_size = 32
ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)

Xb, Yb = Xtr[ix], Ytr[ix]

In [9]:
emb = C[Xb.long()]
embcat = emb.view(batch_size, -1)

# linear layer 1
h_prenorm = embcat @ W1

# batchnorm
bnmean = 1/batch_size * h_prenorm.sum(0, keepdim=True)
bn_diff = h_prenorm - bnmean
bn_dff_sq = bn_diff**2
bn_var = 1/(batch_size-1) * bn_dff_sq.sum(0, keepdim=True) # note: bessel's correction (dividing by n-1 instead of n)
bn_var_inv = (bn_var + 1e-5)**-0.5
bn_raw = bn_diff * bn_var_inv
h_norm = bn_gain * bn_raw + bn_bias

# activation
h = torch.tanh(h_norm)

# linear layer 2
logits = h @ W2 + b2

# cross-entropy loss
logit_maxes = logits.max(1, keepdim=True).values
norm_logits = logits - logit_maxes # subtract max for numerical stability
counts = norm_logits.exp()
counts_sum = counts.sum(1, keepdim=True)
counts_sum_inv = counts_sum**-1
probs = counts * counts_sum_inv
log_probs = probs.log()
loss = -log_probs[range(batch_size), Yb.long()].mean()

# pytorch backward pass
for p in parameters:
    p.grad = None

tensors = [
    emb,
    embcat,
    h_prenorm,
    bnmean,
    bn_diff,
    bn_dff_sq,
    bn_var,
    bn_var_inv,
    bn_raw,
    h_norm,
    h,
    logits,
    logit_maxes,
    norm_logits,
    counts,
    counts_sum,
    counts_sum_inv,
    probs,
    log_probs,
    loss,
]

for t in tensors:
    t.retain_grad()

loss.backward()

loss

tensor(3.5586, grad_fn=<NegBackward0>)

## Derivatives

In [10]:
d_logprobs = None

d_probs = None

d_counts_sum_inv = None