In [10]:
import random
import torch
from torch.nn import functional as F
random.seed(0x1337_b00b)

# Context length -> How many characters we take as input for the network to predict
# the next
block_size = 3

class Dataset:
    def __init__(self):
        with open('names.txt', 'r') as f:
            names = f.read().split('\n')
        self.names = names

        self.build_vocab()

        # Shuffle names in place
        random.shuffle(self.names)
        # Training set and dev/validation set last index
        # First 80% is used for training, 10% percent for validation, 10% for test
        train_set_idx = int(0.8 * len(names))
        validation_idx = int(0.9 * len(names))
        
        self.X, self.Y = {}, {}
        self.X["train"], self.Y["train"] = self.build_dataset(self.names[:train_set_idx])
        self.X["valid"], self.Y["valid"] = self.build_dataset(self.names[train_set_idx:validation_idx])
        self.X["test"], self.Y["test"] = self.build_dataset(self.names[validation_idx:])
        

    def build_vocab(self):
        # Build vocabulary
        vocab = []
        for name in self.names:
            vocab += name
        self.vocab = sorted(set(vocab))
        
        # Build mapping from letter to integer id and for id to letter
        # Leave the `0` key for `.` (dot) which new treat as a null / terminating char
        self.itos = { i+1:l for i, l in enumerate(self.vocab)}
        self.itos[0] = '.'
        # Build the inverse mapping -> from character to integer id
        self.stoi = { l:i for i, l in self.itos.items()}
                

    def build_dataset(self, words):
        global block_size
        # Inputs
        X = []
        # Targets
        Y = []
        
        # For each name
        for word in words:
            # The start is an empty new context (which contains our designed dot special character)
            context = [0] * block_size
            # For each character in the name (adding dot as a stopping token)
            for ch in word + '.':
                # We add the current context and as an input to the dataset
                X.append(context)
                # Get the index of the current character and add it as a target for a potential
                # generated new character that could follow this context
                idx_ch = self.stoi[ch]
                Y.append(idx_ch)
                # Slide the context window and add the new character to it
                context = context[1:] + [idx_ch]
    
        X = torch.Tensor(X).long()
        Y = torch.Tensor(Y).long()
        return (X, Y)


    def dataset_demo(self, split, count = 10):
        for i, p in zip(self.X[split][:count], self.Y[split][:count]):
            print([self.itos[c.item()] for c in i], "-->", self.itos[p.item()])

In [11]:
# Model building
import torch

g = torch.Generator().manual_seed(2147483647)
d = Dataset()

class Linear:
    def __init__(self, fan_in, fan_out, bias=True):
        kaimin_scaling = fan_in**0.5
        self.weights = torch.randn((fan_in, fan_out), generator=g) / kaimin_scaling
        if bias:
            # Bias initializing with zeros or small floats?
            self.bias = torch.zeros(fan_out)
            # self.bias = torch.rand(fan_out, generator=g) * 0.1
        else:
            self.bias = None


    def __call__(self, x_in):
        self.out = x_in @ self.weights
        if self.bias is not None:
            self.out += self.bias
        return self.out


    def parameters(self):
        params = [self.weights]
        if self.bias is not None:
            params += [self.bias]
        return params


class BatchNorm1d:
    def __init__(self, size, eps=1e-05, momentum=0.1):
        """Batch normalization layers defined accoding to the paper with the same name
        and torch docs

        Args:
            size: size of the batch and implictily this layer
            eps: a small variable to control that we are not dividing by zero
            momentum: The amount that each training iteration affects the final running
                mean and std used in model inference at production time. 
        """
        self.size = size
        # Controls if we are using the running mean and std when inferencing (in prod)
        # or we are computing it from the batch (training)
        self.training = True
        # Training parameters that get updated by the backward pass
        self.gamma = torch.ones(size)
        self.beta = torch.zeros(size)
        # Extra variables used to control the behaviour of the normalisation
        self.eps = eps
        self.momentum = momentum
        # Buffers (torch naming) that are not part of the backward pass and training
        self.running_mean = torch.zeros((1, size))
        self.running_std = torch.ones((1, size))


    def __call__(self, x_in):
        # Compute the mean and the std for the input
        if self.training:
            in_mean = x_in.mean(0, keepdim=True)
            in_std = x_in.std(0, keepdim=True)
        else:
            in_mean = self.running_mean
            in_std = self.running_std

        # Normalize the layer
        norm = (x_in - in_mean) / torch.sqrt(in_std + self.eps)
        # Compute the batch norm
        self.out = self.gamma * norm + self.beta

        # If we are training, we need to update the running mean and std
        if self.training:
            with torch.no_grad():
                self.running_mean = (1.-self.momentum) * self.running_mean \
                    + self.momentum * in_mean
                self.running_std = (1.-self.momentum) * self.running_std \
                    + self.momentum * in_std
        
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]


class Tanh():
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out

    
    def parameters(self):
        return []


vocab_size = 27
emb_size = 10
n_hidden = 100

C = torch.randn((vocab_size, emb_size), generator=g)

layers_without_batch_norm = [
    Linear(emb_size * block_size, n_hidden), Tanh(),
    Linear(n_hidden, n_hidden), Tanh(),
    Linear(n_hidden, n_hidden), Tanh(),
    Linear(n_hidden, n_hidden), Tanh(),
    Linear(n_hidden, n_hidden), Tanh(),
    Linear(n_hidden, vocab_size),
]

layers_with_batch_norm = [
    Linear(emb_size * block_size, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    Linear(n_hidden, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    Linear(n_hidden, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    Linear(n_hidden, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    Linear(n_hidden, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    Linear(n_hidden, vocab_size, bias=False), BatchNorm1d(vocab_size),
]
layers = layers_with_batch_norm

with torch.no_grad():
    # Make last layer less confident by scaling down the weights
    # layers[-1].weights *= 0.1
    # When we have BatchNorm1d as the last layer ,we need to scale the gamma because
    # we do not have weigths, but gamma effectively acts like weights
    layers[-1].gamma *= 0.1
    # For every other preactivation layer, apply a tanh gain, similar to kaimin init
    for l in layers[:-1]:
        if isinstance(l, Linear):
            # This fights the squashing over the layers done by tanh 
            l.weights *= 5/3


parameters = [C] + [p for layer in layers for p in layer.parameters()]
print("Number of parameters: ", sum([p.nelement() for p in parameters]))
for p in parameters:
    p.requires_grad = True

Number of parameters:  47024


In [13]:
steps = []
losses = []

In [14]:
max_steps = 200000
update_to_data_ratio = []
for idx in range(max_steps):
    # Minibatch construction
    # Sample indexes from X (minibatch of 32 examples)
    idxs = torch.randint(0, d.X["train"].shape[0], (32,))
    
    # Forward pass, only with the minibatch
    emb = C[d.X["train"][idxs]]
    # Transpose the input to match the first layer
    x_in = emb.view(emb.shape[0], block_size * emb_size)

    for layer in layers:
        x_in = layer(x_in)   
    # Compute the loss
    loss = F.cross_entropy(x_in, d.Y["train"][idxs])
    # Reset the gradients
    for p in parameters:
        p.grad = None

    # This makes the backward pass also populate the gradient for the `out` variables
    for layer in layers:
        layer.out.retain_grad()
    # Compute the backward pass
    loss.backward()

    # Gradually increase the learning rate in each step
    # lr = lrs[idx]
    lr = 0.1 if idx < 100000 else 0.01
    # Update / nudge the value in the direction of the gradient
    for p in parameters:
        p.data += -lr * p.grad

    # Track progress
    # lrs_used.append(lr_exponents[idx])
    steps.append(idx)
    # Each 10k steps print the progress of the loss
    if idx % 10000 == 0:
        print(f"{idx:6d} / {max_steps:6d} -> {loss.item():.4f}")
    losses.append(loss.log10().item())

    with torch.no_grad():
        ud = [(lr * p.grad.std() / p.std()).log10().item() for p in parameters]
        update_to_data_ratio.append(ud)

    if idx >= 1000:
        break

loss

     0 / 200000 -> 3.2602


tensor(2.8259, grad_fn=<NllLossBackward0>)

In [29]:
# Utility function to compare manual gradients to torch gradients
def cmp(manual_dt, dt):
    # Check for perfect equality
    are_equal = torch.all(manual_dt == dt).item()
    # Check for equality up to a certail decimal (usefull for unstable floats)
    close_eq = torch.allclose(manual_dt, dt)
    # Check the maximum difference between close equality and perfect equality
    max_diff = (manual_dt - dt).abs().max().item()
    print(f"exact {are_equal:.5s}, approx {close_eq:.5s}, max diff {max_diff}")

In [32]:
cmp(torch.ones(10), torch.zeros(10))

exact 0.00000, approx 0.00000, max diff 1.0
