In [29]:
import torch
import matplotlib.pyplot as plt
%matplotlib inline

In [30]:
vocab_size = 27
n_embd = 10
block_size = 3

In [31]:
class Linear:

    def __init__(self, fan_in, fan_out, bias=True):
        self.weight = torch.randn((fan_in, fan_out), generator=g) / fan_in ** 0.5
        self.bias = torch.zeros(fan_out) if bias else None

    def __call__(self, x):
        self.out = x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out

    def parameters(self):
        return [self.weight] + ([] if self.bias is None else [self.bias])

In [32]:
class BatchNorm1d:

    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps
        self.momentum = momentum
        self.training = True # many different modules have different behaviors in train & test. in train we use the mean & variance of the current batch, but if we test we use the running mean & the running std.

        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

        self.running_mean = torch.zeros(dim)
        self.running_var = torch.ones(dim)

    def __call__(self, x):

        if self.training:
            xmean = x.mean(0, keepdim=True)
            xvar = x.var(0, keepdim=True, unbiased=True)
        else:
            xmean = self.running_mean
            xvar  = self.running_var
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
        self.out = self.gamma * xhat + self.beta # torch modules don't have these by default, we're implementing this here for easier debugging.

        if self.training:
            with torch.no_grad(): # turn off grad so torch doesn't start building a computational graph for these ops
                self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
                self.running_var = (1 - self.momentum) * self.running_var + self.momemntum * xvar
            return self.out

        def parameters(self):
            return [self.gamma, self.beta]

In [33]:
class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out
    def parameters(self):
        return []

In [34]:
n_embd = 10 # dimensionality
n_hidden = 100 # number of neurons in the hidden layer
g = torch.Generator().manual_seed(2147483647)

C = torch.randn((vocab_size, n_embd), generator=g)
layers = [
    Linear(n_embd*block_size, n_hidden), Tanh(),
    Linear(        n_hidden, n_hidden), Tanh(),
    Linear(        n_hidden, n_hidden), Tanh(),
    Linear(        n_hidden, n_hidden), Tanh(),
    Linear(        n_hidden, n_hidden), Tanh(),
    Linear(        n_hidden, vocab_size), 
]

In [38]:
# apply Kaiming init to every layer except the last.
with torch.no_grad(): 
    layers[-1].weight *= 0.1 # why???
    for layer in layers[:-1]: 
        if isinstance(layer, Linear):
            layer.weight *= 5/3

parameters = [C] + [p for layer in layers for p in layer.parameters()]
p

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0.], requires_grad=True)

In [39]:
print(sum(p.nelement() for p in parameters))
for p in parameters:
    p.requires_grad = True

46497
