# Torchify batch norm notebook

In [1]:
import torch
import torch.nn.functional as F

Starting from [here](https://www.youtube.com/watch?v=P6sfmUTpUmc&t=4715s).

- Linear Layer
  - n_inputs, n_outputs, bias-flag
  - pass is done in call method
  - parameters returns list
  - remember to scale weights


In [131]:
class Linear:
    def __init__(self, n_in, n_out, bias=True) -> None:
        self.W = torch.randn((n_in, n_out), requires_grad=True)
        self.use_bias = bias
        if bias:
            self.b = torch.zeros((1,n_out), requires_grad=True)
        

    def __call__(self, x):
        self.out = x @ self.W
        if self.use_bias:
            self.out += self.b
        return self.out

    def parameters(self):
        if self.use_bias: return [self.W, self.b]
        return [self.W]

- BatchNorm1d
  - dim
  - eps (division by std)
  - momentum
  - training-flag to toggle running-mean calculation
  - keep track of var instead of std
  - in training:
    - use exact mean/std of batch, still update running statistics


In [132]:
class BatchNorm1d:
    def __init__(self, dim, train=True, momentum=.1, eps=1e-5):
        self.dim = dim
        self.train = train
        self.momentum = momentum
        self.eps = eps

        self.running_mean = torch.zeros((1,dim), requires_grad=False)
        self.running_var = torch.ones((1,dim), requires_grad=False)

        self.gain = torch.zeros((1,dim), requires_grad=True)
        self.scale = torch.ones((1,dim), requires_grad=True)

    def __call__(self, x: torch.Tensor):
        if self.train:
            batch_mean = x.mean(dim=0, keepdim=True)
            batch_var = x.var(dim=0, keepdim=True)
        else:
            batch_mean = self.running_mean
            batch_var = self.running_var

        self.out = (x - batch_mean) / batch_var
        self.out = self.out * self.scale + self.gain

        if self.train:
            with torch.no_grad():
                self.running_mean = (1-self.momentum)*self.running_mean + self.momentum*batch_mean
                self.running_var = (1-self.momentum)*self.running_var + self.momentum*batch_var

        return self.out

    def parameters(self):
        return [self.gain, self.scale]

- Tanh
  - simple


In [133]:
class Tanh:
    def __call__(self, x: torch.Tensor):
        self.out = torch.tanh(x)
        return self.out

    def parameters(self):
        return []

In [134]:
class Embedding:
    def __init__(self, emb_size, n_in):
        self.emb_size = emb_size
        self.n_in = n_in
        self.C = torch.randn((n_in, emb_size), requires_grad=True)

    def __call__(self, x: torch.Tensor):
        self.out = self.C[x] # embed the characters into vectors
        self.out = self.out.view(self.out.shape[0], -1) # concatenate the vectors

        return self.out

    def parameters(self): return [self.C]

- In general:
  - use self.out to store output

- load data from files

In [135]:
from pathlib import Path


def load(filename):
    data = Path('../data/')
    return torch.load(data/filename)


x_train = load('x_train')
y_train = load('y_train')

- build 6 layer network
  - scale last layer weights down by .1
  - scale all other layers by 5/3

In [136]:
class Sequential:
    def __init__(self, layer_list):
        self.layer_list = layer_list

    def __call__(self, x):
        for layer in self.layer_list:
            x = layer(x)

        return x

    def parameters(self):
        return sum((layer.parameters() for layer in self.layer_list), start=[])

In [147]:
import torch.nn

emb_size = 10
n_hidden = 100
vocab_size = 27

net = Sequential([
    Embedding(emb_size, vocab_size),
    Linear(3*emb_size, n_hidden),
    Tanh(),
    Linear(n_hidden, n_hidden),
    Tanh(),
    Linear(n_hidden, n_hidden),
    Tanh(),
    Linear(n_hidden, n_hidden),
    Tanh(),
    Linear(n_hidden, n_hidden),
    Tanh(),
    Linear(n_hidden, vocab_size),
])
g = torch.Generator().manual_seed(2147483647) # for reproducibility

net.layer_list[-1].W.data *= .1
for layer in net.layer_list[:1]:
    if isinstance(layer, Linear):
        layer.W.data *= 5/3


- training loop
  - embed
  - linear application of all layers
  - cross entropy
  - retain_graph on all layer.out

In [148]:
batch_size = 32
max_steps = 1
lossi = []

for i in range(max_steps):

    # minibatch construct
    ix = torch.randint(0, x_train.shape[0], (batch_size,), generator=g)
    xb, yb = x_train[ix], y_train[ix] # batch X,Y

    x = net(xb)
    
    loss = F.cross_entropy(x, yb)
    
    # backward pass
    for layer in net.layer_list[1:]:
        layer.out.retain_grad()

    for p in net.parameters():
      p.grad = None
    loss.backward()
    
    # update
    lr = 0.1 if i < 100000 else 0.01 # step learning rate decay
    for p in net.parameters():
      p.data += -lr * p.grad

    # track stats
    if i % 10000 == 0: # print every once in a while
      print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())

      0/      1: 3.8727


- visualization
  - for each layer, take out of tanh
    - mean, std, percent of >.97
  - plot histograms for each layer into one diagram

- same visualization for grads