In [1]:
import numpy as np
import pytest
import bettertimeit

# Design your own Neural Net

## ~~Ray Hettinger~~

## Varun Nayyar


## What

- We're going to write a simple neural net codebase
- We only want a fully connected layer aproach with custom optimisation
- We aren't building ResNet, but we want our design to be flexible, efficient and memory aware
- We also want to make it easy to use!

## Why? 

- Neural Nets are easy
- Everyone loves Neural Nets
- Good way to illustrate good software mixed with ML


## This is

- A fun mix of ML and Software
- A deeper dive into Neural Nets than pytorch
- Mostly iterative design and analysis
- A lot of live coding (that I'm going to regret)
- Gratuitous classes

## This is not

- A good way to implement a Neural Net Library in 2019
- Building computational graphs
- Automatic Differentiation (autograd) or how to do it
- GPU programming
- See @chewxy for the above
- Using non standardlib (i.e. no `attrs` et al)


## Neural Nets

![nn.png](resources/nn.png)

- Backprop was invented independently 3 times.
- Was thought to be useless for a long time - Hiton spent many years on approximate methods

## Forward 

- Fully Connected Layer
    - $y=Wx + b$
    - This is just a matrix multiplication
- TanH
    - $y = tanh(x)$


In [2]:
class Layer:
    def forward(self, x):
        pass

In [3]:
class Tanh(Layer):
    def forward(self, x):
        return np.tanh(x)

## Aside

- $Wx+b$ has these shapes
    - x is (Indim,)
    - W is (Outdim, Indim)
    - b is (Outdim,)
- How do we initialize the W and b?
- We want to batch our x, we don't want to do this


In [6]:
def forward(*args):
    y = []
    for vector in x:
        y.append(W @ x + b)
    return y
        

## Let's try this

In [7]:
class FullyConnected(Layer):
    def __init__(self, indim, hiddendim):
        self.W = np.ones((hiddendim, indim))
        self.b = np.zeros(hiddendim)

    def forward(self, x):
        y = self.W @ x + self.b
        return y


## Let's quickly test

In [10]:
N = 100
x = np.random.randn(N, 10)
l = FullyConnected(10, 32)
l.forward(x)

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 100 is different from 10)

## Hmm

- What should the shapes of $x$ and $W$ be?
    - if $x$ is (I, N) then W should be (O, I)
        - `W @ x`
    - if $x$ is (N, I) then W should be (I, O)
        - `x @ W`
- is there a compute consideration here?

In [23]:
def forward():
    import numpy as np
    N = 3000
    indim = 20
    hiddendim = 40

    w = np.random.randn(indim, hiddendim)
    x = np.random.randn(N, indim)

    def timeit_opt1():
        x @ w

    wT = w.T
    xT = x.T

    def timeit_opt2():
        wT @ xT

bettertimeit.bettertimeit(forward, 10)

opt1: 10000 loops, best of 10: 110 usec per loop
opt2: 10000 loops, best of 10: 120 usec per loop


## Design

- N >> I (usually)
- $x$ (I, N) is column major!
    - natural mathematical form in code!
    - If this was fortran, matlab or julia 
- Python and C are row major = N, I
    - more efficient access in these languages
    - N as leading index matches most python/C conventions
    - Code no longer matches the mathematics :(

In [12]:
class FullyConnected(Layer):
    def __init__(self, indim, hiddendim):
        self.W = np.ones((indim, hiddendim))
        self.b = np.zeros(hiddendim)

    def forward(self, x):
        y = x @ self.W + self.b
        return y


## Quick test

In [13]:
N = 100
x = np.random.randn(N, 10)
l = FullyConnected(10, 32)
y = l.forward(x)
y.shape

(100, 32)

## Initialisation Design

- zeros is a bad idea (very slow backprop)
- Many different approaches (Xavier, He, etc)
    - Xavier is random normal(0, scale) where scale is 2/I+O
    - He et al is random normal (0, scale) where scale is 2/I
    
- Design: Classmethod or init arg?

## My Opinion

- Classmethod
    - choices would show up in methods (less reliance on doc)
    - user needs to know all the options 
    - many methods would be very similar - code duplication
    - init would either be user unfriendly (takes W and b) or have a bad defauLt
- Init arg
    - classmethod's are best when we have very different arguments
    - lot of code can be shared
    - Maybe even allow it to be a function?
    

In [14]:
class FullyConnected(Layer):
    def __init__(self, indim, hiddendim, init="xavier"):
        if init == "xavier":
            scale = np.sqrt(2/(indim+hiddendim))
        elif init == "he":
            scale = np.sqrt(2/indim)
        else:
            raise ValueError(f"Unknown initialiser: {init}")
        self.W = np.random.randn(indim, hiddendim) * scale
        self.b = np.zeros(hiddendim)

    def forward(self, x):
        y = x @ self.W + self.b
        return y


In [15]:
## Test

N = 100
x = np.random.randn(N, 10)
l = FullyConnected(10, 32)
y = l.forward(x)
y.shape

(100, 32)

# Backward Pass

## The Equations


- Fully Connected
    - $\frac{dy}{dx} = W^T$
    - $\frac{dy}{dW} = x^T$
    - $\frac{dy}{db} = 1$
    - Chain Rule + Matrix math
        - $dL/dy$ is same shape as y - (N,O)
        - $\frac{dL}{dx} = \frac{dL}{dy} W^T$
        - $\frac{dL}{dW} = x^T\frac{dL}{dy}$
        - $\frac{dL}{db} = \frac{dL}{dy}$
- Tanh
    - $\frac{dy}{dx} = 1-tanh^2(x)$
    - $\frac{dL}{dx} = (1-tanh^2(x)) * \frac{dL}{dy}$

In [16]:
class Layer:
    def backward(self, dldy):
        pass

class Tanh(Layer):
    
    def backward(self, dldy):
        dldx = (1 - (np.tanh(x)) ** 2) * dldy
        return dldx

class FullyConnected(Layer):
    
    def backward(self, dldy):
        dldw = dldy @ self.W.T
        dldb = dldy
        dldx = x.T @ dldy
        # TODO: param updates
        return dldx

## Wait

- We don't have access to the input, $x$ in the backward pass!
- How should we solve this?
    - Cache it on the forward pass?
    - Or expect it as an argument to the backward function?


## My Opinion

- Caching
    - Easiest for user - just call backward and it just works
    - Odd side effects - call forward twice and gradient will only be on second forward
    - Uneccessary work if we're doing inference only
- Argument
    - Explicit is better than implicit!
    - Easier to test - we can separate the functionality.
    - Allows for possible optimisations or memory control.

In [17]:
class Tanh(Layer):

    def backward(self, dldy, x):
        y = self.forward(x)
        return (1 - y**2) * dldy
    

class FullyConnected(Layer):

    def backward(self, dldy, x):
        dldw = x.T @ dldy
        dldb = np.sum(dldy, axis=0)
        dldx = dldy @ self.W.T
        
        # TODO: param updates
        
        return dldx

## Consideration

- the backward pass of the tanh can be expressed in terms of the output
- the output of the activation layer is usually the input of the fully connected layer
- passing the input in isn't always the best option!
- ReLU may prefer a different cache value!
- If batch is large, each layer will now store a copy of the data

## Solution

Forward pass returns the value it wants on the backward pass?

In [18]:
class Tanh(Layer):

    def forward(self, x):
        y = np.tanh(x)
        return y, y

    def backward(self, dldy, y):
        return (1 - y**2) * dldy

class FullyConnected(Layer):
    
    def forward(self, x):
        y = x @ self.W + self.b
        return y, x

    def backward(self, dldy, x):
        dldw = x.T @ dldy
        dldb = np.sum(dldy, axis=0)
        dldx = dldy @ self.W.T
        
        # TODO: param updates
        
        return dldx


More generally

- Pass both input and output to backwards function
- Allow layer to do custom caching if need be, but give control of object to top level

We clearly need a container object here!

In [19]:

class Network:
    def __init__(self, *layers):
        self.network = tuple(layers)

    def forward(self, x):
        cacheList = []
        for l in self.network:
            x, c = l.forward(x)
            cacheList.append(c)
        return x, cacheList

    def backward(self, dldx, cachelist):
        for l, c in zip(reversed(self.network), reversed(cachelist)):
            dldx = l.backward(dldx, c)
        return dldx


In [20]:

class Layer:
    def forward(self, x):
        pass

    def backward(self, dldy, cache):
        pass


class FullyConnected(Layer):
    def __init__(self, indim, hiddendim):
        super().__init__()
        self.W = np.random.randn(indim, hiddendim) * np.sqrt(2 / indim)
        self.b = np.zeros(hiddendim)

    def forward(self, x):
        y = x @ self.W + self.b
        return y, x

    def backward(self, dldy, x):
        dldw = x.T @ dldy
        dldb = np.sum(dldy, axis=0)
        dldx = dldy @ self.W.T

        return dldx


class Tanh(Layer):

    def forward(self, x):
        y = np.tanh(x)
        return y, y

    def backward(self, dldy, y):
        return (1 - y**2) * dldy


class Network(Layer):
    def __init__(self, *layers):
        super().__init__()
        self.network = tuple(layers)

    def forward(self, x):
        cacheList = []
        for l in self.network:
            x, c = l.forward(x)
            cacheList.append(c)
        return x, cacheList

    def backward(self, dldx, cachelist):
        for l, c in zip(reversed(self.network), reversed(cachelist)):
            dldx = l.backward(dldx, c)
        return dldx


In [21]:

x = np.random.randn(100, 10)
y = np.random.randn(100, 3)
dldy = np.random.randn(100, 3)
net = Network(
    FullyConnected(10, 20),
    Tanh(),
    FullyConnected(20, 3),
    Tanh()
)

yhat, ca = net.forward(x)
dldx = net.backward(dldy, ca)
dldx.shape

(100, 10)

# Optimisers!

- SGD
    - $\theta^{new} = \theta^{curr} - \eta \nabla L$
    - stateless
    - $\eta$ is the learning rate
- Momentum Gradient Descent
    - $\nu^{new} = \alpha \nu^{curr}  + \eta \nabla L$
    - $\theta^{new} = \theta^{curr} - \nu^{new}$
    - not stateless
    - $\alpha$ is the momentum param
    
    

In [24]:
LR = 0.01

class FullyConnected(Layer):

    def backward(self, dldy, x):
        dldw = x.T @ dldy
        dldb = np.sum(dldy, axis=0)
        dldx = dldy @ self.W.T

        self.W -= dldw * LR
        self.b -= dldb * LR

        return dldx


## Design

- Should train be a method of Network or a function

## My opinion

- Network actually matches the signature of a Layer!
- We can compose a network of networks! 
- Train methods are very variable, we should provide flexibility

In [22]:
class MSELoss:
    """Eg loss function"""
    def loss(self, y, yhat):
        return np.mean((y - yhat)**2 / 2)

    def loss_gradient(self, y, yhat):
        return np.expand_dims(np.mean(yhat - y, axis=-1), axis=-1)
    

def train(network, data, numepochs):
    mse = MSELoss()
    for i in range(numepochs):
        x, y = data
        yhat, cachelist = network.forward(x)
        dldy = mse.loss_gradient(y, yhat)
        network.backward(dldy, cachelist)
        print(f"Epoch {i}, loss: {mse.loss(y, yhat)}")
