# The forward and backward passes

## High Level

- [x] Using mnist dataset, create a basic architecture 
  - two linear layers with relu inbetween
- [x] Create a loss function for multiclass (for simplicity use MSE)
- [x] Forward pass passing the inputs and computing the loss using output
- [x] Backward pass - computing the grads of out and mult with previous layer inp due to the chain layer (dy/dx = dy/du * du/dx). This is done in order to computer the grads of out wrt to the inp.
- Refactor : Convert the individual layers as classes
- Refactor : Abstract the repetitive code into base class
- Using Autograd

In [None]:
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl, numpy as np
from pathlib import Path
from torch import tensor
from fastcore.test import test_close, test_eq
torch.manual_seed(42)

mpl.rcParams['image.cmap']='gray'
torch.set_printoptions(precision=2,linewidth=125,sci_mode=False)
np.set_printoptions(precision=2,linewidth=125)

path_data=Path('data')
path_gz = path_data/'mnist.pkl.gz'
with gzip.open(path_gz, 'rb') as f: ((x_train,y_train),(x_valid,y_valid),_) = pickle.load(f, encoding='latin-1')
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])

## Foundations version

### Basic architecture

In [None]:
n,m = x_train.shape
c = y_train.max() + 1
n, m, c

(50000, 784, tensor(10))

In [None]:
# num hidden - arbitrary
nh = 50

50000 rows X 784 cols @ 784 rows X 10 cols -> 50000 images X 10 
First row is the pixel val of the image.

We are going to do this in two steps with hidden layers

50000 X 784 @ 784 X 50 @ 50 X 10

Weight matrix is initiallized with random values. We will also need to add bias that's what makes it linear layers.

In [None]:
w1 = torch.randn(m,nh)
b1 = torch.zeros(nh)
w2 = torch.randn(nh, 1) # we are going to create only 1 output(what number it is) inorder to use MSE instead of Cross Entroy
b2 = torch.zeros(1) 

In [None]:
def lin(x, w, b): return x@w + b

In [None]:
x_valid.shape

torch.Size([10000, 784])

In [None]:
t = lin(x_valid, w1, b1);t.shape

torch.Size([10000, 50])

In [None]:
test_eq(tensor(-5.5).clamp_min(0.), tensor(0.))
test_eq(tensor(5.5).clamp_min(0.), tensor(5.5))

In [None]:
def relu(x): return x.clamp_min(0.)

In [None]:
t=relu(t);t

tensor([[ 0.00, 11.87,  0.00,  ...,  5.48,  2.14, 15.30],
        [ 5.38, 10.21,  0.00,  ...,  0.88,  0.08, 20.23],
        [ 3.31,  0.12,  3.10,  ..., 16.89,  0.00, 24.74],
        ...,
        [ 4.01, 10.35,  0.00,  ...,  0.23,  0.00, 18.28],
        [10.62,  0.00, 10.72,  ...,  0.00,  0.00, 18.23],
        [ 2.84,  0.00,  1.43,  ...,  0.00,  5.75,  2.12]])

In [None]:
def model(xb):
    l1 = lin(xb, w1, b1)
    l2 = relu(l1)
    return lin(l2, w2, b2)

In [None]:
res = model(x_valid);res.shape

torch.Size([10000, 1])

### Loss function: MSE

To keep things simple, we are using mse.

In [None]:
res.shape, y_valid.shape

(torch.Size([10000, 1]), torch.Size([10000]))

1 will be broadcasted over 10000 elem, moves over to the previous since there are no axis, inserts a unit axis

In [None]:
(res - y_valid).shape # this is not what we want to compute mse

torch.Size([10000, 10000])

We need to get rid of the trailing (,1) in order to use mse.

In [None]:
test_eq(res[:, 0].shape, torch.Size([10000]))
test_eq(res.squeeze().shape, torch.Size([10000]))

In [None]:
(res.squeeze() - y_valid).shape

torch.Size([10000])

In [None]:
y_train, y_valid = y_train.float(), y_valid.float()
preds = model(x_train)
preds.shape, y_train.shape

(torch.Size([50000, 1]), torch.Size([50000]))

In [None]:
def mse(output, targ): return (output.squeeze() - targ).pow(2).mean() # error squared mean

In [None]:
mse(preds, y_train)

tensor(4308.76)

### Gradients and backward pass

Gradients are slope ie tangent at every point to the curve.
Rise/run -> as we increase in time, how much the distance will increase.

As we increase the weight, how much loss would go down. Derivative of the loss wrt to the weight will inform us how much to change the weight. SGD is New W -> W - dL/dw. 
Req - Loss should be a single number.

In [None]:
#%pip install sympy -Uq

In [None]:
from sympy import symbols,diff
x,y = symbols('x y')
diff(x**2, x) # diff x^2 wrt x

2*x

In [None]:
diff(3*x**2+9, x) # # diff 3x^2 + 9 wrt x -> Chain rule

6*x

Chain rule visualized - [The Intuitive Notion of the Chain Rule](https://webspace.ship.edu/msrenault/geogebracalculus/derivative_intuitive_chain_rule.html)

Start at the end - takes it derivative & mult with the previous step-> backpropagation using the chain rule
```
L(l2, y)
l2(r, w2, b2)
r(l1)
l1(x, w1, b1)
```

In [None]:
w2.shape, w2.t().shape

(torch.Size([50, 1]), torch.Size([1, 50]))

In [None]:
preds.shape, y_train.shape

(torch.Size([50000, 1]), torch.Size([50000]))

In [None]:
preds[:, 0].shape, preds.squeeze(dim=1).shape, y_train.shape

(torch.Size([50000]), torch.Size([50000]), torch.Size([50000]))

In [None]:
((preds[:, 0] - y_train)[:, None]).shape, x_train.shape

(torch.Size([50000, 1]), torch.Size([50000, 784]))

In [None]:
def lin_grad(inp, out, w, b):
    # grad of matmul with respect to input # dy/dx = dy/du * du/dx
    # below comments are for the lin_grad(l2, out, w2, b2)
    inp.g = out.g @ w.t() # chain rule -> grad of o/p matmul with transponse of weights -> (50000, 50) -> (50000, 1) * (1, 50)
    w.g = (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0) # w.g = inp.T()@out.g - (50, 1) -> (50000, 50, 1) * (50000, 1, 1) 
    b.g = out.g.sum(0) # -> grad of output added together (1) -> (50000, 1)

In [None]:
def forward_and_backward(inp, targ):
    # forward
    l1 = lin(inp, w1, b1) # [50000, 50] -> ([50000, 784]), ([784, 50]), ([50])
    l2 = relu(l1) # [50000, 50]
    out = lin(l2, w2, b2)  # ([50000, 1])
    diff = out[:, 0] - targ # ([50000])
    loss = diff.pow(2).mean() # scalar
    
    # backward - store the grad of each layer with its input in an attr 'g'
    # import pdb; pdb.set_trace()
    out.g = 2 * diff[:,None] / inp.shape[0]  # (50000, 1) -> (50000, 1) / 50000 (mean)
    lin_grad(l2, out, w2, b2) # mult by the grad of previous layer
    l1.g = (l1>0).float() * l2.g # (50000, 50) -> (50000, 50)
    lin_grad(inp, l1, w1, b1)

In [None]:
forward_and_backward(x_train, y_train)

In [None]:
# Save for testing against later
def get_grad(o): return o.g.clone()
chks = w1,w2,b1,b2,x_train

In [None]:
grads = w1g,w2g,b1g,b2g,ig = map(get_grad, chks)

In [None]:
# use pytorch 
def mkgrad(x): return x.clone().requires_grad_(True)
ptgrads = w12,w22,b12,b22,xt2 = map(mkgrad, chks)

In [None]:
def forward(inp, targ):
    l1 = lin(inp, w12, b12)
    l2 = relu(l1)
    out = lin(l2, w22, b22)
    return mse(out,targ)

In [None]:
loss = forward(xt2, y_train)
loss.backward()

In [None]:
for a,b in zip(grads, ptgrads): test_close(a.grad, b, eps=0.01)

## Refactor model

In [None]:
class Relu():
    def __call__(self, inp):
        self.inp = inp
        self.out = inp.clamp_min(0.)
        return self.out
    
    def backward(self): self.inp.g = (self.inp>0).float() * self.out.g

In [None]:
class Lin():
    def __init__(self, w, b): self.w,self.b = w,b

    def __call__(self, inp):
        self.inp = inp
        self.out = lin(inp, self.w, self.b)
        return self.out

    def backward(self):
        self.inp.g = self.out.g @ self.w.t()
        self.w.g = self.inp.t() @ self.out.g
        self.b.g = self.out.g.sum(0)

In [None]:
class Mse():
    def __call__(self, inp, targ):
        self.inp,self.targ = inp,targ
        self.out = mse(inp, targ)
        return self.out
    
    def backward(self):
        self.inp.g = 2. * (self.inp.squeeze() - self.targ).unsqueeze(-1) / self.targ.shape[0]

In [None]:
class Model():
    def __init__(self, w1, b1, w2, b2):
        self.layers = [Lin(w1,b1), Relu(), Lin(w2,b2)]
        self.loss = Mse()
        
    def __call__(self, x, targ):
        for l in self.layers: x = l(x)
        return self.loss(x, targ)
    
    def backward(self):
        self.loss.backward()
        for l in reversed(self.layers): l.backward()

In [None]:
model = Model(w1, b1, w2, b2)
loss = model(x_train, y_train)
model.backward()

In [None]:
test_close(w2g, w2.g, eps=0.01)
test_close(b2g, b2.g, eps=0.01)
test_close(w1g, w1.g, eps=0.01)
test_close(b1g, b1.g, eps=0.01)
test_close(ig, x_train.g, eps=0.01)

### Module.forward()

In [None]:
class Module():
    def __call__(self, *args):
        self.args = args
        self.out = self.forward(*args)
        return self.out

    def forward(self): raise Exception('not implemented')
    def backward(self): self.bwd(self.out, *self.args)
    def bwd(self): raise Exception('not implemented')

In [None]:
class Relu(Module):
    def forward(self, inp): return inp.clamp_min(0.)
    def bwd(self, out, inp): inp.g = (inp>0).float() * out.g

In [None]:
class Lin(Module):
    def __init__(self, w, b): self.w,self.b = w,b
    def forward(self, inp): return inp@self.w + self.b
    def bwd(self, out, inp):
        inp.g = self.out.g @ self.w.t()
        self.w.g = inp.t() @ self.out.g
        self.b.g = self.out.g.sum(0)

In [None]:
class Mse(Module):
    def forward (self, inp, targ): return (inp.squeeze() - targ).pow(2).mean()
    def bwd(self, out, inp, targ): inp.g = 2*(inp.squeeze()-targ).unsqueeze(-1) / targ.shape[0]

In [None]:
model = Model(w1, b1, w2, b2)
loss = model(x_train, y_train)
model.backward()

In [None]:
test_close(w2g, w2.g, eps=0.01)
test_close(b2g, b2.g, eps=0.01)
test_close(w1g, w1.g, eps=0.01)
test_close(b1g, b1.g, eps=0.01)
test_close(ig, x_train.g, eps=0.01)

### Autograd

In [None]:
from torch import nn
import torch.nn.functional as F

In [None]:
class Linear(nn.Module):
    def __init__(self, n_in, n_out):
        super().__init__()
        self.w = torch.randn(n_in,n_out).requires_grad_()
        self.b = torch.zeros(n_out).requires_grad_()
    def forward(self, inp): return inp@self.w + self.b

In [None]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [Linear(n_in,nh), nn.ReLU(), Linear(nh,n_out)]
        
    def __call__(self, x, targ):
        for l in self.layers: x = l(x)
        return F.mse_loss(x, targ[:,None])

In [None]:
m, nh

(784, 50)

In [None]:
model = Model(m, nh, 1)
loss = model(x_train, y_train)
loss.backward()

In [None]:
l0 = model.layers[0]
l0.b.grad

tensor([-19.60,  -2.40,  -0.12,   1.99,  12.78, -15.32, -18.45,   0.35,   3.75,  14.67,  10.81,  12.20,  -2.95, -28.33,
          0.76,  69.15, -21.86,  49.78,  -7.08,   1.45,  25.20,  11.27, -18.15, -13.13, -17.69, -10.42,  -0.13, -18.89,
        -34.81,  -0.84,  40.89,   4.45,  62.35,  31.70,  55.15,  45.13,   3.25,  12.75,  12.45,  -1.41,   4.55,  -6.02,
        -62.51,  -1.89,  -1.41,   7.00,   0.49,  18.72,  -4.84,  -6.52])