# Forwarda nd bakward pass

## Get data

* normalize
* test near zero

In [340]:
%matplotlib inline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [341]:
import gzip
from pathlib import Path
import torch
import pickle
import matplotlib as mpl
import matplotlib.pyplot as plt

In [342]:
from exp.nb_01Magda import *

In [343]:
data_path = Path('/home/magda/datasets/mnist')
data_gzip = data_path/'mnist.pkl.gz'

In [344]:
with gzip.open(data_gzip, 'rb') as data_file:
    (train_x, train_y), (valid_x, valid_y), _ = pickle.load(data_file, encoding='latin-1')

In [345]:
(train_x, train_y, valid_x, valid_y) = [torch.tensor(x, dtype=torch.float) for x in (train_x, train_y, valid_x, valid_y)]

In [346]:
train_y, valid_y = [torch.unsqueeze(x, -1) for x in (train_y, valid_y)]
train_y.shape

torch.Size([50000, 1])

**normalize**
across instances and dimensions cause all pixels treated equal

In [347]:
def normalize(a, mean_a, std_a):
    return (a - mean_a) / std_a

In [348]:
train_mean = train_x.mean()
train_std = train_x.std()

train_x = normalize(train_x, train_mean, train_std)
valid_x = normalize(valid_x, train_mean, train_std)

In [349]:
train_x.mean(), train_x.std(), valid_x.mean(), valid_x.std()

(tensor(-6.2598e-06), tensor(1.), tensor(-0.0059), tensor(0.9924))

## Basic archtecture

* simplified kaiming init
* def simple linear layer
* def simple relu
* kaiming init for relu
* pytorch init kaiming
* model function

In [350]:
in_num, in_dim = train_x.shape
out_dim = 1
hidden_dim = 50

In [351]:
weights1 = torch.normal(0, 1, size=(in_dim, hidden_dim)) / in_dim**0.5
bias1 = torch.zeros(hidden_dim)
weights2 = torch.normal(0, 1, size=(hidden_dim, out_dim)) / hidden_dim**0.5
bias1 = torch.zeros(out_dim)

In [352]:
weights1.mean(), weights1.std(), weights1.std()*(in_dim**0.5)

(tensor(-6.6535e-05), tensor(0.0359), tensor(1.0063))

In [353]:
def lin_layer(x, w, b):
    return x@w + b

In [354]:
hidden = lin_layer(valid_x, weights1, bias1)

In [355]:
hidden.mean(), hidden.std()

(tensor(0.0611), tensor(0.9905))

In [356]:
def relu(x):
    return x.clamp(0)

In [357]:
hidden = relu(lin_layer(valid_x, weights1, bias1))

In [358]:
hidden.mean(), hidden.std()

(tensor(0.4241), tensor(0.5941))

**improved kaimining for relu**

In [359]:
weights1 = torch.normal(0, 1, size=(in_dim, hidden_dim)) / (in_dim/2)**0.5
bias1 = torch.zeros(hidden_dim)
weights2 = torch.normal(0, 1, size=(hidden_dim, out_dim)) / (hidden_dim/2)**0.5
bias2 = torch.zeros(out_dim)

In [360]:
hidden = relu(lin_layer(valid_x, weights1, bias1))

In [361]:
hidden.mean(), hidden.std()

(tensor(0.4999), tensor(0.7810))

**torch init**

In [362]:
from torch.nn import init

In [363]:
w = torch.empty((in_dim, hidden_dim))
init.kaiming_normal_(w, mode='fan_out'); # pytorch w is weights1.T so fan_out instead of fan_in

In [364]:
hidden = relu(lin_layer(valid_x, w, bias1))
hidden.mean(), hidden.std()

(tensor(0.5579), tensor(0.8559))

**model**

In [365]:
def model(x, w1, b1, w2, b2):
    h = relu(lin_layer(x, w1, b1))
    return lin_layer(h, w2, b2).squeeze(-1)

In [366]:
model(valid_x, weights1, bias1, weights2, bias2).shape

torch.Size([10000])

## loss funciton: MSE

* model outputs
* def mse loss

In [367]:
out = model(valid_x, weights1, bias1, weights2, bias2)

In [368]:
def loss_mse(predict, target):
    return ((predict-target)**2).mean()

In [369]:
loss_mse(out, valid_y)

tensor(36.4607)

## gradients and backward pass

* def mse grad
* def relu grad
* def lin grad
* def forwarda and backward as func
* check results with pytorch autograd

In [370]:
def mse_grad(inp, target):
    inp.g = 2*(inp - target) / inp.shape[0]

In [371]:
def relu_grad(inp, out):
    inp.g = (inp>0).float() * out.g

In [372]:
def lin_grad(inp, w, b, out):
    inp.g = out.g@w.T
    w.g = (inp[..., None]*out.g[:,None,:]).sum(0)
    b.g = out.g.sum(0)

In [373]:
mse_grad(out, valid_y)

In [374]:
def forward_backward(x, w1, b1, w2, b2):
    # forward
    l1 = lin_layer(valid_x, weights1, bias1)
    l1_relu = relu(l1)
    out = lin_layer(l1_relu, weights2, bias2)
    # mse
    mse = loss_mse(out, valid_y)
    # backward
    mse_grad(out, valid_y)
    lin_grad(l1_relu, weights2, bias2, out)
    relu_grad(l1, l1_relu)
    lin_grad(valid_x, weights1, bias1, l1)

In [375]:
forward_backward(valid_x, weights1, bias1, weights2, bias2)

**pytorch autograd**

In [376]:
weights1.requires_grad_(True)
bias1.requires_grad_(True)
weights2.requires_grad_(True)
bias2.requires_grad_(True)
valid_x.requires_grad_(True)

tensor([[-0.4245, -0.4245, -0.4245,  ..., -0.4245, -0.4245, -0.4245],
        [-0.4245, -0.4245, -0.4245,  ..., -0.4245, -0.4245, -0.4245],
        [-0.4245, -0.4245, -0.4245,  ..., -0.4245, -0.4245, -0.4245],
        ...,
        [-0.4245, -0.4245, -0.4245,  ..., -0.4245, -0.4245, -0.4245],
        [-0.4245, -0.4245, -0.4245,  ..., -0.4245, -0.4245, -0.4245],
        [-0.4245, -0.4245, -0.4245,  ..., -0.4245, -0.4245, -0.4245]],
       requires_grad=True)

In [377]:
def forward(x, w1, b1, w2, b2):
    # forward
    l1 = lin_layer(valid_x, weights1, bias1)
    l1_relu = relu(l1)
    out = lin_layer(l1_relu, weights2, bias2)
    # mse
    return loss_mse(out, valid_y)

In [378]:
mse = forward(valid_x, weights1, bias1, weights2, bias2)

In [379]:
mse.backward()

In [380]:
test_near(weights1.grad, weights1.g, 1e-3, 1e-5)
test_near(weights2.grad, weights2.g)
test_near(bias1.grad, bias1.g)
test_near(bias2.grad, bias2.g)
test_near(valid_x.grad, valid_x.g)

## refactor model - layers as classes

* classes relu, lin, mse, model
* `__call__`, backward


In [381]:
class Relu():
    def __call__(self, inp):
        self.inp = inp
        self.out = inp.clamp(0)
        return self.out
    
    def backward(self):
        self.inp.g = (self.inp>0).float() * self.out.g

In [382]:
class LinLayer():
    def __init__(self, w, b):
        self.w = w
        self.b = b
        
    def __call__(self, inp):
        self.inp = inp
        self.out = inp@self.w + self.b
        return self.out
    
    def backward(self):
        self.inp.g = self.out.g@self.w.T
        self.w.g = (self.inp[..., None]*self.out.g[:,None,:]).sum(0)
        self.b.g = self.out.g.sum(0)

In [383]:
class LossMSE():
    def __call__(self, inp, target):
        self.inp = inp
        self.target = target
        return ((inp - target)**2).mean(0)
        
    def backward(self):
        self.inp.g = 2*(self.inp-self.target) / self.inp.shape[0]

In [384]:
class Model():
    def __init__(self, w1, b1, w2, b2):
        self.lin1 = LinLayer(w1, b1)
        self.lin2 = LinLayer(w2, b2)
        self.relu = Relu()
        self.loss_func = LossMSE()
        
    def __call__(self, inp, target):
        self.inp = inp
        self.target = target
        self.out = self.lin2(self.relu(self.lin1(inp)))
        self.loss_func(self.out, self.target)
        return self.out
        
    def loss(self):
        return self.loss_func(self.out, self.target) 

    def backward(self):
        self.loss_func.backward()
        self.lin2.backward()
        self.relu.backward()
        self.lin1.backward()

In [385]:
weights1.g, bias1.g, weights2.g, bias2.g, valid_x.g = [None]*5
model = Model(weights1, bias1, weights2, bias2)

In [386]:
model(valid_x, valid_y)

tensor([[-0.2343],
        [-0.4456],
        [-0.3779],
        ...,
        [ 0.4303],
        [-0.4290],
        [-0.9752]], grad_fn=<AddBackward0>)

In [387]:
model.backward()

In [388]:
model.loss()

tensor([36.7830], grad_fn=<MeanBackward1>)

**check agains pytorch autograd**

In [389]:
test_near(weights1.grad, weights1.g, 1e-3, 1e-5)
test_near(weights2.grad, weights2.g)
test_near(bias1.grad, bias1.g)
test_near(bias2.grad, bias2.g)
test_near(valid_x.grad, valid_x.g)

## refactor again with forward and backward

In [390]:
class Module():
    def __call__(self, *args):
        self.args = args
        self.out = self.forward(*self.args)
        return self.out
    
    def forward(self):
        raise Exception('not implemented')
    
    def backward(self):
        raise Exception('not implemented')

In [391]:
class Relu(Module):   
    def forward(self, inp):
        self.inp = inp
        self.out = self.inp.clamp(0)
        return self.out
    
    def backward(self):
        self.inp.g = (self.inp>0).float() * self.out.g

In [392]:
class LinLayer(Module):
    def __init__(self, w, b):
        self.w = w
        self.b = b
        
    def forward(self, inp):
        self.inp = inp
        self.out = self.inp@self.w + self.b
        return self.out
    
    def backward(self):
        self.inp.g = self.out.g@self.w.T
        self.w.g = (self.inp[..., None]*self.out.g[:,None,:]).sum(0)
        self.b.g = self.out.g.sum(0)

In [393]:
class LossMSE(Module):
    def forward(self, inp, target):
        self.inp = inp
        self.target = target
        return ((inp - target)**2).mean(0)
        
    def backward(self):
        self.inp.g = 2*(self.inp-self.target) / self.inp.shape[0]

In [394]:
class Model(Module):
    def __init__(self, w1, b1, w2, b2):
        self.layers = [LinLayer(w1, b1), Relu(), LinLayer(w2, b2)]
        self.loss_func = LossMSE()
        
    def forward(self, x, target):
        for layer in self.layers:
            x = layer(x)
        self.loss = self.loss_func(x, target)
        return x

    def backward(self):
        self.loss_func.backward()
        for layer in reversed(self.layers):
            layer.backward()

In [395]:
weights1.g, bias1.g, weights2.g, bias2.g, valid_x.g = [None]*5
model = Model(weights1, bias1, weights2, bias2)

In [396]:
model(valid_x, valid_y)

tensor([[-0.2343],
        [-0.4456],
        [-0.3779],
        ...,
        [ 0.4303],
        [-0.4290],
        [-0.9752]], grad_fn=<AddBackward0>)

In [397]:
model.backward()

In [398]:
test_near(weights1.grad, weights1.g, 1e-3, 1e-5)
test_near(weights2.grad, weights2.g)
test_near(bias1.grad, bias1.g)
test_near(bias2.grad, bias2.g)
test_near(valid_x.grad, valid_x.g)

## nn.Linear and nn.Module

In [None]:
import torch.nn as nn
class Model(nn.Module):
    def __init__(self, in_dim, h_dim, out_dim):
        super(Model, self).__init__()
        self.layers = [nn.Linear(in_dim, h_dim), nn.ReLU(), nn.Linear(h_dim, out_dim)]
        self.loss_func = nn.MSELoss()
    
    def forward(self, x, target):
        for layer in self.layers:
            x = layer(x)
        self.loss = self.loss_func(x, target)   

In [505]:
model = Model(in_dim, hidden_dim, out_dim)
model(valid_x, valid_y)

In [502]:
model.layers[0].weight.grad, model.layers[0].bias.grad

(tensor([[-0.1762, -0.1762, -0.1762,  ..., -0.1762, -0.1762, -0.1762],
         [-0.0181, -0.0181, -0.0181,  ..., -0.0181, -0.0181, -0.0181],
         [ 0.0960,  0.0960,  0.0960,  ...,  0.0960,  0.0960,  0.0960],
         ...,
         [ 0.0488,  0.0488,  0.0488,  ...,  0.0488,  0.0488,  0.0488],
         [ 0.0147,  0.0147,  0.0147,  ...,  0.0147,  0.0147,  0.0147],
         [-0.0384, -0.0384, -0.0384,  ..., -0.0384, -0.0384, -0.0384]]),
 tensor([ 0.4151,  0.0427, -0.2261,  0.3407, -0.1978, -0.0524, -0.0128, -0.2895,
          0.0924, -0.1274, -0.2024,  0.4173,  0.0247, -0.5304, -0.1451, -0.5526,
         -0.5436,  0.0343,  0.0569, -0.1933,  0.1652,  0.4987, -0.1080,  0.1236,
         -0.0157,  0.0608, -0.3137,  0.0937, -0.0279,  0.0283,  0.0470,  0.1348,
         -0.0759, -0.0946, -0.2766, -0.0417, -0.0631,  0.4763, -0.7488,  0.3939,
         -0.5718, -0.4260,  0.1591,  0.1749, -0.1195,  0.1956, -0.2956, -0.1149,
         -0.0346,  0.0904]))

In [503]:
model.layers[2].weight.grad, model.layers[2].bias.grad

(tensor([[-1.8590, -0.3108, -1.1811, -1.6412, -1.5475, -0.9389, -0.1614, -2.8028,
          -0.8290, -1.2885, -0.7883, -4.3612, -0.5352, -1.5133, -3.1240, -1.7928,
          -3.4015, -0.7169, -2.2674, -0.5153, -2.1296, -2.0411, -2.3259, -0.3523,
          -2.0078, -1.4588, -1.9275, -2.0649, -0.9860, -1.3555, -0.3945, -0.3820,
          -1.7299, -2.9450, -1.1033, -2.0713, -0.2237, -2.8358, -5.1573, -1.6168,
          -1.8701, -1.5214, -0.4071, -0.5935, -0.2614, -3.6520, -1.0018, -1.8520,
          -0.1577, -1.7664]]),
 tensor([-8.4678]))

In [506]:
model.loss.backward()

The backward pass updates the parameter gradients but the parameters cannot be directly extracted from the model as `model.parameters()` becuase they are included in the list in the `__init__`