In [3]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [1]:
#export 
from exp.nb_01 import *

def get_data():
    mnist_trainset = datasets.MNIST(
        root='./data', train=True, download=True, transform=None)
    x_set = mnist_trainset.data.reshape(mnist_trainset.data.shape[0], 784)
    x_train, x_valid = x_set[:50000], x_set[50000:]
    y_train, y_valid = mnist_trainset.targets[:
                                              50000], mnist_trainset.targets[50000:]
    print(x_train.shape, y_train.shape, x_valid.shape, y_valid.shape)
    print(x_train.type(), y_train.type(), x_valid.type(), y_valid.type())

    return x_train.float(), y_train, x_valid.float(), y_valid

def normalize(x, m, s): return (x-m)/s

In [5]:
x_train, y_train, x_valid, y_valid = get_data()

torch.Size([50000, 784]) torch.Size([50000]) torch.Size([10000, 784]) torch.Size([10000])
torch.ByteTensor torch.LongTensor torch.ByteTensor torch.LongTensor


  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


In [6]:
x_train, x_valid = x_train.type(torch.FloatTensor), x_valid.type(torch.FloatTensor)

In [7]:
train_mean, train_std = x_train.mean(), x_train.std()
train_mean, train_std

(tensor(33.3951), tensor(78.6662))

In [8]:
x_train = normalize(x_train, train_mean, train_std)
x_valid = normalize(x_valid, train_mean, train_std)

## Important!

Notice that we're using the train mean and standard deviation because we need to keep the values at the same scale.

In [9]:
train_mean, train_std = x_train.mean(), x_train.std()
train_mean, train_std

(tensor(2.1425e-08), tensor(1.))

In [10]:
#export
def test_near_zero(a, tol=1e-3): assert a.abs() < tol, f"Near zero: {a}"

In [11]:
test_near_zero(x_train.mean())
test_near_zero(1-x_train.std())

In [12]:
n, m = x_train.shape
c = y_train.max()+1 # classes

In [13]:
# print train size, input size, output size
n,m,c

(50000, 784, tensor(10))

## Basic architecture

w1 is a matrix of the number of columns by number of hidden layers

In [14]:
# num hidden
nh = 50

In [15]:
# simplified kaiming init / he init
w1 = torch.randn(m, nh)/math.sqrt(m)
b1 = torch.zeros(nh)
w2 = torch.randn(nh, 1)/math.sqrt(nh)
b2 = torch.zeros(1)

In [16]:
# updated init
# simplified kaiming init / he init
def init_values():
    w1 = torch.randn(m, nh)*math.sqrt(2./m)
    b1  = torch.zeros(nh)
    w2 = torch.randn(nh, 1)*math.sqrt(2./m)    
    b2 = torch.zeros(1)

    print(f"w1_m: {w1.mean()}, w1_std: {w1.std()-1/math.sqrt(m)}, w2_m: {w2.mean()}, w2_std: {w2.std()}")
   
    return w1, b1, w2, b2

In [17]:
w1, b1, w2, b2 = init_values() # doesn't work

w1_m: -5.425676499726251e-05, w1_std: 0.01473693922162056, w2_m: -0.001893445267342031, w2_std: 0.04803181067109108


In [18]:
print(f"w1_m: {w1.mean()}, w1_std: {w1.std()/math.sqrt(m)}, w2_m: {w2.mean()}, w2_std: {w2.std()}")

w1_m: -5.425676499726251e-05, w1_std: 0.0018018295522779226, w2_m: -0.001893445267342031, w2_std: 0.04803181067109108


In [19]:
test_near_zero(w1.mean())
test_near_zero(w1.std()-1/math.sqrt(m))

AssertionError: Near zero: 0.01473693922162056

In [20]:
w1.shape, b1.shape, w2.shape, b2.shape

(torch.Size([784, 50]), torch.Size([50]), torch.Size([50, 1]), torch.Size([1]))

In [21]:
# should be ~ (0,1) (mean, std)...
x_valid.mean(), x_valid.std()

(tensor(-0.0059), tensor(0.9924))

In [22]:
def lin(x, w, b): return x@w + b

In [23]:
t = lin(x_valid, w1, b1)

In [24]:
#...this too
t.mean(), t.std()

(tensor(-0.0379), tensor(1.3862))

In [25]:
def relu(x): return x.clamp_min(0.) # clamp min just means to replace negative values with 0

In [26]:
t = relu(lin(x_valid, w1, b1))

Because the relu function removes all the values below 0, the distribution of our values no longer has a mean of 0 and std of 1

Paper: [Delving Deep into Rectifiers](https://arxiv.org/abs/1502.01852)

[Initialization Blog](https://pouannes.github.io/blog/initialization/)

[Blog](https://prateekvjoshi.com/2016/03/29/understanding-xavier-initialization-in-deep-neural-networks/)

[Lesson Link](https://course19.fast.ai/videos/?lesson=8)

In [27]:
# kaiming he init for relu - we swap the 1 numerator with a 2
w1 = torch.randn(m, nh)*math.sqrt(2/m)

In [28]:
w1.mean(), w1.std()

(tensor(0.0002), tensor(0.0506))

The initialization with the 2 on top gets a much closer to (0,1) (mean, std)

In [29]:
t = relu(lin(x_valid, w1, b1))
t.mean(), t.std()

(tensor(0.6831), tensor(0.9113))

In [30]:
# because we end up with a mean of ~ 1/2 , given that we deleted everything below 0, so we replace relu with 
# experimental
def relu(x): return x.clamp_min(0.) - 0.5 

In [31]:
#export 
from torch.nn import init

In [32]:
# fan out preserves magnitude of variance in backward pass
w1 = torch.zeros(m, nh)
init.kaiming_normal_(w1, mode='fan_out')
t = relu(lin(x_valid, w1, b1))

In [33]:
w1.mean(), w1.std()

(tensor(0.0002), tensor(0.0503))

In [34]:
t.mean(), t.std()

(tensor(-0.0202), tensor(0.8014))

In [35]:
w1.shape

torch.Size([784, 50])

In [36]:
import torch.nn

In [37]:
# the problem is that torch flips the dims
torch.nn.Linear(m, nh).weight.shape

torch.Size([50, 784])

In [38]:
def model(xb):
    l1 = lin(xb, w1, b1)
    l2 = relu(l1)
    l3 = lin(l2, w2, b2)
    return l3

In [39]:
%timeit -n 10 _=model(x_valid)

17.1 ms ± 1.62 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [40]:
assert model(x_valid).shape == torch.Size([x_valid.shape[0],1])

## Loss function: MSE

In [41]:
model(x_valid).shape

torch.Size([10000, 1])

In [42]:
#export
#  we use output.squeeze to remove a given unit axis - we add a -1 param in case the we have input size 1
def mse(output, targ): return (output.squeeze(-1)-targ).pow(2).mean()

In [43]:
y_train, y_valid = y_train.float(), y_valid.float()

In [44]:
preds = model(x_train)
preds.shape

torch.Size([50000, 1])

In [45]:
mse(preds, y_train)

tensor(27.3405)

## Gradients and backward pass

In [46]:
def mse_grad(inp, targ):
    # grad of loss wrt output of previous layer
    inp.g = 2. * (inp.squeeze() - targ).unsqueeze(-1) / inp.shape[0] # inp.shape[0] is n 

In [47]:
def relu_grad(inp, out):
    # grad of relu wrt input activations
    inp.g = (inp>0).float() * out.g

In [48]:
# gradient of matrix product is matrix product with transpose
def lin_grad(inp, out, w, b):
    # grad of matmul wrt input
    inp.g = out.g @ w.t()
    w.g = (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0)
    b.g = out.g.sum(0)

In [49]:
def forward_and_backward(inp, targ):
    # forward pass
    l1 = inp @ w1 + b1
    l2 = relu(l1)
    out = l2 @ w2 + b2
    # we're actually not using the loss
    loss = mse(out, targ)

    # backward pass
    mse_grad(out, targ) # grad of loss wrt previous layer
    lin_grad(l2, out, w2, b2) # grad of matmul wrt previous layer
    relu_grad(l1, l2) # grad of relu wrt to l1 (first matmul)
    lin_grad(inp, l1, w1, b1) # grad of matmul wrt input

In [50]:
x_train.shape

torch.Size([50000, 784])

In [51]:
forward_and_backward(x_train, y_train)

In [52]:
# save for testing against later
w1g = w1.g.clone()
w2g = w2.g.clone()
b1g = b1.g.clone()
b2g = b2.g.clone()
ig = x_train.g.clone()

In [53]:
xt2 = x_train.clone().requires_grad_(True)
w12 = w1.clone().requires_grad_(True)
w22 = w2.clone().requires_grad_(True)
b12 = b1.clone().requires_grad_(True)
b22 = b2.clone().requires_grad_(True)

## Main takeaways:

- Kaiming init helps initialize the weights with ~ mean 0 and std 1

In [54]:
def forward(inp, targ):
    # fw pass
    l1 = inp @ w12 + b12
    l2 = relu(l1)
    out = l2 @ w22 + b22
    return mse(out, targ)

In [55]:
loss = forward(xt2, y_train)

In [56]:
loss.backward()

In [57]:
test_near(w22.grad, w2g)
test_near(b22.grad, b2g)
test_near(w12.grad, w1g)
test_near(b12.grad, b1g)
test_near(xt2.grad, ig)

### Module.forward()

In [58]:
class Module():
    def __call__(self, *args):
        self.args = args
        self.out = self.forward(*args)
        return self.out
    
    def forward(self): raise Exception('not implemented')
    def backward(self): self.bwd(self.out, *self.args)

In [59]:
class Relu(Module):
    def forward(self, inp): return inp.clamp_min(0.)-0.5
    def bwd(self, out, inp): inp.g = (inp>0).float() * out.g

In [60]:
class Lin(Module):
    def __init__(self, w, b): self.w, self.b = w, b

    def forward(self, inp): return inp @ self.w + self.b

    def bwd(self, out, inp):
        inp.g = out.g @ self.w.t()
        self.w.g = torch.einsum("bi,bj->ij", inp, out.g)
        self.b.g = out.g.sum(0)

In [61]:
class Mse(Module):
    def forward(self, inp, targ): return (inp.squeeze() - targ).pow(2).mean()
    def bwd(self, out, inp, targ): inp.g = 2*(inp.squeeze() - targ).unsqueeze(-1) / inp.shape[0]

In [62]:
class Model():
    def __init__(self):
        self.layers = [Lin(w1,b1), Relu(), Lin(w2,b2)]
        self.loss = Mse()

    def __call__(self, x, targ):
        for l in self.layers: x = l(x)
        return self.loss(x, targ)

    def backward(self):
        self.loss.backward()
        for l in reversed(self.layers): l.backward()

In [63]:
w1.g, b1.g, w2.g, b2.g = [None]*4
model = Model()

In [64]:
%time loss = model(x_train, y_train)

CPU times: user 485 ms, sys: 23 ms, total: 508 ms
Wall time: 90.1 ms


In [65]:
%time model.backward()

CPU times: user 969 ms, sys: 155 ms, total: 1.12 s
Wall time: 191 ms


In [66]:
test_near(w2g, w2.g)
test_near(b2g, b2.g)
test_near(w1g, w1.g)
test_near(b1g, b1.g)
test_near(ig, x_train.g)

In [67]:
#export 
from torch import nn

In [68]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in, nh), nn.ReLU(), nn.Linear(nh, n_out)]
        self.loss = mse

    def __call__(self, x, targ):
        for l in self.layers: x = l(x)
        return self.loss(x.squeeze(), targ)

In [69]:
model = Model(m, nh, 1)

In [70]:
%time loss = model(x_train, y_train)

CPU times: user 620 ms, sys: 29.9 ms, total: 649 ms
Wall time: 108 ms


In [71]:
%time loss.backward()

CPU times: user 2.02 s, sys: 30.1 ms, total: 2.05 s
Wall time: 332 ms


In [74]:
!python notebook2script.py 02_fully_connected.ipynb

Converted 02_fully_connected.ipynb to exp/nb_02.py
