In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
#export
from exp.nb_01 import *

def get_data():
    path = datasets.download_data(MNIST_URL, ext='.gz')
    with gzip.open(path, 'rb') as fin:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(fin, encoding='latin-1')
        
    return map(tensor, (x_train, y_train, x_valid, y_valid))

def normalize(x:torch.Tensor, 
              mean: torch.Tensor, 
              std: torch.Tensor) -> torch.Tensor:
    
    return (x - mean)/std

In [3]:
x_train,y_train,x_valid,y_valid = get_data()

In [4]:
train_mean, train_std = x_train.mean(), x_train.std()
train_mean, train_std

(tensor(0.1304), tensor(0.3073))

In [5]:
x_train = normalize(x_train, train_mean, train_std)
x_valid = normalize(x_valid, train_mean, train_std)

In [6]:
x_train.mean(), x_train.std(), x_valid.mean(), x_valid.std()

(tensor(3.0614e-05), tensor(1.), tensor(-0.0058), tensor(0.9924))

In [7]:
#export
def test_near_zero(value, tol:float=1e-3):
    assert value.abs()<tol, f"Near Zero: {value}"

In [8]:
test_near_zero(x_train.mean())

In [9]:
nrow, ncol = x_train.shape
nclass = y_train.max()+1

assert nclass==10, f"number of class not equal to 10"

nrow, ncol, nclass

(50000, 784, tensor(10))

### Basic Architecture

In [10]:
hid_dim = 50
input_dim = ncol

In [11]:
#simplified kaiming initialization
w1 = torch.randn(input_dim, hid_dim) / math.sqrt(input_dim)
b1 = torch.zeros(hid_dim)

# why 1? why not nclass
w2 = torch.randn(hid_dim, 1) / math.sqrt(hid_dim)
b2 = torch.zeros(1)

In [12]:
w1.shape, b1.shape

(torch.Size([784, 50]), torch.Size([50]))

In [13]:
test_near_zero(w1.mean())

In [14]:
#test_near_zero(w1.std()-1/math.sqrt(hid_dim))

In [15]:
def lin(x:torch.Tensor, 
        w:torch.Tensor, 
        b:torch.Tensor) -> torch.Tensor:
    
    return x @ w + b

In [16]:
t = lin(x_valid, w1, b1)

In [17]:
x_valid.shape, w1.shape, b1.shape

(torch.Size([10000, 784]), torch.Size([784, 50]), torch.Size([50]))

#### Note

Before applying `relu` check the mean and std of the output of `lin(x_valid, w1, b1)`.

In [18]:
#...so should this, because we used kaiming init, which is designed to do this
t.mean(),t.std()

(tensor(-0.0576), tensor(1.0251))

It seems the mean is close to zero and std is close to 1. Which is a good sign. Our objective is after each layer the value should be in normalized state. Let's apply `rely` and see

In [19]:
def relu(x:torch.Tensor): return x.clamp_min(0.)

In [20]:
t = relu(lin(x_valid, w1, b1))

In [21]:
#...actually it really should be this!
t.mean(),t.std()

(tensor(0.3761), tensor(0.5732))

... as you see mean is not equal to `zero` and std is close to `0.5`. The reason is `relu` actually removes all the negative points and clamps them to zero. So you calculate mean with the positive points only so mean is now zero anymore. And also the value range is [0,1] (this give std close to 0.5) unlike `pre-relu` where it was [-1, 1] (which gave std of 1)

> From pytorch docs: a: the negative slope of the rectifier used after this layer (0 for ReLU by default)

$$\text{std} = \sqrt{\frac{2}{(1 + a^2) \times \text{fan_in}}}$$

In [22]:
# kaiming init / he init for relu
w1 = torch.randn(input_dim,hid_dim)*math.sqrt(2/input_dim)

In [23]:
w1.mean(),w1.std()

(tensor(-0.0002), tensor(0.0507))

**Note:** With the correct initialization, now the mean is updated and close to zero.

In [24]:
t = relu(lin(x_valid, w1, b1))
t.mean(),t.std()

(tensor(0.5172), tensor(0.7935))

In [25]:
#export
from torch.nn import init

In [26]:
w1 = torch.zeros(input_dim,hid_dim)
init.kaiming_normal_(w1, mode='fan_out')
t = relu(lin(x_valid, w1, b1))

In [27]:
t.mean(), t.std()

(tensor(0.4596), tensor(0.7325))

In [28]:
init.kaiming_normal_??

In [29]:
w1.mean(),w1.std()

(tensor(0.0001), tensor(0.0506))

In [30]:
# what if...?
def relu(x): return x.clamp_min(0.) - 0.5

**Note**: The above `relu` implementation brings the `mean` again close to zero. Which is great !! See the below code

In [31]:
# kaiming init / he init for relu
w1 = torch.randn(input_dim,hid_dim)*math.sqrt(2./input_dim )
t1 = relu(lin(x_valid, w1, b1))
t1.mean(),t1.std()

(tensor(0.1304), tensor(0.8756))

In [32]:
def forward_pass(xb: torch.Tensor):
    l1 = lin(xb, w1, b1)
    l2 = relu(l1)
    l3 = lin(l2, w2, b2)
    return l3

In [33]:
%timeit -n 10 _=forward_pass(x_valid)

9.84 ms ± 2.86 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [34]:
torch.Size()

torch.Size([])

In [35]:
assert forward_pass(x_valid).shape==torch.Size([x_valid.shape[0],1])

In [36]:
forward_pass(x_valid).shape

torch.Size([10000, 1])

**Note:** MSE expects the output as 1 vector without the one extra dimension as seen above. So we need to `squeeze` that extra dimention.

We need squeeze() to get rid of that trailing (,1), in order to use mse. (Of course, mse is not a suitable loss function for multi-class classification; we'll use a better loss function soon. We'll use mse for now to keep things simple.)

Alwways pass `dimension` in squeeze function to be safe.

In [37]:
#export
def mse(output, targ): return (output.squeeze(-1) - targ).pow(2).mean()

In [38]:
y_train,y_valid = y_train.float(),y_valid.float()

In [39]:
preds = forward_pass(x_train)

In [40]:
preds.shape

torch.Size([50000, 1])

In [41]:
mse(preds, y_train)

tensor(44.5473)

### Gradient and Backward pass

**Notation:** If $y=f(x)$ then in the following function definition, `inp` is `=x` and `out` is `=y`

```py
y_hat = lin(relu(lin(x)))
loss = mse(y_hat - y) = mse(lin(relu(lin(x))) - y)
```
so for `mse`, outer `lin` is the previous layer 

**Note:** So the output of `lin` is the `inp` for `mse_grad`

In [42]:
def mse_grad(inp:torch.Tensor, 
             target: torch.Tensor) -> None:
    
    """
    In this gradient, the inp is the output of the previous layer that's coming as input in this mse function.44
    Also we need to save this gradient for the previous layer to use it in the chain rule.
    
    Q: How the below code is equal to the derivative ??? 
    """
    inp.g =  2*(inp.squeeze() - target).unsqueeze(-1)/inp.shape[0]

In [43]:
def relu_grad_original(inp:torch.Tensor):
    """
    
    if x>0 return x
    else return 0
    """
    return (inp>0).float()

**Note:** However we need to combine the chain rule to multiple the other partial derivative with the gradient of relu. Because it's like this 

$$y=r(f(x))$$
$$\frac{\delta r}{\delta x} = \frac{\delta r}{\delta f} * \frac{\delta f}{\delta x}$$

where $r$ is the `relu` and `f` is the `lin` here. So to get the gradient of `relu` we need to multiply the $\frac{\delta r}{\delta f}$ with $\frac{\delta f}{\delta x}$. And for doing that we need to pass the $\frac{\delta f}{\delta x}$ to the `relu_grad` function. Now here that is the output of `lin` function. And in the `lin_grad` we store the `grad_of_lin` under `out.g`. So we need to pass `out` to the `relu_grad`

In [44]:
#relu_grad_original(a)

In [45]:
def relu_grad(inp:torch.Tensor, 
              out: torch.Tensor)-> None:
    """
    Applying chain rule with the original relu grad
    """
    inp.g = relu_grad_original(inp)*out.g

**Note:** Always remember the `out` is coming from the previous layer. If $f(g(x))$, then for $f()$, $g()$ is the previous layer and $g(x)$ is the output 

$$y = w*f(g(x)) + b$$
$$\frac{\delta y}{\delta x} = w*(\frac{\delta f}{\delta g}*\frac{\delta g}{\delta x})$$

where $(\frac{\delta f}{\delta g}*\frac{\delta g}{\delta x})$ is the `out.g`

Now due to chain rule we are mulitplying the `out.g` with the `original grad`. [Is this Assumption correct???] 


So 
$$\frac{\delta y}{\delta w} = f(g(x))*out.g$$

and 

$$\frac{\delta y}{\delta b} = 1*out.g$$

In [46]:
def lin_grad(inp: torch.Tensor, 
             out: torch.Tensor, 
             w: torch.Tensor, 
             b: torch.Tensor)-> None:
    
    inp.g = out.g @ w.t()
    w.g = (inp.unsqueeze(-1)*out.g.unsqueeze(1)).sum(0)
    b.g = out.g.sum(0)

In [47]:
def forward_backward(inp: torch.Tensor, 
                     target: torch.Tensor):
    
    # forward pass
    l1 = inp @ w1 + b1
    l2 = relu(l1)
    out = l2 @ w2 + b2
    
    # find loss
    loss = mse(out, target)
    print(f"loss: {loss}")
    
    # backward pass
    
    mse_grad(out, target)
    lin_grad(l2, out, w2, b2)
    relu_grad(l1, l2)
    lin_grad(inp, l1, w1, b1)

**Observation:** In the above formulation `loss` never appears in the gradient calculation

In [48]:
%%time
forward_backward(x_train, y_train)

loss: 44.54726028442383
CPU times: user 9.2 s, sys: 3.02 s, total: 12.2 s
Wall time: 6.25 s


In [49]:
#save for testing later
w1g = w1.g.clone()
w2g = w2.g.clone()
b1g = b1.g.clone()
b2g = b2.g.clone()
ig = x_train.g.clone()

In [50]:
xt2 = x_train.clone().requires_grad_(True)
w12 = w1.clone().requires_grad_(True)
w22 = w2.clone().requires_grad_(True)
b12 = b1.clone().requires_grad_(True)
b22 = b2.clone().requires_grad_(True)

In [51]:
def forward(inp, targ):
    # forward pass:
    l1 = inp @ w12 + b12
    l2 = relu(l1)
    out = l2 @ w22 + b22
    # we don't actually need the loss in backward!
    return mse(out, targ)

In [52]:
loss = forward(xt2, y_train)

In [53]:
%%time
loss.backward()

CPU times: user 413 ms, sys: 91 ms, total: 504 ms
Wall time: 617 ms


In [54]:
test_near(w22.grad, w2g)
test_near(b22.grad, b2g)
test_near(w12.grad, w1g)
test_near(b12.grad, b1g)
test_near(xt2.grad, ig )

In [55]:
class Relu():
    def __call__(self, inp):
        self.inp = inp
        self.out = self.inp.clamp_min(0.) - 0.5
        return self.out
        
    def backward(self): self.inp.g = (self.inp>0).float()*self.out.g

In [56]:
# class Lin():
#     def __init__(self, w, b): self.w, self.b = w, b
    
#     def __call__(self, inp): 
#         self.inp = inp
#         self.out = inp @ self.w + self.b
#         print("done...")
#         return self.out

#     def backward(self):
#         self.inp.g = self.out.g @ self.w.t()
#         self.w.g = (self.inp.unsqueeze(-1)*self.out.g.unsqueeze(1)).sum(0)
#         self.b.g = self.out.g.sum(0)

class Lin():
    def __init__(self, w, b): self.w,self.b = w,b
        
    def __call__(self, inp):
        self.inp = inp
        self.out = inp@self.w + self.b
        return self.out
    
    def backward(self):
        self.inp.g = self.out.g @ self.w.t()
        # Creating a giant outer product, just to sum it, is inefficient!
        self.w.g = (self.inp.unsqueeze(-1) * self.out.g.unsqueeze(1)).sum(0)
        self.b.g = self.out.g.sum(0)

In [57]:
class Mse():
    def __call__(self, inp, target):
        self.inp = inp
        self.target = target
        self.out = (self.inp.squeeze() - target).pow(2).mean()
        return self.out
    
    def backward(self):
        self.inp.g =  2*(self.inp.squeeze() - self.target).unsqueeze(-1)/self.inp.shape[0] 
        

In [58]:
class Model():
    def __init__(self, w1, b1, w2, b2):
        self.layers = [Lin(w1, b1), Relu(), Lin(w2, b2)]
        self.loss = Mse()
        
    def __call__(self, x, target):
        for l in self.layers: x = l(x)
        return self.loss(x, target)
    
    def backward(self):
        self.loss.backward()
        for l in reversed(self.layers): l.backward()


In [59]:
w1.g,b1.g,w2.g,b2.g = [None]*4
model = Model(w1, b1, w2, b2)

In [60]:
w1.shape, b1.shape

(torch.Size([784, 50]), torch.Size([50]))

In [61]:
x_train.shape

torch.Size([50000, 784])

In [62]:
%%time
loss = model(x_train, y_train)

CPU times: user 241 ms, sys: 337 µs, total: 242 ms
Wall time: 120 ms


In [63]:
%prun model.backward()

 

In [64]:
test_near(w2g, w2.g)
test_near(b2g, b2.g)
test_near(w1g, w1.g)
test_near(b1g, b1.g)
test_near(ig, x_train.g)

 ## Module Forward

In [92]:
class Module():
    def __call__(self, *args):
        self.args = args
        self.out = self.forward(*args)
        return self.out
    
    def forward(self): raise Exception("Not Implemented")
    def backward(self): self.bwd(self.out, *self.args)

In [93]:
class Relu(Module):
    def forward(self, inp): return inp.clamp_min(0.) - 0.5
    def bwd(self, out, inp): inp.g = (inp>0).float()*out.g

In [103]:
class Lin(Module):
    def __init__(self, w, b): self.w, self.b = w, b
    
    def forward(self, inp): return inp @ self.w + self.b
    def bwd(self, out, inp):
        inp.g = out.g @ self.w.t()
        self.w.g = torch.einsum("bi,bj->ij", inp, out.g)
        self.b.g = out.g.sum(0)

In [95]:
class Mse(Module):
    def forward (self, inp, targ): return (inp.squeeze() - targ).pow(2).mean()
    def bwd(self, out, inp, targ): inp.g = 2*(inp.squeeze()-targ).unsqueeze(-1) / targ.shape[0]

In [96]:
class Model():
    def __init__(self):
        self.layers = [Lin(w1,b1), Relu(), Lin(w2,b2)]
        self.loss = Mse()
        
    def __call__(self, x, targ):
        for l in self.layers: x = l(x)
        return self.loss(x, targ)
    
    def backward(self):
        self.loss.backward()
        for l in reversed(self.layers): l.backward()

In [97]:
w1.g,b1.g,w2.g,b2.g = [None]*4
model = Model()

In [98]:
%time loss = model(x_train, y_train)

CPU times: user 151 ms, sys: 2.9 ms, total: 154 ms
Wall time: 78 ms


In [99]:
%time model.backward()

CPU times: user 217 ms, sys: 68.3 ms, total: 286 ms
Wall time: 143 ms


**Note:** Difference of `backward()` and `bwd()`

In [104]:
test_near(w2g, w2.g)
test_near(b2g, b2.g)
test_near(w1g, w1.g)
test_near(b1g, b1.g)
test_near(ig, x_train.g)

### nn.Linear and nn.Module

In [105]:
#export
from torch import nn

In [106]:
class Model(nn.Module):
    def __init__(self, input_dim, hid_dim, out_dim):
        super().__init__()
        self.layers = [nn.Linear(input_dim,hid_dim), nn.ReLU(), nn.Linear(hid_dim,out_dim)]
        self.loss = mse
        
    def __call__(self, x, targ):
        for l in self.layers: x = l(x)
        return self.loss(x.squeeze(), targ)

In [108]:
model = Model(input_dim, hid_dim, 1)

In [109]:
%time loss = model(x_train, y_train)

CPU times: user 175 ms, sys: 16.8 ms, total: 192 ms
Wall time: 330 ms


In [110]:
%time loss.backward()

CPU times: user 185 ms, sys: 11.2 ms, total: 196 ms
Wall time: 122 ms


In [111]:
!python notebook2script.py 02_fully_connected.ipynb

Converted 02_fully_connected.ipynb to exp/nb_02.py
