In [1]:
#export
from pathlib import Path
import os
import torchvision.datasets as datasets
from IPython.core.debugger import set_trace
import pickle, gzip, math, torch, matplotlib as mpl
import matplotlib.pyplot as plt
from torch import tensor
import torch

In [3]:
def test(a,b,cmp,cname=None):
    if cname is None: cname=cmp.__name__
    assert cmp(a,b),f"{cname}:\n{a}\n{b}"

def near(a,b): return torch.allclose(a, b, rtol=1e-3, atol=1e-5)
def test_near(a,b): test(a,b,near)

In [4]:
#export
def get_data():
    root = 'C:\\Users\\omar_\\Part 2 Deep Learning from the Foundations\\data'
    if not os.path.exists(root):
        os.mkdir(root)
    train_set = datasets.MNIST(root = root , train = True , download = False)
    test_set = datasets.MNIST(root = root , train = False , download = False)
    x_train, x_valid = train_set.train_data.split([50000, 10000])
    y_train, y_valid = train_set.train_labels.split([50000, 10000])
    return (x_train.view(50000, -1) / 256.0), y_train.float(), (x_valid.view(10000, -1))/ 256.0, y_valid.float()

def normalize(x , m , s): return (x - m) / s

In [5]:
x_train,y_train,x_valid,y_valid = get_data()

  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


In [6]:
x_train.shape

torch.Size([50000, 784])

In [7]:
train_mean , train_std = x_train.mean() , x_train.std()
train_mean , train_std

(tensor(0.1304), tensor(0.3073))

In [8]:
x_train = normalize(x_train, train_mean, train_std)
# NB: Use training, not validation mean for validation set
x_valid = normalize(x_valid, train_mean, train_std)

In [9]:
train_mean , train_std = x_train.mean(),x_train.std()
train_mean , train_std

(tensor(3.8966e-08), tensor(1.))

In [10]:
#export
def test_near_zero(a,tol=1e-3): assert a.abs()<tol, f"Near zero: {a}"

In [11]:
n , m = x_train.shape
c = y_train.max() + 1
n , m , c

(50000, 784, tensor(10.))

In [12]:
nh = 50

In [13]:
w1 = torch.rand(m , nh) / math.sqrt(m)
b1 = torch.rand(nh)
w2 = torch.rand(nh , 1) / math.sqrt(m)
b2 = torch.rand(1)

In [14]:
print(w1.shape)
print(b1.shape)
print(w2.shape)
print(b2.shape)

torch.Size([784, 50])
torch.Size([50])
torch.Size([50, 1])
torch.Size([1])


In [15]:
def lin(x , w , b):
    return x @ w + b

In [16]:
def relu(x):
    return x.clamp_min(0.) - 0.5

In [17]:
def model(x):
    l1 = lin(x , w1 , b1)
    l2 = relu(l1)
    l3 = lin(l2 , w2 , b2)
    return l3

In [18]:
%timeit -n 10 _=model(x_valid)

10 loops, best of 5: 23.1 ms per loop


In [19]:
#export
def mse(x , y):
    return (x.squeeze(-1) - y).pow(2).mean()

In [20]:
y_train , y_valid = y_train.float() , y_valid.float()

In [21]:
preds = model(x_train)

In [22]:
mse(preds , y_train)

tensor(19.7370)

In [23]:
def mse_grad(inp , targ):
    inp.g = 2. * (inp.squeeze() - targ).unsqueeze(-1) / inp.shape[0]

In [24]:
def relu_grad(inp , out):
    inp.g = (inp > 0).float() * out.g

In [25]:
def lin_grad(inp , out , w , b):
    inp.g = out.g @ w.t()
    w.g = (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0)
    b.g = out.g.sum(0)

In [26]:
def forward_and_backward(inp , targ):
    # forward pass:
    l1 = inp @ w1 + b1
    l2 = relu(l1)
    out = l2 @ w2 + b2
    loss = mse(out, targ)
    
    # backward pass:
    mse_grad(out , targ)
    lin_grad(l2 , out , w2 , b2)
    relu_grad(l1 , l2)
    lin_grad(inp , l1 , w1 , b1)

In [27]:
forward_and_backward(x_train , y_train)

**Refactor model**

In [28]:
class Relu():

    def __call__(self , inp):
        self.inp = inp
        self.out = inp.clamp_min(0.) - 0.5
        return self.out

    def backward(self):
        self.inp.g = (self.inp > 0).float() * self.out.g

In [29]:
class Lin():

    def __init__(self , w , b): 
        self.w , self.b = w , b

    def __call__(self , inp):
        self.inp = inp
        self.out = inp @ self.w + self.b
        return self.out

    def backward(self):
        self.inp.g = self.out.g @ self.w.t()
        self.w.g = (self.inp.unsqueeze(-1) * self.out.g.unsqueeze(1)).sum(0)
        self.b.g = self.out.g.sum(0)

In [30]:
class Mse():

    def __call__(self , inp , targ):
        self.inp = inp
        self.targ = targ
        self.out = (inp.squeeze(-1) - targ).pow(2).mean()
        return self.out

    def backward(self):
        self.inp.g = 2. * (self.inp.squeeze() -  self.targ).unsqueeze(-1) /  self.targ.shape[0]

In [31]:
class Model():

    def __init__(self , w1 , b1 , w2 , b2):
        self.layers = [Lin(w1 , b1) , Relu() , Lin(w2 , b2)]
        self.loss = Mse()

    def __call__(self , x , targ):
        for l in self.layers:
            x = l(x)
        return self.loss(x, targ)

    def backward(self):
        self.loss.backward()
        for l in reversed(self.layers): 
            l.backward()

In [32]:
w1.g,b1.g,w2.g,b2.g = [None]*4
model = Model(w1, b1, w2, b2)

In [33]:
%time loss = model(x_train, y_train)

CPU times: user 112 ms, sys: 288 µs, total: 113 ms
Wall time: 113 ms


In [34]:
%time model.backward()

CPU times: user 4.98 s, sys: 47.2 ms, total: 5.03 s
Wall time: 5 s


**Module.forward()**

In [35]:
class Module():
    def __call__(self, *args):
        self.args = args
        self.out = self.forward(*args)
        return self.out
    
    def forward(self): raise Exception('not implemented')
    def backward(self): self.bwd(self.out, *self.args)

In [36]:
class Relu(Module):
    def forward(self, inp): return inp.clamp_min(0.)-0.5
    def bwd(self, out, inp): inp.g = (inp>0).float() * out.g

In [37]:
class Lin(Module):
    def __init__(self, w, b): self.w,self.b = w,b
        
    def forward(self, inp): return inp@self.w + self.b
    
    def bwd(self, out, inp):
        inp.g = out.g @ self.w.t()
        self.w.g = torch.einsum("bi,bj->ij", inp, out.g)
        self.b.g = out.g.sum(0)

In [38]:
class Mse(Module):
    def forward (self, inp, targ): return (inp.squeeze() - targ).pow(2).mean()
    def bwd(self, out, inp, targ): inp.g = 2*(inp.squeeze()-targ).unsqueeze(-1) / targ.shape[0]

In [39]:
class Model():
    def __init__(self):
        self.layers = [Lin(w1,b1), Relu(), Lin(w2,b2)]
        self.loss = Mse()
        
    def __call__(self, x, targ):
        for l in self.layers: x = l(x)
        return self.loss(x, targ)
    
    def backward(self):
        self.loss.backward()
        for l in reversed(self.layers): l.backward()

In [40]:
w1.g,b1.g,w2.g,b2.g = [None]*4
model = Model()

In [41]:
%time loss = model(x_train, y_train)

CPU times: user 122 ms, sys: 0 ns, total: 122 ms
Wall time: 125 ms


In [42]:
%time model.backward()

CPU times: user 213 ms, sys: 4.36 ms, total: 218 ms
Wall time: 218 ms


nn.Linear and nn.Module

In [43]:
#export
from torch import nn

In [44]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in,nh), nn.ReLU(), nn.Linear(nh,n_out)]
        self.loss = mse
        
    def __call__(self, x, targ):
        for l in self.layers: x = l(x)
        return self.loss(x.squeeze(), targ)

# Reference for this notebook

In [1]:
#https://github.com/fastai/course-v3/blob/master/nbs/dl2/02_fully_connected.ipynb