In [1]:
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl, numpy as np
from pathlib import Path
from torch import tensor
from fastcore.test import test_close
torch.manual_seed(42)

mpl.rcParams['image.cmap'] = 'gray'
torch.set_printoptions(precision=2, linewidth=125, sci_mode=False)
np.set_printoptions(precision=2, linewidth=125)

In [2]:
MNIST_URL='https://github.com/mnielsen/neural-networks-and-deep-learning/blob/master/data/mnist.pkl.gz?raw=true'
path_data = Path('data')
path_data.mkdir(exist_ok=True)
path_gz = path_data/'mnist.pkl.gz'

In [3]:
from urllib.request import urlretrieve
if not path_gz.exists(): urlretrieve(MNIST_URL, path_gz)

In [4]:
path_data = Path('data')
path_gz = path_data/'mnist.pkl.gz'
with gzip.open(path_gz, 'rb') as f: ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])


In [5]:
n,m = x_train.shape
c = y_train.max()+1
n,m,c

(50000, 784, tensor(10))

In [6]:
# number of activation layers or hidden layers
nh =50

In [7]:
w1 = torch.randn(m,nh)
b1 = torch.zeros(nh)
w2 = torch.randn(nh,1)
b2 = torch.zeros(1)

In [8]:
def lin(x,w,b): return x@w + b

In [9]:
x_valid.shape

torch.Size([10000, 784])

In [10]:
t = lin(x_valid,w1,b1)
t.shape

torch.Size([10000, 50])

In [11]:
def relu(x): return x.clamp_min(0.)

In [12]:
t =relu(t)
t.shape

torch.Size([10000, 50])

In [13]:
t

tensor([[ 0.00, 11.87,  0.00,  ...,  5.48,  2.14, 15.30],
        [ 5.38, 10.21,  0.00,  ...,  0.88,  0.08, 20.23],
        [ 3.31,  0.12,  3.10,  ..., 16.89,  0.00, 24.74],
        ...,
        [ 4.01, 10.35,  0.00,  ...,  0.23,  0.00, 18.28],
        [10.62,  0.00, 10.72,  ...,  0.00,  0.00, 18.23],
        [ 2.84,  0.00,  1.43,  ...,  0.00,  5.75,  2.12]])

In [14]:
def model_(xb):
  l1 = lin(xb,w1,b1)
  l2 = relu(l1)
  return lin(l2,w2,b2)

In [15]:
res = model_(x_valid)
res.shape

torch.Size([10000, 1])

## LOSS Function
Lets start with MSE to begin with then we return to a better Loss fucntion because MSE is not good with multiclass

In [16]:
res.shape,y_valid.shape

(torch.Size([10000, 1]), torch.Size([10000]))

In [17]:
y_valid[:,None].shape

torch.Size([10000, 1])

Here broadcasting is applied. And it starts from Right to Left so first example here res has [10000,1] shape and y_valid has [10000] shape and per the Rules the res value(1) is multiplied with 10000.. right to left rule. But if we change y_valid shape to [10000,1] with y_valid[,:None] it will give us the right result.
### alternatively
### we can change the shape of res to just [10000] by removing the trailing unit vector. With res[:,0] or the sqeeze function res.sqeeze to remove the trailing unit dimentions as the example used below.

In [18]:
(res -y_valid).shape

torch.Size([10000, 10000])

In [19]:
(res-y_valid[:,None]).shape

torch.Size([10000, 1])

In [20]:
res[:,0].shape

torch.Size([10000])

In [21]:
q = res[None,:,None]
q.shape

torch.Size([1, 10000, 1, 1])

In [22]:
q.squeeze().shape

torch.Size([10000])

In [23]:
res.squeeze().shape

torch.Size([10000])

In [24]:
(res[:,0]-y_valid).shape

torch.Size([10000])

In [25]:
y_train,y_valid = y_train.float(),y_valid.float()
preds = model_(x_train)
preds.shape

torch.Size([50000, 1])

In [26]:
def mse(output, target): return (output[:,0]-target).pow(2).mean()

In [27]:
mse(preds, y_train)

tensor(4308.76)

### Gradient and backward pass

Here on our notebook we can use sympy library to show differential equation on our nb

In [28]:
from sympy import symbols, diff
x,y  = symbols('x y')
diff(x**2,x)

2*x

In [29]:
diff(3*x**2+9,x)

6*x

The python debugger (pdb) is quite helpful.
we replaced w.g = (i * o).sum with just matrix multiplication (ofc transposed inp. after seeing
the einsum ["p torch.einsum('ij,ik->jk', inp, out.g)"]
with pdb debugger its quite the same if we use just matrix multi w.g = inp.T @ out.g

In [30]:
def lin_grad(inp,out,w,b):
  #grad matrix multip with respect to the input
  inp.g = out.g@ w.t()
  #import pdb; pdb.set_trace()
  #i,o = inp.unsqueeze(-1), out.g.unsqueeze(1)
  #w.g = (i * o).sum(0)
  w.g = inp.T @ out.g
  b.g = out.g.sum(0)

In [31]:
def forward_and_backward(inp,targ):
  #forward pass:
  l1 = lin(inp,w1,b1)
  l2 = relu(l1)
  out = lin(l2,w2,b2)
  diff = out[:,0]- targ
  loss = diff.pow(2).mean()

  #backward pass:
  out.g = 2.*diff[:,None] / inp.shape[0]
  lin_grad(l2,out,w2,b2)
  l1.g = (l1>0).float() * l2.g
  lin_grad(inp,l1,w1,b1)


In [32]:
forward_and_backward(x_train,y_train)

## Refactor Model
#### Layers as Classes

In [48]:
class Relu():
  def __call__(self,inp):
    self.inp = inp
    self.out = inp.clamp_min(0.)
    return self.out

  def backward(self): self.inp.g = (self.inp>0.).float() * self.out.g

In [49]:
class Lin():
  def __init__(self,w,b): self.w,self.b = w,b

  def __call__(self,inp):
    self.inp = inp
    self.out = lin(inp,self.w,self.b)
    return self.out

  def backward(self):
    self.inp.g = self.out.g @ self.w.t()
    self.w.g = self.inp.t() @ self.out.g
    self.b.g = (self.out.g).sum(0)

In [50]:
class Mse():
  def __call__(self,inp,targ):
    self.inp,self.targ =inp,targ
    self.out = mse(inp,targ)
    return self.out

  def backward(self): self.inp.g = 2. * (self.inp.squeeze() - self.targ).unsqueeze(-1) / self.targ.shape[0]

In [51]:
class Model():
  def __init__(self,w1,b1,w2,b2):
    self.layers = [Lin(w1,b1), Relu(), Lin(w2,b2)]
    self.loss = Mse()

  def __call__(self,x,targ):
    for I in self.layers: x = I(x)
    return self.loss(x,targ)

  def backward(self):
    self.loss.backward()
    for I in reversed(self.layers): I.backward()


In [52]:
model = Model(w1,b1,w2,b2)