<a href="https://colab.research.google.com/github/mzohaibnasir/NeuralNotes/blob/main/02_deepDiveIntoBasics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Forward & Backward passes

In [47]:
from pathlib import Path
import pickle , gzip,math, os, time, shutil,matplotlib.pyplot as plt, matplotlib as mpl, numpy as np

import torch
from torch import tensor
from fastcore.test import test_close


In [48]:
MNIST_URL='https://github.com/mnielsen/neural-networks-and-deep-learning/blob/d15df08a69ed33ae16a2fff874f83b57a956172c/data/mnist.pkl.gz?raw=true'
path_data = Path('data')
path_data.mkdir(exist_ok=True )
path_gz=path_data/'mnist.pkl.gz'
path_gz

PosixPath('data/mnist.pkl.gz')

In [49]:
from urllib.request import urlretrieve

if not path_gz.exists(): urlretrieve(MNIST_URL, path_gz)

In [50]:
torch.manual_seed(42)


mpl.rcParams['image.cmap'] = 'gray'
torch.set_printoptions(precision=2, linewidth=125, sci_mode=False)
np.set_printoptions(precision=2, linewidth=125)


In [51]:
!ls -l data

total 16656
-rw-r--r-- 1 root root 17051982 Jan 15 14:38 mnist.pkl.gz


In [52]:
with gzip.open(path_gz, 'rb') as f:   #read as binary as opposed to text
   ((x_train,y_train), (x_valid,y_valid), _) = pickle.load(f, encoding='latin-1') #destructuring
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])

In [53]:
np.array(x_train).shape

(50000, 784)

In [54]:
path_gz

PosixPath('data/mnist.pkl.gz')

In [55]:
n,m = x_train.shape  # n: # of training samples   m: pixels
c = y_train.max()+1  # posible ouputs
n, m, c

(50000, 784, tensor(10))

In [56]:
nh = 50  # hidden activations # # of REctified lines to add up

In [57]:
w1 = torch.randn(m, nh)
b1 = torch.zeros(nh) # bias
w2 = torch.randn(nh,1)  # 1  output
b2 = torch.zeros(1)

w1.shape, b1.shape,w2.shape, b2.shape

(torch.Size([784, 50]), torch.Size([50]), torch.Size([50, 1]), torch.Size([1]))

In [58]:
def lin(x, w, b):
  print(f" x:{x.shape} \n w:{w.shape} \n b:{b.shape}")
  return x@w + b

In [59]:
t = lin(x_valid, w1, b1)
t.shape

 x:torch.Size([10000, 784]) 
 w:torch.Size([784, 50]) 
 b:torch.Size([50])


torch.Size([10000, 50])

In [60]:
t

tensor([[ -0.09,  11.87, -11.39,  ...,   5.48,   2.14,  15.30],
        [  5.38,  10.21, -14.49,  ...,   0.88,   0.08,  20.23],
        [  3.31,   0.12,   3.10,  ...,  16.89,  -6.05,  24.74],
        ...,
        [  4.01,  10.35, -11.25,  ...,   0.23,  -5.30,  18.28],
        [ 10.62,  -4.27,  10.72,  ...,  -2.87,  -2.87,  18.23],
        [  2.84,  -0.22,   1.43,  ...,  -3.91,   5.75,   2.12]])

In [61]:
def relu(x):
  return x.clamp_min(0.)  # under 0 would be 0


t = relu(t)
t

tensor([[ 0.00, 11.87,  0.00,  ...,  5.48,  2.14, 15.30],
        [ 5.38, 10.21,  0.00,  ...,  0.88,  0.08, 20.23],
        [ 3.31,  0.12,  3.10,  ..., 16.89,  0.00, 24.74],
        ...,
        [ 4.01, 10.35,  0.00,  ...,  0.23,  0.00, 18.28],
        [10.62,  0.00, 10.72,  ...,  0.00,  0.00, 18.23],
        [ 2.84,  0.00,  1.43,  ...,  0.00,  5.75,  2.12]])

In [62]:
def model(xb):
  l1 = lin(xb, w1, b1)
  l2 = relu(l1)
  return lin(l2,w2,b2)

model(x_valid).shape

 x:torch.Size([10000, 784]) 
 w:torch.Size([784, 50]) 
 b:torch.Size([50])
 x:torch.Size([10000, 50]) 
 w:torch.Size([50, 1]) 
 b:torch.Size([1])


torch.Size([10000, 1])

In [63]:
res  = model(x_valid)
res.shape

 x:torch.Size([10000, 784]) 
 w:torch.Size([784, 50]) 
 b:torch.Size([50])
 x:torch.Size([10000, 50]) 
 w:torch.Size([50, 1]) 
 b:torch.Size([1])


torch.Size([10000, 1])

## Loss: MSE
ofcourse MSE is not suitable

In [64]:
res.shape, y_valid.shape  # y_valid is to compare with   # 1 in (10000,1) would be first broadcasted to corresponding y_valid first
                            # and when it comes 10000 in (10000,1) y_valis's shape becomes (1, 10000); repeating everything

(torch.Size([10000, 1]), torch.Size([10000]))

In [65]:
(res - y_valid).shape  # each element in res's distance to each element in y_valid so a matrix

torch.Size([10000, 10000])

In [66]:
# so make Y-valid shapes (10000, 1)

(res - y_valid[:,None]).shape, (res - y_valid[:,None])

(torch.Size([10000, 1]),
 tensor([[  22.75],
         [ -21.06],
         [-120.79],
         ...,
         [ -72.44],
         [ -80.48],
         [ -68.19]]))

In [67]:
(res.squeeze() - y_valid).shape, (res.squeeze() - y_valid)

(torch.Size([10000]),
 tensor([  22.75,  -21.06, -120.79,  ...,  -72.44,  -80.48,  -68.19]))

In [68]:
#or
#res[:, 0].shape : extracts first columns
#res[0,:].shape : extracts first row


res.shape, res[:, 0].shape, res[0,:].shape

(torch.Size([10000, 1]), torch.Size([10000]), torch.Size([1]))

In [69]:
res.shape, res[None, : , None].shape, (res[None, : , None]).squeeze().shape

(torch.Size([10000, 1]), torch.Size([1, 10000, 1, 1]), torch.Size([10000]))

In [70]:
# so,

(res[:,0] - y_valid).shape

torch.Size([10000])

In [71]:
y_train, y_valid = y_train.float(), y_valid.float()

preds = model(x_train)
preds.shape

 x:torch.Size([50000, 784]) 
 w:torch.Size([784, 50]) 
 b:torch.Size([50])
 x:torch.Size([50000, 50]) 
 w:torch.Size([50, 1]) 
 b:torch.Size([1])


torch.Size([50000, 1])

In [72]:
def mse( output, targ):
  print(f"output: {output.shape}, targ: {targ.shape}")
  return (output[:, 0] - targ).pow(2).mean()

mse(preds, y_train)

output: torch.Size([50000, 1]), targ: torch.Size([50000])


tensor(4308.76)

## Gradients & backward pass

In [73]:
from sympy import symbols, diff

In [74]:
x,y = symbols('x y')
x,y

(x, y)

In [75]:
diff(x**2, x) # taking differential

2*x

In [76]:
diff(3*x**2 +9, x)

6*x

In [77]:
def lin_grad(inp, out, w, b):   # inp is being manipulated here  # just multiply by output gradients in all values
  # grad of matmul w.r.t input

  # here  inp contains the original input values used in the forward pass. inp.g attribute of inp is assigned the gradient information calculated during the backward pass.
  #The original values of inp are still present in the tensor, but now the .g attribute holds the calculated gradients.
  inp.g = out.g @ w.t()  # gradient of output w.r.t input   # multiplying weights with gradients of ouput
  # import pdb; pdb.set_trace()  # tellls debugger its breakpoint
  # breakpoint()

  # print(f"inp: {inp.shape}  ******   inp.g: {inp.g.shape}" )


  # print(f"   inp.unsqueeze(-1): {inp.unsqueeze(-1).shape} ||||  inp: {inp.shape}")
  # print(f"** inp: { inp.shape } * out.g: {out.g.shape}")

  # w.g = (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0)  # gradient of output w.r.t weights   # (inputs * output weights gradients ebcause of chain rule) summed over
  # i,o = inp.unsqueeze(-1), out.g.unsqueeze(1)
  # w.g = (i*o).sum(0)
  # w.g = torch.einsum('ij,ik->jk', inp, out.g)
  """
  but ij,ik->jk is not same as matrix multiplication.  we'd need to sawp i and j by using transpose
  """

  w.g = inp.T@out.g



  # print(f"(inp.unsqueeze(-1) * out.g.unsqueeze(1)): {(inp.unsqueeze(-1) * out.g.unsqueeze(1)).shape}  || (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0).shape: {(inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0).shape}")
  # print(f"++ inp.unsqueeze(-1): { inp.unsqueeze(-1).shape } * out.g.unsqueeze(1): {out.g.unsqueeze(1).shape}")
  b.g = out.g.sum(0) # gradients of output summed over
  # print(f" out.g.sum(0): {b.g} ||| out.g : {out.g.shape}")

  # breakpoint()

  """


  inp.g: Gradient of the input with respect to the loss. This line calculates the gradient of the loss with respect to the input inp. The formula used here is a common one in backpropagation for a linear layer. The gradient is computed by taking the dot product of the gradient of the output (out.g) and the transpose of the weight matrix (w.t()). This operation essentially backpropagates the error from the output to the input through the weight matrix.

Mathematically, if L is the loss and i represents the input, this operation can be expressed as:

dL/di = (dL/dout) * w.t()
"""

In [78]:
def forward_and_backward(inp, targ):
  # forward
  l1 = lin(inp, w1,b1)
  l2 = relu(l1)
  out = lin(l2, w2, b2)  # output of second layer
  diff = out[:, 0]-targ
  loss = diff.pow(2).mean()

  #backward pass:
  out.g = 2.*diff[:,None]/inp.shape[0]   # creating new attribute `.g`    # 2*diff because we took diff square    # divided my inp.shape[0] bcz we took mean
  # print(f"inp.shape[0]: {inp.shape[0]}   |   diff[:,None]: {diff[:,None].shape}  |  diff: {diff.shape}")
  lin_grad(l2, out, w2,b2)
  l1.g = (l1>0).float() * l2.g  # as we are using l1 l2 of forward pass here,, so we need to store each of intermediary layer calculations
  lin_grad(inp, l1,w1,b1)


forward_and_backward(x_train, y_train)


 x:torch.Size([50000, 784]) 
 w:torch.Size([784, 50]) 
 b:torch.Size([50])
 x:torch.Size([50000, 50]) 
 w:torch.Size([50, 1]) 
 b:torch.Size([1])


In [79]:
 w1.g.shape, x_train.g.shape

(torch.Size([784, 50]), torch.Size([50000, 784]))

In [80]:
x_train.g

tensor([[    -0.00,     -0.01,      0.00,  ...,     -0.00,      0.00,      0.00],
        [    -0.03,     -0.03,      0.01,  ...,     -0.04,     -0.01,     -0.01],
        [     0.00,      0.00,     -0.00,  ...,      0.00,     -0.00,      0.00],
        ...,
        [    -0.00,     -0.02,      0.01,  ...,     -0.00,     -0.00,      0.00],
        [    -0.02,     -0.01,      0.01,  ...,     -0.01,      0.01,     -0.00],
        [    -0.00,     -0.00,      0.00,  ...,     -0.00,     -0.00,     -0.00]])

In [81]:
x_train.g.zero_()

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [82]:
forward_and_backward(x_train, y_train)

 x:torch.Size([50000, 784]) 
 w:torch.Size([784, 50]) 
 b:torch.Size([50])
 x:torch.Size([50000, 50]) 
 w:torch.Size([50, 1]) 
 b:torch.Size([1])


In [83]:
forward_and_backward(x_train, y_train)

 x:torch.Size([50000, 784]) 
 w:torch.Size([784, 50]) 
 b:torch.Size([50])
 x:torch.Size([50000, 50]) 
 w:torch.Size([50, 1]) 
 b:torch.Size([1])


## Geradients and backward pass

## Refactor model

### Layers as classes

In [84]:
class Relu():
  def __call__(self,inp):
    self.inp = inp     # # as we are using l1 l2 of forward pass here,, so we need to store each of intermediary layer calculations so, here relu is
    # not ust doing calculations but also stores input and output in self.inp and self.out  so when we cakll backward, we can use them as shown in backward() below
    self.out = inp.clamp_min(0.)
    return self.out

  def backward(self):
    self.inp.g = (self.inp>0).float() * self.out.g



In [85]:
class Lin():
  def __init__(self, w, b):  # it has init because it needs additional input that relu didnt
    self.w, self.b = w,b

  def __call__(self, inp):
    self.inp = inp
    self.out = lin(inp, self.w, self.b)
    return self.out

  def backward(self):
    self.inp.g = self.out.g @ self.w.t()
    self.w.g = self.inp.t() @ self.out.g
    self.b.g = self.out.g.sum(0)




In [86]:
class Mse():
  def __call__(self, inp, targ):
    self.inp, self.targ = inp, targ
    self.out = mse(inp, targ)
    return self.out


  def backward(self):
    self.inp.g = 2. * (self.inp.squeeze() - self.targ).unsqueeze(-1) / self.targ.shape[0]


In [87]:
x = torch.randn((2,2))
x.shape, x[:,None].shape,  x[:, 0 ].shape,  x, x[:,None],  x[:, 0 ]

(torch.Size([2, 2]),
 torch.Size([2, 1, 2]),
 torch.Size([2]),
 tensor([[ 0.57, -0.99],
         [ 1.62, -0.98]]),
 tensor([[[ 0.57, -0.99]],
 
         [[ 1.62, -0.98]]]),
 tensor([0.57, 1.62]))

In [88]:
class Model():
  def __init__(self, w1, b1, w2, b2):
    self.layers = [Lin(w1, b1), Relu(), Lin(w2,b2)]  # we are creating instances of each classes
    self.loss = Mse()


  def __call__(self, x, targ):
    for l in self.layers:
      x = l(x)                                    # calling layer
    return self.loss(x, targ)     # loss function is being calculated in the model

  def backward(self):
    self.loss.backward()
    for l in reversed(self.layers):
      l.backward()

In [89]:
model = Model(w1,b1, w2, b2)

In [90]:
loss= model(x_train, y_train)

 x:torch.Size([50000, 784]) 
 w:torch.Size([784, 50]) 
 b:torch.Size([50])
 x:torch.Size([50000, 50]) 
 w:torch.Size([50, 1]) 
 b:torch.Size([1])
output: torch.Size([50000, 1]), targ: torch.Size([50000])


In [91]:
model.backward()

In [92]:
test_close(w2g, w2.g, eps=0.01)
test_close(b2g, b2.g, eps=0.01)
test_close(w1g, w1.g, eps=0.01)
test_close(b1g, b1.g, eps=0.01)
test_close(ig, x_train.g, eps=0.01)

NameError: name 'w2g' is not defined

In [None]:
class A:
  def __init__(self):
    print("INITITALIZED!!")

  def __call__(self, x):
    print(f"CALLLED {x}???")




A()

In [None]:
A()("ME?")

In [None]:
a=A()


In [None]:
a("ME?")