<a href="https://colab.research.google.com/github/mzohaibnasir/NeuralNotes/blob/main/03_deepDiveIntoBasics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Minibatch Training

In [35]:
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl,numpy as np,matplotlib.pyplot as plt
from pathlib import Path
from torch import tensor,nn
import torch.nn.functional as F
from fastcore.test import test_close


In [36]:
MNIST_URL='https://github.com/mnielsen/neural-networks-and-deep-learning/blob/d15df08a69ed33ae16a2fff874f83b57a956172c/data/mnist.pkl.gz?raw=true'
path_data = Path('data')
path_data.mkdir(exist_ok=True )
path_gz=path_data/'mnist.pkl.gz'
path_gz

PosixPath('data/mnist.pkl.gz')

In [37]:
from urllib.request import urlretrieve

if not path_gz.exists(): urlretrieve(MNIST_URL, path_gz)

In [38]:
torch.manual_seed(42)


mpl.rcParams['image.cmap'] = 'gray'
torch.set_printoptions(precision=2, linewidth=125, sci_mode=False)
np.set_printoptions(precision=2, linewidth=125)


In [39]:
!ls -l data

total 16656
-rw-r--r-- 1 root root 17051982 Jan 18 11:43 mnist.pkl.gz


In [40]:
with gzip.open(path_gz, 'rb') as f:   #read as binary as opposed to text
   ((x_train,y_train), (x_valid,y_valid), _) = pickle.load(f, encoding='latin-1') #destructuring
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])

In [41]:
n,m = x_train.shape
c = y_train.max()+1
nh = 50


In [42]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in,nh), nn.ReLU(), nn.Linear(nh,n_out)]

    def __call__(self, x):
        for l in self.layers: x = l(x)
        return x

In [43]:
model = Model(m, nh, 10)
pred = model(x_train)
pred.shape

torch.Size([50000, 10])

In [44]:
### CE loss

In [45]:
def log_softmax(x):
  print(x.shape)
  return (x.exp()/x.exp().sum(-1, keepdim=True)).shape
log_softmax(pred)

torch.Size([50000, 10])


torch.Size([50000, 10])

In [46]:
def log_softmax(x):
  # print(x.shape)
  return (x.exp()/x.exp().sum(-1, keepdim=True)).log()
log_softmax(pred)

tensor([[-2.36, -2.28, -2.09,  ..., -2.43, -2.47, -2.11],
        [-2.37, -2.25, -2.09,  ..., -2.46, -2.43, -2.11],
        [-2.34, -2.31, -2.14,  ..., -2.44, -2.48, -2.14],
        ...,
        [-2.26, -2.25, -2.13,  ..., -2.36, -2.53, -2.17],
        [-2.39, -2.30, -2.18,  ..., -2.38, -2.42, -2.11],
        [-2.40, -2.25, -2.14,  ..., -2.38, -2.45, -2.23]], grad_fn=<LogBackward0>)

In [47]:
pred.exp().shape , pred.exp().sum(-1)[:,None].shape

(torch.Size([50000, 10]), torch.Size([50000, 1]))

In [48]:
pred.shape

torch.Size([50000, 10])

we'll try to use log rules to simply softmax funcction

`e^x and log x are opposite`

In [49]:
def log_softmax(x):
  return x - x.exp().sum(-1, keepdim=True).log()

In [50]:
def logsumexp(x):
  m = x.max(-1)[0]  # max value of `x`
  return m + (x-m[:,None]).exp().sum(-1).log()  #logsumexp trick

In [51]:
# in pytorch

def log_softmax(x):
  return x - x.logsumexp(-1, keepdim=True)

In [52]:
test_close( logsumexp(pred), pred.logsumexp(-1))
sm_pred = log_softmax(pred)
sm_pred

tensor([[-2.36, -2.28, -2.09,  ..., -2.43, -2.47, -2.11],
        [-2.37, -2.25, -2.09,  ..., -2.46, -2.43, -2.11],
        [-2.34, -2.31, -2.14,  ..., -2.44, -2.48, -2.14],
        ...,
        [-2.26, -2.25, -2.13,  ..., -2.36, -2.53, -2.17],
        [-2.39, -2.30, -2.18,  ..., -2.38, -2.42, -2.11],
        [-2.40, -2.25, -2.14,  ..., -2.38, -2.45, -2.23]], grad_fn=<SubBackward0>)

The cross entropy loss for some target
 and some prediction  p(x) is given by:
      - sum( x . log(p(x)))

  But since our xs are 1-hot encoded (actually, they're just the integer indices), this can be rewritten as -log(pi) where i is the index of the desired target.

This can be done using numpy-style integer array indexing. Note that PyTorch supports all the tricks in the advanced indexing methods discussed in that link.

In [53]:
y_train[:3]  # actual values of y_train

tensor([5, 0, 4])

In [54]:
# now in our softmaax prediction `sm_pred`, we want to get 5th prediction of 0, 0th of 1, and 4th of 2

sm_pred[0,5], sm_pred[1,0], sm_pred[2,4]

(tensor(-2.40, grad_fn=<SelectBackward0>),
 tensor(-2.37, grad_fn=<SelectBackward0>),
 tensor(-2.14, grad_fn=<SelectBackward0>))

In [55]:
sm_pred.shape

torch.Size([50000, 10])

In [56]:
sm_pred[[0,1,2], y_train[:3]]     # [rows, cols]

tensor([-2.40, -2.37, -2.14], grad_fn=<IndexBackward0>)

In [57]:
sm_pred[range(y_train.shape[0])]

tensor([[-2.36, -2.28, -2.09,  ..., -2.43, -2.47, -2.11],
        [-2.37, -2.25, -2.09,  ..., -2.46, -2.43, -2.11],
        [-2.34, -2.31, -2.14,  ..., -2.44, -2.48, -2.14],
        ...,
        [-2.26, -2.25, -2.13,  ..., -2.36, -2.53, -2.17],
        [-2.39, -2.30, -2.18,  ..., -2.38, -2.42, -2.11],
        [-2.40, -2.25, -2.14,  ..., -2.38, -2.45, -2.23]], grad_fn=<IndexBackward0>)

In [58]:
def nll(input, target): # this will cross entropy loss   # negative non-likelihood loss
  return -input[range(target.shape[0]), target].mean()

In [59]:
loss = nll(sm_pred, y_train)
loss

tensor(2.30, grad_fn=<NegBackward0>)

In [60]:
# Then use PyTorch's implementation.
test_close(F.nll_loss(F.log_softmax(pred, -1), y_train), loss, 1e-3)





In [61]:
# In PyTorch, F.log_softmax and F.nll_loss are combined in one optimized function, F.cross_entropy.
test_close(F.cross_entropy(pred, y_train), loss, 1e-3)


# basic training loop

Basic training loop
Basically the training loop repeats over the following steps:

 * get the output of the model on a batch of inputs
 * compare the output to the labels we have and compute a loss
 * calculate the gradients of the loss with respect to every parameter of the model
 * update said parameters with those gradients to make them a little bit better

In [62]:
loss_func = F.cross_entropy

In [63]:
bs = 60  # batch size
xb = x_train[0:bs]  # a mini batch from x
preds = model(xb)

preds[0], preds.shape

(tensor([-0.05,  0.03,  0.22,  0.02,  0.00, -0.09, -0.04, -0.12, -0.15,  0.20], grad_fn=<SelectBackward0>),
 torch.Size([60, 10]))

In [64]:
yb = y_train[0:bs]
yb

tensor([5, 0, 4, 1, 9, 2, 1, 3, 1, 4, 3, 5, 3, 6, 1, 7, 2, 8, 6, 9, 4, 0, 9, 1, 1, 2, 4, 3, 2, 7, 3, 8, 6, 9, 0, 5, 6, 0, 7,
        6, 1, 8, 7, 9, 3, 9, 8, 5, 9, 3, 3, 0, 7, 4, 9, 8, 0, 9, 4, 1])

In [65]:
loss_func(preds, yb)

tensor(2.28, grad_fn=<NllLossBackward0>)

In [66]:
preds.argmax(dim=1) # for each rows in preds , find highest number

tensor([2, 2, 4, 2, 2, 9, 2, 2, 1, 9, 9, 9, 2, 2, 1, 2, 2, 2, 9, 9, 2, 2, 9, 2, 2, 2, 2, 2, 2, 1, 9, 1, 2, 2, 2, 9, 2, 9, 2,
        9, 9, 2, 9, 2, 9, 2, 2, 2, 2, 2, 9, 2, 2, 9, 2, 2, 2, 9, 2, 2])

In [67]:
def accuracy(out, yb):
  return (out.argmax(dim = 1) == yb).float().mean()

In [68]:
accuracy(preds, yb)

tensor(0.15)

In [77]:
lr = 0.5
epochs = 3

In [78]:
def report(loss, preds, yb):
  print(f"{loss:.2f}, {accuracy(preds,yb):.2f}")

In [79]:
xb,yb = x_train[:bs],y_train[:bs]
preds = model(xb)
report(loss_func(preds, yb), preds, yb)

2.28, 0.15


In [80]:
for epoch in range(epochs):
  for i in range(0, n, bs):
    s = slice(i, min(n, i+bs))
    xb, yb = x_train[s], y_train[s]
    preds = model(xb)
    loss = loss_func(preds, yb)
    loss.backward()

    with torch.no_grad():
      for l in model.layers:
        if( hasattr( l , 'weight')):
          l.weight -= l.weight.grad * lr
          l.bias -= l.bias.grad * lr

          l.weight.grad.zero_()
          l.bias.grad.zero_()

  report(loss, preds, yb)

0.18, 0.95
0.11, 1.00
0.05, 1.00
