In [36]:
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl, numpy as np,matplotlib.pyplot as plt
from pathlib import Path
from torch import tensor,nn
import torch.nn.functional as F
from fastcore.test import test_close
from urllib.request import urlretrieve
#torch.manual_seed(42)

mpl.rcParams['image.cmap'] = 'gray'
torch.set_printoptions(precision=2, linewidth=125, sci_mode=False)
np.set_printoptions(precision=2, linewidth=125)

MNIST_URL='https://github.com/mnielsen/neural-networks-and-deep-learning/blob/master/data/mnist.pkl.gz?raw=true'
path_data = Path('data')
path_data.mkdir(exist_ok=True)
path_gz = path_data/'mnist.pkl.gz'

if not path_gz.exists(): urlretrieve(MNIST_URL, path_gz)
path_data = Path('data')
path_gz = path_data/'mnist.pkl.gz'
with gzip.open(path_gz, 'rb') as f: ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])

## setup


In [37]:
n,m = x_train.shape
c = y_train.max() +1
nh = 50

In [38]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in,nh), nn.ReLU(), nn.Linear(nh,n_out)]

    def __call__(self, x):
        for l in self.layers: x = l(x)
        return x

In [39]:
model = Model(m,nh,10)
pred = model(x_train)
pred.shape

torch.Size([50000, 10])

## Cross Entropy loss

In [40]:
def log_softmax(x): return (x.exp() / (x.exp().sum(-1,keepdim=True))).log()

In [41]:
log_softmax(pred)

tensor([[-2.36, -2.28, -2.09,  ..., -2.43, -2.47, -2.11],
        [-2.37, -2.25, -2.09,  ..., -2.46, -2.43, -2.11],
        [-2.34, -2.31, -2.14,  ..., -2.44, -2.48, -2.14],
        ...,
        [-2.26, -2.25, -2.13,  ..., -2.36, -2.53, -2.17],
        [-2.39, -2.30, -2.18,  ..., -2.38, -2.42, -2.11],
        [-2.40, -2.25, -2.14,  ..., -2.38, -2.45, -2.23]], grad_fn=<LogBackward0>)

In [42]:
def log_softmax(x): return x - x.exp().sum(-1,keepdim=True).log()

In [43]:
def logsumexp(x):
  m = x.max(-1)[0]
  return m + (x -m[:,None]).exp().sum(-1).log()

In [44]:
def log_softmax(x): return x - x.logsumexp(-1,keepdim=True)

In [45]:
test_close(logsumexp(pred), pred.logsumexp(-1))
sm_pred = log_softmax(pred)
sm_pred

tensor([[-2.36, -2.28, -2.09,  ..., -2.43, -2.47, -2.11],
        [-2.37, -2.25, -2.09,  ..., -2.46, -2.43, -2.11],
        [-2.34, -2.31, -2.14,  ..., -2.44, -2.48, -2.14],
        ...,
        [-2.26, -2.25, -2.13,  ..., -2.36, -2.53, -2.17],
        [-2.39, -2.30, -2.18,  ..., -2.38, -2.42, -2.11],
        [-2.40, -2.25, -2.14,  ..., -2.38, -2.45, -2.23]], grad_fn=<SubBackward0>)

In [46]:
y_train[:3]

tensor([5, 0, 4])

In [47]:
sm_pred[0,5], sm_pred[1,0], sm_pred[2,4]

(tensor(-2.40, grad_fn=<SelectBackward0>),
 tensor(-2.37, grad_fn=<SelectBackward0>),
 tensor(-2.14, grad_fn=<SelectBackward0>))

In [48]:
sm_pred[[0,1,2], y_train[:3]]

tensor([-2.40, -2.37, -2.14], grad_fn=<IndexBackward0>)

In [49]:
def nll(input, target):return -input[range(target.shape[0]),target].mean()

In [50]:
loss = nll(sm_pred, y_train)
loss

tensor(2.30, grad_fn=<NegBackward0>)

In [51]:
test_close(F.nll_loss(F.log_softmax(pred,-1), y_train),loss, 1e-3)

In [52]:
test_close(F.cross_entropy(pred,y_train),loss,1e-3)

## basic training Loop

In [53]:
loss_func = F.cross_entropy

In [54]:
bs=64        # batch size
xb = x_train[0:bs]
preds =model(xb)
preds[0], preds.shape

(tensor([-0.05,  0.03,  0.22,  0.02,  0.00, -0.09, -0.04, -0.12, -0.15,  0.20], grad_fn=<SelectBackward0>),
 torch.Size([64, 10]))

In [55]:
yb = y_train[0:64]
yb

tensor([5, 0, 4, 1, 9, 2, 1, 3, 1, 4, 3, 5, 3, 6, 1, 7, 2, 8, 6, 9, 4, 0, 9, 1, 1, 2, 4, 3, 2, 7, 3, 8, 6, 9, 0, 5, 6, 0, 7,
        6, 1, 8, 7, 9, 3, 9, 8, 5, 9, 3, 3, 0, 7, 4, 9, 8, 0, 9, 4, 1, 4, 4, 6, 0])

In [56]:
loss_func(preds, yb)

tensor(2.28, grad_fn=<NllLossBackward0>)

In [57]:
preds.argmax(dim=1)

tensor([2, 2, 4, 2, 2, 9, 2, 2, 1, 9, 9, 9, 2, 2, 1, 2, 2, 2, 9, 9, 2, 2, 9, 2, 2, 2, 2, 2, 2, 1, 9, 1, 2, 2, 2, 9, 2, 9, 2,
        9, 9, 2, 9, 2, 9, 2, 2, 2, 2, 2, 9, 2, 2, 9, 2, 2, 2, 9, 2, 2, 2, 2, 2, 2])

In [58]:
def accuracy(out,yb): return (out.argmax(dim=1)==yb).float().mean()

In [59]:
accuracy(preds,yb)

tensor(0.14)

In [68]:
lr=0.5
epochs=3

In [69]:
def report(loss, preds, yb): print(f'{loss:.2f}, {accuracy(preds, yb):.2f}')

In [71]:
xb,yb = x_train[:bs],y_train[:bs]
preds = model(xb)
report(loss_func(preds, yb), preds, yb)

0.04, 0.98


In [78]:
for epoch in range(epochs):
  for i in range(0,n,bs):
    s = slice(i,min(n,i+bs))
    xb,yb = x_train[s],y_train[s]
    preds = model(xb)
    loss = loss_func(preds,yb)
    loss.backward()
    if i==0:print(loss.item(), accuracy(preds,yb).item())
    with torch.no_grad():
      for l in model.layers:
        if hasattr(l, 'weight'):
          l.weight -= l.weight.grad*lr
          l.bias   -= l.bias.grad   * lr
          l.weight.grad.zero_()
          l.bias.grad.zero_()

  #report(loss,preds,yb)

0.004525449126958847 1.0
0.003806290216743946 1.0
0.0036004765424877405 1.0
