In [1]:
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl, numpy as np,matplotlib.pyplot as plt
from pathlib import Path
from torch import tensor,nn
import torch.nn.functional as F
from fastcore.test import test_close
from urllib.request import urlretrieve
#torch.manual_seed(42)

mpl.rcParams['image.cmap'] = 'gray'
torch.set_printoptions(precision=2, linewidth=125, sci_mode=False)
np.set_printoptions(precision=2, linewidth=125)

MNIST_URL='https://github.com/mnielsen/neural-networks-and-deep-learning/blob/master/data/mnist.pkl.gz?raw=true'
path_data = Path('data')
path_data.mkdir(exist_ok=True)
path_gz = path_data/'mnist.pkl.gz'

if not path_gz.exists(): urlretrieve(MNIST_URL, path_gz)
# path_data = Path('data')
# path_gz = path_data/'mnist.pkl.gz'
with gzip.open(path_gz, 'rb') as f: ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])

## setup


In [2]:
n,m = x_train.shape
c = y_train.max() +1
nh = 50
n,m,c

(50000, 784, tensor(10))

In [3]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in,nh), nn.ReLU(), nn.Linear(nh,n_out)]

    def __call__(self, x):
        for l in self.layers: x = l(x)
        return x

In [4]:
model = Model(m,nh,10)
pred = model(x_train)
pred.shape

torch.Size([50000, 10])

## Cross Entropy loss

In [5]:
def log_softmax(x): return (x.exp() / (x.exp().sum(-1,keepdim=True))).log()

In [6]:
log_softmax(pred)

tensor([[-2.24, -2.39, -2.35,  ..., -2.26, -2.38, -2.47],
        [-2.30, -2.41, -2.19,  ..., -2.33, -2.45, -2.48],
        [-2.34, -2.40, -2.26,  ..., -2.25, -2.34, -2.53],
        ...,
        [-2.28, -2.40, -2.28,  ..., -2.28, -2.39, -2.49],
        [-2.36, -2.33, -2.40,  ..., -2.30, -2.34, -2.41],
        [-2.36, -2.41, -2.29,  ..., -2.32, -2.44, -2.47]], grad_fn=<LogBackward0>)

In [7]:
def log_softmax(x): return x - x.exp().sum(-1,keepdim=True).log()

In [8]:
def logsumexp(x):
  m = x.max(-1)[0]
  return m + (x -m[:,None]).exp().sum(-1).log()

In [9]:
def log_softmax(x): return x - x.logsumexp(-1,keepdim=True)

In [10]:
test_close(logsumexp(pred), pred.logsumexp(-1))
sm_pred = log_softmax(pred)
sm_pred

tensor([[-2.24, -2.39, -2.35,  ..., -2.26, -2.38, -2.47],
        [-2.30, -2.41, -2.19,  ..., -2.33, -2.45, -2.48],
        [-2.34, -2.40, -2.26,  ..., -2.25, -2.34, -2.53],
        ...,
        [-2.28, -2.40, -2.28,  ..., -2.28, -2.39, -2.49],
        [-2.36, -2.33, -2.40,  ..., -2.30, -2.34, -2.41],
        [-2.36, -2.41, -2.29,  ..., -2.32, -2.44, -2.47]], grad_fn=<SubBackward0>)

In [11]:
y_train[:3]

tensor([5, 0, 4])

In [12]:
sm_pred[0,5], sm_pred[1,0], sm_pred[2,4]

(tensor(-2.08, grad_fn=<SelectBackward0>),
 tensor(-2.30, grad_fn=<SelectBackward0>),
 tensor(-2.38, grad_fn=<SelectBackward0>))

In [13]:
sm_pred[[0,1,2], y_train[:3]]

tensor([-2.08, -2.30, -2.38], grad_fn=<IndexBackward0>)

In [14]:
def nll(input, target):return -input[range(target.shape[0]),target].mean()

In [15]:
loss = nll(sm_pred, y_train)
loss

tensor(2.31, grad_fn=<NegBackward0>)

In [16]:
test_close(F.nll_loss(F.log_softmax(pred,-1), y_train),loss, 1e-3)

In [17]:
test_close(F.cross_entropy(pred,y_train),loss,1e-3)

## basic training Loop

In [18]:
loss_func = F.cross_entropy

In [19]:
bs=64        # batch size
xb = x_train[0:bs]
preds =model(xb)
preds[0], preds.shape

(tensor([ 0.06, -0.09, -0.05,  0.08, -0.04,  0.23, -0.04,  0.05, -0.08, -0.17], grad_fn=<SelectBackward0>),
 torch.Size([64, 10]))

In [20]:
yb = y_train[0:64]
yb

tensor([5, 0, 4, 1, 9, 2, 1, 3, 1, 4, 3, 5, 3, 6, 1, 7, 2, 8, 6, 9, 4, 0, 9, 1, 1, 2, 4, 3, 2, 7, 3, 8, 6, 9, 0, 5, 6, 0, 7,
        6, 1, 8, 7, 9, 3, 9, 8, 5, 9, 3, 3, 0, 7, 4, 9, 8, 0, 9, 4, 1, 4, 4, 6, 0])

In [21]:
xb.shape, yb.shape

(torch.Size([64, 784]), torch.Size([64]))

In [22]:
loss_func(preds, yb)

tensor(2.32, grad_fn=<NllLossBackward0>)

In [23]:
preds.argmax(dim=1)

tensor([5, 5, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, 6, 5, 5, 6, 5, 5, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5,
        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 6, 5, 5, 5, 6, 6, 6, 5, 5])

In [24]:
def accuracy(out,yb): return (out.argmax(dim=1)==yb).float().mean()

In [25]:
accuracy(preds,yb)

tensor(0.06)

In [26]:
lr=0.5
epochs=3

In [27]:
def report(loss, preds, yb): print(f'{loss:.2f}, {accuracy(preds, yb):.2f}')

In [28]:
xb.shape, yb.shape

(torch.Size([64, 784]), torch.Size([64]))

In [29]:
# xb,yb = x_train[:bs],y_train[:bs]
# preds = model(xb)
# report(loss_func(preds, yb), preds, yb)

In [30]:
xb.shape, yb.shape

(torch.Size([64, 784]), torch.Size([64]))

In [31]:
for epoch in range(epochs):
    for i in range(0, n, bs):
        s = slice(i, min(n,i+bs))
        xb_,yb_ = x_train[s],y_train[s]
        preds = model(xb_)
        loss = loss_func(preds, yb_)
        loss.backward()
        with torch.no_grad():
            for l in model.layers:
                if hasattr(l, 'weight'):
                    l.weight -= l.weight.grad * lr
                    l.bias   -= l.bias.grad   * lr
                    l.weight.grad.zero_()
                    l.bias  .grad.zero_()
    #report(loss, preds, yb)

In [32]:
# for epoch in range(epochs):
#   for i in range(0,n,bs):
#     s = slice(i, min(n,i+bs))
#     xb,yb = x_train[s],y_train[s]
#     preds = model(xb)
#     loss = loss_func(preds,yb)
#     loss.backward()
#     if i==0:print(loss.item(), accuracy(preds,yb).item())
#     with torch.no_grad():
#       for l in model.layers:
#         if hasattr(l, 'weight'):
#           l.weight -= l.weight.grad*lr
#           l.bias   -= l.bias.grad   * lr
#           l.weight.grad.zero_()
#           l.bias.grad.zero_()

#   #report(loss,preds,yb)

In [33]:
xb.shape, yb.shape

(torch.Size([64, 784]), torch.Size([64]))

## Parameters and Optim

#### Parameters

In [34]:
m1 = nn.Module()
m1.foo = nn.Linear(3,4)
m1

Module(
  (foo): Linear(in_features=3, out_features=4, bias=True)
)

In [35]:
(list(m1.named_children()))

[('foo', Linear(in_features=3, out_features=4, bias=True))]

In [36]:
m1.named_children()

<generator object Module.named_children at 0x79016701d4d0>

In [37]:
list(m1.parameters())

[Parameter containing:
 tensor([[ 0.15,  0.11,  0.15],
         [-0.37, -0.50, -0.40],
         [-0.15,  0.01,  0.02],
         [-0.51,  0.53,  0.33]], requires_grad=True),
 Parameter containing:
 tensor([-0.46, -0.08, -0.34, -0.03], requires_grad=True)]

In [38]:
class MLP(nn.Module):
  def __init__(self,n_in,nh,n_out):
    super().__init__()
    self.l1 = nn.Linear(n_in,nh)
    self.l2 = nn.Linear(nh,n_out)
    self.relu = nn.ReLU()

  def forward(self,x): return self.l2(self.relu(self.l1(x)))

In [39]:
model = MLP(m,nh,10)
model.l1

Linear(in_features=784, out_features=50, bias=True)

In [40]:
model

MLP(
  (l1): Linear(in_features=784, out_features=50, bias=True)
  (l2): Linear(in_features=50, out_features=10, bias=True)
  (relu): ReLU()
)

In [41]:
for name,l in model.named_children(): print(f'{name}: {l}')

l1: Linear(in_features=784, out_features=50, bias=True)
l2: Linear(in_features=50, out_features=10, bias=True)
relu: ReLU()


In [42]:
for p in model.parameters(): print(p.shape)

torch.Size([50, 784])
torch.Size([50])
torch.Size([10, 50])
torch.Size([10])


In [43]:
def fit():
  for epoch in range(epochs):
    for i in range(0,n,bs):
      s = slice(i,min(n,i+bs))
      xb,yb = x_train[s],y_train[s]
      preds = model(xb)
      loss = loss_func(preds,yb)
      loss.backward()
      #if i==0:print(loss.item(), accuracy(preds,yb).item())
      if i==0:print(f'{loss.item():.2f}, {accuracy(preds,yb).item():.2f}')
      with torch.no_grad():
        for p in model.parameters(): p -= p.grad *lr
        model.zero_grad()

    #report(loss,preds,yb)

In [44]:
fit()

2.30, 0.14
0.12, 0.97
0.12, 0.97


Lets creat our own nn.Module

In [45]:
class MyModule():
  def __init__(self, n_in, nh, n_out):
    self._modules = {}
    self.l1 = nn.Linear(n_in, nh)
    self.l2 = nn.Linear(nh, n_out)

  def __setattr__(self,k, v):
    if not k.startswith("_"): self._modules[k] = v
    super().__setattr__(k,v)

  def __repr__(self): return f'{self._modules}'

  def parameters(self):
    for l in self._modules.values():
      #for p in l.parameters(): yield p  # the above can be more factored into
      yield from l.parameters()

In [46]:
md1 = MyModule(m,nh,10)
md1

{'l1': Linear(in_features=784, out_features=50, bias=True), 'l2': Linear(in_features=50, out_features=10, bias=True)}

In [47]:
for p in md1.parameters(): print(p.shape)

torch.Size([50, 784])
torch.Size([50])
torch.Size([10, 50])
torch.Size([10])


## Registering Modules

In [48]:
from functools import reduce

In [49]:
layers = [nn.Linear(m,nh), nn.ReLU(), nn.Linear(nh,10)]

In [50]:
class Module(nn.Module):
  def __init__(self,layers):
    super().__init__()
    self.layers = layers
    for i,l in enumerate(self.layers): self.add_module(f'layer_{i}',l)

  def forward(self,x): return reduce(lambda val,layer: layer(val), self.layers,x)

In [51]:
model = Module(layers)
model

Module(
  (layer_0): Linear(in_features=784, out_features=50, bias=True)
  (layer_1): ReLU()
  (layer_2): Linear(in_features=50, out_features=10, bias=True)
)

In [52]:
model(xb).shape

torch.Size([64, 10])

In [53]:
xb.shape,yb.shape

(torch.Size([64, 784]), torch.Size([64]))