In [1]:
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl, numpy as np,matplotlib.pyplot as plt
from pathlib import Path
from torch import tensor,nn
import torch.nn.functional as F
from fastcore.test import test_close
from urllib.request import urlretrieve
#torch.manual_seed(42)

mpl.rcParams['image.cmap'] = 'gray'
torch.set_printoptions(precision=2, linewidth=125, sci_mode=False)
np.set_printoptions(precision=2, linewidth=125)

MNIST_URL='https://github.com/mnielsen/neural-networks-and-deep-learning/blob/master/data/mnist.pkl.gz?raw=true'
path_data = Path('data')
path_data.mkdir(exist_ok=True)
path_gz = path_data/'mnist.pkl.gz'

if not path_gz.exists(): urlretrieve(MNIST_URL, path_gz)
path_data = Path('data')
path_gz = path_data/'mnist.pkl.gz'
with gzip.open(path_gz, 'rb') as f: ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])

## setup


In [2]:
n,m = x_train.shape
c = y_train.max() +1
nh = 50
n,m,c

(50000, 784, tensor(10))

In [3]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in,nh), nn.ReLU(), nn.Linear(nh,n_out)]

    def __call__(self, x):
        for l in self.layers: x = l(x)
        return x

In [4]:
model = Model(m,nh,10)
pred = model(x_train)
pred.shape

torch.Size([50000, 10])

## Cross Entropy loss

In [5]:
def log_softmax(x): return (x.exp() / (x.exp().sum(-1,keepdim=True))).log()

In [6]:
log_softmax(pred)

tensor([[-2.31, -2.51, -2.34,  ..., -2.53, -2.22, -2.27],
        [-2.27, -2.52, -2.44,  ..., -2.53, -2.14, -2.20],
        [-2.20, -2.42, -2.43,  ..., -2.44, -2.21, -2.25],
        ...,
        [-2.22, -2.45, -2.37,  ..., -2.53, -2.24, -2.31],
        [-2.27, -2.43, -2.39,  ..., -2.45, -2.23, -2.24],
        [-2.23, -2.49, -2.44,  ..., -2.54, -2.18, -2.19]], grad_fn=<LogBackward0>)

In [7]:
def log_softmax(x): return x - x.exp().sum(-1,keepdim=True).log()

In [8]:
def logsumexp(x):
  m = x.max(-1)[0]
  return m + (x -m[:,None]).exp().sum(-1).log()

In [9]:
def log_softmax(x): return x - x.logsumexp(-1,keepdim=True)

In [10]:
test_close(logsumexp(pred), pred.logsumexp(-1))
sm_pred = log_softmax(pred)
sm_pred

tensor([[-2.31, -2.51, -2.34,  ..., -2.53, -2.22, -2.27],
        [-2.27, -2.52, -2.44,  ..., -2.53, -2.14, -2.20],
        [-2.20, -2.42, -2.43,  ..., -2.44, -2.21, -2.25],
        ...,
        [-2.22, -2.45, -2.37,  ..., -2.53, -2.24, -2.31],
        [-2.27, -2.43, -2.39,  ..., -2.45, -2.23, -2.24],
        [-2.23, -2.49, -2.44,  ..., -2.54, -2.18, -2.19]], grad_fn=<SubBackward0>)

In [11]:
y_train[:3]

tensor([5, 0, 4])

In [12]:
sm_pred[0,5], sm_pred[1,0], sm_pred[2,4]

(tensor(-2.13, grad_fn=<SelectBackward0>),
 tensor(-2.27, grad_fn=<SelectBackward0>),
 tensor(-2.09, grad_fn=<SelectBackward0>))

In [13]:
sm_pred[[0,1,2], y_train[:3]]

tensor([-2.13, -2.27, -2.09], grad_fn=<IndexBackward0>)

In [14]:
def nll(input, target):return -input[range(target.shape[0]),target].mean()

In [15]:
loss = nll(sm_pred, y_train)
loss

tensor(2.32, grad_fn=<NegBackward0>)

In [16]:
test_close(F.nll_loss(F.log_softmax(pred,-1), y_train),loss, 1e-3)

In [17]:
test_close(F.cross_entropy(pred,y_train),loss,1e-3)

## basic training Loop

In [18]:
loss_func = F.cross_entropy

In [19]:
bs=64        # batch size
xb = x_train[0:bs]
preds =model(xb)
preds[0], preds.shape

(tensor([ 0.01, -0.19, -0.03, -0.10,  0.35,  0.18, -0.15, -0.22,  0.09,  0.04], grad_fn=<SelectBackward0>),
 torch.Size([64, 10]))

In [20]:
yb = y_train[0:64]
yb

tensor([5, 0, 4, 1, 9, 2, 1, 3, 1, 4, 3, 5, 3, 6, 1, 7, 2, 8, 6, 9, 4, 0, 9, 1, 1, 2, 4, 3, 2, 7, 3, 8, 6, 9, 0, 5, 6, 0, 7,
        6, 1, 8, 7, 9, 3, 9, 8, 5, 9, 3, 3, 0, 7, 4, 9, 8, 0, 9, 4, 1, 4, 4, 6, 0])

In [21]:
loss_func(preds, yb)

tensor(2.33, grad_fn=<NllLossBackward0>)

In [22]:
preds.argmax(dim=1)

tensor([4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 8, 4, 4, 4, 4, 4, 4, 4, 4, 5,
        4, 5, 4, 4, 4, 5, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 8, 4, 4, 4, 4, 4, 4, 4, 4])

In [23]:
def accuracy(out,yb): return (out.argmax(dim=1)==yb).float().mean()

In [24]:
accuracy(preds,yb)

tensor(0.16)

In [25]:
lr=0.5
epochs=3

In [26]:
def report(loss, preds, yb): print(f'{loss:.2f}, {accuracy(preds, yb):.2f}')

In [27]:
xb,yb = x_train[:bs],y_train[:bs]
preds = model(xb)
report(loss_func(preds, yb), preds, yb)

2.33, 0.16


In [28]:
for epoch in range(epochs):
  for i in range(0,n,bs):
    s = slice(i,min(n,i+bs))
    xb,yb = x_train[s],y_train[s]
    preds = model(xb)
    loss = loss_func(preds,yb)
    loss.backward()
    if i==0:print(loss.item(), accuracy(preds,yb).item())
    with torch.no_grad():
      for l in model.layers:
        if hasattr(l, 'weight'):
          l.weight -= l.weight.grad*lr
          l.bias   -= l.bias.grad   * lr
          l.weight.grad.zero_()
          l.bias.grad.zero_()

  #report(loss,preds,yb)

2.328364849090576 0.15625
0.1482311487197876 0.96875
0.08862613141536713 0.953125


## Parameters and Optim

#### Parameters

In [29]:
m1 = nn.Module()
m1.foo = nn.Linear(3,4)
m1

Module(
  (foo): Linear(in_features=3, out_features=4, bias=True)
)

In [30]:
(list(m1.named_children()))

[('foo', Linear(in_features=3, out_features=4, bias=True))]

In [31]:
m1.named_children()

<generator object Module.named_children at 0x7fb4174161f0>

In [32]:
list(m1.parameters())

[Parameter containing:
 tensor([[-0.14, -0.09,  0.04],
         [-0.14, -0.39, -0.37],
         [-0.54, -0.39, -0.22],
         [-0.03, -0.05,  0.24]], requires_grad=True),
 Parameter containing:
 tensor([-0.26, -0.21,  0.46, -0.08], requires_grad=True)]

In [33]:
class MLP(nn.Module):
  def __init__(self,n_in,nh,n_out):
    super().__init__()
    self.l1 = nn.Linear(n_in,nh)
    self.l2 = nn.Linear(nh,n_out)
    self.relu = nn.ReLU()

  def forward(self,x): return self.l2(self.relu(self.l1(x)))

In [34]:
model = MLP(m,nh,10)
model.l1

Linear(in_features=784, out_features=50, bias=True)

In [35]:
model

MLP(
  (l1): Linear(in_features=784, out_features=50, bias=True)
  (l2): Linear(in_features=50, out_features=10, bias=True)
  (relu): ReLU()
)

In [36]:
for name,l in model.named_children(): print(f'{name}: {l}')

l1: Linear(in_features=784, out_features=50, bias=True)
l2: Linear(in_features=50, out_features=10, bias=True)
relu: ReLU()


In [37]:
for p in model.parameters(): print(p.shape)

torch.Size([50, 784])
torch.Size([50])
torch.Size([10, 50])
torch.Size([10])


In [38]:
def fit():
  for epoch in range(epochs):
    for i in range(0,n,bs):
      s = slice(i,min(n,i+bs))
      xb,yb = x_train[s],y_train[s]
      preds = model(xb)
      loss = loss_func(preds,yb)
      loss.backward()
      #if i==0:print(loss.item(), accuracy(preds,yb).item())
      if i==0:print(f'{loss.item():.2f}, {accuracy(preds,yb).item():.2f}')
      with torch.no_grad():
        for p in model.parameters(): p -= p.grad *lr
        model.zero_grad()

    #report(loss,preds,yb)

In [39]:
fit()

2.30, 0.11
0.14, 0.94
0.13, 0.95
