In [1]:
#| default_exp training

In [2]:
#|export
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl,numpy as np,matplotlib.pyplot as plt
from pathlib import Path
from torch import tensor,nn
import torch.nn.functional as F
from fastcore.test import test_close

torch.set_printoptions(precision=2, linewidth=140, sci_mode=False)
torch.manual_seed(1)
mpl.rcParams['image.cmap'] = 'gray'

In [3]:
path_data = Path('data')
path_gz = path_data/'mnist.pkl.gz'
with gzip.open(path_gz, 'rb') as f: ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin#1')
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])

In [4]:
n,m = x_train.shape
c = y_train.max()+1
nh = 50

In [5]:
n

50000

In [6]:
# Define a new class called `Model` that inherits from `nn.Module`.
class Model(nn.Module):
# Define a constructor method for the `Model` class that takes three arguments: `n_in`, `nh`, and `n_out`.
    def __init__(self, n_in, nh, n_out):
# Call the constructor of the parent class (`nn.Module`) to initialize the `Model` object.
        super().__init__()
# Create a list called `layers` that contains three elements:
  # A linear layer that maps `n_in` input features to `nh` output features.
  # A ReLU activation function.
  # A linear layer that maps `nh` input features to `n_out` output features.
        self.layers = [nn.Linear(n_in, nh), nn.ReLU(), nn.Linear(nh, n_out)]
# Define a method called `__call__` that takes an input tensor `x`.
    def __call__(self, x):
# Loop over each layer in the `layers` list and apply it to the input tensor `x`. 
# This effectively performs a forward pass through the neural network.
        for l in self.layers: 
            x = l(x)
# Return the output tensor `x` after passing it through all the layers in the neural network.
        return x

In [7]:
model = Model(m, nh, 10)
pred = model(x_train)
pred.shape

torch.Size([50000, 10])

$$\hbox{testing(x)}_{i,j}

First, we will need to compute the softmax of our activations. This is defined by:

$$\hbox{softmax(x)}_{i} = \frac{e^{x_{i}}}{e^{x_{0}} + e^{x_{1}} + \cdots + e^{x_{n-1}}}$$

or more concisely:

$$\hbox{softmax(x)}_{i} = \frac{e^{x_{i}}}{\sum\limits_{0 \leq j \lt n} e^{x_{j}}}$$ 

In practice, we will need the log of the softmax when we calculate the loss.

In [8]:
x_train.exp(), x_train.exp().shape

(tensor([[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         ...,
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.]]),
 torch.Size([50000, 784]))

In [9]:
x_train.exp().unique()

tensor([1.00, 1.00, 1.01, 1.01, 1.02, 1.02, 1.02, 1.03, 1.03, 1.04, 1.04, 1.04, 1.05, 1.05, 1.06, 1.06, 1.06, 1.07, 1.07, 1.08, 1.08, 1.09,
        1.09, 1.09, 1.10, 1.10, 1.11, 1.11, 1.12, 1.12, 1.12, 1.13, 1.13, 1.14, 1.14, 1.15, 1.15, 1.16, 1.16, 1.16, 1.17, 1.17, 1.18, 1.18,
        1.19, 1.19, 1.20, 1.20, 1.21, 1.21, 1.22, 1.22, 1.23, 1.23, 1.23, 1.24, 1.24, 1.25, 1.25, 1.26, 1.26, 1.27, 1.27, 1.28, 1.28, 1.29,
        1.29, 1.30, 1.30, 1.31, 1.31, 1.32, 1.32, 1.33, 1.34, 1.34, 1.35, 1.35, 1.36, 1.36, 1.37, 1.37, 1.38, 1.38, 1.39, 1.39, 1.40, 1.40,
        1.41, 1.42, 1.42, 1.43, 1.43, 1.44, 1.44, 1.45, 1.45, 1.46, 1.47, 1.47, 1.48, 1.48, 1.49, 1.50, 1.50, 1.51, 1.51, 1.52, 1.52, 1.53,
        1.54, 1.54, 1.55, 1.55, 1.56, 1.57, 1.57, 1.58, 1.59, 1.59, 1.60, 1.60, 1.61, 1.62, 1.62, 1.63, 1.64, 1.64, 1.65, 1.66, 1.66, 1.67,
        1.67, 1.68, 1.69, 1.69, 1.70, 1.71, 1.71, 1.72, 1.73, 1.73, 1.74, 1.75, 1.76, 1.76, 1.77, 1.78, 1.78, 1.79, 1.80, 1.80, 1.81, 1.82,
        1.82, 1.83, 

In [10]:
x_train.exp().sum()

tensor(47351144.)

In [11]:
# take the natural log of x and divide by the sum of the natural logs of x along the first dimension,
# tak
# using keepdim returns a tensor with the same number of dimensions as the original
(x_train.exp()/x_train.exp().sum(-1, keepdim=True)).log()

tensor([[-6.86, -6.86, -6.86,  ..., -6.86, -6.86, -6.86],
        [-6.88, -6.88, -6.88,  ..., -6.88, -6.88, -6.88],
        [-6.80, -6.80, -6.80,  ..., -6.80, -6.80, -6.80],
        ...,
        [-6.83, -6.83, -6.83,  ..., -6.83, -6.83, -6.83],
        [-6.83, -6.83, -6.83,  ..., -6.83, -6.83, -6.83],
        [-6.85, -6.85, -6.85,  ..., -6.85, -6.85, -6.85]])

In [12]:
def log_softmax(x):
    return (x.exp()/x.exp().sum(-1, keepdim=True)).log()

Note that the formula:

$$\log \left ( \frac{a}{b} \right ) = \log(a) - \log(b)$$

gives us a simplification

In [13]:
def log_softmax(x):
    return x - x.exp()/x.exp().sum(-1, keepdim=True).log()

Then, there is a way to compute the log of the sum of exponentials in a more stable way, called the [LogSumExp trick](https://en.wikipedia.org/wiki/LogSumExp). The idea is to use the following formula:

$$\log \left ( \sum_{j=1}^{n} e^{x_{j}} \right ) = \log \left ( e^{a} \sum_{j=1}^{n} e^{x_{j}-a} \right ) = a + \log \left ( \sum_{j=1}^{n} e^{x_{j}-a} \right )$$

where a is the maximum of the $x_{j}$.

In [14]:
def logsumexp(x):
    m = x.max(-1)[0]
    return m + (x-m[:,None]).exp().sum(-1).log()

This way, we will avoid an overflow when taking the exponential of a big activation. In PyTorch, this is already implemented for us. 

In [15]:
def log_softmax(x):
    return x - x.logsumexp(-1, keepdim=True)

In [18]:
test_close(logsumexp(pred), pred.logsumexp(-1))
sm_pred = log_softmax(pred)
sm_pred

tensor([[-2.37, -2.49, -2.36,  ..., -2.31, -2.28, -2.22],
        [-2.37, -2.44, -2.44,  ..., -2.27, -2.26, -2.16],
        [-2.48, -2.33, -2.28,  ..., -2.30, -2.30, -2.27],
        ...,
        [-2.33, -2.52, -2.34,  ..., -2.31, -2.21, -2.16],
        [-2.38, -2.38, -2.33,  ..., -2.29, -2.26, -2.17],
        [-2.33, -2.55, -2.36,  ..., -2.29, -2.27, -2.16]], grad_fn=<SubBackward0>)

In [19]:
y_train[:3]

tensor([5, 0, 4])

In [20]:
sm_pred[0,5]

tensor(-2.20, grad_fn=<SelectBackward0>)

In [22]:
def nll(input, target): 
    return -input[range(target.shape[0]), target].mean()

In [24]:
loss = nll(sm_pred, y_train)
loss

tensor(2.30, grad_fn=<NegBackward0>)

In [25]:
test_close(F.nll_loss(F.log_softmax(pred, -1), y_train), loss, 1e-3)

In [26]:
test_close(F.cross_entropy(pred, y_train), loss, 1e-3)

## Basic training loop

Basically the training loop repeats over the following steps:
- get the output of the model on a batch of inputs
- compare the output to the labels we have and compute a loss
- calculate the gradients of the loss with respect to every parameter of the model
- update said parameters with those gradients to make them a little bit better

In [27]:
loss_func = F.cross_entropy

In [31]:
bs = 50

xb = x_train[0:bs]
preds = model(xb)
preds[0], preds.shape

(tensor([-0.09, -0.21, -0.08,  0.10, -0.04,  0.08, -0.04, -0.03,  0.01,  0.06], grad_fn=<SelectBackward0>),
 torch.Size([50, 10]))

In [32]:
yb = y_train[0:bs]

In [33]:
loss_func(preds, yb)

tensor(2.30, grad_fn=<NllLossBackward0>)

In [34]:
preds.argmax(dim=1)

tensor([3, 9, 3, 8, 5, 9, 3, 9, 3, 9, 5, 3, 9, 9, 3, 9, 9, 5, 8, 7, 9, 5, 3, 8, 9, 5, 9, 5, 5, 9, 3, 5, 9, 7, 5, 7, 9, 9, 3, 9, 3, 5, 3, 8,
        3, 5, 9, 5, 9, 5])

In [37]:
#|export
def accuracy(out, yb): 
    return (out.argmax(dim=1)==yb).float().mean()

In [38]:
accuracy(preds, yb)

tensor(0.08)

In [None]:
lr = 0.5
epochs = 3

In [39]:
#|export
def report(loss, preds, yb): 
    print(f'{loss:.2f}, {accuracy(preds, yb):.2f}')

In [40]:
xb, yb = x_train[0:bs], y_train[0:bs]
preds = model(xb)
report(loss_func(preds, yb), preds, yb)

2.30, 0.08


In [None]:
# Loop through each epoch for a certain number of epochs
for epoch in range(epochs):
# For each epoch, loop through the training data in batches of size bs
    for i in range(0, n, bs):
# Create a slice object for the current batch
        s = slice(i )
# Get the input (xb) and target (yb) data for the current batch
# Feed the batch through the model and get predictions
# Calculate the loss between the model's predictions and the actual targets
# Compute the gradients of the loss with respect to model parameters
# Temporarily disable gradient calculation to perform update steps
# For each layer in the model:
#     If the layer has weights:
#         Apply gradient descent to update the layer's weights
#         Apply gradient descent to update the layer's bias
#         Reset the gradients of the layer's weights to zero
#         Reset the gradients of the layer's bias to zero
# Report the loss and predictions for the current batch
