📝 **Author:** Amirhossein Heydari - 📧 **Email:** amirhosseinheydari78@gmail.com - 📍 **Linktree:** [linktr.ee/mr_pylin](https://linktr.ee/mr_pylin)

---

# Dependencies

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import torch
from sklearn.metrics import classification_report
from torch import nn
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from torch.utils.data import DataLoader, random_split
from torchinfo import summary
from torchmetrics import Accuracy, ConfusionMatrix
from torchvision.datasets import MNIST
from torchvision.transforms import v2

In [2]:
# set a seed for deterministic results
random_state = 42
torch.manual_seed(random_state)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
# check if cuda is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

# Pre-Processing
   - transforms: [pytorch.org/vision/main/transforms.html](https://pytorch.org/vision/main/transforms.html)
   - available datasets: [pytorch.org/vision/main/datasets.html](https://pytorch.org/vision/main/datasets.html)
   - Datasets & DataLoader: [pytorch.org/tutorials/beginner/basics/data_tutorial.html](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html)

## Load Dataset
   - v2.ToImage:
      - Convert a tensor, ndarray, or PIL Image to [Image](https://pytorch.org/vision/main/generated/torchvision.tv_tensors.Image.html#torchvision.tv_tensors.Image)
      - [pytorch.org/vision/main/generated/torchvision.transforms.v2.ToImage.html](https://pytorch.org/vision/main/generated/torchvision.transforms.v2.ToImage.html)
   - v2.ToDtype:
      - Converts the input to a specific dtype, optionally scaling the values for images or videos
      - [pytorch.org/vision/main/generated/torchvision.transforms.v2.ToDtype.html](https://pytorch.org/vision/main/generated/torchvision.transforms.v2.ToDtype.html)

In [4]:
# initial transforms
transforms = v2.Compose(
    [
        v2.ToImage(),
        v2.ToDtype(torch.float32, scale=True)
    ]
)

# load the MNIST dataset
trainset = MNIST(root='./dataset', train=True, download=True, transform=transforms)
testset = MNIST(root='./dataset', train=False, download=True, transform=transforms)

# log
print('trainset:')
print(f"    -> trainset.data.shape    : {trainset.data.shape}")  # it doesn't explicitly include the channel dimension for MNIST e.g. [60000, 1, 28, 28]
print(f"    -> trainset.data.dtype    : {trainset.data.dtype}")
print(f"    -> type(trainset.data)    : {type(trainset.data)}")
print(f"    -> type(trainset.targets) : {type(trainset.targets)}")
print('-' * 50)
print('testset:')
print(f"    -> testset.data.shape     : {testset.data.shape}")
print(f"    -> testset.data.dtype     : {testset.data.dtype}")
print(f"    -> type(testset.data)     : {type(testset.data)}")
print(f"    -> type(testset.targets)  : {type(testset.targets)}")
print('-' * 50)
print(f"classes: {trainset.classes}")
print(f"trainset distribution: {np.unique(trainset.targets, return_counts=True)[1]}")
print(f"testset  distribution: {np.unique(testset.targets, return_counts=True)[1]}")

trainset:
    -> trainset.data.shape    : torch.Size([60000, 28, 28])
    -> trainset.data.dtype    : torch.uint8
    -> type(trainset.data)    : <class 'torch.Tensor'>
    -> type(trainset.targets) : <class 'torch.Tensor'>
--------------------------------------------------
testset:
    -> testset.data.shape     : torch.Size([10000, 28, 28])
    -> testset.data.dtype     : torch.uint8
    -> type(testset.data)     : <class 'torch.Tensor'>
    -> type(testset.targets)  : <class 'torch.Tensor'>
--------------------------------------------------
classes: ['0 - zero', '1 - one', '2 - two', '3 - three', '4 - four', '5 - five', '6 - six', '7 - seven', '8 - eight', '9 - nine']
trainset distribution: [5923 6742 5958 6131 5842 5421 5918 6265 5851 5949]
testset  distribution: [ 980 1135 1032 1010  982  892  958 1028  974 1009]


In [None]:
# plot
fig, axs = plt.subplots(nrows=4, ncols=8, figsize=(12, 6), layout='compressed')
for i in range(4):
    for j in range(8):
        axs[i, j].imshow(trainset.data[i * 8 + j], cmap='gray')
        axs[i, j].set_title(trainset.classes[trainset.targets[i * 8 + j]])
        axs[i, j].axis('off')
plt.show()

## Split trainset into [trainset, validationset]
   - [pytorch.org/docs/stable/data.html](https://pytorch.org/docs/stable/data.html)

In [6]:
# random split (returns List[Subset])
trainset, validationset = random_split(trainset, [.9, .1])

# log
print('trainset:')
print(f"    -> len(trainset)       : {len(trainset)}")
print(f"    -> trainset[0][0]      : {trainset[0][0].shape}")
print(f"    -> trainset[0][1]      : {trainset[0][1]}")
print(f"    -> type(trainset)      : {type(trainset)}\n")
print('validationset:')
print(f"    -> len(validationset)  : {len(validationset)}")
print(f"    -> validationset[0][0] : {validationset[0][0].shape}")
print(f"    -> validationset[0][1] : {validationset[0][1]}")
print(f"    -> type(validationset) : {type(validationset)}\n")
print('testset:')
print(f"    -> len(testset)        : {len(testset)}")
print(f"    -> testset[0][0]       : {testset[0][0].shape}")
print(f"    -> testset[0][1]       : {testset[0][1]}")
print(f"    -> type(testset)       : {type(testset)}")

trainset:
    -> len(trainset)       : 54000
    -> trainset[0][0]      : torch.Size([1, 28, 28])
    -> trainset[0][1]      : 6
    -> type(trainset)      : <class 'torch.utils.data.dataset.Subset'>

validationset:
    -> len(validationset)  : 6000
    -> validationset[0][0] : torch.Size([1, 28, 28])
    -> validationset[0][1] : 1
    -> type(validationset) : <class 'torch.utils.data.dataset.Subset'>

testset:
    -> len(testset)        : 10000
    -> testset[0][0]       : torch.Size([1, 28, 28])
    -> testset[0][1]       : 7
    -> type(testset)       : <class 'torchvision.datasets.mnist.MNIST'>


## Normalization
   1. Min-Max Normalization
      - 0-1 Normalization
         - Scales the pixel values to [0, 1] range
      - ...
   1. Mean-STD Normalization
      - Standardization (Z-score normalization)
         - Transforms the data to have a mean of 0 and a standard deviation of 1
      - Mean Normalization
         - It centers the data around zero
      - Scale and Center Images
         - Rescale the pixel values to have a mean of 0.5 and a standard deviation of 0.5
      - ...
   1. ...


In [7]:
# create a temporary DataLoader for the trainset
temp_trainloader = DataLoader(trainset, batch_size=len(trainset))

# get the whole data
temp_dataset = next(iter(temp_trainloader))

# calculate the mean and standard deviation
train_mean = temp_dataset[0].mean().item()  # 0.1307
train_std = temp_dataset[0].std().item()  # 0.3081

del temp_trainloader
del temp_dataset

# log
print(f"train mean per channel: {train_mean}")
print(f"train std  per channel: {train_std}")

train mean per channel: 0.13067437708377838
train std  per channel: 0.30812761187553406


## Transform
   - on-the-fly data augmentation
   - Disadvantage:
      - same transform applies to the same data in each epoch
   - Advantage:
      - Reduced Memory Usage, Regularization & Data Diversity [random transforms e.g. RancomCrop]

In [8]:
transforms

Compose(
      ToImage()
      ToDtype(scale=True)
)

In [9]:
transforms.transforms.append(v2.Normalize(mean=(train_mean,), std=(train_std,)))

# log
print(f"trainset.dataset.transforms:\n{trainset.dataset.transforms}\n")
print(f"validationset.dataset.transforms:\n{validationset.dataset.transforms}\n")
print(f"testset.transforms:\n{testset.transforms}")

trainset.dataset.transforms: StandardTransform
Transform: Compose(
                 ToImage()
                 ToDtype(scale=True)
                 Normalize(mean=[0.13067437708377838], std=[0.30812761187553406], inplace=False)
           )
validationset.dataset.transforms: StandardTransform
Transform: Compose(
                 ToImage()
                 ToDtype(scale=True)
                 Normalize(mean=[0.13067437708377838], std=[0.30812761187553406], inplace=False)
           )
testset.transforms: StandardTransform
Transform: Compose(
                 ToImage()
                 ToDtype(scale=True)
                 Normalize(mean=[0.13067437708377838], std=[0.30812761187553406], inplace=False)
           )


In [10]:
# log
print("before applying transform:")
print(f"    -> type(testset.data[0]) : {type(testset.data[0])}")
print(f"    -> testset.data[0].dtype : {testset.data[0].dtype}")
print(f"    -> testset.data[0].shape : {testset.data[0].shape}")
print('-' * 50)
print("after applying transform:")
print(f"    -> type(testset[0][0])   : {type(testset[0][0])}")
print(f"    -> testset[0][0].dtype   : {testset[0][0].dtype}")
print(f"    -> testset[0][0].shape   : {testset[0][0].shape}")

before applying transform:
    -> type(testset.data[0]) : <class 'torch.Tensor'>
    -> testset.data[0].dtype : torch.uint8
    -> testset.data[0].shape : torch.Size([28, 28])
--------------------------------------------------
after applying transform:
    -> type(testset[0][0])   : <class 'torchvision.tv_tensors._image.Image'>
    -> testset[0][0].dtype   : torch.float32
    -> testset[0][0].shape   : torch.Size([1, 28, 28])


## DataLoader
   - [pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader)

In [11]:
batch_size = 64

trainloader      = DataLoader(dataset=trainset     , batch_size=batch_size, shuffle=True , num_workers=2)
validationloader = DataLoader(dataset=validationset, batch_size=batch_size, shuffle=False, num_workers=2)
testloader       = DataLoader(dataset=testset      , batch_size=batch_size, shuffle=False, num_workers=2)

In [12]:
# log
first_train_batch      = next(iter(trainloader))
first_validation_batch = next(iter(validationloader))
first_test_batch       = next(iter(testloader))

print(f"trainloader      first batch     -> x.shape: {first_train_batch[0].shape} - y.shape: {first_train_batch[1].shape} - x.dtype: {first_train_batch[0].dtype} - y.dtype: {first_train_batch[1].dtype}")
print(f"validationloader first batch     -> x.shape: {first_validation_batch[0].shape} - y.shape: {first_validation_batch[1].shape} - x.dtype: {first_validation_batch[0].dtype} - y.dtype: {first_validation_batch[1].dtype}")
print(f"testloader       first batch     -> x.shape: {first_test_batch[0].shape} - y.shape: {first_test_batch[1].shape} - x.dtype: {first_test_batch[0].dtype} - y.dtype: {first_test_batch[1].dtype}")
print(f"trainloader      last batch-size -> {len(trainset) % batch_size}")
print(f"validationloader last batch-size -> {len(validationset) % batch_size}")
print(f"testloader       last batch-size -> {len(testset) % batch_size}")

trainloader      first batch     -> x.shape: torch.Size([64, 1, 28, 28]) - y.shape: torch.Size([64]) - x.dtype: torch.float32 - y.dtype: torch.int64
validationloader first batch     -> x.shape: torch.Size([64, 1, 28, 28]) - y.shape: torch.Size([64]) - x.dtype: torch.float32 - y.dtype: torch.int64
testloader       first batch     -> x.shape: torch.Size([64, 1, 28, 28]) - y.shape: torch.Size([64]) - x.dtype: torch.float32 - y.dtype: torch.int64
trainloader      last batch-size -> 48
validationloader last batch-size -> 48
testloader       last batch-size -> 16


# Network Structure: Fully-Connected Networks
   - Sequential Model
      - Use torch.nn.Sequential to create a sequence of layers or modules
      - [pytorch.org/docs/stable/generated/torch.nn.Sequential.html](https://pytorch.org/docs/stable/generated/torch.nn.Sequential.html)
   - Functional Model
      - for stateless operations like activation functions, loss functions, and other operations within the forward method of custom modules or in custom functions
      - [pytorch.org/docs/stable/nn.functional.html](https://pytorch.org/docs/stable/nn.functional.html)
   - Mixed Model

**Notes**:
   - loss function : 
      - multi-class classification : `torch.nn.CrossEntropyLoss` = `torch.nn.LogSoftmax` + `torch.nn.NLLLoss`
      - [pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html)
      - [pytorch.org/docs/stable/generated/torch.nn.NLLLoss.html](https://pytorch.org/docs/stable/generated/torch.nn.NLLLoss.html)
   - activation function for the last layer:
      - when using `torch.nn.CrossEntropyLoss` as a loss function, the output layer doesn't need an activation function
      - `torch.nn.CrossEntropyLoss` calculates `torch.nn.LogSoftmax` and `torch.nn.NLLLoss` internally.
      - [pytorch.org/docs/stable/generated/torch.nn.Softmax.html](https://pytorch.org/docs/stable/generated/torch.nn.Softmax.html)
      - [pytorch.org/docs/stable/generated/torch.nn.LogSoftmax.html](https://pytorch.org/docs/stable/generated/torch.nn.LogSoftmax.html)
   - `torch.nn.Linear`
      - Weights
         - Initialized based on a scheme similar to Xavier/Glorot initialization
         - Uniform Distribution [default]: $W \sim \mathcal{U}\left(-{gain}\times\sqrt{\frac{6}{n_{\text{in}} + n_{\text{out}}}}, {gain}\times\sqrt{\frac{6}{n_{\text{in}} + n_{\text{out}}}}\right)$
         - Normal Distribution: $W \sim \mathcal{N}\left(0, \frac{2}{n_{\text{in}} + n_{\text{out}}}\right)$
      - Biases:
         - Initialized to zero
      - [pytorch.org/docs/stable/nn.init.html](https://pytorch.org/docs/stable/nn.init.html)
      - Paper: [Understanding the difficulty of training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010).](https://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf)

**Playground**:
   - [deeperplayground.org](https://deeperplayground.org/)
   - [alexlenail.me/NN-SVG](https://alexlenail.me/NN-SVG/)

<figure style="text-align: center;">
    <img src="../../assets/images/original/mlp/multi-layer-perceptrons.svg" alt="multi-layer-perceptrons.svg" style="width: 100%;">
    <figcaption style="text-align: center;">Multi-Layer-Perceptron (aka fully connected layers)</figcaption>
</figure>

<table style="margin: 0 auto; text-align:center;">
   <thead>
      <tr>
         <th colspan="2">hidden<sub>1</sub> parameters</th>
         <th colspan="2">hidden<sub>2</sub> parameters</th>
         <th colspan="2">logits parameters</th>
      </tr>
   </thead>
   <tbody>
      <tr>
         <td>Weights</td>
         <td>Biases</td>
         <td>Weights</td>
         <td>Biases</td>
         <td>Weights</td>
         <td>Biases</td>
      </tr>
      <tr>
         <td>A × B</td>
         <td>B</td>
         <td>B × C</td>
         <td>C</td>
         <td>C × D</td>
         <td>D</td>
      </tr>
   </tbody>
   <tfoot>
      <tr>
         <td colspan="2">(A + 1) × B</td>
         <td colspan="2">(B + 1) × C</td>
         <td colspan="2">(C + 1) × D</td>
      </tr>
   </tfoot>
</table>

In [13]:
# layers
depth, height, width = trainset[0][0].shape

input_dim = depth * height * width
hidden_dim = [64, 32]
output_dim = len(testset.classes)

# log
print(f"input_dim  : {input_dim}")
print(f"hidden_dim : {hidden_dim}")
print(f"output_dim : {output_dim}")

input_dim  : 784
hidden_dim : [64, 32]
output_dim : 10


## Sequential Model
   - Activation Function is ignored from the last layer due to the `torch.nn.CrossEntropyLoss`

In [14]:
sequential_model = nn.Sequential(
    nn.Flatten(start_dim=1),
    nn.Linear(input_dim, hidden_dim[0]),
    nn.ReLU(),
    nn.Linear(hidden_dim[0], hidden_dim[1]),
    nn.ReLU(),
    nn.Linear(hidden_dim[1], output_dim),
)

In [15]:
sequential_model.to(device)

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=64, bias=True)
  (2): ReLU()
  (3): Linear(in_features=64, out_features=32, bias=True)
  (4): ReLU()
  (5): Linear(in_features=32, out_features=10, bias=True)
)

In [16]:
summary(sequential_model, input_size=(batch_size, *trainset[0][0].shape))

Layer (type:depth-idx)                   Output Shape              Param #
Sequential                               [64, 10]                  --
├─Flatten: 1-1                           [64, 784]                 --
├─Linear: 1-2                            [64, 64]                  50,240
├─ReLU: 1-3                              [64, 64]                  --
├─Linear: 1-4                            [64, 32]                  2,080
├─ReLU: 1-5                              [64, 32]                  --
├─Linear: 1-6                            [64, 10]                  330
Total params: 52,650
Trainable params: 52,650
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 3.37
Input size (MB): 0.20
Forward/backward pass size (MB): 0.05
Params size (MB): 0.21
Estimated Total Size (MB): 0.47

## Functional Model

In [17]:
class FunctionalModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(FunctionalModel, self).__init__()
        self.flatten = nn.Flatten(start_dim=1)
        self.linear1 = nn.Linear(input_dim, hidden_dim[0])
        self.relu1   = nn.ReLU()
        self.linear2 = nn.Linear(hidden_dim[0], hidden_dim[1])
        self.relu2   = nn.ReLU()
        self.linear3 = nn.Linear(hidden_dim[1], output_dim)

    def forward(self, x):
        x = self.flatten(x)
        x = self.linear1(x)
        x = self.relu1(x)
        x = self.linear2(x)
        x = self.relu2(x)
        x = self.linear3(x)
        return x


functional_model = FunctionalModel(input_dim, output_dim)

In [18]:
functional_model.to(device)

FunctionalModel(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear1): Linear(in_features=784, out_features=64, bias=True)
  (relu1): ReLU()
  (linear2): Linear(in_features=64, out_features=32, bias=True)
  (relu2): ReLU()
  (linear3): Linear(in_features=32, out_features=10, bias=True)
)

In [19]:
summary(functional_model, input_size=(batch_size, *trainset[0][0].shape))

Layer (type:depth-idx)                   Output Shape              Param #
FunctionalModel                          [64, 10]                  --
├─Flatten: 1-1                           [64, 784]                 --
├─Linear: 1-2                            [64, 64]                  50,240
├─ReLU: 1-3                              [64, 64]                  --
├─Linear: 1-4                            [64, 32]                  2,080
├─ReLU: 1-5                              [64, 32]                  --
├─Linear: 1-6                            [64, 10]                  330
Total params: 52,650
Trainable params: 52,650
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 3.37
Input size (MB): 0.20
Forward/backward pass size (MB): 0.05
Params size (MB): 0.21
Estimated Total Size (MB): 0.47

## Mixed Model

In [20]:
class MixModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MixModel, self).__init__()
        self.classifier = nn.Sequential(
            nn.Flatten(start_dim=1),
            nn.Linear(input_dim, hidden_dim[0]),
            nn.ReLU(),
            nn.Linear(hidden_dim[0], hidden_dim[1]),
            nn.ReLU(),
            nn.Linear(hidden_dim[1], output_dim),
        )

    def forward(self, x):
        x = self.classifier(x)
        return x


mix_model = MixModel(input_dim, output_dim)

In [21]:
mix_model.to(device)

MixModel(
  (classifier): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=784, out_features=64, bias=True)
    (2): ReLU()
    (3): Linear(in_features=64, out_features=32, bias=True)
    (4): ReLU()
    (5): Linear(in_features=32, out_features=10, bias=True)
  )
)

In [22]:
summary(mix_model, input_size=(batch_size, *trainset[0][0].shape))

Layer (type:depth-idx)                   Output Shape              Param #
MixModel                                 [64, 10]                  --
├─Sequential: 1-1                        [64, 10]                  --
│    └─Flatten: 2-1                      [64, 784]                 --
│    └─Linear: 2-2                       [64, 64]                  50,240
│    └─ReLU: 2-3                         [64, 64]                  --
│    └─Linear: 2-4                       [64, 32]                  2,080
│    └─ReLU: 2-5                         [64, 32]                  --
│    └─Linear: 2-6                       [64, 10]                  330
Total params: 52,650
Trainable params: 52,650
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 3.37
Input size (MB): 0.20
Forward/backward pass size (MB): 0.05
Params size (MB): 0.21
Estimated Total Size (MB): 0.47

# Set up remaining Hyper-Parameters
   - Model `Parameters` are learned during training
   - `Hyperparameters` are critical in determining the performance and efficiency of your model

**Common Hyperparameters**:
<table style="margin: 0 auto;">
   <thead>
      <tr>
         <th style="text-align:center;">Hyperparameter</th>
         <th style="text-align:center;">Description</th>
         <th style="text-align:center;">Effect</th>
      </tr>
   </thead>
   <tbody>
      <tr>
         <td>Batch Size</td>
         <td>Number of samples per gradient update</td>
         <td>Larger batch sizes can stabilize training but require more memory</td>
      </tr>
      <tr>
         <td>Number of Epochs</td>
         <td>Full passes through the training dataset</td>
         <td>More epochs can improve learning but may lead to overfitting</td>
      </tr>
      <tr>
         <td>Dropout Rate</td>
         <td>Fraction of units to drop during training</td>
         <td>Helps prevent overfitting but too high can lead to underfitting</td>
      </tr>
      <tr>
         <td>Number of Layers</td>
         <td>Layers in a neural network</td>
         <td>More layers can capture more complexity but can be harder to train</td>
      </tr>
      <tr>
         <td>Number of Neurons</td>
         <td>Neurons in each layer</td>
         <td>More neurons can capture more features but increase computational cost</td>
      </tr>
      <tr>
         <td>Weight Initialization</td>
         <td>Method to initialize weights (e.g., Xavier, He)</td>
         <td>Affects the convergence speed and stability of the training process</td>
      </tr>
      <tr>
         <td>Loss Function</td>
         <td>Function to evaluate model performance (e.g., MSE, Cross-Entropy)</td>
         <td>Determines how the model's performance is measured and impacts training behavior</td>
      </tr>
      <tr>
         <td>Optimizer</td>
         <td>Algorithm to update model parameters (e.g., SGD, Adam)</td>
         <td>Different optimizers can affect the training dynamics and performance</td>
      </tr>
      <tr>
         <td>Learning Rate</td>
         <td>Step size for parameter updates</td>
         <td>Too high can cause divergence, too low can result in slow convergence</td>
      </tr>
      <tr>
         <td>Regularization (weight decacy)</td>
         <td>Strength of regularization (e.g., lambda for L2)</td>
         <td>Helps to prevent overfitting by adding a penalty for large weights</td>
      </tr>
      <tr>
         <td>Momentum</td>
         <td>Accelerates gradient vectors in the right directions</td>
         <td>Can help to converge faster by smoothing oscillations</td>
      </tr>
      <tr>
         <td>Learning Rate Decay</td>
         <td>Reduces learning rate over time</td>
         <td>Helps to fine-tune the model as it gets closer to convergence</td>
      </tr>
      <tr>
         <td>Activation Function</td>
         <td>Function applied to neurons (e.g., ReLU, sigmoid)</td>
         <td>Affects the model's ability to capture non-linearities</td>
      </tr>
      <tr>
         <td>Early Stopping</td>
         <td>Stops training when validation performance degrades</td>
         <td>Prevents overfitting by halting training at the optimal point</td>
      </tr>
      <tr>
         <td>Gradient Clipping</td>
         <td>Clamps gradients during backpropagation</td>
         <td>Prevents exploding gradients and helps in stabilizing training</td>
      </tr>
      <tr>
         <td>Batch Normalization</td>
         <td>Normalizes the output of a previous activation layer</td>
         <td>Helps to stabilize and accelerate training</td>
      </tr>
      <tr>
         <td>Data Augmentation</td>
         <td>Techniques to artificially increase training data (e.g., rotation, flipping)</td>
         <td>Improves generalization by making the model invariant to transformations</td>
      </tr>
   </tbody>
</table>

In [23]:
lr = 0.001
criterion = CrossEntropyLoss()
optimizer = Adam(params=sequential_model.parameters(), lr=lr)
num_epochs = 15

# Train & Validation Loop

### model.train & model.eval
   - Some regularization methods (e.g. batchNorm, dropout) are applied only during training, not during evaluation and prediction
   - `model.eval()` [`model.train(False)`], disables these type of regularizations
   - [pytorch.org/docs/stable/generated/torch.nn.Module.html](https://pytorch.org/docs/stable/generated/torch.nn.Module.html)
   - [pytorch.org/docs/stable/notes/autograd.html#locally-disable-grad-doc](https://pytorch.org/docs/stable/notes/autograd.html#locally-disable-grad-doc)

In [24]:
train_acc_per_epoch  = []
train_loss_per_epoch = []
val_acc_per_epoch    = []
val_loss_per_epoch   = []

In [25]:
train_acc = Accuracy(task='multiclass', num_classes=len(testset.classes), top_k=1).to(device)
val_acc   = Accuracy(task='multiclass', num_classes=len(testset.classes), top_k=1).to(device)

In [26]:
for epoch in range(num_epochs):

# train loop
    sequential_model.train()
    train_loss = 0

    for x, y in trainloader:

        # send data to GPU
        x, y_true = x.to(device), y.to(device)

        # forward
        y_pred = sequential_model(x)
        loss = criterion(y_pred, y_true)

        # backward
        loss.backward()

        # update parameters
        optimizer.step()
        optimizer.zero_grad()

        # log loss & accuracy
        train_loss += loss.item() * len(x)
        train_acc.update(y_pred, y_true)

    # store intermediate loss & accuracy
    train_loss_per_epoch.append(train_loss / len(trainset))
    train_acc_per_epoch.append(train_acc.compute().item())
    train_acc.reset()


# validation loop
    sequential_model.eval()
    val_loss = 0

    # During the forward pass, PyTorch saves intermediate results
    # (from each operation that involves tensors with requires_grad=True)
    # in order to compute gradients during the backward pass
    # torch.no_grad() stops pytorch to save these intermediate results
    with torch.no_grad():
        for x, y in validationloader:

            # send data to GPU
            x, y_true = x.to(device), y.to(device)

            # forward
            y_pred = sequential_model(x)
            loss = criterion(y_pred, y_true)

            # log loss & accuracy
            val_loss += loss.item() * len(x)
            val_acc.update(y_pred, y_true)

    # store intermediate loss & accuracy
    val_loss_per_epoch.append(val_loss / len(validationset))
    val_acc_per_epoch.append(val_acc.compute().item())
    val_acc.reset()

    # log
    print(f"epoch {epoch:>2}  ->  train[loss: {train_loss_per_epoch[epoch]:.5f} - acc: {train_acc_per_epoch[epoch]:.2f}] | validation[loss: {val_loss_per_epoch[epoch]:.5f} - acc: {val_acc_per_epoch[epoch]:.2f}]")

epoch  0  ->  train[loss: 0.33980 - acc: 0.90] | validation[loss: 0.19319 - acc: 0.94]
epoch  1  ->  train[loss: 0.15023 - acc: 0.96] | validation[loss: 0.14697 - acc: 0.96]
epoch  2  ->  train[loss: 0.10996 - acc: 0.97] | validation[loss: 0.13199 - acc: 0.96]
epoch  3  ->  train[loss: 0.08858 - acc: 0.97] | validation[loss: 0.13687 - acc: 0.96]
epoch  4  ->  train[loss: 0.07578 - acc: 0.98] | validation[loss: 0.11709 - acc: 0.97]
epoch  5  ->  train[loss: 0.06542 - acc: 0.98] | validation[loss: 0.11954 - acc: 0.97]
epoch  6  ->  train[loss: 0.05596 - acc: 0.98] | validation[loss: 0.11377 - acc: 0.97]
epoch  7  ->  train[loss: 0.04777 - acc: 0.98] | validation[loss: 0.13935 - acc: 0.96]
epoch  8  ->  train[loss: 0.04318 - acc: 0.99] | validation[loss: 0.10793 - acc: 0.97]
epoch  9  ->  train[loss: 0.03676 - acc: 0.99] | validation[loss: 0.11059 - acc: 0.97]
epoch 10  ->  train[loss: 0.03423 - acc: 0.99] | validation[loss: 0.13908 - acc: 0.97]
epoch 11  ->  train[loss: 0.03074 - acc: 0.

## Model Analysis
   - A useful technique to check the over-fitting situation

In [None]:
# plot
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(10, 4), layout='compressed')
axs[0].plot(train_loss_per_epoch, label="Train loss")
axs[0].plot(val_loss_per_epoch, label="Validation loss")
axs[0].set(title="Loss over time", xlabel='Epoch', ylabel='Loss')
axs[0].legend(loc='best', fancybox=True, shadow=True)
axs[1].plot(train_acc_per_epoch, label="Train accuracy")
axs[1].plot(val_acc_per_epoch, label="Validation accuracy")
axs[1].set(title="Accuracy over time", xlabel='Epoch', ylabel='Accuracy')
axs[1].legend(loc='best', fancybox=True, shadow=True)
plt.show()

# Test Loop

In [28]:
test_acc = Accuracy(task='multiclass', num_classes=len(testset.classes), top_k=1).to(device)

In [29]:
sequential_model.eval()
test_loss = 0
predictions = []
targets = []

with torch.no_grad():
    for x, y in testloader:

        # send data to GPU
        x, y_true = x.to(device), y.to(device)

        # forward
        y_pred = sequential_model(x)
        loss = criterion(y_pred, y_true)

        # log loss & accuracy
        test_loss += loss.item() * len(x)
        test_acc.update(y_pred, y_true)

        predictions.extend(y_pred.argmax(dim=1).cpu())
        targets.extend(y_true.cpu())

# log
print(f"test[loss: {test_loss / len(testset):.5f} - acc: {test_acc.compute().item():.2f}]")

test[loss: 0.11213 - acc: 0.98]


## Metrics
   - Loss
   - Accuracy
   - Recall
   - Precision
   - F1-Score
   - Confusion Matrix
   - Area Under the ROC Curve (AUC-ROC)
   - Area Under the Precision-Recall Curve (AUC-PR)
   - ...

**Docs**:
   - [lightning.ai/docs/torchmetrics/stable/all-metrics.html](https://lightning.ai/docs/torchmetrics/stable/all-metrics.html)
   - [scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html)

In [30]:
# classification report
print(classification_report(targets, predictions))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98       980
           1       0.99      0.99      0.99      1135
           2       0.97      0.98      0.98      1032
           3       0.96      0.98      0.97      1010
           4       0.97      0.98      0.98       982
           5       0.98      0.96      0.97       892
           6       0.98      0.98      0.98       958
           7       0.97      0.98      0.97      1028
           8       0.98      0.94      0.96       974
           9       0.97      0.96      0.97      1009

    accuracy                           0.98     10000
   macro avg       0.98      0.97      0.97     10000
weighted avg       0.98      0.98      0.98     10000



In [36]:
# confusion matrix
metric = ConfusionMatrix(task='multiclass', num_classes=10)
confusion_matrix = metric(torch.tensor(predictions), torch.tensor(targets))

# log
print(confusion_matrix)

# plot
fig, ax = plt.subplots(figsize=(8, 8))
metric.plot(ax=ax)
plt.show()

tensor([[ 971,    0,    2,    0,    1,    0,    3,    1,    1,    1],
        [   1, 1123,    4,    1,    0,    0,    2,    2,    2,    0],
        [   2,    0, 1014,    5,    1,    0,    1,    6,    3,    0],
        [   0,    1,    4,  990,    2,    2,    0,    9,    1,    1],
        [   2,    1,    3,    0,  962,    0,    3,    3,    0,    8],
        [   4,    1,    1,   12,    1,  860,    6,    2,    2,    3],
        [   4,    2,    1,    1,    2,    3,  941,    1,    3,    0],
        [   3,    2,    9,    1,    1,    0,    0, 1004,    0,    8],
        [   6,    0,    7,   13,    6,    8,    5,    4,  916,    9],
        [   3,    2,    0,    9,   15,    1,    1,    5,    2,  971]])


# Prediction

In [32]:
def predict(model: nn.Module, data: np.ndarray, classes: list, transform: v2._container.Compose = None) -> torch.Tensor:

    # add batch & channel dimension to a single data
    if len(data.shape) == 2:
        data = np.expand_dims(data, axis=(0, 3))

    # apply the transform
    if transform:
        data = torch.stack([transform(sample) for sample in data])

    # predict
    model.eval()
    with torch.no_grad():

        # send data to GPU
        data = data.to(device)

        # forward
        y_pred = model(data).argmax(dim=1).cpu()

        # idx to labels
        y_pred = np.array(classes)[y_pred]

    return y_pred

In [33]:
# some raw data
raw_data = MNIST(root='./dataset', train=False, download=True, transform=None).data[:32]

# predict
y_pred = predict(sequential_model, data=raw_data, classes=testset.classes, transform=transforms)

# log
print(f"predictions:\n{y_pred}")

predictions:
['7 - seven' '2 - two' '1 - one' '0 - zero' '4 - four' '1 - one'
 '4 - four' '9 - nine' '5 - five' '9 - nine' '0 - zero' '6 - six'
 '9 - nine' '0 - zero' '1 - one' '5 - five' '9 - nine' '7 - seven'
 '3 - three' '4 - four' '9 - nine' '6 - six' '6 - six' '5 - five'
 '4 - four' '0 - zero' '7 - seven' '4 - four' '0 - zero' '1 - one'
 '3 - three' '1 - one']


In [None]:
# plot
fig, axs = plt.subplots(nrows=4, ncols=8, figsize=(12, 6), layout='compressed')
for i in range(4):
    for j in range(8):
        axs[i, j].imshow(raw_data[i * 8 + j], cmap='gray')
        axs[i, j].set_title(predict(sequential_model, raw_data[i * 8 + j], testset.classes, transform=transforms))
        axs[i, j].axis('off')
plt.show()