# Dependencies

In [1]:
import numpy as np

import torch
from torch import nn
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from torch.utils.data import DataLoader, random_split

from torchvision.transforms import v2
from torchvision.datasets import MNIST

from torchsummary import summary

from torchmetrics import Accuracy, ConfusionMatrix

import matplotlib.pyplot as plt

from sklearn.metrics import classification_report

In [2]:
# set a fixed seed
random_state = 42
torch.manual_seed(random_state)
torch.cuda.manual_seed(random_state)
np.random.seed(random_state)

In [3]:
# check if cuda is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

# Pre-Processing

## Load Dataset

In [4]:
# initial transforms
transforms = v2.Compose(
    [
        v2.ToImage(),
        v2.ToDtype(torch.float32, scale= True)
    ]
)

# load the MNIST dataset
trainset = MNIST(root= './dataset', train= True , download= True, transform= transforms)
testset  = MNIST(root= './dataset', train= False, download= True, transform= transforms)

# log
print('trainset:')
print(f"    -> trainset.data.shape    : {trainset.data.shape}") # it doesn't explicitly include the channel dimension for MNIST e.g. [60000, 1, 28, 28]
print(f"    -> trainset.data.dtype    : {trainset.data.dtype}")
print(f"    -> type(trainset.data)    : {type(trainset.data)}")
print(f"    -> type(trainset.targets) : {type(trainset.targets)}")
print('-' * 50)
print('testset:')
print(f"    -> testset.data.shape     : {testset.data.shape}")
print(f"    -> testset.data.dtype     : {testset.data.dtype}")
print(f"    -> type(testset.data)     : {type(testset.data)}")
print(f"    -> type(testset.targets)  : {type(testset.targets)}")
print('-' * 50)
print(f"classes: {trainset.classes}")
print(f"trainset distribution: {np.unique(trainset.targets, return_counts= True)[1]}")
print(f"testset  distribution: {np.unique(testset.targets, return_counts= True)[1]}")

trainset:
    -> trainset.data.shape    : torch.Size([60000, 28, 28])
    -> trainset.data.dtype    : torch.uint8
    -> type(trainset.data)    : <class 'torch.Tensor'>
    -> type(trainset.targets) : <class 'torch.Tensor'>
--------------------------------------------------
testset:
    -> testset.data.shape     : torch.Size([10000, 28, 28])
    -> testset.data.dtype     : torch.uint8
    -> type(testset.data)     : <class 'torch.Tensor'>
    -> type(testset.targets)  : <class 'torch.Tensor'>
--------------------------------------------------
classes: ['0 - zero', '1 - one', '2 - two', '3 - three', '4 - four', '5 - five', '6 - six', '7 - seven', '8 - eight', '9 - nine']
trainset distribution: [5923 6742 5958 6131 5842 5421 5918 6265 5851 5949]
testset  distribution: [ 980 1135 1032 1010  982  892  958 1028  974 1009]


In [None]:
# plot
fig, axs = plt.subplots(nrows= 4, ncols= 8, figsize= (12, 6), layout= 'compressed')
for i in range(4):
    for j in range(8):
        axs[i, j].imshow(trainset.data[i * 8 + j], cmap= 'gray')
        axs[i, j].set_title(trainset.classes[trainset.targets[i * 8 + j]])
        axs[i, j].axis('off')
plt.show()

## Split trainset into [trainset, validationset]

In [6]:
# 90% trainset & 10% validationset
validation_size = int(0.1 * len(trainset))
train_size = len(trainset) - validation_size

# random split
trainset, validationset = random_split(trainset, [train_size, validation_size])

# log
print('trainset:')
print(f"    -> len(trainset) : {len(trainset)}")
print(f"    -> trainset[0][0]: {trainset[0][0].shape}")
print(f"    -> trainset[0][1]: {trainset[0][1]}\n")
print('validationset:')
print(f"    -> len(validationset) : {len(validationset)}")
print(f"    -> validationset[0][0]: {validationset[0][0].shape}")
print(f"    -> validationset[0][1]: {validationset[0][1]}\n")
print('testset:')
print(f"    -> len(testset) : {len(testset)}")
print(f"    -> testset[0][0]: {testset[0][0].shape}")
print(f"    -> testset[0][1]: {testset[0][1]}")

trainset:
    -> len(trainset) : 54000
    -> trainset[0][0]: torch.Size([1, 28, 28])
    -> trainset[0][1]: 6

validationset:
    -> len(validationset) : 6000
    -> validationset[0][0]: torch.Size([1, 28, 28])
    -> validationset[0][1]: 1

testset:
    -> len(testset) : 10000
    -> testset[0][0]: torch.Size([1, 28, 28])
    -> testset[0][1]: 7


## Normalization
   1. Min-Max Normalization
      - 0-1 Normalization
         - Scales the pixel values to [0, 1] range
      - ...
   2. Mean-STD Normalization
      - Standardization (Z-score normalization)
         - Transforms the data to have a mean of 0 and a standard deviation of 1
      - Mean Normalization
         - It centers the data around zero
      - Scale and Center Images
         - Rescale the pixel values to have a mean of 0.5 and a standard deviation of 0.5
      - ...
   3. ...


In [7]:
# create a temporary DataLoader for the trainset
temp_trainloader = DataLoader(trainset, batch_size= len(trainset))

# get the whole data
temp_dataset = next(iter(temp_trainloader))

# calculate the mean and standard deviation
train_mean = temp_dataset[0].mean().item() # 0.1307
train_std  = temp_dataset[0].std().item()  # 0.3081

del temp_trainloader
del temp_dataset

# log
print(f"train mean per channel: {train_mean}")
print(f"train std  per channel: {train_std}")

train mean per channel: 0.13067437708377838
train std  per channel: 0.30812761187553406


## Transform
   - on-the-fly data augmentation
   - Disadvantage:
      - same transform applies to the same data in each epoch
   - Advantage:
      - Reduced Memory Usage, Regularization & Data Diversity [random transforms e.g. RancomCrop]

In [None]:
transforms

Compose(
      ToImage()
      ToDtype(scale=True)
)

In [None]:
transforms.transforms.append(v2.Normalize(mean= (train_mean,), std= (train_std,)))

# log
print(f"trainset.dataset.transforms: {trainset.dataset.transforms}")
print(f"validationset.dataset.transforms: {validationset.dataset.transforms}")
print(f"testset.transforms: {testset.transforms}")

trainset.dataset.transforms: StandardTransform
Transform: Compose(
                 ToImage()
                 ToDtype(scale=True)
                 Normalize(mean=[0.13067437708377838], std=[0.30812761187553406], inplace=False)
           )
validationset.dataset.transforms: StandardTransform
Transform: Compose(
                 ToImage()
                 ToDtype(scale=True)
                 Normalize(mean=[0.13067437708377838], std=[0.30812761187553406], inplace=False)
           )
testset.transforms: StandardTransform
Transform: Compose(
                 ToImage()
                 ToDtype(scale=True)
                 Normalize(mean=[0.13067437708377838], std=[0.30812761187553406], inplace=False)
           )


In [None]:
# log
print("before applying transform:")
print(f"    -> type(testset.data[0]) : {type(testset.data[0])}")
print(f"    -> testset.data[0].dtype : {testset.data[0].dtype}")
print(f"    -> testset.data[0].shape : {testset.data[0].shape}")
print('-' * 50)
print("after applying transform:")
print(f"    -> type(testset[0][0])   : {type(testset[0][0])}")
print(f"    -> testset[0][0].dtype   : {testset[0][0].dtype}")
print(f"    -> testset[0][0].shape   : {testset[0][0].shape}")

before applying transform:
    -> type(testset.data[0]) : <class 'torch.Tensor'>
    -> testset.data[0].dtype : torch.uint8
    -> testset.data[0].shape : torch.Size([28, 28])
--------------------------------------------------
after applying transform:
    -> type(testset[0][0])   : <class 'torchvision.tv_tensors._image.Image'>
    -> testset[0][0].dtype   : torch.float32
    -> testset[0][0].shape   : torch.Size([1, 28, 28])


## DataLoader

In [None]:
batch_size = 64

trainloader      = DataLoader(dataset= trainset     , batch_size = batch_size, shuffle= True , num_workers= 2)
validationloader = DataLoader(dataset= validationset, batch_size = batch_size, shuffle= False, num_workers= 2)
testloader       = DataLoader(dataset= testset      , batch_size = batch_size, shuffle= False, num_workers= 2)

In [None]:
first_train_batch      = next(iter(trainloader))
first_validation_batch = next(iter(validationloader))
first_test_batch       = next(iter(testloader))

print(f"trainloader      first batch     -> x.shape: {first_train_batch[0].shape} - y.shape: {first_train_batch[1].shape} - x.dtype: {first_train_batch[0].dtype} - y.dtype: {first_train_batch[1].dtype}")
print(f"validationloader first batch     -> x.shape: {first_validation_batch[0].shape} - y.shape: {first_validation_batch[1].shape} - x.dtype: {first_validation_batch[0].dtype} - y.dtype: {first_validation_batch[1].dtype}")
print(f"testloader       first batch     -> x.shape: {first_test_batch[0].shape} - y.shape: {first_test_batch[1].shape} - x.dtype: {first_test_batch[0].dtype} - y.dtype: {first_test_batch[1].dtype}")
print(f"trainloader      last batch-size -> {len(trainset) % batch_size}")
print(f"validationloader last batch-size -> {len(validationset) % batch_size}")
print(f"testloader       last batch-size -> {len(testset) % batch_size}")

trainloader      first batch     -> x.shape: torch.Size([64, 1, 28, 28]) - y.shape: torch.Size([64]) - x.dtype: torch.float32 - y.dtype: torch.int64
validationloader first batch     -> x.shape: torch.Size([64, 1, 28, 28]) - y.shape: torch.Size([64]) - x.dtype: torch.float32 - y.dtype: torch.int64
testloader       first batch     -> x.shape: torch.Size([64, 1, 28, 28]) - y.shape: torch.Size([64]) - x.dtype: torch.float32 - y.dtype: torch.int64
trainloader      last batch-size -> 48
validationloader last batch-size -> 48
testloader       last batch-size -> 16


# Network Structure
   - Sequential Model
      - Use torch.nn.Sequential to create a sequence of layers or modules
   - Functional Model
      - for stateless operations like activation functions, loss functions, and other operations within the forward method of custom modules or in custom functions
   - Mixed Model

Note:
   - loss function : 
      - multi-class classification : `torch.nn.CrossEntropyLoss()`
   - activation function [last layer]:
      - multi-class classification : `torch.nn.Softmax()`
      - we does not include softmax in the last layer because torch's CrossEntropyLoss already use softmax inside itself
   

<figure style="text-align: center;">
    <img src="../resources/images/SVGs/multi-layer-perceptron.svg" alt="multi-layer-perceptron.svg" style="width: 100%;">
    <figcaption>Multi-Layer-Perceptron (aka fully connected layers)</figcaption>
</figure>

<table style="margin-left:auto;margin-right:auto;text-align:center;">
  <thead>
    <tr>
      <th colspan="2">hidden<sub>1</sub> parameters</th>
      <th colspan="2">hidden<sub>2</sub> parameters</th>
      <th colspan="2">logits parameters</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>Weights</td>
      <td>Biases</td>
      <td>Weights</td>
      <td>Biases</td>
      <td>Weights</td>
      <td>Biases</td>
    </tr>
    <tr>
      <td>A × B</td>
      <td>B</td>
      <td>B × C</td>
      <td>C</td>
      <td>C × D</td>
      <td>D</td>
    </tr>
  </tbody>
  <tfoot>
    <tr>
      <td colspan="2">(A + 1) × B</td>
      <td colspan="2">(B + 1) × C</td>
      <td colspan="2">(C + 1) × D</td>
    </tr>
  </tfoot>
</table>


In [None]:
# layers
depth, height, width = trainset[0][0].shape

input_dim  = depth * height * width
hidden_dim = [64, 32]
output_dim = len(testset.classes)

## Sequential Model

In [None]:
sequential_model = nn.Sequential(
    nn.Flatten(start_dim= 1),

    nn.Linear(input_dim, hidden_dim[0]),
    nn.ReLU(),

    nn.Linear(hidden_dim[0], hidden_dim[1]),
    nn.ReLU(),
    
    nn.Linear(hidden_dim[1], output_dim),
    # softmax is already satisfied inside CrossEntropyLoss()
)

In [None]:
sequential_model.to(device)

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=64, bias=True)
  (2): ReLU()
  (3): Linear(in_features=64, out_features=32, bias=True)
  (4): ReLU()
  (5): Linear(in_features=32, out_features=10, bias=True)
)

In [None]:
summary(sequential_model, input_size= trainset[0][0].shape, batch_size= batch_size)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
           Flatten-1                  [64, 784]               0
            Linear-2                   [64, 64]          50,240
              ReLU-3                   [64, 64]               0
            Linear-4                   [64, 32]           2,080
              ReLU-5                   [64, 32]               0
            Linear-6                   [64, 10]             330
Total params: 52,650
Trainable params: 52,650
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.19
Forward/backward pass size (MB): 0.48
Params size (MB): 0.20
Estimated Total Size (MB): 0.87
----------------------------------------------------------------


## Functional Model

In [None]:
class FunctionalModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(FunctionalModel, self).__init__()
        self.flatten = nn.Flatten(start_dim= 1)
        self.linear1 = nn.Linear(input_dim, hidden_dim[0])
        self.relu1   = nn.ReLU()
        self.linear2 = nn.Linear(hidden_dim[0], hidden_dim[1])
        self.relu2   = nn.ReLU()
        self.linear3 = nn.Linear(hidden_dim[1], output_dim)
        # softmax is already satisfied inside CrossEntropyLoss()
    
    def forward(self, x):
        x = self.flatten(x)
        x = self.linear1(x)
        x = self.relu1(x)
        x = self.linear2(x)
        x = self.relu2(x)
        x = self.linear3(x)
        return x

functional_model = FunctionalModel(input_dim, output_dim)

In [None]:
functional_model.to(device)

FunctionalModel(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear1): Linear(in_features=784, out_features=64, bias=True)
  (relu1): ReLU()
  (linear2): Linear(in_features=64, out_features=32, bias=True)
  (relu2): ReLU()
  (linear3): Linear(in_features=32, out_features=10, bias=True)
)

In [None]:
summary(sequential_model, input_size= trainset[0][0].shape, batch_size= batch_size)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
           Flatten-1                  [64, 784]               0
            Linear-2                   [64, 64]          50,240
              ReLU-3                   [64, 64]               0
            Linear-4                   [64, 32]           2,080
              ReLU-5                   [64, 32]               0
            Linear-6                   [64, 10]             330
Total params: 52,650
Trainable params: 52,650
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.19
Forward/backward pass size (MB): 0.48
Params size (MB): 0.20
Estimated Total Size (MB): 0.87
----------------------------------------------------------------


## Mixed Model

In [None]:
class MixModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MixModel, self).__init__()
        self.classifier = nn.Sequential(
            nn.Flatten(start_dim= 1),
            nn.Linear(input_dim, hidden_dim[0]),
            nn.ReLU(),
            nn.Linear(hidden_dim[0], hidden_dim[1]),
            nn.ReLU(),
            nn.Linear(hidden_dim[1], output_dim),
            # softmax is already satisfied inside CrossEntropyLoss()
        )
    
    def forward(self, x):
        x = self.classifier(x)
        return x

mix_model = MixModel(input_dim, output_dim)

In [None]:
mix_model.to(device)

MixModel(
  (classifier): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=784, out_features=64, bias=True)
    (2): ReLU()
    (3): Linear(in_features=64, out_features=32, bias=True)
    (4): ReLU()
    (5): Linear(in_features=32, out_features=10, bias=True)
  )
)

In [None]:
summary(sequential_model, input_size= trainset[0][0].shape, batch_size= batch_size)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
           Flatten-1                  [64, 784]               0
            Linear-2                   [64, 64]          50,240
              ReLU-3                   [64, 64]               0
            Linear-4                   [64, 32]           2,080
              ReLU-5                   [64, 32]               0
            Linear-6                   [64, 10]             330
Total params: 52,650
Trainable params: 52,650
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.19
Forward/backward pass size (MB): 0.48
Params size (MB): 0.20
Estimated Total Size (MB): 0.87
----------------------------------------------------------------


# Set up remaining Hyper-Parameters

In [None]:
model = sequential_model

In [None]:
lr = 0.01
criterion = CrossEntropyLoss()
optimizer = Adam(params= model.parameters(), lr= lr)
num_epochs = 15

# Train & Validation Loop

### model.train & model.eval
   - Some regularization methods are applied only during training, not during evaluation and prediction
   - e.g. batchNorm, dropout, ...
   - model.eval() disables these type of regularizations

In [None]:
train_acc_per_epoch  = []
train_loss_per_epoch = []
val_acc_per_epoch    = []
val_loss_per_epoch   = []

In [None]:
train_acc = Accuracy(task = 'multiclass', num_classes= len(testset.classes), top_k= 1).to(device)
val_acc   = Accuracy(task = 'multiclass', num_classes= len(testset.classes), top_k= 1).to(device)

In [None]:
for epoch in range(num_epochs):

# train loop
    model.train()
    train_loss = 0

    for x, y in trainloader:

        # send data to GPU
        x, y_true = x.to(device), y.to(device)

        # forward
        y_pred = model(x)
        loss = criterion(y_pred, y_true)

        # backward
        loss.backward()

        # update parameters
        optimizer.step()
        optimizer.zero_grad()

        # log loss & accuracy
        train_loss += loss.item() * len(x)
        train_acc.update(y_pred, y_true)

    train_loss_per_epoch.append(train_loss / len(trainset))
    train_acc_per_epoch.append(train_acc.compute().item())
    train_acc.reset()


# validation loop
    model.eval()
    val_loss = 0

    # During the forward pass, PyTorch saves intermediate results
    # (from each operation that involves tensors with requires_grad=True)
    # in order to compute gradients during the backward pass
    # torch.no_grad() stops pytorch to save these intermediate results
    with torch.no_grad():
        for x, y in validationloader:

            # send data to GPU
            x, y_true = x.to(device), y.to(device)

            # forward
            y_pred = model(x)
            loss = criterion(y_pred, y_true)

            # log loss & accuracy
            val_loss += loss.item() * len(x)
            val_acc.update(y_pred, y_true)
    
    val_loss_per_epoch.append(val_loss / len(validationset))
    val_acc_per_epoch.append(val_acc.compute().item())
    val_acc.reset()
    

    # log
    print(f"epoch {epoch:>2}  ->  train[loss: {train_loss_per_epoch[epoch]:.5f} - acc: {train_acc_per_epoch[epoch]:.2f}] | validation[loss: {val_loss_per_epoch[epoch]:.5f} - acc: {val_acc_per_epoch[epoch]:.2f}]")


epoch  0  ->  train[loss: 0.29456 - acc: 0.91] | validation[loss: 0.20450 - acc: 0.94]
epoch  1  ->  train[loss: 0.19259 - acc: 0.94] | validation[loss: 0.20727 - acc: 0.95]
epoch  2  ->  train[loss: 0.17141 - acc: 0.95] | validation[loss: 0.18805 - acc: 0.95]
epoch  3  ->  train[loss: 0.15537 - acc: 0.96] | validation[loss: 0.21517 - acc: 0.94]
epoch  4  ->  train[loss: 0.14470 - acc: 0.96] | validation[loss: 0.20678 - acc: 0.95]
epoch  5  ->  train[loss: 0.13822 - acc: 0.96] | validation[loss: 0.21816 - acc: 0.95]
epoch  6  ->  train[loss: 0.13850 - acc: 0.96] | validation[loss: 0.23807 - acc: 0.95]
epoch  7  ->  train[loss: 0.13025 - acc: 0.97] | validation[loss: 0.20442 - acc: 0.95]
epoch  8  ->  train[loss: 0.13073 - acc: 0.97] | validation[loss: 0.19155 - acc: 0.95]
epoch  9  ->  train[loss: 0.12571 - acc: 0.97] | validation[loss: 0.21276 - acc: 0.95]
epoch 10  ->  train[loss: 0.11658 - acc: 0.97] | validation[loss: 0.21281 - acc: 0.96]
epoch 11  ->  train[loss: 0.12149 - acc: 0.

## Model Analysis
   - A useful technique to check the over-fitting situation

In [None]:
# plot
fig, axs = plt.subplots(nrows= 1, ncols= 2, figsize= (10, 4), layout= 'compressed')

axs[0].plot(train_loss_per_epoch, label= "Train loss")
axs[0].plot(val_loss_per_epoch, label= "Validation loss")
axs[0].set(title= "Loss over time", xlabel= 'Epoch', ylabel= 'Loss')
axs[0].legend(loc= 'best', fancybox= True, shadow= True)

axs[1].plot(train_acc_per_epoch, label= "Train accuracy")
axs[1].plot(val_acc_per_epoch, label= "Validation accuracy")
axs[1].set(title= "Accuracy over time", xlabel= 'Epoch', ylabel= 'Accuracy')
axs[1].legend(loc= 'best', fancybox= True, shadow= True)

plt.show()

# Test Loop

In [None]:
test_acc = Accuracy(task = 'multiclass', num_classes= len(testset.classes), top_k= 1).to(device)

In [None]:
# test loop
model.eval()
test_loss = 0
predictions = []
targets = []

with torch.no_grad():
    for x, y in testloader:

        # send data to GPU
        x, y_true = x.to(device), y.to(device)

        # forward
        y_pred = model(x)
        loss = criterion(y_pred, y_true)

        # log loss & accuracy
        test_loss += loss.item() * len(x)
        test_acc.update(y_pred, y_true)

        predictions.extend(y_pred.argmax(dim= 1).cpu())
        targets.extend(y_true.cpu())

# log
print(f"test[loss: {test_loss / len(testset):.5f} - acc: {test_acc.compute().item():.2f}]")

test[loss: 0.21722 - acc: 0.96]


## Metrics
   - loss
   - accuracy
   - recall
   - precision
   - f1-score
   - ROC Curve
   - AUC Curve
   - ...

In [None]:
# classification report
print(classification_report(targets, predictions))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97       980
           1       0.98      0.99      0.98      1135
           2       0.97      0.95      0.96      1032
           3       0.97      0.96      0.96      1010
           4       0.97      0.95      0.96       982
           5       0.98      0.95      0.96       892
           6       0.98      0.97      0.97       958
           7       0.95      0.96      0.96      1028
           8       0.90      0.94      0.92       974
           9       0.96      0.95      0.95      1009

    accuracy                           0.96     10000
   macro avg       0.96      0.96      0.96     10000
weighted avg       0.96      0.96      0.96     10000



In [None]:
# confusion matrix
metric = ConfusionMatrix(task= 'multiclass', num_classes= 10)
confusion_matrix = metric(torch.tensor(predictions), torch.tensor(targets))

# log
print(confusion_matrix)

# plot
fig, ax = plt.subplots(figsize= (8, 8))
metric.plot(ax= ax)
plt.show()

tensor([[ 969,    0,    3,    0,    0,    0,    1,    2,    5,    0],
        [   0, 1118,    2,    0,    1,    2,    2,    1,    8,    1],
        [   3,    3,  985,    3,    3,    1,    4,   11,   19,    0],
        [   1,    0,    4,  966,    0,    3,    1,    6,   25,    4],
        [   2,    1,    4,    2,  936,    0,    5,    6,    6,   20],
        [   9,    0,    1,   10,    0,  844,    6,    1,   17,    4],
        [  11,    2,    1,    1,    5,    2,  929,    1,    6,    0],
        [   1,    6,   10,    5,    1,    1,    0,  987,    7,   10],
        [  14,    8,    3,    3,    6,    5,    3,    7,  920,    5],
        [   6,    2,    2,    5,   13,    1,    1,   16,    8,  955]])


# Prediction

In [None]:
def predict(model: nn.Module, data: np.ndarray, classes: list, transform: v2._container.Compose = None) -> torch.Tensor:

    # add batch & channel dimension to a single data
    if len(data.shape) == 2:
        data = np.expand_dims(data, axis= (0, 3))

    # apply the transform
    if transform:
        data = torch.stack([transform(sample) for sample in data])

    # predict
    model.eval()
    with torch.no_grad():

        # send data to GPU
        data = data.to(device)

        # forward
        y_pred = model(data).argmax(dim= 1).cpu()

        # idx to labels
        y_pred = np.array(classes)[y_pred]

    return y_pred

In [None]:
# some raw data
raw_data = MNIST(root= './dataset', train= False, download= True, transform= None).data[:32]

# predict
y_pred = predict(model, data= raw_data, classes= testset.classes, transform= transforms)

# log
print(f"predictions:\n{y_pred}")

predictions:
['7 - seven' '2 - two' '1 - one' '0 - zero' '4 - four' '1 - one'
 '4 - four' '9 - nine' '5 - five' '9 - nine' '0 - zero' '6 - six'
 '9 - nine' '0 - zero' '1 - one' '5 - five' '9 - nine' '7 - seven'
 '3 - three' '4 - four' '9 - nine' '6 - six' '6 - six' '5 - five'
 '4 - four' '0 - zero' '7 - seven' '4 - four' '0 - zero' '1 - one'
 '3 - three' '1 - one']


In [None]:
# plot
fig, axs = plt.subplots(nrows= 4, ncols= 8, figsize= (12, 6), layout= 'compressed')

for i in range(4):
    for j in range(8):
        axs[i, j].imshow(raw_data[i * 8 + j], cmap= 'gray')
        axs[i, j].set_title(predict(model, raw_data[i * 8 + j], testset.classes, transform= transforms))
        axs[i, j].axis('off')

plt.show()