📝 **Author:** Amirhossein Heydari - 📧 **Email:** amirhosseinheydari78@gmail.com - 📍 **Linktree:** [linktr.ee/mr_pylin](https://linktr.ee/mr_pylin)

---

# Dependencies

In [28]:
import matplotlib.pyplot as plt
import numpy as np
import torch
from sklearn.metrics import classification_report
from torch import nn
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from torch.utils.data import DataLoader, random_split
from torchinfo import summary
from torchmetrics.classification import MulticlassAccuracy, MulticlassConfusionMatrix
from torchvision.datasets import MNIST
from torchvision.transforms import v2

In [29]:
# set a seed for deterministic results
seed = 42
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
# check if cuda is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# log
device

# Pre-Processing
   - transforms: [pytorch.org/vision/main/transforms.html](https://pytorch.org/vision/main/transforms.html)
   - available datasets: [pytorch.org/vision/main/datasets.html](https://pytorch.org/vision/main/datasets.html)
   - Datasets & DataLoader: [pytorch.org/tutorials/beginner/basics/data_tutorial.html](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html)

## Load Dataset
   - v2.ToImage:
      - Convert a tensor, ndarray, or PIL Image to [Image](https://pytorch.org/vision/main/generated/torchvision.tv_tensors.Image.html#torchvision.tv_tensors.Image)
      - [pytorch.org/vision/main/generated/torchvision.transforms.v2.ToImage.html](https://pytorch.org/vision/main/generated/torchvision.transforms.v2.ToImage.html)
   - v2.ToDtype:
      - Converts the input to a specific dtype, optionally scaling the values for images or videos
      - [pytorch.org/vision/main/generated/torchvision.transforms.v2.ToDtype.html](https://pytorch.org/vision/main/generated/torchvision.transforms.v2.ToDtype.html)

In [None]:
# initial transforms
transforms = v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True)])

# load the MNIST dataset
trainset = MNIST(root="../../datasets", train=True, transform=transforms, download=False)
testset = MNIST(root="../../datasets", train=False, transform=transforms, download=False)

# log
print("trainset:")
print(
    f"    -> trainset.data.shape    : {trainset.data.shape}"
)  # it doesn't explicitly include the channel dimension for MNIST e.g. [60000, 1, 28, 28]
print(f"    -> trainset.data.dtype    : {trainset.data.dtype}")
print(f"    -> type(trainset.data)    : {type(trainset.data)}")
print(f"    -> type(trainset.targets) : {type(trainset.targets)}")
print("-" * 50)
print("testset:")
print(f"    -> testset.data.shape     : {testset.data.shape}")
print(f"    -> testset.data.dtype     : {testset.data.dtype}")
print(f"    -> type(testset.data)     : {type(testset.data)}")
print(f"    -> type(testset.targets)  : {type(testset.targets)}")
print("-" * 50)
print(f"classes: {trainset.classes}")
print(f"trainset distribution: {np.unique(trainset.targets, return_counts=True)[1]}")
print(f"testset  distribution: {np.unique(testset.targets, return_counts=True)[1]}")

In [None]:
# plot
fig, axs = plt.subplots(nrows=4, ncols=8, figsize=(12, 6), layout="compressed")
for i in range(4):
    for j in range(8):
        axs[i, j].imshow(trainset.data[i * 8 + j], cmap="gray")
        axs[i, j].set_title(trainset.classes[trainset.targets[i * 8 + j]])
        axs[i, j].axis("off")
plt.show()

## Split trainset into [trainset, validationset]
   - [pytorch.org/docs/stable/data.html](https://pytorch.org/docs/stable/data.html)

In [None]:
# random split (returns List[Subset])
trainset, validationset = random_split(trainset, [0.9, 0.1])

# log
print("trainset:")
print(f"    -> len(trainset)       : {len(trainset)}")
print(f"    -> trainset[0][0]      : {trainset[0][0].shape}")
print(f"    -> trainset[0][1]      : {trainset[0][1]}")
print(f"    -> type(trainset)      : {type(trainset)}\n")
print("validationset:")
print(f"    -> len(validationset)  : {len(validationset)}")
print(f"    -> validationset[0][0] : {validationset[0][0].shape}")
print(f"    -> validationset[0][1] : {validationset[0][1]}")
print(f"    -> type(validationset) : {type(validationset)}\n")
print("testset:")
print(f"    -> len(testset)        : {len(testset)}")
print(f"    -> testset[0][0]       : {testset[0][0].shape}")
print(f"    -> testset[0][1]       : {testset[0][1]}")
print(f"    -> type(testset)       : {type(testset)}")

## Normalization
   1. Min-Max Normalization
      - 0-1 Normalization
         - Scales the pixel values to [0, 1] range
      - ...
   1. Mean-STD Normalization
      - Standardization (Z-score normalization)
         - Transforms the data to have a mean of 0 and a standard deviation of 1
      - Mean Normalization
         - It centers the data around zero
      - Scale and Center Images
         - Rescale the pixel values to have a mean of 0.5 and a standard deviation of 0.5
      - ...
   1. ...


In [None]:
# create a temporary DataLoader for the trainset
temp_trainloader = DataLoader(trainset, batch_size=len(trainset))

# get the whole data
temp_dataset = next(iter(temp_trainloader))

# calculate the mean and standard deviation
train_mean = temp_dataset[0].mean().item()  # 0.1307
train_std = temp_dataset[0].std().item()  # 0.3081

del temp_trainloader
del temp_dataset

# log
print(f"train mean per channel: {train_mean}")
print(f"train std  per channel: {train_std}")

## Transform
   - on-the-fly data augmentation
   - Disadvantage:
      - same transform applies to the same data in each epoch
   - Advantage:
      - Reduced Memory Usage, Regularization & Data Diversity [random transforms e.g. RancomCrop]

In [None]:
transforms

In [None]:
transforms.transforms.append(v2.Normalize(mean=(train_mean,), std=(train_std,)))

# log
print(f"trainset.dataset.transforms:\n{trainset.dataset.transforms}\n")
print(f"validationset.dataset.transforms:\n{validationset.dataset.transforms}\n")
print(f"testset.transforms:\n{testset.transforms}")

In [None]:
# log
print("before applying transform:")
print(f"    -> type(testset.data[0]) : {type(testset.data[0])}")
print(f"    -> testset.data[0].dtype : {testset.data[0].dtype}")
print(f"    -> testset.data[0].shape : {testset.data[0].shape}")
print("-" * 50)
print("after applying transform:")
print(f"    -> type(testset[0][0])   : {type(testset[0][0])}")
print(f"    -> testset[0][0].dtype   : {testset[0][0].dtype}")
print(f"    -> testset[0][0].shape   : {testset[0][0].shape}")

## DataLoader
   - [pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader)

In [38]:
batch_size = 64

trainloader = DataLoader(dataset=trainset, batch_size=batch_size, shuffle=True, num_workers=2)
validationloader = DataLoader(dataset=validationset, batch_size=batch_size, shuffle=False, num_workers=2)
testloader = DataLoader(dataset=testset, batch_size=batch_size, shuffle=False, num_workers=2)

In [None]:
# log
first_train_batch = next(iter(trainloader))
first_validation_batch = next(iter(validationloader))
first_test_batch = next(iter(testloader))

print(
    f"trainloader      first batch     -> x.shape: {first_train_batch[0].shape} - y.shape: {first_train_batch[1].shape} - x.dtype: {first_train_batch[0].dtype} - y.dtype: {first_train_batch[1].dtype}"
)
print(
    f"validationloader first batch     -> x.shape: {first_validation_batch[0].shape} - y.shape: {first_validation_batch[1].shape} - x.dtype: {first_validation_batch[0].dtype} - y.dtype: {first_validation_batch[1].dtype}"
)
print(
    f"testloader       first batch     -> x.shape: {first_test_batch[0].shape} - y.shape: {first_test_batch[1].shape} - x.dtype: {first_test_batch[0].dtype} - y.dtype: {first_test_batch[1].dtype}"
)
print(f"trainloader      last batch-size -> {len(trainset) % batch_size}")
print(f"validationloader last batch-size -> {len(validationset) % batch_size}")
print(f"testloader       last batch-size -> {len(testset) % batch_size}")

# Network Structure: Multi-layer Perceptron

<figure style="text-align: center;">
    <img src="../../assets/images/original/mlp/multi-layer-perceptrons.svg" alt="multi-layer-perceptrons.svg" style="width: 100%;">
    <figcaption style="text-align: center;">Multi-Layer-Perceptron (aka fully connected layers)</figcaption>
</figure>

In [None]:
# layers
depth, height, width = trainset[0][0].shape

input_dim = depth * height * width
hidden_dim = [64, 32]
output_dim = len(testset.classes)

# log
print(f"input_dim  : {input_dim}")
print(f"hidden_dim : {hidden_dim}")
print(f"output_dim : {output_dim}")

## Custom MLP Model
   - Activation Function is ignored from the last layer due to the `torch.nn.CrossEntropyLoss`

In [None]:
class CustomMLP(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(CustomMLP, self).__init__()
        self.classifier = nn.Sequential(
            nn.Flatten(start_dim=1),
            nn.Linear(input_dim, hidden_dim[0]),
            nn.ReLU(),
            nn.Linear(hidden_dim[0], hidden_dim[1]),
            nn.ReLU(),
            nn.Linear(hidden_dim[1], output_dim),
        )

    def forward(self, x):
        x = self.classifier(x)
        return x


# initialize the model
model = CustomMLP(input_dim, output_dim).to(device)

# log
model

In [None]:
summary(model, input_size=(batch_size, *trainset[0][0].shape))

# Set up remaining Hyperparameters

In [43]:
lr = 0.001
criterion = CrossEntropyLoss()
optimizer = Adam(params=model.parameters(), lr=lr)
num_epochs = 15

# Train & Validation Loop

### model.train & model.eval
   - Some regularization methods (e.g. batchNorm, dropout) are applied only during training, not during evaluation and prediction
   - `model.eval()` [`model.train(False)`], disables these type of regularizations
   - [pytorch.org/docs/stable/generated/torch.nn.Module.html](https://pytorch.org/docs/stable/generated/torch.nn.Module.html)
   - [pytorch.org/docs/stable/notes/autograd.html#locally-disable-grad-doc](https://pytorch.org/docs/stable/notes/autograd.html#locally-disable-grad-doc)

In [44]:
train_acc_per_epoch = []
train_loss_per_epoch = []
val_acc_per_epoch = []
val_loss_per_epoch = []

In [45]:
train_acc = MulticlassAccuracy(num_classes=len(testset.classes), top_k=1).to(device)
val_acc = MulticlassAccuracy(num_classes=len(testset.classes), top_k=1).to(device)

In [None]:
for epoch in range(num_epochs):

    # train loop
    model.train()
    train_loss = 0

    for x, y in trainloader:

        # send data to GPU
        x, y_true = x.to(device), y.to(device)

        # forward
        y_pred = model(x)
        loss = criterion(y_pred, y_true)

        # backward
        loss.backward()

        # update parameters
        optimizer.step()
        optimizer.zero_grad()

        # store loss and accuracy per iteration
        train_loss += loss.item() * len(x)
        train_acc.update(y_pred, y_true)

    # store loss and accuracy per epoch
    train_loss_per_epoch.append(train_loss / len(trainset))
    train_acc_per_epoch.append(train_acc.compute().item())
    train_acc.reset()

    # validation loop
    model.eval()
    val_loss = 0

    # During the forward pass, PyTorch saves intermediate results
    # (from each operation that involves tensors with requires_grad=True)
    # in order to compute gradients during the backward pass
    # torch.no_grad() stops pytorch to save these intermediate results
    with torch.no_grad():
        for x, y in validationloader:

            # send data to GPU
            x, y_true = x.to(device), y.to(device)

            # forward
            y_pred = model(x)
            loss = criterion(y_pred, y_true)

            # store loss and accuracy per iteration
            val_loss += loss.item() * len(x)
            val_acc.update(y_pred, y_true)

    # store loss and accuracy per epoch
    val_loss_per_epoch.append(val_loss / len(validationset))
    val_acc_per_epoch.append(val_acc.compute().item())
    val_acc.reset()

    # log
    print(
        f"epoch {epoch+1:0{len(str(num_epochs))}}/{num_epochs}  ->  train[loss: {train_loss_per_epoch[epoch]:.5f} - acc: {train_acc_per_epoch[epoch]:.2f}] | validation[loss: {val_loss_per_epoch[epoch]:.5f} - acc: {val_acc_per_epoch[epoch]:.2f}]"
    )

## Model Analysis
   - A useful technique to check the over-fitting situation

In [None]:
# plot
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(10, 4), layout="compressed")
axs[0].plot(train_loss_per_epoch, label="Train loss")
axs[0].plot(val_loss_per_epoch, label="Validation loss")
axs[0].set(title="Loss over time", xlabel="Epoch", ylabel="Loss")
axs[0].legend(loc="best", fancybox=True, shadow=True)
axs[1].plot(train_acc_per_epoch, label="Train accuracy")
axs[1].plot(val_acc_per_epoch, label="Validation accuracy")
axs[1].set(title="Accuracy over time", xlabel="Epoch", ylabel="Accuracy")
axs[1].legend(loc="best", fancybox=True, shadow=True)
plt.show()

# Test Loop

In [48]:
test_acc = MulticlassAccuracy(num_classes=len(testset.classes), top_k=1).to(device)

In [None]:
model.eval()
test_loss = 0
predictions = []
targets = []

with torch.no_grad():
    for x, y in testloader:

        # send data to GPU
        x, y_true = x.to(device), y.to(device)

        # forward
        y_pred = model(x)
        loss = criterion(y_pred, y_true)

        # store loss and accuracy per iteration
        test_loss += loss.item() * len(x)
        test_acc.update(y_pred, y_true)

        predictions.extend(y_pred.argmax(dim=1).cpu())
        targets.extend(y_true.cpu())

# log
print(f"test[loss: {test_loss / len(testset):.5f} - acc: {test_acc.compute().item():.2f}]")

## Metrics
   - Loss
   - Accuracy
   - Recall
   - Precision
   - F1-Score
   - Confusion Matrix
   - Area Under the ROC Curve (AUC-ROC)
   - Area Under the Precision-Recall Curve (AUC-PR)
   - ...

**Docs**:
   - [lightning.ai/docs/torchmetrics/stable/all-metrics.html](https://lightning.ai/docs/torchmetrics/stable/all-metrics.html)
   - [scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html)

In [None]:
# classification report
print(classification_report(targets, predictions))

In [None]:
# confusion matrix
metric = MulticlassConfusionMatrix(num_classes=10)
confusion_matrix = metric(torch.tensor(predictions), torch.tensor(targets))

# log
print(confusion_matrix)

# plot
fig, ax = plt.subplots(figsize=(8, 8))
metric.plot(ax=ax)
plt.show()

# Prediction

In [52]:
def predict(model: nn.Module, data: np.ndarray, classes: list, transform: v2._container.Compose = None) -> torch.Tensor:

    # add batch & channel dimension to a single data
    if len(data.shape) == 2:
        data = np.expand_dims(data, axis=(0, 3))

    # apply the transform
    if transform:
        data = torch.stack([transform(sample) for sample in data])

    # predict
    model.eval()
    with torch.no_grad():

        # send data to GPU
        data = data.to(device)

        # forward
        y_pred = model(data).argmax(dim=1).cpu()

        # idx to labels
        y_pred = np.array(classes)[y_pred]

    return y_pred

In [None]:
# some raw data
raw_data = MNIST(root="../../datasets", train=False, transform=None, download=False).data[:32]

# predict
y_pred = predict(model, data=raw_data, classes=testset.classes, transform=transforms)

# log
print(f"predictions:\n{y_pred}")

In [None]:
# plot
fig, axs = plt.subplots(nrows=4, ncols=8, figsize=(12, 6), layout="compressed")
for i in range(4):
    for j in range(8):
        axs[i, j].imshow(raw_data[i * 8 + j], cmap="gray")
        axs[i, j].set_title(predict(model, raw_data[i * 8 + j], testset.classes, transform=transforms))
        axs[i, j].axis("off")
plt.show()