# Dependencies

In [1]:
import torch
from torch.utils.data import DataLoader, random_split

from torchvision import transforms
from torchvision.datasets import MNIST

from torchsummary import summary

from torchmetrics import Accuracy, ConfusionMatrix

import matplotlib.pyplot as plt

from sklearn.metrics import classification_report

In [2]:
# set a fixed seed
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

In [3]:
# check if cuda is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

# Load Dataset

In [4]:
# load the MNIST dataset
trainset = MNIST(root= './dataset', train= True , download= True, transform= transforms.ToTensor())
testset  = MNIST(root= './dataset', train= False, download= True, transform= transforms.ToTensor())

# log
print(f"trainset.data.shape    : {trainset.data.shape}")    # it doesn't explicitly include the channel dimension for MNIST e.g. [60000, 1, 28, 28]
print(f"trainset.targets.shape : {trainset.targets.shape}")
print('-' * 50)
print(f"trainset[0][0].shape   : {trainset[0][0].shape}")
print(f"trainset[0][1]         : {trainset[0][1]}")

trainset.data.shape    : torch.Size([60000, 28, 28])
trainset.targets.shape : torch.Size([60000])
--------------------------------------------------
trainset[0][0].shape   : torch.Size([1, 28, 28])
trainset[0][1]         : 5


In [None]:
# plot
fig, axs = plt.subplots(nrows= 4, ncols= 8, figsize= (12, 6), layout= 'compressed')
for i in range(4):
    for j in range(8):
        axs[i, j].imshow(trainset.data[i * 8 + j], cmap= 'gray')
        axs[i, j].set_title(trainset.classes[trainset.targets[i * 8 + j]])
        axs[i, j].axis('off')
plt.show()

# Pre-Processing

## Split trainset into [trainset, validationset]

In [6]:
# 90% trainset & 10% validationset
validation_size = int(0.1 * len(trainset))
train_size = len(trainset) - validation_size

# random split
trainset, validationset = random_split(trainset, [train_size, validation_size])

# log
print('trainset:')
print(f"    -> len(trainset): {len(trainset)}")
print(f"    -> trainset.dataset[0][0]: {trainset.dataset[0][0].shape}")
print(f"    -> trainset.dataset[0][1]: {trainset.dataset[0][1]}\n")
print('validationset:')
print(f"    -> len(validationset): {len(validationset)}")
print(f"    -> validationset.dataset[0][0]: {validationset.dataset[0][0].shape}")
print(f"    -> validationset.dataset[0][1]: {validationset.dataset[0][1]}\n")
print('testset:')
print(f"    -> len(testset): {len(testset)}")
print(f"    -> testset[0][0]: {testset[0][0].shape}")
print(f"    -> testset[0][1]: {testset[0][1]}")

trainset:
    -> len(trainset): 54000
    -> trainset.dataset[0][0]: torch.Size([1, 28, 28])
    -> trainset.dataset[0][1]: 5

validationset:
    -> len(validationset): 6000
    -> validationset.dataset[0][0]: torch.Size([1, 28, 28])
    -> validationset.dataset[0][1]: 5

testset:
    -> len(testset): 10000
    -> testset[0][0]: torch.Size([1, 28, 28])
    -> testset[0][1]: 7


## Normalization
   1. Min-Max Normalization
      - 0-1 Normalization
         - Scales the pixel values to [0, 1] range
      - ...
   2. Mean-STD Normalization
      - Standardization (Z-score normalization)
         - Transforms the data to have a mean of 0 and a standard deviation of 1
      - Mean Normalization
         - It centers the data around zero
      - Scale and Center Images
         - Rescale the pixel values to have a mean of 0.5 and a standard deviation of 0.5
      - ...
   3. ...


In [7]:
# create a temporary DataLoader for the trainset
temp_trainloader = DataLoader(trainset, batch_size= len(trainset))

# get the whole data
temp_dataset = next(iter(temp_trainloader))

# calculate the mean and standard deviation
train_mean = temp_dataset[0].mean().item() # 0.1307
train_std  = temp_dataset[0].std().item()  # 0.3081

del temp_trainloader
del temp_dataset

## Transform
   - on-the-fly data augmentation
   - Disadvantage:
      - same transform applies to the same data in each epoch
   - Advantage:
      - Reduced Memory Usage, Regularization & Data Diversity [random transforms e.g. RancomCrop]

In [8]:
transform = transforms.Compose([
    transforms.ToTensor(),                      # convert a PIL Image or ndarray to tensor and scale the values accordingly.
    transforms.Normalize(train_mean, train_std)
])

trainset.dataset.transform = transform
validationset.dataset.transform = transform
testset.transform = transform

## DataLoader

In [9]:
batch_size = 64

trainloader      = DataLoader(dataset= trainset     , batch_size = batch_size, shuffle= True , num_workers= 2)
validationloader = DataLoader(dataset= validationset, batch_size = batch_size, shuffle= False, num_workers= 2)
testloader       = DataLoader(dataset= testset      , batch_size = batch_size, shuffle= False, num_workers= 2)

In [10]:
first_train_batch      = next(iter(trainloader))
first_validation_batch = next(iter(validationloader))
first_test_batch       = next(iter(testloader))

print(f"trainloader      first batch     -> x.shape: {first_train_batch[0].shape} - y.shape: {first_train_batch[1].shape}")
print(f"validationloader first batch     -> x.shape: {first_validation_batch[0].shape} - y.shape: {first_validation_batch[1].shape}")
print(f"testloader       first batch     -> x.shape: {first_test_batch[0].shape} - y.shape: {first_test_batch[1].shape}")
print(f"trainloader      last batch-size -> {len(trainset) % batch_size}")
print(f"validationloader last batch-size -> {len(validationset) % batch_size}")
print(f"testloader       last batch-size -> {len(testset) % batch_size}")

trainloader      first batch     -> x.shape: torch.Size([64, 1, 28, 28]) - y.shape: torch.Size([64])
validationloader first batch     -> x.shape: torch.Size([64, 1, 28, 28]) - y.shape: torch.Size([64])
testloader       first batch     -> x.shape: torch.Size([64, 1, 28, 28]) - y.shape: torch.Size([64])
trainloader      last batch-size -> 48
validationloader last batch-size -> 48
testloader       last batch-size -> 16


# Network Structure
   - Sequential Model
      - Use torch.nn.Sequential to create a sequence of layers or modules
   - Functional Model
      - for stateless operations like activation functions, loss functions, and other operations within the forward method of custom modules or in custom functions
   - Mixed Model

Note:
   - loss function : 
      - multi-class classification : `torch.nn.CrossEntropyLoss()`
   - activation function [last layer]:
      - multi-class classification : `torch.nn.Softmax()`
      - we does not include softmax in the last layer because torch's CrossEntropyLoss already use softmax inside itself
   

![alt text](resources/images/mlp.svg)

In [11]:
# layers
depth, height, width = trainset[0][0].shape

input_dim  = depth * height * width
hidden_dim = [64, 32]
output_dim = len(testset.classes)

## Sequential Model

In [12]:
sequential_model = torch.nn.Sequential(
    torch.nn.Flatten(start_dim= 1),

    torch.nn.Linear(input_dim, hidden_dim[0]),
    torch.nn.ReLU(),

    torch.nn.Linear(hidden_dim[0], hidden_dim[1]),
    torch.nn.ReLU(),
    
    torch.nn.Linear(hidden_dim[1], output_dim),
    # softmax is already satisfied inside CrossEntropyLoss()
)

In [13]:
sequential_model.to(device)

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=64, bias=True)
  (2): ReLU()
  (3): Linear(in_features=64, out_features=32, bias=True)
  (4): ReLU()
  (5): Linear(in_features=32, out_features=10, bias=True)
)

In [14]:
summary(sequential_model, (1,  width,  height))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
           Flatten-1                  [-1, 784]               0
            Linear-2                   [-1, 64]          50,240
              ReLU-3                   [-1, 64]               0
            Linear-4                   [-1, 32]           2,080
              ReLU-5                   [-1, 32]               0
            Linear-6                   [-1, 10]             330
Total params: 52,650
Trainable params: 52,650
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.01
Params size (MB): 0.20
Estimated Total Size (MB): 0.21
----------------------------------------------------------------


## Functional Model

In [15]:
class FunctionalModel(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(FunctionalModel, self).__init__()
        self.flatten = torch.nn.Flatten(start_dim= 1)
        self.linear1 = torch.nn.Linear(input_dim, hidden_dim[0])
        self.relu1   = torch.nn.ReLU()
        self.linear2 = torch.nn.Linear(hidden_dim[0], hidden_dim[1])
        self.relu2   = torch.nn.ReLU()
        self.linear3 = torch.nn.Linear(hidden_dim[1], output_dim)
        # softmax is already satisfied inside CrossEntropyLoss()
    
    def forward(self, x):
        x = self.flatten(x)
        x = self.linear1(x)
        x = self.relu1(x)
        x = self.linear2(x)
        x = self.relu2(x)
        x = self.linear3(x)
        return x

functional_model = FunctionalModel(input_dim, output_dim)

In [16]:
functional_model.to(device)

FunctionalModel(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear1): Linear(in_features=784, out_features=64, bias=True)
  (relu1): ReLU()
  (linear2): Linear(in_features=64, out_features=32, bias=True)
  (relu2): ReLU()
  (linear3): Linear(in_features=32, out_features=10, bias=True)
)

In [17]:
summary(functional_model, (1,  width,  height))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
           Flatten-1                  [-1, 784]               0
            Linear-2                   [-1, 64]          50,240
              ReLU-3                   [-1, 64]               0
            Linear-4                   [-1, 32]           2,080
              ReLU-5                   [-1, 32]               0
            Linear-6                   [-1, 10]             330
Total params: 52,650
Trainable params: 52,650
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.01
Params size (MB): 0.20
Estimated Total Size (MB): 0.21
----------------------------------------------------------------


## Mixed Model

In [18]:
class MixModel(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MixModel, self).__init__()
        self.classifier = torch.nn.Sequential(
            torch.nn.Flatten(start_dim= 1),
            torch.nn.Linear(input_dim, hidden_dim[0]),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_dim[0], hidden_dim[1]),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_dim[1], output_dim),
            # softmax is already satisfied inside CrossEntropyLoss()
        )
    
    def forward(self, x):
        x = self.classifier(x)
        return x

mix_model = MixModel(input_dim, output_dim)

In [19]:
mix_model.to(device)

MixModel(
  (classifier): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=784, out_features=64, bias=True)
    (2): ReLU()
    (3): Linear(in_features=64, out_features=32, bias=True)
    (4): ReLU()
    (5): Linear(in_features=32, out_features=10, bias=True)
  )
)

In [20]:
summary(mix_model, (1,  width,  height))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
           Flatten-1                  [-1, 784]               0
            Linear-2                   [-1, 64]          50,240
              ReLU-3                   [-1, 64]               0
            Linear-4                   [-1, 32]           2,080
              ReLU-5                   [-1, 32]               0
            Linear-6                   [-1, 10]             330
Total params: 52,650
Trainable params: 52,650
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.01
Params size (MB): 0.20
Estimated Total Size (MB): 0.21
----------------------------------------------------------------


# Set up remaining Hyper-Parameters

In [21]:
model = sequential_model

In [22]:
lr = 0.01
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params= model.parameters(), lr= lr)
num_epochs = 15

# Train & Validation Loop

In [23]:
train_acc_per_epoch  = []
train_loss_per_epoch = []
val_acc_per_epoch  = []
val_loss_per_epoch = []

In [24]:
train_acc = Accuracy(task = 'multiclass', num_classes= len(testset.classes), top_k= 1).to(device)
val_acc   = Accuracy(task = 'multiclass', num_classes= len(testset.classes), top_k= 1).to(device)

In [25]:
for epoch in range(num_epochs):

    # train loop
    model.train()
    train_loss = 0

    for x, y in trainloader:

        # send data to gpu
        x, y_true = x.to(device), y.to(device)

        # forward
        y_pred = model(x)
        loss = criterion(y_pred, y_true)

        # backward
        loss.backward()

        # update parameters
        optimizer.step()
        optimizer.zero_grad()

        # log loss & accuracy
        train_loss += loss.item() * len(x)
        train_acc.update(y_pred, y_true)

    train_loss_per_epoch.append(train_loss / len(trainset))
    train_acc_per_epoch.append(train_acc.compute().item())
    train_acc.reset()


    # validation loop
    model.eval()
    val_loss = 0

    with torch.no_grad():
        for x, y in validationloader:

            # send data to gpu
            x, y_true = x.to(device), y.to(device)

            # forward
            y_pred = model(x)
            loss = criterion(y_pred, y_true)

            # log loss & accuracy
            val_loss += loss.item() * len(x)
            val_acc.update(y_pred, y_true)
    
    val_loss_per_epoch.append(val_loss / len(validationset))
    val_acc_per_epoch.append(val_acc.compute().item())
    val_acc.reset()
    

    # log
    print(f"epoch {epoch:>2}  ->  train[loss: {train_loss_per_epoch[epoch]:.5f} - acc: {train_acc_per_epoch[epoch]:.2f}] | validation[loss: {val_loss_per_epoch[epoch]:.5f} - acc: {val_acc_per_epoch[epoch]:.2f}]")


epoch  0  ->  train[loss: 0.29456 - acc: 0.91] | validation[loss: 0.20450 - acc: 0.94]
epoch  1  ->  train[loss: 0.18648 - acc: 0.95] | validation[loss: 0.21168 - acc: 0.94]
epoch  2  ->  train[loss: 0.17149 - acc: 0.95] | validation[loss: 0.19605 - acc: 0.95]
epoch  3  ->  train[loss: 0.15955 - acc: 0.96] | validation[loss: 0.20097 - acc: 0.95]
epoch  4  ->  train[loss: 0.15570 - acc: 0.96] | validation[loss: 0.20038 - acc: 0.95]
epoch  5  ->  train[loss: 0.14224 - acc: 0.96] | validation[loss: 0.19857 - acc: 0.95]
epoch  6  ->  train[loss: 0.14045 - acc: 0.96] | validation[loss: 0.18619 - acc: 0.95]
epoch  7  ->  train[loss: 0.13501 - acc: 0.97] | validation[loss: 0.17799 - acc: 0.96]
epoch  8  ->  train[loss: 0.13111 - acc: 0.97] | validation[loss: 0.21187 - acc: 0.95]
epoch  9  ->  train[loss: 0.12707 - acc: 0.97] | validation[loss: 0.23307 - acc: 0.95]
epoch 10  ->  train[loss: 0.13033 - acc: 0.97] | validation[loss: 0.21427 - acc: 0.95]
epoch 11  ->  train[loss: 0.12393 - acc: 0.

## Learning Analysis

In [None]:
# plot
fig, axs = plt.subplots(nrows= 1, ncols= 2, figsize= (10, 4), layout= 'compressed')

axs[0].plot(train_loss_per_epoch, label= 'Train loss')
axs[0].plot(val_loss_per_epoch, label= 'Validation loss')
axs[0].set(xlabel= 'Epoch', ylabel= 'Loss', title= "Loss over time")
axs[0].legend()

axs[1].plot(train_acc_per_epoch, label= 'Train accuracy')
axs[1].plot(val_acc_per_epoch, label= 'Validation accuracy')
axs[1].set(xlabel= 'Epoch', ylabel= 'Accuracy', title= "Accuracy over time")
axs[1].legend()

plt.show()

# Test Loop

In [27]:
test_acc = Accuracy(task = 'multiclass', num_classes= len(testset.classes), top_k= 1).to(device)

In [28]:
# test loop
model.eval()
test_loss = 0
predictions = []
targets = []

with torch.no_grad():
    for x, y in testloader:

        # send data to gpu
        x, y_true = x.to(device), y.to(device)

        # forward
        y_pred = model(x)
        loss = criterion(y_pred, y_true)

        # log loss & accuracy
        test_loss += loss.item() * len(x)
        test_acc.update(y_pred, y_true)

        targets.extend(y_true.cpu())
        predictions.extend(y_pred.argmax(dim= 1).cpu())
        

print(f"test[loss: {test_loss / len(testset):.5f} - acc: {test_acc.compute().item():.2f}]")

test[loss: 0.20450 - acc: 0.96]


In [None]:
# plot
data = next(iter(testloader))[0][:32]

fig, axs = plt.subplots(nrows= 4, ncols= 8, figsize= (12, 6), layout= 'compressed')

for i in range(4):
    for j in range(8):
        axs[i, j].imshow(testset.data[i * 8 + j], cmap= 'gray')
        axs[i, j].set_title(model.cpu()(data[i * 8 + j]).argmax(dim= 1).item())
        axs[i, j].axis('off')

plt.show()

## Metrics
   - loss
   - accuracy
   - recall
   - precision
   - f1-score
   - ROC Curve
   - AUC Curve
   - ...

In [30]:
# classification report
print(classification_report(targets, predictions))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       980
           1       0.96      0.99      0.98      1135
           2       0.96      0.96      0.96      1032
           3       0.97      0.95      0.96      1010
           4       0.97      0.95      0.96       982
           5       0.94      0.96      0.95       892
           6       0.99      0.96      0.97       958
           7       0.97      0.94      0.96      1028
           8       0.93      0.95      0.94       974
           9       0.92      0.96      0.94      1009

    accuracy                           0.96     10000
   macro avg       0.96      0.96      0.96     10000
weighted avg       0.96      0.96      0.96     10000



In [36]:
# confusion matrix
metric = ConfusionMatrix(task= 'multiclass', num_classes= 10)
confusion_matrix = metric(torch.tensor(targets), torch.tensor(predictions))

# log
print(confusion_matrix)

# plot
fig, ax = plt.subplots(figsize= (8, 8))
metric.plot(ax= ax)
plt.show()

tensor([[ 962,    0,    2,    0,    1,    5,    5,    1,    4,    2],
        [   1, 1124,    3,    1,    1,    2,    2,   10,   15,    6],
        [   1,    4,  995,    4,    8,    0,    3,   13,    4,    1],
        [   0,    1,    6,  962,    0,   10,    0,    4,    5,    4],
        [   0,    0,    2,    0,  932,    0,    8,    3,    4,   10],
        [   3,    2,    3,   10,    0,  852,   17,    1,   10,    6],
        [   2,    1,    0,    0,    1,    2,  916,    0,    0,    0],
        [   1,    1,    7,    3,    5,    0,    0,  967,    3,    8],
        [   8,    2,   13,   14,    2,    4,    7,   15,  922,    4],
        [   2,    0,    1,   16,   32,   17,    0,   14,    7,  968]])
