# Teacher's Assignment No. 14 - Q2

***Author:*** *Ofir Paz* $\qquad$ ***Version:*** *15.05.2024* $\qquad$ ***Course:*** *22961 - Deep Learning*

Welcome to question 2 of the fourth assignment of the course *Deep Learning*. \
In this question, we will implement the *DropNorm* network layer, and compare the use of this layer to the normal dropout and normalizing layers in pytorch with training on *Fashion-MNIST*

## Imports

First, we will import the required packages for this assignment.
- [pytorch](https://pytorch.org/) - One of the most fundemental and famous tensor handling library.
- [numpy](https://numpy.org) - The fundamental package for scientific computing with Python.
- [matplotlib](https://matplotlib.org) - Library to plot graphs in Python.

In [1]:
from typing import Tuple
import torch  # pytorch.
import torch.nn as nn  # neural network module.
import torch.nn.functional as F  # functional module.
import torch.optim as optim  # optimizer module.
from torch.utils.data import DataLoader  # data loader.
from torchvision import datasets, transforms  # torchvision, for datasets.
from torchmetrics import Accuracy  # accuracy metric.
import matplotlib.pyplot as plt  # plotting.

## DropNorm Implementation

We will start with the implementation of the *DropNorm* layer, using pytorch.

In [2]:
class DropNorm(nn.Module):
    '''DropNorm layer.
    
    The DropNorm layer is a combined dropout and batch norm layers that in training mode
    zeros half of the input tensor and normalizes the other half. In evaluation mode, it
    normalizes the entire input tensor.
    '''
    def __init__(self, batch_size: int) -> None:
        '''
        Constructor for the DropNorm layer.

        Args:
            batch_size (int) - Batch size.
        '''
        super(DropNorm, self).__init__()
        
        self.gammas = nn.Parameter(torch.zeros(batch_size, 1))
        self.betas = nn.Parameter(torch.ones(batch_size, 1))
        self.eps = 1e-8
        self.p = 0.5

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        '''
        Forward pass of the layer.

        Args:
            x (torch.Tensor) - Input tensor. Assumes shape (batch_size, #features).

        Returns:
            torch.Tensor - Output tensor.
        '''

        gammas = self.gammas.expand_as(x)
        betas = self.betas.expand_as(x)

        if self.training:
            mask = torch.bernoulli(torch.full(x.shape, self.p))
            x = x * mask

        x = (x - x.mean(dim=0, keepdim=True)) / \
            ((x.std(dim=0, keepdim=True) ** 2 + self.eps) ** 0.5)
        
        x = x * gammas + betas
        
        return x

## Training Network on Fashion-MNIST

In [3]:
# Configure hyper parameters.
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 512 if device == 'cuda' else 128

# Load the Fashion-MNIST dataset.

# Define the transformations.
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

# Load the dataset.
train_set = datasets.FashionMNIST('data', download=True, train=True, transform=transform)
val_set = datasets.FashionMNIST('data', download=True, train=False, transform=transform)

# Prepare the data loaders.
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)

In [4]:
class BaseModel(nn.Module):
    '''Base model class.'''
    def __init__(self) -> None:
        '''Constructor.'''
        super(BaseModel, self).__init__()
        self.epoch = 0

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        '''Forward pass.'''
        raise NotImplementedError
    
    def fit(self, train_loader: DataLoader, val_loader: DataLoader,
            num_epochs: int = 30, lr: float = 0.001, wd: float = 0., 
            try_cuda: bool = True, quite: bool = False, print_stride: int = 1) -> \
                  Tuple[list[float], list[float], list[float]]:
        '''
        Base function for training the model.

        Args:
            train_loader (DataLoader) - The dataloader to fit the model to.
            val_loader (DataLoader) - The dataloader to validate the model on.
            num_epochs (int) - Number of epochs.
            lr (float) - Learning rate.
            wd (float) - Weight decay.
            try_cuda (bool) - Try to use CUDA.
            quite (bool) - Quite mode.
            print_stride (int) - Print stride (in epochs).
        
        Returns:
            costs (list[float]) - Costs over all epochs
            train_accs (list[float]) - Accuracies over all epochs
            val_accs (list[float]) - Validation accuracies over all epochs
        '''
        
        costs: list[float] = []
        train_accs: list[float] = []
        val_accs: list[float] = []
        
        use_cuda = try_cuda and torch.cuda.is_available()
        if use_cuda:
            self.cuda()
            print("Using CUDA for traininig.")
        else:
            self.cpu()
            print("Using CPU for training.")

        # Create the optimizer and criterion.
        optimizer = optim.Adam(self.parameters(), lr=lr, weight_decay=wd)
        criterion = nn.CrossEntropyLoss()

        start_epoch = self.epoch
        for epoch in range(num_epochs):
            train_true = 0.
            running_loss = 0.
            self.epoch += 1
            for mb, (x, y) in enumerate(train_loader):
                if use_cuda:
                    x, y = x.cuda(), y.cuda()

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward + backward + optimize
                y_hat = self(x)

                loss = criterion(y_hat, y)
                loss.backward()
                optimizer.step()

                # Calc loss
                lloss = loss.item()
                running_loss += lloss * x.size(0)

                # Calc accuracy
                train_true += (y_hat.argmax(1) == y).sum().item()

                if not quite:
                    print(f"\r[epoch: {self.epoch:02d}/{start_epoch + num_epochs:02d}", end=" ")
                    print(f"mb: {mb + 1:03d}/{len(train_loader):03d}]", end=" ")
                    print(f"loss: {lloss:.6f}", end="")

            epoch_loss = running_loss / len(train_loader.dataset)  # type: ignore
            train_acc = train_true / len(train_loader.dataset)  # type: ignore
            val_acc = self.calc_acc(val_loader, use_cuda)
            costs.append(epoch_loss)
            train_accs.append(train_acc)
            val_accs.append(val_acc)
                
            if not quite and (epoch % print_stride == 0 or epoch == num_epochs - 1):
                print(f"\r[epoch: {self.epoch:02d}/{start_epoch + num_epochs:02d}]", end=" ")
                print(f"[Total Loss: {epoch_loss:.6f}]", end=" ")
                print(f"[Train Acc: {100 * train_acc:.3f}%]", end=" ")
                print(f"[Val Acc: {100 * val_acc:.3f}%]")

        return costs, train_accs, val_accs
    
    def calc_acc(self, data_loader: DataLoader, use_cuda: bool) -> float:
        '''
        Calculates and returns the accuracy of the model on a give dataset.

        Args:
            data_loader (DataLoader) - Data loader.
            use_cuda (bool) - Use CUDA flag.
        '''
        
        accuracy = Accuracy(task="multiclass", num_classes=10)
        if use_cuda:
            accuracy = accuracy.cuda()
        self.eval()
        acc = 0.
        for x, y in data_loader:
            if use_cuda:
                x, y = x.cuda(), y.cuda()
            y_hat = self(x)
            acc += accuracy(y_hat, y).item() * x.size(0)
        
        self.train()
        return acc / len(data_loader.dataset)  # type: ignore

In [5]:
class NormalNet(BaseModel):
    '''Normal Net
    A simple neural network with two hidden layers and a DropNorm layer.
    '''
    def __init__(self) -> None:
        super(NormalNet, self).__init__()

        self.convs = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=5, padding=2),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2))

        self.fc = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(64 * 7 * 7, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(128, 10))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.convs(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

In [6]:
# Use normal net to train the model.
normal_model = NormalNet()

# Train the model.
normal_metrics = normal_model.fit(train_loader, val_loader, num_epochs=10, lr=0.0025, wd=0.0025, 
                                  try_cuda=True, print_stride=1)

Using CUDA for traininig.
[epoch: 01/10] [Total Loss: 0.543879] [Train Acc: 81.782%] [Val Acc: 86.750%]
[epoch: 02/10] [Total Loss: 0.380921] [Train Acc: 86.937%] [Val Acc: 88.870%]
[epoch: 03/10] [Total Loss: 0.353413] [Train Acc: 87.872%] [Val Acc: 89.390%]
[epoch: 04/10] [Total Loss: 0.336608] [Train Acc: 88.480%] [Val Acc: 88.950%]
[epoch: 05/10] [Total Loss: 0.328862] [Train Acc: 88.550%] [Val Acc: 89.780%]
[epoch: 06/10] [Total Loss: 0.321544] [Train Acc: 89.047%] [Val Acc: 89.250%]
[epoch: 07/10] [Total Loss: 0.320743] [Train Acc: 88.882%] [Val Acc: 90.280%]
[epoch: 08/10] [Total Loss: 0.314472] [Train Acc: 89.203%] [Val Acc: 90.160%]
[epoch: 09/10] [Total Loss: 0.308353] [Train Acc: 89.355%] [Val Acc: 90.150%]
[epoch: 10/10] [Total Loss: 0.307689] [Train Acc: 89.380%] [Val Acc: 90.580%]


In [None]:
class CostumeNet(BaseModel):
    '''Costume Net model.
    Same as the NormalNet mode but with DropNorm layer.
    '''
    def __init__(self) -> None:
        super(CostumeNet, self).__init__()

        self.convs = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=5, padding=2),
            DropNorm(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            DropNorm(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2))

        self.fc = nn.Sequential(
            nn.Linear(64 * 7 * 7, 128),
            DropNorm(128),
            nn.ReLU(inplace=True),
            nn.Linear(128, 10))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.convs(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

In [None]:
# Train the model.
costume_model = CostumeNet()

costume_metrics = costume_model.fit(train_loader, val_loader, num_epochs=10, lr=0.0025, wd=0.0025, 
                                    try_cuda=True, print_stride=2)

## Results Comparison

In [None]:
# Results comparison

