# Teacher's Assignment - Extra Credit #2

***Author:*** *Ofir Paz* $\qquad$ ***Version:*** *17.07.2024* $\qquad$ ***Course:*** *22961 - Deep Learning* \
***Extra Assignment Course:*** *20999 - Extra Assignment 4*

Welcome to the first question of the extra assignment #2 as part of the course *Deep Learning*. \
In this question we will train different types of auto encoders on the MNIST dataset and compare the results with various plots.

## Imports

In [1]:
import torch  # pytorch.
import torch.nn as nn  # neural network module.
import numpy as np  # numpy - scientific calculations.
from torch.utils.data import DataLoader, Dataset  # data handling.
from sklearn.datasets import fetch_openml  # To fetch the hand-written digits dataset.
from sklearn.model_selection import train_test_split  # To split the dataset into training and testing sets.
import matplotlib.pyplot as plt  # plotting module.
from tqdm.notebook import tqdm  # Progress bar
from base_model import BaseModel  # base model class.

# Type hinting.
from torch import Tensor
from typing import Optional, Literal, Tuple

## Loading MNIST

In [None]:
# Fetch MNIST dataset
mnist = fetch_openml('mnist_784', version=1, parser='auto')

# Access features (pixel values). We don't need the target values for autoencoders.
dataset = mnist['data']

print(f"The shape of the data array is: {dataset.shape}")

train_set, val_set = train_test_split(dataset, test_size=1/7, random_state=42)
train_set, val_set = np.array(train_set), np.array(val_set) 

print(f"Training set shape: {train_set.shape}")
print(f"Validation set shape: {val_set.shape}")

In [3]:
# Define the dataset class for the auto encoder.
class MNISTDataset(Dataset):
    __mnist_shape__: Tuple[int, int, int] = (1, 28, 28)

    def __init__(self, data: np.ndarray):
        self.data = torch.from_numpy(data).to(torch.float32).view(-1, *self.__mnist_shape__) / 255.0

    def __len__(self) -> int:
        return self.data.size(0)

    def __getitem__(self, idx: int) -> Tuple[Tensor, Tensor]:
        return self.data[idx], self.data[idx]  # Return the same image as input and target.

In [4]:
batch_size = 256

# Create the dataset loaders.
train_dataset = MNISTDataset(train_set)
val_dataset = MNISTDataset(val_set)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

## The Fully Connected Auto Encoder

In [5]:
class BaseAutoEncoder(BaseModel):
    """Base autoencoder model.
    """
    def __init__(self, layer_parms: dict, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.save_layer_outs: bool = False
        self._layer_outs: list = []
        self._layer_parms = layer_parms

        num_layers = len(list(layer_parms.values())[0])
        assert num_layers > 0, "At least one hidden layer is required."

        encoder_layers = []
        
        encoder_layers.extend([self._make_layer(-1, "encoder")])
        for idx in range(num_layers - 1):
            encoder_layers.extend([nn.ReLU(), self._make_layer(idx, "encoder")])
        encoder_layers.extend([nn.Sigmoid()])
        self.encoder = nn.Sequential(*encoder_layers)
        
        decoder_layers = []
        for idx in range(num_layers - 1):
            decoder_layers.extend([self._make_layer(idx, "decoder"), nn.ReLU()])
        decoder_layers.extend([self._make_layer(-1, "decoder"), nn.Sigmoid()])    
        self.decoder = nn.Sequential(*decoder_layers)

    def forward(self, x: Tensor) -> Tensor:
        x = self.encoder(x)
        x = self.decoder(x)
        return x
    
    def _make_layer(self, idx: int, trasnform_type: Literal["encoder", "decoder"]) -> nn.Module:
        raise NotImplementedError
    
    @property
    def layer_outs(self) -> list:
        self.save_layer_outs = False
        _layer_outs = self._layer_outs
        self._layer_outs = []
        return _layer_outs
    
    def _attach_hooks(self) -> None:
        assert hasattr(self, "encoder") and hasattr(self, "decoder"), \
            "Model must have 'encoder' and 'decoder' attributes."
        for layer in self.encoder:
            layer.register_forward_hook(self._hook_fn)
        for layer in self.decoder:
            layer.register_forward_hook(self._hook_fn)

    def _hook_fn(self, module: nn.Module, input: Tuple[Tensor], output: Tensor) -> None:
        if self.save_layer_outs and isinstance(module, nn.ReLU):
            assert output.size(0) == 1, "For this operation, only 1 image in a batch is allowed."
            assert int(output.numel() ** 0.5) ** 2 == output.numel(), "Only square images are supported."
            side_len = int(output.numel() ** 0.5)
            self._layer_outs.append(output.clone().view(side_len, side_len).cpu().numpy())

In [6]:
class FCAutoEncoder(BaseAutoEncoder):
    """
    Fully connected autoencoder model.

    Args:
        input_size (int): The size of the input data.
        layers (list[int]): The size of each hidden layer.

    Attributes:
        encoder (nn.Sequential): The encoder part of the autoencoder.
        decoder (nn.Sequential): The decoder part of the autoencoder.
    """
    __input_size__: int = 28 * 28

    def __init__(self, layers: list[int], **kwargs) -> None:
        super().__init__({"layers": layers}, **kwargs)
        self._attach_hooks()

    def forward(self, x: Tensor) -> Tensor:
        x = x.view(x.size(0), -1)
        return super().forward(x)
    
    def _make_layer(self, idx: int, trasnform_type: Literal["encoder", "decoder"]) -> nn.Module:
        if trasnform_type == "encoder":
            if idx == -1:
                return nn.Linear(self.__input_size__, self._layer_parms["layers"][0])
            else:
                return nn.Linear(self._layer_parms["layers"][idx], self._layer_parms["layers"][idx+1])
            
        elif trasnform_type == "decoder":
            if idx == -1:
                return nn.Linear(self._layer_parms["layers"][0], self.__input_size__)
            else:
                return nn.Linear(self._layer_parms["layers"][-idx-1], self._layer_parms["layers"][-idx-2])
            
        else:
            raise ValueError("Invalid transform type.")

## Training The FC Auto Encoder

For training the Auto Encoder, I chose the Mean Squared Error (MSE) loss function, since it gives scores for the pixel similarity between images.

In [7]:
# Configs for different network architectures.

# This is more of a sanity check, it should be able perfectly reconstruct the input
#  as the latent space dimension is the same as the input dimension.
layers_config1 = [28 * 28]  

# This is a decent architecture, with a large enough latent space and few hidden layers.
layers_config2 = [400, 400, 64]

# This is a very small latent space, it should not be able to reconstruct the input well.
layers_config3 = [16, 4]

In [None]:
fc_autoencoder1 = FCAutoEncoder(layers_config1, job_type="regression")
fc_autoencoder1.fit(train_loader, val_loader, num_epochs=15, lr=0.001, print_stride=3)
fc_autoencoder1.fit(train_loader, val_loader, num_epochs=10, lr=0.0001, print_stride=2)

In [None]:
fc_autoencoder2 = FCAutoEncoder(layers_config2, job_type="regression")
fc_autoencoder2.fit(train_loader, val_loader, num_epochs=15, lr=0.001, print_stride=3)
fc_autoencoder2.fit(train_loader, val_loader, num_epochs=10, lr=0.0001, print_stride=2)

In [None]:
fc_autoencoder3 = FCAutoEncoder(layers_config3, job_type="regression")
fc_autoencoder3.fit(train_loader, val_loader, num_epochs=15, lr=0.001, print_stride=3)
fc_autoencoder3.fit(train_loader, val_loader, num_epochs=10, lr=0.0001, print_stride=2)

## Visualizing The Encoding-Decoding Path

In [11]:
def plot_encoding_decoding_path(model: BaseAutoEncoder, input_img: Tensor, input_shape: Tuple) -> None:
    model.save_layer_outs = True
    model.eval()
    with torch.no_grad():
        output_img = model(input_img.view(*input_shape)).view(28, 28).numpy()
    layer_outs = [input_img.view(28, 28).numpy(), *model.layer_outs, output_img]
    plt.figure(figsize=(10, 7))
    for i in range(len(layer_outs) // 2):
        plt.subplot(2, len(layer_outs) // 2, i + 1)
        plt.imshow(layer_outs[i], cmap='gray')
        plt.title(f"Layer {i+1} input")

    for i in range(len(layer_outs) // 2, len(layer_outs), 1):
        plt.subplot(2, len(layer_outs) // 2, i + 1)
        plt.imshow(layer_outs[len(layer_outs) - i + len(layer_outs) // 2 - 1], cmap='gray')
        plt.title(f"Layer {len(layer_outs) - i + len(layer_outs) // 2} output")

    plt.show()

In [None]:
for fc_autoencoder in [fc_autoencoder1, fc_autoencoder2, fc_autoencoder3]:
    specific_input = val_dataset[10][0]
    plot_encoding_decoding_path(fc_autoencoder, specific_input, input_shape=(1, -1))

In [13]:
class ConvolutionalAutoEncoder(BaseAutoEncoder):
    """
    Fully connected autoencoder model.

    Args:
        input_size (int): The size of the input data.
        layers (list[int]): The size of each hidden layer.

    Attributes:
        encoder (nn.Sequential): The encoder part of the autoencoder.
        decoder (nn.Sequential): The decoder part of the autoencoder.
    """
    __input_channels__ = 1

    def __init__(self, ksizes: list[int], channels: Optional[list[int]] = None, **kwargs) -> None:
        super().__init__({
            "channels": channels if channels is not None else [self.__input_channels__] * len(ksizes), 
            "ksizes": ksizes
            }, **kwargs)

        if not channels or np.all(np.array(channels) == 1):
            self._attach_hooks()

    def forward(self, x: Tensor) -> Tensor:
        return super().forward(x)
    
    def _make_layer(self, idx: int, trasnform_type: Literal["encoder", "decoder"]) -> nn.Module:
        if trasnform_type == "encoder":
            if idx == -1:
                return nn.Conv2d(self.__input_channels__, self._layer_parms["channels"][0], 
                                 self._layer_parms["ksizes"][0])
            
            return nn.Conv2d(self._layer_parms["channels"][idx], self._layer_parms["channels"][idx+1], 
                             self._layer_parms["ksizes"][idx])
            
        elif trasnform_type == "decoder":
            if idx == -1:
                return nn.ConvTranspose2d(self._layer_parms["channels"][0], self.__input_channels__, 
                                          self._layer_parms["ksizes"][0])
            
            return nn.ConvTranspose2d(self._layer_parms["channels"][-idx-1], 
                                      self._layer_parms["channels"][-idx-2],
                                      self._layer_parms["ksizes"][-idx-1])
            
        else:
            raise ValueError("Invalid transform type.")

In [14]:
# This is a sanity check, the model should be able to perfectly reconstruct the input.
ksizes1 = [1]

# This is a decent architecture, with a large enough latent space and few hidden layers.
ksizes2 = [7, 7]  # Latent space is (28 - 6 - 6)^2 = 256

# Good architecture with multiple channels (cannot be displayed).
channels3 = [2, 2, 2]; ksizes3 = [7, 7, 7]  # Latent space is (28 - 6 - 6 - 6)^2 * 2 = 200

In [None]:
conv_autoencoder1 = ConvolutionalAutoEncoder(ksizes1, job_type="regression")
conv_autoencoder1.fit(train_loader, val_loader, num_epochs=15, lr=0.001, print_stride=3)
conv_autoencoder1.fit(train_loader, val_loader, num_epochs=10, lr=0.0001, print_stride=2)

In [None]:
conv_autoencoder2 = ConvolutionalAutoEncoder(ksizes2, job_type="regression")
conv_autoencoder2.fit(train_loader, val_loader, num_epochs=15, lr=0.001, print_stride=3)
conv_autoencoder2.fit(train_loader, val_loader, num_epochs=10, lr=0.0001, print_stride=2)

In [None]:
conv_autoencoder3 = ConvolutionalAutoEncoder(ksizes3, channels3, job_type="regression")
conv_autoencoder3.fit(train_loader, val_loader, num_epochs=15, lr=0.001, print_stride=3)
conv_autoencoder3.fit(train_loader, val_loader, num_epochs=10, lr=0.0001, print_stride=2)

In [None]:
for conv_autoencoder in [conv_autoencoder1, conv_autoencoder2]:
    specific_input = val_dataset[10][0]
    plot_encoding_decoding_path(conv_autoencoder, specific_input, input_shape=(1, 28, 28))

## Conclusions

We explored the MLP Auto Encoder vs the Fully Convolutional Auto Encoder on the MNIST dataset.

We saw that the MLP Auto Encoder performs overall better than the fully convolutional one, probably because the convolutional one lacks fully connected layers, which are crucial for its receptive field.

Also, the larger the laten space the network has the better it performs, since it doens't have to encode the image into a smaller space.

Choosing the network parameters and the architecture is obviously crucial for the network's performance.

We chose MSE as the loss function since it evaluates the model based on the pixel similarity between the original and the reconstructed image, which is exactly the metric we want to optimize.