<a href="https://colab.research.google.com/github/masalha-alaa/alexnet-pytorch/blob/master/AlexNet_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AlexNet Neural Network
This is an implementation of the famous AlexNet Neural Net (the winner of ImageNet 2012) which was described in the paper:
https://www.cs.toronto.edu/~hinton/absps/imagenet.pdf

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.optim.lr_scheduler import ReduceLROnPlateau, MultiStepLR

In [None]:
NUM_CLASSES = 1000  # Number of categories (classes) in ImageNet
IMAGE_SHAPE = (3, 227, 227)  # there's a mistake in the paper. the cropping is 227 not 224.
                             # See: https://datascience.stackexchange.com/questions/29245/what-is-the-input-size-of-alex-net
EPOCHS = 90


def calc_out_size(in_size, padding, kernel, stride):
    # formula from: https://youtu.be/wnK3uWv_WkU?t=234
    # further reading: https://blog.mlreview.com/a-guide-to-receptive-field-arithmetic-for-convolutional-neural-networks-e0f514068807
    # assuming symmetric nn
    padding = padding if isinstance(padding, int) else padding[0]
    kernel = kernel if isinstance(kernel, int) else kernel[0]
    stride = stride if isinstance(stride, int) else stride[0]
    return ((in_size + 2*padding - kernel) // stride) + 1


def same_padding(in_out_size, kernel_size, stride):
    """
    A pading metric called "Same", which doesn't exist in PyTorch. It exists in Keras though.
    I figured out the formula simply by extracting the "padding" variable from the formula used in calc_out_size():
    out = [(in + 2*padding - kernel) / stride] + 1
    where in == out
    padding = (out * stride - out + kernel - stride) // 2
    """
    return (in_out_size * stride - in_out_size + kernel_size - stride * 1) // 2


def weights_init(m, bias=1.0):
    # Gaussian Weight initalization before each layer: mean 0: std: 0.01.
    torch.nn.init.normal_(m.weight, 0.0, 0.01)
    if bias:
        m.bias.data.fill_(bias)


class AlexNet(nn.Module):
    def __init__(self, dimensions, in_channels, num_classes):
        super().__init__()
        self.dimensions = dimensions
        self.num_classes = num_classes
        self.current_size = 0
        
        # Note: In layers 2,3,4,5 I use "Same" padding. This is a padding metric that keeps the layer's output size the same as the
        # previous layer, by sitting the appropriate padding size.
        # I use this because in the paper they don't mention the padding they picked, but one can conclude it from the layers sizes in
        # the snapshot in the paper: 55x55 => 27x27 => 13x13 => 13x13 => 13x13.
        # Also Andrew NG. says that "Same" padding is used in these layers. Watch:
        # https://www.coursera.org/lecture/convolutional-neural-networks/classic-networks-MmYe2
        # 8:04, 8:20+
        
        conv_stride = 4  # common for all convolution layers

        # layer 1
        layer_kernel_size = 11
        self.conv1 = nn.Sequential(nn.Conv2d(in_channels=in_channels, out_channels=96, kernel_size=layer_kernel_size, stride=conv_stride,
                                             bias=False),
                                   nn.ReLU(),
                                   nn.LocalResponseNorm(size=5, alpha=0.0001, beta=0.75, k=2.0),
                                   nn.MaxPool2d(kernel_size=3, stride=2))
        weights_init(self.conv1[0], bias=False)
        self.update_size(self.conv1[0], dims=self.dimensions)
        self.update_size(self.conv1[3])  # pool
        
        # layer 2
        layer_kernel_size = 5
        self.conv2 = nn.Sequential(nn.Conv2d(in_channels=self.conv1[0].out_channels, out_channels=256, kernel_size=layer_kernel_size, stride=conv_stride,
                                             padding=same_padding(self.current_size, kernel_size=layer_kernel_size, stride=conv_stride)),
                                   nn.ReLU(),
                                   nn.LocalResponseNorm(size=5, alpha=0.0001, beta=0.75, k=2.0),
                                   nn.MaxPool2d(kernel_size=3, stride=2))
        weights_init(self.conv2[0])
        self.update_size(self.conv2[0])
        self.update_size(self.conv2[3])  # pool
        
        # layer 3
        layer_kernel_size = 3
        self.conv3 = nn.Sequential(nn.Conv2d(in_channels=self.conv2[0].out_channels, out_channels=384, kernel_size=layer_kernel_size, stride=conv_stride,
                                             padding=same_padding(self.current_size, kernel_size=layer_kernel_size, stride=conv_stride),
                                             bias=False),
                                   nn.ReLU())
        weights_init(self.conv3[0], bias=False)
        self.update_size(self.conv3[0])
        
        # layer 4
        layer_kernel_size = 3
        self.conv4 = nn.Sequential(nn.Conv2d(in_channels=self.conv3[0].out_channels, out_channels=384, kernel_size=layer_kernel_size, stride=conv_stride,
                                             padding=same_padding(self.current_size, kernel_size=layer_kernel_size, stride=conv_stride)),
                                   nn.ReLU())
        weights_init(self.conv4[0])
        self.update_size(self.conv4[0])
    
        # layer 5
        layer_kernel_size = 3
        self.conv5 = nn.Sequential(nn.Conv2d(in_channels=self.conv4[0].out_channels, out_channels=256, kernel_size=layer_kernel_size, stride=conv_stride,
                                             padding=same_padding(self.current_size, kernel_size=layer_kernel_size, stride=conv_stride)),
                                   nn.ReLU(),
                                   nn.MaxPool2d(kernel_size=3, stride=2))
        weights_init(self.conv5[0])
        self.update_size(self.conv5[0])
        self.update_size(self.conv5[2])  # pool

        # Note: Not clear whether Dropout goes before or after in the following 2 layers. To my understanding it's after.
        # See: https://stats.stackexchange.com/questions/240305/where-should-i-place-dropout-layers-in-a-neural-network

        # layer 6
        self.fc1 = nn.Sequential(nn.Linear(self.conv5[0].out_channels*self.current_size*self.current_size, 4096),
                                 nn.Dropout(0.5))
        weights_init(self.fc1[0])
        
        # layer 7
        self.fc2 = nn.Sequential(nn.Linear(self.fc1[0].out_features, 4096),
                                 nn.Dropout(0.5))
        weights_init(self.fc2[0])
        
        # output layer (softmax)
        self.fc3 = nn.Sequential(nn.Linear(self.fc2[0].out_features, self.num_classes),
                                 nn.Softmax())

    def forward(self, x):
        x = self.conv5(self.conv4(self.conv3(self.conv2(self.conv1(x)))))
        x = x.reshape(x.shape[0], -1)  # TODO: Try torch.flatten(x, 1)
        x = self.fc3(self.fc2(self.fc1(x)))
        
        return x
    
    def update_size(self, layer, dims=None):
        # assuming symmetricity
        self.current_size = calc_out_size(in_size=dims if dims else self.current_size, padding=layer.padding, kernel=layer.kernel_size, stride=layer.stride)
        print(f'self.current_size = {self.current_size}')


model = AlexNet(IMAGE_SHAPE[-1], IMAGE_SHAPE[0], NUM_CLASSES)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=0.0005)

# Reducing Learning Rate (LR):
# In the paper, they reduce LR by 10 each time the validation accuracy stops improving. And it happens 3 times during the 90 epochs.
# There are several ways to do this:

# 1. Use ReduceLROnPlateau:
# scheduler = ReduceLROnPlateau(optimizer, factor=0.10, min_lr=0.0001)
# Read more:
# https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.ReduceLROnPlateau.html#torch.optim.lr_scheduler.ReduceLROnPlateau
# https://www.geeksforgeeks.org/adjusting-learning-rate-of-a-neural-network-in-pytorch/
# Note though that the docs don't mention how to choose the plateauing metric. You might need to use LightningModule and override
# validation_epoch_end():
# https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html#validation-epoch-level-metrics
# https://github.com/PyTorchLightning/pytorch-lightning/issues/1598

# 2. Use: MultiStepLR:
# Here I picked 3 more or less uniformly distributed milestones, making it deterministic and not dynamic (this ignores loss / acc status).
scheduler = MultiStepLR(optimizer, milestones=[25,50,75], gamma=0.10)

# 3. Update it manually whenever the validation accuracy stops improving:
# for g in optimizer.param_groups:
#     g['lr'] /= 10
# See:
# https://stackoverflow.com/a/48324389/900394

loss_function = nn.CrossEntropyLoss()  # Paper: "Our network maximizes the multinomial logistic regression objective", which is just the cross entropy
                                       # according to: https://stats.stackexchange.com/questions/432896/what-is-the-loss-function-used-for-cnn#comment807624_432896
training_transformer = transforms.Compose([
    # transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))  # ImageNet parameters
    transforms.Normalize((0.485, 0.456, 0.406), (1.0, 1.0, 1.0)),  # Since the paper mentions normalizng only by subtracting mean, I disabled the STD factor,
                                                                   # but one might want to use the actual STD parameters written above.
    transforms.Resize(256),  # resize smaller edge to 256
    transforms.RandomCrop(IMAGE_SHAPE[-1]),  # Note: In the original paper, they use random cropping to INCREASE the size of the dataset.
                                             # We can do this in PyTorch using FiveCrop or TenCrop, but I'm avoiding it here for the sake
                                             # of simplicity.
                                             # See my answer here for more details:
                                             # https://stackoverflow.com/a/68131471/900394
    transforms.RandomHorizontalFlip(p=0.5),
    
    transforms.ToTensor(),
])

# Note:
# According to the paper, "augmentation" was also applied on the test set, by 4 corner crops and 2 horizontal reflections, which results
# in 10 NEW patches in total per image, and actually INCREASES THE TEST SET SIZE. Then, in the softmax phase, they average the results per patch set.
# As mentioned eralier, this can be done using torchvision.transforms.TenCrop, but for the sake of simplicity, I will avoid it here.
# Thus, we will simply avoid doing this here.
test_transformer = transforms.Compose([
    transforms.Normalize((0.485, 0.456, 0.406), (1.0, 1.0, 1.0)),  # Although the paper refers only to the training data when talking about normalization,
                                                                   # the test data should always be scaled / normalized just as the training data.
                                                                   # So they surely do that but it's not mentioned.
    transforms.Resize(256),  # resize smaller edge to 256
    transforms.CenterCrop(IMAGE_SHAPE[-1]),
    # transforms.TenCrop(227),  # disabled
    transforms.ToTensor(),
])

# Note:
# I did not apply PCA data augmentation. It can be found here though:
# https://github.com/koshian2/PCAColorAugmentation/blob/master/pca_aug_numpy_tensor.py
# According to the paper, it reduces the error rate by over 1%. In a nutshell:
# To each training image, add the following:
# [p1,p2,p3][a1g1,a2g2,a3g3]
# where pi and gi are the ith eigenvector and eigenvalue of the 3x3 covariance matrix of the corresponding RGB pixel values,
# and ai is a random variable drawn from a Gaussian with mean 0 and std 0.1.
# On each epoch, generate a new alpha for each image.
# Also read here for adding a custom transform method:
# https://discuss.pytorch.org/t/how-to-add-noise-to-mnist-dataset-when-using-pytorch/59745


self.current_size = 55
self.current_size = 27
self.current_size = 27
self.current_size = 13
self.current_size = 13
self.current_size = 13
self.current_size = 13
self.current_size = 6
