In [1]:
import time
import copy

import numpy as np 
import pandas as pd 

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Subset

import torchvision
import torchvision.transforms as transforms

from tqdm import tqdm
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

!pip install torchmetrics
!git clone https://github.com/m0hssn/Metrica.git
from Metrica.metrica import Metrica

Cloning into 'Metrica'...
remote: Enumerating objects: 16, done.[K
remote: Counting objects: 100% (16/16), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 16 (delta 5), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (16/16), 5.39 KiB | 1.80 MiB/s, done.
Resolving deltas: 100% (5/5), done.


### Overview of GoogLeNet Model – CNN Architecture

The GoogLeNet, also known as Inception V1, emerged from collaborative research by Google and various universities in 2014. Its unveiling in the paper titled “Going Deeper with Convolutions” marked its victory at the ILSVRC 2014 image classification challenge. Notably surpassing prior winners like AlexNet (ILSVRC 2012) and ZF-Net (ILSVRC 2013), as well as yielding a considerably lower error rate than VGG (the 2014 runner-up), this architecture introduced innovations such as 1×1 convolutions positioned within the architecture and global average pooling.

#### Key Aspects of GoogLeNet:

The GoogLeNet architecture markedly diverges from preceding state-of-the-art models like AlexNet and ZF-Net. Its distinctive features, including 1×1 convolution and global average pooling, facilitate the creation of a deeper architecture. Here are the pivotal elements within the architecture:

- 1×1 Convolution: The Inception architecture incorporates 1×1 convolutions to reduce the number of parameters, thereby deepening the architecture. For instance, by employing 1×1 convolutions in intermediate stages, it significantly cuts down the total number of operations compared to traditional convolutions without compromising performance.
  
  ![Example Image](https://media.geeksforgeeks.org/wp-content/uploads/20200429201100/without1x1.png){width=50%}

  - Total Number of operations without 1×1 convolutions: 112.9 M
  - With 1×1 convolution: 5.3M, a significant reduction.

- Global Average Pooling: In contrast to earlier architectures like AlexNet, GoogLeNet employs global average pooling at the network's end. This strategic layer condenses a 7×7 feature map to 1×1, effectively decreasing trainable parameters to zero and enhancing top-1 accuracy by 0.6%.

- Inception Module: Unique to this architecture, the Inception module features parallel 1×1, 3×3, 5×5 convolutions and 3×3 max pooling. These operations, stacked together, allow the handling of objects at multiple scales more efficiently.

  ![Conv Layer](https://media.geeksforgeeks.org/wp-content/uploads/20200429201304/Incepption-module.PNG)

- Auxiliary Classifier for Training: GoogLeNet integrates intermediate classifier branches within the architecture solely for training purposes. These branches comprise various layers aimed at combating gradient vanishing and providing regularization, thereby contributing to the overall loss with a weight of 0.3.

- Model Architecture: Spanning 22 layers, the design prioritizes computational efficiency, making it feasible for implementation even on devices with limited computational resources. It includes two auxiliary classifier layers connected to the output of Inception (4a) and Inception (4d) layers.

  ![Google-Net](https://media.geeksforgeeks.org/wp-content/uploads/20200429201549/Inceptionv1_architecture.png)


In [2]:
class Auxiliary(nn.Module):
    
    def __init__(self, in_channels, num_classes):
        super(Auxiliary, self).__init__()
        self.avgpool = nn.AvgPool2d(kernel_size=5, stride=3)
        self.conv1x1 = ConvBlock(in_channels, 128, kernel_size=1)
        
        self.fc1 = nn.Linear(2048, 1024)
        self.fc2 = nn.Linear(1024, num_classes)
        
        self.dropout = nn.Dropout(0.7)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.avgpool(x)
        x = self.conv1x1(x)
        x = x.reshape(x.shape[0], -1)
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        x = F.softmax(x, dim=1)
        return x

class ConvBlock(nn.Module):
    
    def __init__(self, in_channels, out_channels, kernel_size, **kwargs):
        super(ConvBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, **kwargs)
        self.bn = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        return self.relu(self.bn(self.conv(x)))

class InceptionBlock(nn.Module):
    
    def __init__(self, im_channels, num_1x1, num_3x3_red, num_3x3, num_5x5_red, num_5x5, num_pool_proj):
        super(InceptionBlock, self).__init__()
        
        self.one_by_one = ConvBlock(im_channels, num_1x1, kernel_size=1)
        
        self.tree_by_three_red = ConvBlock(im_channels, num_3x3_red, kernel_size=1)  
        self.tree_by_three = ConvBlock(num_3x3_red, num_3x3, kernel_size=3, padding=1)
        
        self.five_by_five_red = ConvBlock(im_channels, num_5x5_red, kernel_size=1)
        self.five_by_five = ConvBlock(num_5x5_red, num_5x5, kernel_size=5, padding=2)
        
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
        self.pool_proj = ConvBlock(im_channels, num_pool_proj, kernel_size=1)
         
    def forward(self, x):
        x1 = self.one_by_one(x)
        
        x2 = self.tree_by_three_red(x)
        x2 = self.tree_by_three(x2)
        
        x3 = self.five_by_five_red(x)
        x3 = self.five_by_five(x3)
        
        x4 = self.maxpool(x)
        x4 = self.pool_proj(x4)
        
        x = torch.cat([x1, x2, x3, x4], 1)
        return x

    
class LeNet(nn.Module):
    
    def __init__(self, in_channels=3, use_auxiliary=True, num_classes=1000):
        super(LeNet, self).__init__()
        
        self.conv1 = ConvBlock(in_channels, 64, kernel_size=7, stride=2, padding=3)
        self.maxpool1 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.conv2 = ConvBlock(64, 192, kernel_size=3, stride=1, padding=1)
        self.maxpool2 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.inception3a = InceptionBlock(192, 64, 96, 128, 16, 32, 32)
        self.inception3b = InceptionBlock(256, 128, 128, 192, 32, 96, 64)
        self.maxpool3 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.inception4a = InceptionBlock(480, 192, 96, 208, 16, 48, 64)
        self.inception4b = InceptionBlock(512, 160, 112, 224, 24, 64, 64)
        self.inception4c = InceptionBlock(512, 128, 128, 256, 24, 64, 64)
        self.inception4d = InceptionBlock(512, 112, 144, 288, 32, 64, 64)
        self.auxiliary4a = Auxiliary(512, num_classes)
        
        self.inception4e = InceptionBlock(528, 256, 160, 320, 32, 128, 128)
        self.maxpool4 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.inception5a = InceptionBlock(832, 256, 160, 320, 32, 128, 128)
        self.inception5b = InceptionBlock(832, 384, 192, 384, 48, 128, 128)
        
        self.avgpool = nn.AvgPool2d(kernel_size=7, stride=1)
        self.dropout = nn.Dropout(0.4)
        self.linear = nn.Linear(1024, num_classes)
        
        self.use_auxiliary = use_auxiliary
        if use_auxiliary:
            self.auxiliary4d = Auxiliary(528, num_classes)

    def forward(self, x):
        auxiliary_outputs = []

        x = self.conv1(x)
        x = self.maxpool1(x)
        x = self.conv2(x)
        x = self.maxpool2(x)
        
        x = self.inception3a(x)
        x = self.inception3b(x)
        x = self.maxpool3(x)
        
        x = self.inception4a(x)
        auxiliary_outputs.append(self.auxiliary4a(x))
        
        x = self.inception4b(x)
        x = self.inception4c(x)
        x = self.inception4d(x)
        if self.use_auxiliary:
            auxiliary_outputs.append(self.auxiliary4d(x))
        
        x = self.inception4e(x)
        x = self.maxpool4(x)
        
        x = self.inception5a(x)
        x = self.inception5b(x)
        
        x = self.avgpool(x)
        x = x.reshape(x.shape[0], -1)
        x = self.dropout(x)
        
        x = self.linear(x)
        x = F.softmax(x, dim=1)
        if self.use_auxiliary:
            return x, auxiliary_outputs
        else:
            return x

In [3]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),  
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=False)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:05<00:00, 29709000.29it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [18]:
model = LeNet(in_channels=3, num_classes=10)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=3e-4)  

In [19]:
def train_model(model, criterion, optimizer, train_loader, test_loader, num_epochs=5, use_auxiliary=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    for epoch in range(num_epochs):
        model.train()
        train_pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} (training)", leave=False)
        running_loss = 0.0
        
        running_correct = 0
        running_total = 0
        
#         test_metrica = Metrica(num_classes=10)
        
        for i, (inputs, labels) in enumerate(train_pbar):
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            if use_auxiliary:
                outputs, aux_outs = model(inputs)
                loss = criterion(outputs, labels) + 0.3 * criterion(aux_outs[0], labels) + 0.3 * criterion(aux_outs[1], labels)
            else:
                outputs, _, _ = model(inputs)
                loss = criterion(outputs, labels)
            _, preds = torch.max(outputs, 1)
            
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            running_total += labels.size(0)
            running_correct += (preds == labels).sum().item()
            
            train_pbar.set_postfix({"loss": running_loss / (i+1),
                                    "accuracy": 100 * running_correct / running_total})
        model.eval()
        running_loss_test = 0.0
        running_total_test = 0
        running_correct_test = 0
        
        test_pbar = tqdm(test_loader, desc=f"Epoch {epoch+1}/{num_epochs} (testing)", leave=False)

        for i, (inputs, labels) in enumerate(test_pbar):
            inputs, labels = inputs.to(device), labels.to(device)
            if use_auxiliary:
                outputs, aux_outs = model(inputs)
                loss = criterion(outputs, labels) + 0.3 * criterion(aux_outs[0], labels) + 0.3 * criterion(aux_outs[1], labels)
            else:
                outputs, _, _ = model(inputs)
                loss = criterion(outputs, labels)
            _, preds = torch.max(outputs, 1)
#             test_metrica.upgrade(outputs, labels)

            running_loss_test += loss.item()
            running_total_test += labels.size(0)
            running_correct_test += (preds == labels).sum().item()
            
            test_pbar.set_postfix({"loss": running_loss_test / (i+1),
                                    "accuracy": 100 * running_correct_test / running_total_test})
        print(f'train acc = {100 * running_correct / running_total}, test acc: {100 * running_correct_test / running_total_test}')
#         test_metrica.print_metrics()
# metrica is a method to show classification scores but theres a problem there i will fix later since we save all the predicted values and the actual ones it takes alot of ram so something must be done about that

In [20]:
train_model(model, criterion, optimizer, trainloader, testloader)

                                                                                                 

train acc = 46.646, test acc: 58.25


                                                                                                 

train acc = 61.438, test acc: 59.1


                                                                                                 

train acc = 68.202, test acc: 68.38


                                                                                                 

train acc = 72.864, test acc: 70.73


                                                                                                 

train acc = 76.334, test acc: 71.4


![ResNet Architecture](https://www.researchgate.net/publication/354224999/figure/fig1/AS:1126083944562688@1645490742169/The-structure-of-the-ResNet34-CNN-Network-The-input-of-the-network-is-the-preprocessed.png)

### Solving the Vanishing Gradients Issue with ResNet:

![Residual connection](https://www.google.com/url?sa=i&url=https%3A%2F%2Fmedium.com%2F%40achronus%2Fexploring-residual-connections-in-transformers-2cd18b9e35eb&psig=AOvVaw2L2R1QdOpqJ-Jz9ybSbyhZ&ust=1701533098466000&source=images&cd=vfe&opi=89978449&ved=0CBIQjRxqFwoTCMDp94HP7oIDFQAAAAAdAAAAABAI)

ResNet50's architecture introduced skip connections, also termed residual connections, as a fundamental aspect. These connections were pivotal in enabling the network to delve into deeper architectures while mitigating the problem of vanishing gradients.

The issue of vanishing gradients arises during the training of deep neural networks when the gradients in the deeper layers diminish significantly, hindering the learning process for those layers. This challenge exacerbates as the network's depth increases.

Skip connections circumvent this challenge by allowing information to flow directly from the input to the output of the network, circumventing one or more intermediate layers. This facilitates the learning of residual functions that map the input to the desired output, alleviating the need to learn the entire mapping from scratch.



In the image above, alongside the standard connections, a direct connection bypassing certain layers in the model is depicted (skip connection). With this, the output transforms from h(x) = f(wx +b) to h(x) = f(x) + x. These skip connections act as alternate pathways for gradients to flow, providing valuable shortcuts for learning.


In [2]:
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, **kwargs):
        super(ConvBlock, self).__init__()
        
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, **kwargs)
        self.batchnorm = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.conv(x)
        x = self.batchnorm(x)
        x = self.relu(x)
        return x

    

class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride = 1, downsample = None):
        super(ResidualBlock, self).__init__()
        
        self.conv1 = ConvBlock(in_channels, out_channels, kernel_size = 3, stride = stride, padding = 1)
        self.conv2 = ConvBlock(out_channels, out_channels, kernel_size = 3, stride = 1, padding = 1)
        
        self.downsample = downsample
        
        self.relu = nn.ReLU()
        self.out_channels = out_channels
        
    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.conv2(out)
        if self.downsample:
            residual = self.downsample(x)
        out = out + residual
        out = self.relu(out)
        return out

In [3]:
class ResNet34(nn.Module):
    def __init__(self, block, layers, num_classes = 10):
        super(ResNet34, self).__init__()
        
        self.inplanes = 64
        self.conv1 = ConvBlock(3, 64, kernel_size = 7, stride = 2, padding = 3)
        self.maxpool = nn.MaxPool2d(kernel_size = 3, stride = 2, padding = 1)
        
        self.layer0 = self._make_layer(block, 64, layers[0], stride = 1)
        self.layer1 = self._make_layer(block, 128, layers[1], stride = 2)
        self.layer2 = self._make_layer(block, 256, layers[2], stride = 2)
        self.layer3 = self._make_layer(block, 512, layers[3], stride = 2)
        
        self.avgpool = nn.AvgPool2d(7, stride=1)
        self.fc = nn.Linear(512, num_classes)
        
    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes, kernel_size=1, stride=stride),
                nn.BatchNorm2d(planes),)
            
        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)
    
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.maxpool(x)
        x = self.layer0(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x


In [4]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),  
    transforms.ToTensor(),
    transforms.Normalize(
    mean=[0.4914, 0.4822, 0.4465],
    std=[0.2023, 0.1994, 0.2010],),
])


trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

trainloader = torch.utils.data.DataLoader(trainset, batch_size=32, shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=32, shuffle=False) # set to 32 because of memory reasons


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:16<00:00, 10055922.46it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [5]:
model = ResNet34(ResidualBlock, [3, 4, 6, 3])
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=3e-4)  

In [6]:
def train_resnet_model(model, criterion, optimizer, train_loader, test_loader, num_epochs=5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    for epoch in range(num_epochs):
        model.train()
        train_pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} (training)", leave=False)
        running_loss = 0.0
        
        running_correct = 0
        running_total = 0
        
#         test_metrica = Metrica(num_classes=10)
        
        for i, (inputs, labels) in enumerate(train_pbar):
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            _, preds = torch.max(outputs, 1)
            
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            running_total += labels.size(0)
            running_correct += (preds == labels).sum().item()
            
            train_pbar.set_postfix({"loss": running_loss / (i+1),
                                    "accuracy": 100 * running_correct / running_total})
        model.eval()
        running_loss_test = 0.0
        running_total_test = 0
        running_correct_test = 0
        
        test_pbar = tqdm(test_loader, desc=f"Epoch {epoch+1}/{num_epochs} (testing)", leave=False)

        for i, (inputs, labels) in enumerate(test_pbar):
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            _, preds = torch.max(outputs, 1)
            
#             test_metrica.upgrade(outputs, labels)

            running_loss_test += loss.item()
            running_total_test += labels.size(0)
            running_correct_test += (preds == labels).sum().item()
            
            test_pbar.set_postfix({"loss": running_loss_test / (i+1),
                                    "accuracy": 100 * running_correct_test / running_total_test})
        print(f'train acc = {100 * running_correct / running_total}, test acc: {100 * running_correct_test / running_total_test}')

#         test_metrica.print_metrics()

In [7]:
train_resnet_model(model, criterion, optimizer, trainloader, testloader)

                                                                                                   

train acc = 50.45, test acc: 63.21


                                                                                                    

train acc = 70.654, test acc: 72.81


                                                                                                    

train acc = 78.396, test acc: 79.06


                                                                                                    

train acc = 82.612, test acc: 80.65


                                                                                                    

train acc = 85.654, test acc: 82.24


