In [None]:
import sys
sys.path.append("../../")

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from torchvision import transforms
from torchvision.datasets import CIFAR10

from tqdm import tqdm

from utils.plots import plot_losses, plot_confusion_matrix
from utils.train import train_one_epoch, evaluate

In [None]:
# these are the standard pre-computed values
cifar10_mean = (0.4914, 0.4822, 0.4465)
cifar10_std  = (0.2023, 0.1994, 0.2010)

train_transforms = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1),
    transforms.RandomRotation(5),
    transforms.ToTensor(),
    transforms.Normalize(mean=cifar10_mean, std=cifar10_std)
])

val_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=cifar10_mean, std=cifar10_std)
])

train_dataset = CIFAR10(
    root="../../assets/cifar10", 
    train=True, 
    download=True, 
    transform=train_transforms
)
val_dataset = CIFAR10(
    root="../../assets/cifar10", 
    train=False, 
    download=True, 
    transform=val_transforms
)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [None]:
def train(model, num_epochs, train_loader, val_loader, optimizer, loss_fn, device):
    train_losses = []
    val_losses = []

    pbar = tqdm(range(num_epochs), desc="Training")

    for _ in pbar:
        train_loss = train_one_epoch(
            model, train_loader, optimizer, loss_fn, device
        )

        val_loss, val_acc = evaluate(
            model, val_loader, loss_fn, device
        )

        train_losses.append(train_loss)
        val_losses.append(val_loss)

        pbar.set_postfix({
            "train_loss": f"{train_loss:.4f}",
            "val_loss": f"{val_loss:.4f}",
            "val_acc": f"{val_acc*100:.2f}%"
        })

    print(f"Final Validation Accuracy: {val_acc*100:.2f}%")
    plot_losses([train_losses, val_losses], ["Train Loss", "Validation Loss"])

## Adam

Adam (Adaptive Moment Estimation) is an optimizer that combines momentum and adaptive learning rates to make training faster and more robust. Works well with minimal tuning and converges quickly.

Let $g_t$ be the gradient at time step $t$.

### 1. First moment (momentum)

$$
m_t = \beta_1 m_{t-1} + (1 - \beta_1) g_t
$$

### 2. Second moment (RMS / variance)

$$
v_t = \beta_2 v_{t-1} + (1 - \beta_2) g_t^2
$$

### 3. Bias correction

Because both moments are initialized at zero, Adam applies bias correction:

$$
\hat{m}_t = \frac{m_t}{1 - \beta_1^t}, \quad
\hat{v}_t = \frac{v_t}{1 - \beta_2^t}
$$

### 4. Parameter update

$$
\theta_t = \theta_{t-1} - \alpha \frac{\hat{m}_t}{\sqrt{\hat{v}_t} + \epsilon}
$$

Where:
- $\alpha$ is the learning rate
- $\beta_1$ controls momentum (default: 0.9)
- $\beta_2$ controls variance smoothing (default: 0.999)
- $\epsilon$ is a small constant for numerical stability

In [None]:
def get_helpers(model):
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, betas=(0.9, 0.999))
    loss_fn = nn.CrossEntropyLoss()
    device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
    model.to(device)
    return optimizer, loss_fn, device

## MaxPool2d

MaxPool2d downsamples a feature map by keeping only the strongest activation in each local window. It reduces spatial resolution while preserving the most salient features.

<p align="center">
  <img src="../../assets/img/accuracy/pooling_in_action.png" width="400">
</p>

In [None]:
model = nn.Sequential(
    # feature extractor
    nn.Conv2d(3, 32, kernel_size=3, padding=1),   # (bsz, 32, 32, 32)
    nn.ReLU(),
    nn.MaxPool2d(2, 2),                           # (bsz, 32, 16, 16)

    nn.Conv2d(32, 64, kernel_size=3, padding=1),  # (bsz, 64, 16, 16)
    nn.ReLU(),
    nn.MaxPool2d(2, 2),                           # (bsz, 64, 8, 8)

    # classifier
    nn.Flatten(),                                 # (bsz, 64*8*8)
    nn.Linear(64 * 8 * 8, 128),
    nn.ReLU(),
    nn.Linear(128, 10)                            # class logits
)

Expect a final accuracy of **~76%**

In [None]:
optimizer, loss_fn, device = get_helpers(model)
train(model, 25, train_loader, val_loader, optimizer, loss_fn, device)

## BatchNorm2d

BatchNorm2d normalizes convolutional feature maps channel-wise to stabilize and speed up training.

<p align="center">
    <img src="../../assets/img/accuracy/normalization_techs.png" width="400">
</p>

In [None]:
model = nn.Sequential(
    # feature extractor
    nn.Conv2d(3, 32, kernel_size=3, padding=1, bias=False),
    nn.BatchNorm2d(32),
    nn.ReLU(),
    nn.MaxPool2d(2, 2),

    nn.Conv2d(32, 64, kernel_size=3, padding=1, bias=False),
    nn.BatchNorm2d(64),
    nn.ReLU(),
    nn.MaxPool2d(2, 2),

    # classifier
    nn.Flatten(),
    nn.Linear(64 * 8 * 8, 128),
    nn.ReLU(),
    nn.Linear(128, 10)
)

Expect a final accuracy of **~76%**

In [None]:
optimizer, loss_fn, device = get_helpers(model)
train(model, 25, train_loader, val_loader, optimizer, loss_fn, device)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters())

print(f"Number of parameters: {count_parameters(model)}")

## Dropout

Dropout is a regularization technique that randomly disables neurons during training to reduce overfitting.

p = probability that each activation is set to zero.

<p align="center">
    <img src="../../assets/img/accuracy/dropout.png" width="400">
</p>


In [None]:
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.block = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(),
            nn.MaxPool2d(2, 2)
        )
    def forward(self, x):
        return self.block(x)
    
class SimpleCNN(nn.Module):
    def __init__(self, num_conv_layers=3, base_channels=32, dropout_p=0.5):
        super().__init__()

        layers = []
        in_channels = 3
        channels = base_channels
        spatial_size = 32

        for _ in range(num_conv_layers):
            layers.append(ConvBlock(in_channels, channels))
            in_channels = channels
            channels *= 2
            spatial_size //= 2

        self.features = nn.Sequential(*layers)

        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_channels * spatial_size * spatial_size, 128),
            nn.ReLU(),
            nn.Dropout(p=dropout_p),
            nn.Linear(128, 10)
        )

    def forward(self, x):
        x = self.features(x)
        return self.classifier(x)

Expect a final accuracy of **~80%**

In [None]:
model = SimpleCNN(num_conv_layers=3, base_channels=32, dropout_p=0.5)
optimizer, loss_fn, device = get_helpers(model)
train(model, 25, train_loader, val_loader, optimizer, loss_fn, device)

In [None]:
plot_confusion_matrix(model, val_loader, device, ["airplane", "automobile", "bird", "cat", "deer","dog", "frog", "horse", "ship", "truck"])