## On LR Schedulers

### StepLR
Decays the learning rate by a fixed factor every *N* epochs; simple and predictable.

### ExponentialLR
Multiplies the learning rate by a constant factor every epoch, resulting in smooth exponential decay.

### CosineAnnealingLR
 Smoothly decreases the learning rate following a cosine curve toward a minimum value.

### ReduceLROnPlateau
 Lowers the learning rate when a monitored metric (e.g. validation loss) stops improving.

### LambdaLR
 Uses a user-defined function to compute the learning rate at each step.

## On Activation Functions

<p align="center">
  <img src="../../assets/img/optimization/activation_functions.png" width="400">
</p>

## Other Parameters

Number of layers, regularization?, dropout?, batch normalization?, optimizer, number of neurons per layer, batch size, early stopping, etc.

## Where to Start?

Check the `literature!!!`

In [None]:
import sys
sys.path.append("../../")

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import optuna
from utils.train import train_one_epoch, evaluate
from torchvision import transforms
from torchvision.datasets import CIFAR10

In [None]:
class FlexibleCNN(nn.Module):
    def __init__(
        self,
        n_layers,
        n_filters,
        kernel_sizes,
        dropout_rate=0.5,
        fc_size=128,
        num_classes=10,
    ):
        super().__init__()

        assert n_layers == len(n_filters) == len(kernel_sizes), \
            "n_layers, n_filters, and kernel_sizes must have the same length"

        self.dropout_rate = dropout_rate
        self.fc_size = fc_size
        self.num_classes = num_classes

        blocks = []
        in_channels = 3  # rgb

        for out_channels, kernel_size in zip(n_filters, kernel_sizes):
            padding = (kernel_size - 1) // 2  # 'same' padding

            blocks.append(
                nn.Sequential(
                    nn.Conv2d(in_channels, out_channels, kernel_size, padding=padding),
                    nn.ReLU(inplace=True),
                    nn.MaxPool2d(kernel_size=2, stride=2),
                )
            )

            in_channels = out_channels

        self.features = nn.Sequential(*blocks)
        self.classifier = None

    def _create_classifier(self, flattened_size, device):
        self.classifier = nn.Sequential(
            nn.Dropout(self.dropout_rate),
            nn.Linear(flattened_size, self.fc_size),
            nn.ReLU(inplace=True),
            nn.Dropout(self.dropout_rate),
            nn.Linear(self.fc_size, self.num_classes),
        ).to(device)

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)

        if self.classifier is None:
            self._create_classifier(x.size(1), x.device)

        return self.classifier(x)

Defining the search space.

In [None]:
def objective(trial, device, train_loader, val_loader, loss_fn, lr, num_epochs):

    # hyperparameters to tune
    n_layers = trial.suggest_int("n_layers", 1, 3)

    n_filters = [
        trial.suggest_int(f"n_filters_{i}", 16, 128)
        for i in range(n_layers)
    ]

    kernel_sizes = [
        trial.suggest_categorical(f"kernel_size_{i}", [3, 5])
        for i in range(n_layers)
    ]

    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5)
    fc_size = trial.suggest_int("fc_size", 64, 256)

    # model, optimizer, training loop
    model = FlexibleCNN(
        n_layers=n_layers,
        n_filters=n_filters,
        kernel_sizes=kernel_sizes,
        dropout_rate=dropout_rate,
        fc_size=fc_size,
        num_classes=10,
    ).to(device)

    optimizer = torch.optim.Adam(
        model.parameters(), lr=lr
    )

    for _ in range(num_epochs):
        train_one_epoch(
            model, train_loader, optimizer, loss_fn, device
        )

    _, accuracy = evaluate(
        model, val_loader, loss_fn,device
    )

    return accuracy

Fixed hyperparameters.

In [None]:
# these are the standard pre-computed values
cifar10_mean = (0.4914, 0.4822, 0.4465)
cifar10_std  = (0.2023, 0.1994, 0.2010)

t = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=cifar10_mean, std=cifar10_std)
])

train_dataset = CIFAR10(
    root="../../assets/cifar10", 
    train=True, 
    download=True, 
    transform=t
)
val_dataset = CIFAR10(
    root="../../assets/cifar10", 
    train=False, 
    download=True, 
    transform=t
)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

study = optuna.create_study(direction="maximize")

study.optimize(
    lambda trial: objective(trial, device, train_loader, val_loader, loss_fn=nn.CrossEntropyLoss(), lr=1e-3, num_epochs=10),
    n_trials=20,    # use more trials in practice
)

In [None]:
import optuna.visualization.matplotlib as vis

vis.plot_optimization_history(study)

In [None]:
vis.plot_param_importances(study)