# Tutorial 4

In this tutorial, we'll look at gradient descent, its variants and optimization algorithms.

In [1]:
import numpy as np
import math
from matplotlib import pyplot as plt

import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision import transforms

print("Torch version:", torch.__version__)

Torch version: 2.8.0+cu128


Let's first load the MNIST dataset and split it into a train and test set:

In [2]:
batch_size = 64

In [3]:
data_train = datasets.MNIST(
    root = "data",
    train = True,
    download = True,
    transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(0, 1)])
)
data_test = datasets.MNIST(
    root = "data",
    train = False,
    download = True,
    transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(0, 1)])
)

dataloader_train = DataLoader(data_train, batch_size = batch_size, shuffle = True)
dataloader_test = DataLoader(data_test, batch_size = batch_size, shuffle = True)

Let's defined helper functions totrain the model:

In [4]:
def calculate_accuracy(model, dataloader):
    num_correct = 0
    
    with torch.no_grad():
        for (X, y) in dataloader:
            pred = model(X)
            num_correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    accuracy = num_correct / len(dataloader.dataset)
    return accuracy

In [5]:
def calculate_loss_accuracy(model, dataloader, loss_fn):
    loss = 0
    num_correct = 0
    
    with torch.no_grad():
        for (X, y) in dataloader:
            pred = model(X)
            loss += loss_fn(pred, y).item()
            num_correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    loss /= len(dataloader)
    accuracy = num_correct / len(dataloader.dataset)
    return loss, accuracy

In [6]:
def train_model(model, loss_fn, optimizer, epochs, dataloader_train, dataloader_test, early_stopper = None, log_period = 10000):
    for epoch in range(epochs):
        processed_since_log = 0
        for batch, (X, y) in enumerate(dataloader_train):
            model.train()
            pred = model(X)
            loss = loss_fn(pred, y)

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            processed_since_log += dataloader_train.batch_size

            if processed_since_log >= log_period:
                current = min((batch + 1) * dataloader_train.batch_size, len(data_train))
                loss = loss.item()
                model.eval()
                train_acc = calculate_accuracy(model, dataloader_train)
                test_loss, test_acc = calculate_loss_accuracy(model, dataloader_test, loss_fn)
                print(f"train loss: {loss:>7f}  test loss: {test_loss:>7f}  train accuracy: {train_acc:>3f}  test accuracy: {test_acc:>3f}  [sample {current:>5d}/{len(data_train):>5d}] [epoch {epoch+1:>2d}/{epochs:>2d}]")
                processed_since_log -= log_period

Since we will be creating multiple identical models, let's also create a function for that:

In [7]:
def create_model():
    model = nn.Sequential()
    model.append(nn.Flatten())
    model.append(nn.Linear(data_train.data.shape[1] * data_train.data.shape[2], 100))
    model.append(nn.ReLU())
    model.append(nn.Linear(100, 10))
    return model

## Gradient descent

In [8]:
class Simple_SGD(torch.optim.Optimizer):
    def __init__(self, params, lr = 1e-3):
        defaults = dict(lr = lr)
        super().__init__(params, defaults)

    @torch.no_grad
    def step(self):
        for group in self.param_groups:
            for p in group["params"]:
                p.add_(p.grad, alpha = -group["lr"])

In [9]:
model = create_model()
optimizer = Simple_SGD(model.parameters())
train_model(model, nn.CrossEntropyLoss(), optimizer, 10, dataloader_train, dataloader_test)

train loss: 2.263719  test loss: 2.276265  train accuracy: 0.158967  test accuracy: 0.166300  [sample 10048/60000] [epoch  1/10]
train loss: 2.236584  test loss: 2.245551  train accuracy: 0.233133  test accuracy: 0.240700  [sample 20032/60000] [epoch  1/10]
train loss: 2.205888  test loss: 2.213709  train accuracy: 0.315933  test accuracy: 0.322500  [sample 30016/60000] [epoch  1/10]
train loss: 2.185920  test loss: 2.180754  train accuracy: 0.411917  test accuracy: 0.418600  [sample 40000/60000] [epoch  1/10]
train loss: 2.167347  test loss: 2.145731  train accuracy: 0.506400  test accuracy: 0.518100  [sample 50048/60000] [epoch  1/10]
train loss: 2.135681  test loss: 2.108275  train accuracy: 0.575500  test accuracy: 0.582700  [sample 60000/60000] [epoch  1/10]
train loss: 2.111817  test loss: 2.067934  train accuracy: 0.613100  test accuracy: 0.622300  [sample 10048/60000] [epoch  2/10]
train loss: 1.998481  test loss: 2.026338  train accuracy: 0.649700  test accuracy: 0.656600  [sa

## Momentum

In [10]:
class Simple_Momentum(torch.optim.Optimizer):
    def __init__(self, params, lr = 1e-3, momentum_gamma = 0.9):
        defaults = dict(lr = lr, momentum_gamma = momentum_gamma)
        super().__init__(params, defaults)

    @torch.no_grad
    def step(self):
        for group in self.param_groups:
            for p in group["params"]:
                momentum_v = self.state[p].get("momentum_v")
                if momentum_v is None:
                    momentum_v = torch.clone(p.grad).detach()
                else:
                    momentum_v.mul_(group["momentum_gamma"]).add_(p.grad)
                
                self.state[p]["momentum_v"] = momentum_v
                p.add_(momentum_v, alpha = -group["lr"])

In [11]:
model = create_model()
optimizer = Simple_Momentum(model.parameters())
train_model(model, nn.CrossEntropyLoss(), optimizer, 10, dataloader_train, dataloader_test)

train loss: 2.024158  test loss: 2.006585  train accuracy: 0.639700  test accuracy: 0.653100  [sample 10048/60000] [epoch  1/10]
train loss: 1.495003  test loss: 1.511178  train accuracy: 0.723150  test accuracy: 0.734500  [sample 20032/60000] [epoch  1/10]
train loss: 1.102537  test loss: 1.073900  train accuracy: 0.801183  test accuracy: 0.808400  [sample 30016/60000] [epoch  1/10]
train loss: 1.017867  test loss: 0.820161  train accuracy: 0.828517  test accuracy: 0.836400  [sample 40000/60000] [epoch  1/10]
train loss: 0.618913  test loss: 0.679602  train accuracy: 0.842833  test accuracy: 0.848500  [sample 50048/60000] [epoch  1/10]
train loss: 0.492513  test loss: 0.593564  train accuracy: 0.858800  test accuracy: 0.865300  [sample 60000/60000] [epoch  1/10]
train loss: 0.685109  test loss: 0.536720  train accuracy: 0.864817  test accuracy: 0.872400  [sample 10048/60000] [epoch  2/10]
train loss: 0.375390  test loss: 0.496623  train accuracy: 0.870533  test accuracy: 0.878900  [sa

## Task 1

Create an optimizer for the Nesterov accelerated gradients (NAG) method.

In [12]:
class Simple_Nesterov(torch.optim.Optimizer):
    def __init__(self, params, lr = 1e-3, momentum_gamma = 0.9):
        defaults = dict(lr = lr, momentum_gamma = momentum_gamma)
        super().__init__(params, defaults)

    @torch.no_grad
    def step(self):
        for group in self.param_groups:
            for p in group["params"]:
                momentum_v = self.state[p].get("momentum_v")
                if momentum_v is None:
                    momentum_v = torch.clone(p.grad).detach()
                else:
                    momentum_v.mul_(group["momentum_gamma"]).add_(p.grad)

                grad_like = p.grad.add(momentum_v, alpha = group["momentum_gamma"])
                
                self.state[p]["momentum_v"] = momentum_v
                p.add_(grad_like, alpha = -group["lr"])

In [13]:
model = create_model()
optimizer = Simple_Nesterov(model.parameters())
train_model(model, nn.CrossEntropyLoss(), optimizer, 10, dataloader_train, dataloader_test)

train loss: 2.028380  test loss: 1.973173  train accuracy: 0.672633  test accuracy: 0.675800  [sample 10048/60000] [epoch  1/10]
train loss: 1.602706  test loss: 1.466081  train accuracy: 0.766783  test accuracy: 0.771900  [sample 20032/60000] [epoch  1/10]
train loss: 0.922532  test loss: 1.040253  train accuracy: 0.796950  test accuracy: 0.802000  [sample 30016/60000] [epoch  1/10]
train loss: 0.791584  test loss: 0.797734  train accuracy: 0.828100  test accuracy: 0.837300  [sample 40000/60000] [epoch  1/10]
train loss: 0.634853  test loss: 0.665005  train accuracy: 0.845133  test accuracy: 0.852500  [sample 50048/60000] [epoch  1/10]
train loss: 0.494954  test loss: 0.581608  train accuracy: 0.855983  test accuracy: 0.864500  [sample 60000/60000] [epoch  1/10]
train loss: 0.603256  test loss: 0.527253  train accuracy: 0.863983  test accuracy: 0.871600  [sample 10048/60000] [epoch  2/10]
train loss: 0.518996  test loss: 0.487960  train accuracy: 0.870617  test accuracy: 0.878700  [sa

## Task 2

Create an optimizer for the ADAM method.

In [14]:
class Simple_ADAM(torch.optim.Optimizer):
    def __init__(self, params, lr = 1e-3, beta_1 = 0.9, beta_2 = 0.999, epsilon = 1e-8):
        defaults = dict(lr = lr, beta_1 = beta_1, beta_2 = beta_2, epsilon = epsilon)
        super().__init__(params, defaults)

    @torch.no_grad
    def step(self):
        for group in self.param_groups:
            for p in group["params"]:
                if len(self.state[p]) == 0:
                    self.state[p]["step"] = torch.tensor(1.0)
                    self.state[p]["grad_average"] = torch.zeros_like(p)
                    self.state[p]["grad_square_average"] = torch.zeros_like(p)
                
                grad_average = self.state[p].get("grad_average")
                grad_square_average = self.state[p].get("grad_square_average")

                # Linear interpolation between current value and p.grad with weight as second argument
                grad_average.lerp_(p.grad, 1 - group["beta_1"])

                # addcmul_ = element-wise multiply arg1 and arg2, multiply by scalar value and add the result to current tensor
                grad_square_average.mul_(group["beta_2"]).addcmul_(p.grad, p.grad.conj(), value = 1 - group["beta_2"])

                bias_correction1 = 1 - group["beta_1"]**self.state[p]["step"]
                bias_correction2 = 1 - group["beta_2"]**self.state[p]["step"]

                grad_average_corrected = grad_average.div(bias_correction1)
                grad_square_average_corrected = grad_square_average.div(bias_correction2)
                denominator = grad_square_average_corrected.sqrt().add(group["epsilon"])

                self.state[p]["step"] += 1
                self.state[p]["grad_average"] = grad_average
                self.state[p]["grad_square_average"] = grad_square_average

                # addcdiv_ = element-wise divide arg1 by arg2, multiply by scalar value and add the result to current tensor
                p.addcdiv_(grad_average_corrected, denominator, value = -group["lr"])

In [15]:
model = create_model()
optimizer = Simple_ADAM(model.parameters())
train_model(model, nn.CrossEntropyLoss(), optimizer, 10, dataloader_train, dataloader_test)

train loss: 0.358754  test loss: 0.393576  train accuracy: 0.886700  test accuracy: 0.892000  [sample 10048/60000] [epoch  1/10]
train loss: 0.263642  test loss: 0.296537  train accuracy: 0.912183  test accuracy: 0.916000  [sample 20032/60000] [epoch  1/10]
train loss: 0.157109  test loss: 0.262567  train accuracy: 0.926483  test accuracy: 0.927700  [sample 30016/60000] [epoch  1/10]
train loss: 0.195705  test loss: 0.235457  train accuracy: 0.932867  test accuracy: 0.935700  [sample 40000/60000] [epoch  1/10]
train loss: 0.305004  test loss: 0.207930  train accuracy: 0.939900  test accuracy: 0.938300  [sample 50048/60000] [epoch  1/10]
train loss: 0.359770  test loss: 0.194959  train accuracy: 0.944433  test accuracy: 0.944300  [sample 60000/60000] [epoch  1/10]
train loss: 0.154418  test loss: 0.177484  train accuracy: 0.950217  test accuracy: 0.949600  [sample 10048/60000] [epoch  2/10]
train loss: 0.171317  test loss: 0.161207  train accuracy: 0.954617  test accuracy: 0.952400  [sa