In [2]:
import torch
import torch.nn as nn
import torchvision as tv
import torchvision.transforms as transforms
from torchvision import datasets
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np
import random
import time
import os

In [3]:
SAMPLE_SIZE = 5000
NUM_LABELS = 10

In [4]:
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

torch.backends.cudnn.benchmark = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
# Load data
dataset = datasets.CIFAR10('./data', train=True, download=True, transform=transforms.ToTensor())

# Normalize images
all_images = torch.stack([dataset[i][0] for i in range(len(dataset))])
all_labels = torch.tensor([dataset[i][1] for i in range(len(dataset))])

cifar10_mean = np.array(all_images.mean(dim=(0, 2, 3)))
cifar10_std = np.array(all_images.std(dim=(0, 2, 3)))

mean_tensor = torch.tensor(cifar10_mean).view(1, 3, 1, 1)
std_tensor = torch.tensor(cifar10_std).view(1, 3, 1, 1)

normalized_images = (all_images - mean_tensor) / std_tensor

# Subset first 5k samples
images = normalized_images[:SAMPLE_SIZE]  # Use normalized images
subset_labels = all_labels[:SAMPLE_SIZE]

# Convert labels to one-hot encoding for MSE loss (CIFAR-10 has 10 classes)
one_hot_labels = torch.zeros(subset_labels.size(0), NUM_LABELS, device=subset_labels.device)
one_hot_labels.scatter_(1, subset_labels.unsqueeze(1), 1)

# Use one_hot_labels for MSE loss
labels = one_hot_labels

Files already downloaded and verified


In [6]:
class FullyConnectedNet(nn.Module):
    def __init__(self, input_size, num_hidden_layers, hidden_layer_size, num_labels, activation=nn.Tanh):
        super(FullyConnectedNet, self).__init__()

        self.input_size = input_size
        self.num_hidden_layers = num_hidden_layers
        self.hidden_layers_size = hidden_layer_size
        self.num_labels = num_labels
        self.activation = activation

        layers = [nn.Flatten()]
        in_size = input_size

        for _ in range(num_hidden_layers):
            layers += [nn.Linear(in_size, hidden_layer_size), activation()]
            in_size = hidden_layer_size

        layers.append(nn.Linear(in_size, num_labels))
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

In [34]:
def compute_sharpness(model, criterion, images, labels,
                      iters: int = 20, tol: float = 1e-3,
                      subsample: int | None = 1024, damping: float = 0.0) -> float:
    """
    Estimates λ_max(H) (sharpness) of the loss at current model parameters via
    power iteration with Hessian–vector products (Pearlmutter trick).

    Args
    ----
    model      : nn.Module (params require_grad=True)
    criterion  : callable(logits, targets) -> scalar loss (mean reduction)
    images     : tensor [N, ...] on correct device
    labels     : tensor [N] on correct device
    iters      : power-iteration steps (15–25 typical)
    tol        : relative convergence tolerance
    subsample  : if not None, randomly sample this many examples for speed
    damping    : computes eigenvalues of (H + damping * I)

    Returns
    -------
    float: estimated largest eigenvalue (sharpness)
    """
    was_training = model.training
    model.eval()  # stabilize stats (esp. BN/Dropout)

    # ---- choose subset (for speed/memory) ----
    if subsample is not None and images.size(0) > subsample:
        idx = torch.randperm(images.size(0), device=images.device)[:subsample]
        xb, yb = images[idx], labels[idx]
    else:
        xb, yb = images, labels

    params = [p for p in model.parameters() if p.requires_grad]
    n = sum(p.numel() for p in params)
    if n == 0:
        if was_training: model.train()
        return 0.0

    # ---- forward with graph for second-order autodiff ----
    # Important: no torch.no_grad() here
    model.zero_grad(set_to_none=True)
    logits = model(xb)
    loss = criterion(logits, yb)

    # ∇ℓ with graph so we can differentiate it again
    grads = torch.autograd.grad(loss, params, create_graph=True, retain_graph=True)
    g_flat = torch.cat([gi.reshape(-1) for gi in grads])

    # init v ~ unit vector
    with torch.no_grad():
        v = torch.randn(n, device=g_flat.device)
        v /= (v.norm() + 1e-12)

    lam_prev = None
    for _ in range(iters):
        # H v = ∇[(∇ℓ)·v]
        gv = (g_flat * v).sum()
        Hv_parts = torch.autograd.grad(gv, params, retain_graph=True)
        Hv = torch.cat([h.reshape(-1) for h in Hv_parts])
        if damping != 0.0:
            Hv = Hv + damping * v

        with torch.no_grad():
            Hv_norm = Hv.norm()
            if Hv_norm == 0 or torch.isnan(Hv_norm):
                lam = 0.0
                break
            v = Hv / (Hv_norm + 1e-12)
            lam = torch.dot(v, Hv).item()

            if lam_prev is not None:
                if abs(lam - lam_prev) / (abs(lam_prev) + 1e-12) < tol:
                    break
            lam_prev = lam

    # cleanup and restore mode
    del grads, g_flat, logits, loss
    if was_training: model.train()
    return float(lam_prev if lam_prev is not None else lam)

In [35]:
def setup_output_files(output_dir="output"): 

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    metadata_path = os.path.join(output_dir, "metadata.csv")
    output_data_path = os.path.join(output_dir, "output.csv")

    if os.path.exists(metadata_path):
            metadata = pd.read_csv(metadata_path)
    else:
        metadata = pd.DataFrame({
            "model_id": pd.Series(dtype="int"),
            "model_type": pd.Series(dtype="str"),
            "num_hidden_layers": pd.Series(dtype="int"),
            "hidden_layers_size": pd.Series(dtype="int"),
            "activation_function": pd.Series(dtype="str"),
            "optimizer": pd.Series(dtype="str"),
            "momentum": pd.Series(dtype="float"),
            "learning_rate": pd.Series(dtype="float"),
            "num_epochs": pd.Series(dtype="int"),
            "train_time": pd.Series(dtype="float")
        })

    if os.path.exists(output_data_path):
        output_data = pd.read_csv(output_data_path)
    else:
        output_data = pd.DataFrame({
            "model_id": pd.Series(dtype="int"),
            "epoch": pd.Series(dtype="int"),
            "loss": pd.Series(dtype="float"),
            "sharpness": pd.Series(dtype="float")
        })

    return metadata, output_data

In [9]:
def load_output_files(output_dir="output"):
    metadata_path = os.path.join(output_dir, "metadata.csv")
    output_data_path = os.path.join(output_dir, "output.csv")

    metadata = pd.read_csv(metadata_path)
    output_data = pd.read_csv(output_data_path)

    return metadata, output_data

In [10]:
def save_output_files(metadata, output_data, output_dir="output"):

    metadata_path = os.path.join(output_dir, "metadata.csv")
    output_data_path = os.path.join(output_dir, "output.csv")
    
    metadata.to_csv(metadata_path, index=False)
    output_data.to_csv(output_data_path, index=False)

In [None]:
def train_model(model, optimizer, criterion, learning_rate, num_epochs, images, labels, num_sharpness_computations=100):

    if optimizer.__class__.__name__ == 'SGD':
        momentum = optimizer.defaults['momentum']
    elif optimizer.__class__.__name__ == 'Adam':
        momentum = optimizer.defaults['betas'][0]
    else:
        momentum = None

    momentum_str = f"Momentum: {momentum}"

    print(f"Model: {model.__class__.__name__}")
    print(f"Optimizer: {optimizer.__class__.__name__}")
    print(f"Learning Rate: {learning_rate}")
    print(f"Number of Epochs: {num_epochs}")
    print(momentum_str)
    
    optimizer.param_groups[0]['lr'] = learning_rate

    model = model.to(device)
    images = images.to(device)
    labels = labels.to(device)

    train_losses = np.empty(num_epochs)
    sharps = np.full(num_epochs, np.nan)

    model.train()

    start_time = time.time()
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_losses[epoch] = loss.item()
        
        if (epoch + 1) % (num_epochs // num_sharpness_computations) == 0 or epoch == 0:
            sharpness = compute_sharpness(model, criterion, images, labels, iters=20, 
                                    subsample=512)
            sharps[epoch] = sharpness
            print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Sharpness: {sharpness}")
            
    end_time = time.time()
    print(f"Training Time: {end_time - start_time}")

    metadata, output_data, = setup_output_files("output")
    model_id = metadata.shape[0] + 1
    
    metadata.loc[metadata.shape[0]] ={
        "model_id": model_id,
        "model_type": model.__class__.__name__,
        "num_hidden_layers": model.num_hidden_layers,
        "hidden_layers_size": model.hidden_layers_size,
        "activation_function": model.activation,
        "optimizer": optimizer.__class__.__name__,
        "momentum" : momentum if momentum is not None else 0,
        "learning_rate": learning_rate,
        "num_epochs": num_epochs,
        "train_time": end_time - start_time,
    }

    output_data = pd.concat([output_data, pd.DataFrame({
        "model_id": np.ones_like(train_losses) * model_id,
        "loss": train_losses,
        "sharpness": sharps
    })], ignore_index=True)

    save_output_files(metadata, output_data, output_dir="output")
    print("")

In [None]:
num_epochs = 5000
num_sharpness_computations = 100

learning_rates_sgd = [1/10, 1/20, 1/40, 1/80]
momentums_sgd = [0, 0.25, 0.5, 0.75, 0.9, 0.95] 

learning_rates_adam = [1/333, 1/1000, 1/3333, 1/10000]
momentums_adam = [0.8, 0.85, 0.9, 0.950, 0.99, 0.999]

learning_rates_rmsprop = [1/33, 1/100, 1/333, 1/1000]
momentums_rmsprop = [0.8, 0.85, 0.9, 0.950, 0.99, 0.999]


input_size = np.prod(all_images.shape[1:])
num_hidden_layers = 2
hidden_layer_size = 200

for optimizer_type in ['SGD','Adam', 'RMSProp']:

    if optimizer_type == 'SGD':
        params = momentums_sgd
        learning_rates = learning_rates_sgd
    elif optimizer_type == 'Adam':
        params = momentums_adam
        learning_rates = learning_rates_adam
    elif optimizer_type == 'RMSProp':
        params = momentums_rmsprop
        learning_rates = learning_rates_rmsprop
    else:
        params = []
        learning_rates = []

    for param in params:
        for lr in learning_rates:
            
            model = FullyConnectedNet(input_size, num_hidden_layers, hidden_layer_size, NUM_LABELS)

            if optimizer_type == 'SGD':
                optimizer = torch.optim.SGD(model.parameters(), momentum = param) 
            elif optimizer_type == 'Adam':
                optimizer = torch.optim.Adam(model.parameters(), betas = [param, 0.999])
            elif optimizer_type == 'RMSProp':
                optimizer = torch.optim.RMSprop(model.parameters(), momentum = param)
            else:
                optimizer = None

            train_model(model=model, 
                        optimizer=optimizer, 
                        criterion=nn.MSELoss(), 
                        num_epochs=num_epochs,
                        learning_rate=lr,
                        images=images, 
                        labels=labels,
                        num_sharpness_computations=num_sharpness_computations
            )

Model: FullyConnectedNet
Optimizer: SGD
Learning Rate: 0.1
Number of Epochs: 5000
Momentum: 0
Beta1: None
Epoch [1/5000], Loss: 0.1233, Sharpness: 17.326448440551758
Epoch [50/5000], Loss: 0.0820, Sharpness: 14.364348411560059
Epoch [100/5000], Loss: 0.0784, Sharpness: 16.266252517700195
Epoch [150/5000], Loss: 0.0764, Sharpness: 15.799684524536133
Epoch [200/5000], Loss: 0.0750, Sharpness: 16.397724151611328
Epoch [250/5000], Loss: 0.0738, Sharpness: 16.487470626831055
Epoch [300/5000], Loss: 0.0727, Sharpness: 14.72199535369873
Epoch [350/5000], Loss: 0.0718, Sharpness: 14.423073768615723
Epoch [400/5000], Loss: 0.0709, Sharpness: 13.870201110839844
Epoch [450/5000], Loss: 0.0700, Sharpness: 18.19318389892578
Epoch [500/5000], Loss: 0.0691, Sharpness: 16.65302276611328
Epoch [550/5000], Loss: 0.0683, Sharpness: 15.520635604858398
Epoch [600/5000], Loss: 0.0675, Sharpness: 18.817853927612305
Epoch [650/5000], Loss: 0.0667, Sharpness: 19.3194637298584
Epoch [700/5000], Loss: 0.0660, Sh

In [30]:
def plot_output_data(metadata, output_data, num_rows, num_cols, title, model_ids):

    metadata = metadata[metadata['model_id'].isin(model_ids)]
    output_data = output_data[output_data['model_id'].isin(model_ids)] 

    learning_rates = metadata['learning_rate'].unique()
    subplot_titles = [f"η = {learning_rate}" for learning_rate in learning_rates]

    specs = [[{"secondary_y": True} for _ in range(num_cols)] for _ in range(num_rows)]
    fig = make_subplots(num_rows, 
                        num_cols, 
                        specs=specs,
                        horizontal_spacing=0.05,
                        vertical_spacing=0.02,
                        subplot_titles = subplot_titles)

    fig.update_annotations(font=dict(size=14), yanchor="bottom")

    for i, model_id in enumerate(model_ids):
        row = i // num_cols + 1
        col = i % num_cols + 1

        model_data = output_data[output_data['model_id'] == model_id]
        model_info = metadata[metadata['model_id'] == model_id].iloc[0]
        
        lr = model_info['learning_rate']
        momentum = model_info['momentum'] if model_info['optimizer'] == "SGD" else model_info['beta1']

        # threshold = (2 * (1 + momentum)) / (lr * ((1 - momentum)**2))
        threshold = 2 * (1 + momentum) / lr
        
        x_vals = np.arange(1, len(model_data)+1)
        loss = model_data['loss']
        sharpness = model_data['sharpness']

        fig.add_trace(
            go.Scatter(x=x_vals, y=loss, mode='lines', name='Loss',
                       line=dict(color='blue')),
            row=row, col=col, secondary_y=False
        )

        fig.add_trace(
            go.Scatter(x=x_vals, y=sharpness, mode='markers', name='Sharpness', 
                       marker=dict(size=4, color='red')),
            row=row, col=col, secondary_y=True
        )

        fig.add_hline(
            y=threshold,
            row=row, col=col,
            secondary_y=True,
            line=dict(color='black', dash='dot', width=1),
            opacity=1.0
        )

        if col == 1:
            if model_info['optimizer'] == "SGD":
                y1_axis_title = f"Loss   (momentum = {model_info['momentum']})"
            elif model_info['optimizer'] == "Adam":
                y1_axis_title = f"Loss   (beta 1 = {model_info['beta1']})"
        else:
            y1_axis_title = ""
            
        y2_axis_title = "Sharpness" if col == num_cols else ""
        x_axis_title = "Epoch" if row == num_rows else ""
        
        fig.update_xaxes(title_text=x_axis_title, row=row, col=col)
        
        fig.update_yaxes(title_text=y1_axis_title, row=row, col=col, secondary_y=False,
            range=[0, output_data.drop_duplicates(subset = 'model_id', keep = 'first')['loss'].max() * 1.1])
        
        fig.update_yaxes(title_text=y2_axis_title,row = row, col = col, secondary_y=True,
            range=[0,sharpness.max() * 1.1],
            showgrid=False,
            tickmode='array',
            tickvals=[threshold / 2, threshold, 1.5 * threshold, 2 * threshold],
            ticktext=["(1+β)/η", "2(1+β)/η", "3(1+β)/η", "4(1+β)/η"],
            ticks='outside',
            ticklen=6,
            tickwidth=1,
        )

    fig.update_layout(title = dict(text=title, x = 0.5), showlegend=False,
                      height = 300 * num_rows, width = 400 * num_cols)

    fig.show()

In [31]:
metadata, output_data = load_output_files()

In [32]:
num_rows = 6
num_columns = 4
model_ids = list(range(1,25))
plot_output_data(metadata, output_data, num_rows, num_columns, 
                 title="GD w/ Momentum: Traning Loss and Sharpness for Combinations of Momentum and Learning Rate", 
                 model_ids = model_ids)

In [33]:
num_rows = 6
num_columns = 4
model_ids = list(range(25,49))
plot_output_data(metadata, output_data, num_rows, num_columns, 
                 title="GD w/ Momentum: Traning Loss and Sharpness for Combinations of Momentum and Learning Rate", 
                 model_ids = model_ids)

In [21]:
metadata

Unnamed: 0,model_id,model_type,num_hidden_layers,hidden_layers_size,activation_function,optimizer,momentum,beta1,learning_rate,num_epochs,train_time
0,1,FullyConnectedNet,2,200,<class 'torch.nn.modules.activation.Tanh'>,SGD,0.0,0.0,0.1,5000,21.009866
1,2,FullyConnectedNet,2,200,<class 'torch.nn.modules.activation.Tanh'>,SGD,0.0,0.0,0.05,5000,19.617142
2,3,FullyConnectedNet,2,200,<class 'torch.nn.modules.activation.Tanh'>,SGD,0.0,0.0,0.025,5000,19.668279
3,4,FullyConnectedNet,2,200,<class 'torch.nn.modules.activation.Tanh'>,SGD,0.0,0.0,0.0125,5000,19.976518
4,5,FullyConnectedNet,2,200,<class 'torch.nn.modules.activation.Tanh'>,SGD,0.25,0.0,0.1,5000,20.167397
5,6,FullyConnectedNet,2,200,<class 'torch.nn.modules.activation.Tanh'>,SGD,0.25,0.0,0.05,5000,20.004448
6,7,FullyConnectedNet,2,200,<class 'torch.nn.modules.activation.Tanh'>,SGD,0.25,0.0,0.025,5000,20.583166
7,8,FullyConnectedNet,2,200,<class 'torch.nn.modules.activation.Tanh'>,SGD,0.25,0.0,0.0125,5000,20.813748
8,9,FullyConnectedNet,2,200,<class 'torch.nn.modules.activation.Tanh'>,SGD,0.5,0.0,0.1,5000,21.229838
9,10,FullyConnectedNet,2,200,<class 'torch.nn.modules.activation.Tanh'>,SGD,0.5,0.0,0.05,5000,20.826769
