In [1]:
import os
os.environ["PYTHONHASHSEED"] = "42"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import pandas as pd
import numpy as np
import random
import time
import torch
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.autograd.functional import hessian
import torch.nn as nn
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(True)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
generator=torch.Generator(device=device).manual_seed(SEED)

In [3]:
def sample_data(X, y, num_per_class):
    X = np.asarray(X)
    y = np.asarray(y)

    classes = np.unique(y)
    indices = []

    for c in classes:
        cls_idx = np.where(y == c)[0]
        chosen = np.random.choice(cls_idx, num_per_class, replace=False)
        indices.append(chosen)

    indices = np.concatenate(indices)
    return X[indices], y[indices]

# Load raw CIFAR-10 
train = datasets.CIFAR10(root="./data", train=True,  download=True)
test  = datasets.CIFAR10(root="./data", train=False, download=True)

# # Subsample
X, y  = sample_data(train.data, train.targets, 500)
X_test, y_test = sample_data(test.data, test.targets, 100)

# Convert to float and scale
X  = X.astype(np.float32) / 255.0
X_test = X_test.astype(np.float32) / 255.0

# Normalize
mean = X.mean(axis=(0,1,2),keepdims=True)
std = X.std(axis=(0,1,2),keepdims=True)
X = (X - mean) / std
X_test = (X_test - mean) / std

# Reshape to NCHW
X = np.transpose(X, (0, 3, 1, 2))
X_test = np.transpose(X_test, (0, 3, 1, 2))

# Convert to torch
X = torch.tensor(X, dtype=torch.float32, device=device)
y = torch.tensor(y, dtype=torch.long, device=device)
X_test = torch.tensor(X_test, dtype=torch.float32, device=device)
y_test = torch.tensor(y_test, dtype=torch.long, device=device)

# One hot encode labels for RMSE criterion
y_onehot = torch.nn.functional.one_hot(y, num_classes=10).float().to(device)
y_test_onehot = torch.nn.functional.one_hot(y_test, num_classes=10).float().to(device)


In [4]:
class FullyConnectedNet(nn.Module):
    def __init__(self, input_size, num_hidden_layers, hidden_layer_size, 
                 num_labels, activation):
        super(FullyConnectedNet, self).__init__()

        self.input_size = input_size
        self.num_hidden_layers = num_hidden_layers
        self.hidden_layers_size = hidden_layer_size
        self.num_labels = num_labels
        self.activation = activation

        layers = [nn.Flatten()]

        in_size = input_size
        for _ in range(num_hidden_layers):
            layers += [nn.Linear(in_size, hidden_layer_size), activation()]
            in_size = hidden_layer_size

        layers.append(nn.Linear(in_size, num_labels))
        self.network = nn.Sequential(*layers)
        self.param_list = list(self.parameters())

        for m in self.network:
            if isinstance(m, nn.Linear):
                if activation == nn.ReLU:
                    nn.init.kaiming_normal_(
                        m.weight, 
                        generator=torch.Generator().manual_seed(SEED), 
                        nonlinearity='relu')
                elif activation == nn.Tanh:
                    nn.init.xavier_uniform_(
                        m.weight,
                        generator=torch.Generator().manual_seed(SEED)
                        )
                else:
                    nn.init.kaiming_normal_(
                        m.weight, 
                        generator=torch.Generator().manual_seed(SEED), 
                        nonlinearity='relu')
                nn.init.zeros_(m.bias)

    def forward(self, x):
        return self.network(x)
    
class CNN(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()

        self.activation = nn.ReLU

        self.features = nn.Sequential(
            nn.Conv2d(3, 16, 3, padding=1), nn.ReLU(),
            nn.Conv2d(16, 16, 3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(16, 32, 3, padding=1), nn.ReLU(),
            nn.Conv2d(32, 32, 3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(),
            nn.Conv2d(64, 64, 3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2),
        )

        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64*4*4, 128), nn.ReLU(),
            nn.Linear(128, num_classes),
        )

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
                nn.init.zeros_(m.bias)
            elif isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
                nn.init.zeros_(m.bias)

        self.param_list = list(self.parameters())

    def forward(self, x):
        return self.classifier(self.features(x))

In [5]:
def setup_output_files(output_dir="output"): 

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    metadata_path = os.path.join(output_dir, "metadata_rmsprop.csv")
    output_data_path = os.path.join(output_dir, "output_rmsprop.csv")

    if os.path.exists(metadata_path):
            metadata = pd.read_csv(metadata_path)
    else:
        metadata = pd.DataFrame({
            "model_id": pd.Series(dtype="int"),
            "model_type": pd.Series(dtype="str"),
            "num_hidden_layers": pd.Series(dtype="int"),
            "hidden_layers_size": pd.Series(dtype="int"),
            "activation_function": pd.Series(dtype="str"),
            "optimizer": pd.Series(dtype="str"),
            "criterion": pd.Series(dtype="str"),
            "learning_rate": pd.Series(dtype="float"),
            "momentum": pd.Series(dtype="float"),
            "num_epochs": pd.Series(dtype="int"),
            "time_minutes": pd.Series(dtype="float"),
        })

    if os.path.exists(output_data_path):
        output_data = pd.read_csv(output_data_path)
    else:
        output_data = pd.DataFrame({
            "model_id": pd.Series(dtype="int"),
            "epoch": pd.Series(dtype="int"),
            "train_loss": pd.Series(dtype="float"),
            "train_accuracy": pd.Series(dtype="float"),
            "test_accuracy": pd.Series(dtype="float"),
            "sharpness_H": pd.Series(dtype="float"),
            "sharpness_A": pd.Series(dtype="float"),
        })

    return metadata, output_data

def load_output_files(output_dir="output"):
    metadata_path = os.path.join(output_dir, "metadata_rmsprop.csv")
    output_data_path = os.path.join(output_dir, "output_rmsprop.csv")

    metadata = pd.read_csv(metadata_path)
    output_data = pd.read_csv(output_data_path)

    return metadata, output_data

def save_output_files(metadata, output_data, output_dir="output"):

    metadata_path = os.path.join(output_dir, "metadata_rmsprop.csv")
    output_data_path = os.path.join(output_dir, "output_rmsprop.csv")

    metadata.to_csv(metadata_path, index=False)
    output_data.to_csv(output_data_path, index=False)

def delete_model_data(model_ids, output_dir="output"):
    metadata, output_data = load_output_files(output_dir)
    metadata = metadata[~metadata['model_id'].isin(model_ids)]
    output_data = output_data[~output_data['model_id'].isin(model_ids)]
    save_output_files(metadata, output_data, output_dir)


In [122]:
def get_hessian_metrics(model, optimizer, criterion, X, y, 
                        subsample_dim = 1024, iters=30, tol = 1e-4):
    
    # Subsample data for compute efficiency
    subsample_dim = min(subsample_dim, len(X))
    idx = torch.randperm(len(X), device=X.device, generator=generator)[:subsample_dim]
    X = X[idx]
    y = y[idx]
    
    # Build graph for gradient
    outputs = model(X)
    loss = criterion(outputs, y)

    grads = torch.autograd.grad(
        loss, model.param_list,
        create_graph=True
    )
    g_flat = torch.cat([g.reshape(-1) for g in grads])
    dim    = g_flat.numel()
    device = g_flat.device

    # Computes Hessian-vector product with Pearlmutter trick
    def Hv(v):
        Hv_list = torch.autograd.grad(
            g_flat @ v,
            model.param_list,
            retain_graph=True
        )
        return torch.cat([h.reshape(-1) for h in Hv_list])
    
    # Performs power iteration to estimate largest eigenvalue
    def power_iteration(matvec):
        v = torch.randn(dim, device=device, generator=generator)
        v /= v.norm()

        eig_old = 0.0
        for _ in range(iters):
            Hv_v = matvec(v)
            eig = (v @ Hv_v).item()   
            v = Hv_v / Hv_v.norm()

            if abs(eig - eig_old) / (abs(eig_old) + 1e-12) < tol:
                break
            eig_old = eig

        Hv_v = matvec(v)
        eig = (v @ Hv_v).item()
        return eig

    lambda_H = power_iteration(Hv)
    
    if isinstance(optimizer, torch.optim.RMSprop):
        
        # Compute adaptive scaling matrix D (sqrt) for effective Hessian
        v_t = torch.cat([state['square_avg'].reshape(-1)
                        for state in optimizer.state.values()]
                        ).detach()

        eps = optimizer.param_groups[0]['eps']
        D_sqrt = torch.sqrt(1 / torch.sqrt(v_t + eps))

        # Compute effective Hessian-vector product
        def Av(v):
            return D_sqrt * Hv(D_sqrt * v)
        
        lambda_A = power_iteration(Av)
    else:
        lambda_A = None

    return lambda_H, lambda_A

In [7]:
def train_model(model, optimizer, criterion, epochs, accuracy, X, y, X_test, y_test):
    print(f"Training {model.__class__.__name__} with " +
          f"{optimizer.__class__.__name__} and learning rate " +
          f"{optimizer.param_groups[0]['lr']} for {epochs} epochs.")

    learning_rate = optimizer.param_groups[0]['lr']
    momentum = optimizer.param_groups[0].get('momentum', 0.0)

    model.to(device)
    model.train()

    train_losses = np.full(epochs, np.nan)
    train_accuracies = np.full(epochs, np.nan)
    test_accuracies = np.full(epochs, np.nan)
    H_sharps = np.full(epochs, np.nan)
    A_sharps = np.full(epochs, np.nan)

    if isinstance(criterion, nn.MSELoss):
        y_loss = torch.nn.functional.one_hot(
            y, num_classes=model.num_labels).float().to(device)
       
    else:
        y_loss = y.to(device)

    start = time.time()
    
    train_acc = 0.0
    epoch = 0

    while train_acc < accuracy and epoch < epochs :

        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y_loss)
        loss.backward()
        optimizer.step()

        train_losses[epoch] = loss.item()

        if epoch % (epochs // 100) == 0:
            H_sharps[epoch], A_sharps[epoch] = get_hessian_metrics(
                model, optimizer, criterion, X, y_loss
            )

        with torch.no_grad():
            model.eval()
            train_preds = outputs.argmax(dim=1)
            test_preds = model(X_test).argmax(dim=1)
            train_acc = (train_preds == y).float().mean().item()
            test_acc = (test_preds == y_test).float().mean().item()
            train_accuracies[epoch] = train_acc
            test_accuracies[epoch] = test_acc
        model.train()

        if (epoch+1) % 100 == 0:
            print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}, " +
                  f"Time: {round(((time.time() - start) / 60), 2)}, " +
                  f"Train Acc: {train_accuracies[epoch]:.4f}, " +
                  f"Test Acc: {test_accuracies[epoch]:.4f}, ")
        epoch += 1

    metadata, output_data = setup_output_files("output")
    model_id = metadata.shape[0] + 1

    metadata.loc[metadata.shape[0]] ={
        "model_id": model_id,
        "model_type": model.__class__.__name__,
        "activation_function": model.activation.__name__,
        "optimizer": optimizer.__class__.__name__,
        "criterion": criterion.__class__.__name__,
        "learning_rate": learning_rate,
        "momentum": momentum,
        "num_epochs": epochs,
        "time_minutes": round((time.time() - start) / 60, 2),
    }

    output_data = pd.concat([output_data, pd.DataFrame({
        "model_id": np.ones_like(train_losses) * model_id,
        "epoch": np.arange(1, epochs + 1),
        "train_loss": train_losses,
        "sharpness_H": H_sharps.round(4),
        "sharpness_A": A_sharps.round(4),
        "test_accuracy": test_accuracies,
        "train_accuracy": train_accuracies,
    })], ignore_index=True)

    save_output_files(metadata, output_data)

In [None]:
input_size = X.shape[1] * X.shape[2] * X.shape[3]
num_hidden_layers = 2
hidden_layer_size = 200

epochs = 20000
learning_rates = [0.13, 0.12, 0.1, 0.07, 0.06, 0.05, 0.04, 0.032]
accuracy = 0.99

for learning_rate in learning_rates:

    model = FullyConnectedNet(
        input_size=input_size,
        num_hidden_layers=num_hidden_layers,
        hidden_layer_size=hidden_layer_size,
        num_labels=10,
        activation=nn.Tanh
    )
    criterion = nn.MSELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    train_model(model, optimizer, criterion, epochs, accuracy, X, y, X_test, y_test)

Training FullyConnectedNet with SGD and learning rate 0.13 for 20000 epochs.
Epoch [100/20000], Loss: 0.0912, Time: 0.01, Train Acc: 0.3426, Test Acc: 0.2720, 
Epoch [200/20000], Loss: 0.0796, Time: 0.01, Train Acc: 0.4474, Test Acc: 0.2990, 
Epoch [300/20000], Loss: 0.0732, Time: 0.02, Train Acc: 0.5206, Test Acc: 0.3070, 
Epoch [400/20000], Loss: 0.0687, Time: 0.02, Train Acc: 0.5806, Test Acc: 0.3140, 
Epoch [500/20000], Loss: 0.0651, Time: 0.03, Train Acc: 0.6328, Test Acc: 0.3180, 
Epoch [600/20000], Loss: 0.0621, Time: 0.04, Train Acc: 0.6716, Test Acc: 0.3230, 
Epoch [700/20000], Loss: 0.0595, Time: 0.04, Train Acc: 0.7088, Test Acc: 0.3240, 
Epoch [800/20000], Loss: 0.0571, Time: 0.05, Train Acc: 0.7414, Test Acc: 0.3150, 
Epoch [900/20000], Loss: 0.0548, Time: 0.05, Train Acc: 0.7690, Test Acc: 0.3170, 
Epoch [1000/20000], Loss: 0.0528, Time: 0.06, Train Acc: 0.7904, Test Acc: 0.3140, 
Epoch [1100/20000], Loss: 0.0508, Time: 0.06, Train Acc: 0.8152, Test Acc: 0.3220, 
Epoch [1

In [135]:
input_size = X.shape[1] * X.shape[2] * X.shape[3]
num_hidden_layers = 2
hidden_layer_size = 200

epochs = 4000
learning_rates = [0.032, 0.025, 0.02, 0.015, 0.01]
accuracy = 0.999
for learning_rate in learning_rates:

    model = FullyConnectedNet(
        input_size=input_size,
        num_hidden_layers=num_hidden_layers,
        hidden_layer_size=hidden_layer_size,
        num_labels=10,
        activation=nn.Tanh
    )
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    train_model(model, optimizer, criterion, epochs, accuracy, X, y, X_test, y_test)


Training FullyConnectedNet with SGD and learning rate 0.032 for 4000 epochs.
Epoch [100/4000], Loss: 1.5505, Time: 0.01, Train Acc: 0.4760, Test Acc: 0.3850, 
Epoch [200/4000], Loss: 1.3310, Time: 0.03, Train Acc: 0.5788, Test Acc: 0.3870, 
Epoch [300/4000], Loss: 1.1433, Time: 0.04, Train Acc: 0.6730, Test Acc: 0.4110, 
Epoch [400/4000], Loss: 0.9652, Time: 0.05, Train Acc: 0.7484, Test Acc: 0.3990, 
Epoch [500/4000], Loss: 0.8686, Time: 0.06, Train Acc: 0.7624, Test Acc: 0.3770, 
Epoch [600/4000], Loss: 0.6842, Time: 0.07, Train Acc: 0.8484, Test Acc: 0.3760, 
Epoch [700/4000], Loss: 0.5364, Time: 0.08, Train Acc: 0.9026, Test Acc: 0.3830, 
Epoch [800/4000], Loss: 0.4614, Time: 0.09, Train Acc: 0.9192, Test Acc: 0.3740, 
Epoch [900/4000], Loss: 0.3820, Time: 0.11, Train Acc: 0.9308, Test Acc: 0.3570, 
Epoch [1000/4000], Loss: 0.2595, Time: 0.11, Train Acc: 0.9800, Test Acc: 0.3660, 
Epoch [1100/4000], Loss: 0.2843, Time: 0.13, Train Acc: 0.9440, Test Acc: 0.3460, 
Epoch [1200/4000], 

In [67]:
input_size = X.shape[1] * X.shape[2] * X.shape[3]
num_hidden_layers = 2
hidden_layer_size = 200

epochs = 8000
learning_rates = [0.14, 0.12, 0.1]
momentum = 0.9
accuracy = 0.9999

for learning_rate in learning_rates:

    model = FullyConnectedNet(
        input_size=input_size,
        num_hidden_layers=num_hidden_layers,
        hidden_layer_size=hidden_layer_size,
        num_labels=10,
        activation=nn.Tanh
    )
    criterion = nn.MSELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)

    train_model(model, optimizer, criterion, epochs, accuracy, X, y, X_test, y_test)

Training FullyConnectedNet with SGD and learning rate 0.14 for 8000 epochs.
Epoch [100/8000], Loss: 0.0819, Time: 0.01, Train Acc: 0.3724, Test Acc: 0.2190, 
Epoch [200/8000], Loss: 0.0732, Time: 0.01, Train Acc: 0.4952, Test Acc: 0.2390, 
Epoch [300/8000], Loss: 0.0681, Time: 0.02, Train Acc: 0.5674, Test Acc: 0.2430, 
Epoch [400/8000], Loss: 0.0637, Time: 0.03, Train Acc: 0.6258, Test Acc: 0.2510, 
Epoch [500/8000], Loss: 0.0595, Time: 0.03, Train Acc: 0.6776, Test Acc: 0.2490, 
Epoch [600/8000], Loss: 0.0556, Time: 0.04, Train Acc: 0.7216, Test Acc: 0.2510, 
Epoch [700/8000], Loss: 0.0518, Time: 0.04, Train Acc: 0.7630, Test Acc: 0.2530, 
Epoch [800/8000], Loss: 0.0482, Time: 0.05, Train Acc: 0.8004, Test Acc: 0.2520, 
Epoch [900/8000], Loss: 0.0449, Time: 0.06, Train Acc: 0.8264, Test Acc: 0.2480, 
Epoch [1000/8000], Loss: 0.0418, Time: 0.06, Train Acc: 0.8512, Test Acc: 0.2490, 
Epoch [1100/8000], Loss: 0.0388, Time: 0.07, Train Acc: 0.8710, Test Acc: 0.2550, 
Epoch [1200/8000], L

In [8]:
input_size = X.shape[1] * X.shape[2] * X.shape[3]
num_hidden_layers = 2
hidden_layer_size = 200

epochs = 500
learning_rates = [0.1, 0.09, 0.08, 0.07, 0.06, 0.05, 0.04, 0.03]
momentum = 0.9
accuracy = 0.9999

for learning_rate in learning_rates:

    model = FullyConnectedNet(
        input_size=input_size,
        num_hidden_layers=num_hidden_layers,
        hidden_layer_size=hidden_layer_size,
        num_labels=10,
        activation=nn.Tanh
    )
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)

    train_model(model, optimizer, criterion, epochs, accuracy, X, y, X_test, y_test)

Training FullyConnectedNet with SGD and learning rate 0.1 for 500 epochs.
Epoch [100/500], Loss: 0.1003, Time: 0.03, Train Acc: 0.9924, Test Acc: 0.3460, 
Training FullyConnectedNet with SGD and learning rate 0.09 for 500 epochs.
Epoch [100/500], Loss: 0.0709, Time: 0.03, Train Acc: 0.9966, Test Acc: 0.3490, 
Training FullyConnectedNet with SGD and learning rate 0.08 for 500 epochs.
Epoch [100/500], Loss: 0.1355, Time: 0.03, Train Acc: 0.9848, Test Acc: 0.3590, 
Training FullyConnectedNet with SGD and learning rate 0.07 for 500 epochs.
Epoch [100/500], Loss: 0.1436, Time: 0.03, Train Acc: 0.9876, Test Acc: 0.3620, 
Training FullyConnectedNet with SGD and learning rate 0.06 for 500 epochs.
Epoch [100/500], Loss: 0.2065, Time: 0.03, Train Acc: 0.9710, Test Acc: 0.3490, 
Training FullyConnectedNet with SGD and learning rate 0.05 for 500 epochs.
Epoch [100/500], Loss: 0.5167, Time: 0.03, Train Acc: 0.8574, Test Acc: 0.3590, 
Training FullyConnectedNet with SGD and learning rate 0.04 for 50

In [151]:
input_size = X.shape[1] * X.shape[2] * X.shape[3]
num_hidden_layers = 2
hidden_layer_size = 200

epochs = 4000
learning_rates = [0.00005, 0.00002, 0.00001]
learning_rates = [0.0001]
accuracy = 0.99

for learning_rate in learning_rates:

    model = FullyConnectedNet(
        input_size=input_size,
        num_hidden_layers=num_hidden_layers,
        hidden_layer_size=hidden_layer_size,
        num_labels=10,
        activation=nn.Tanh
    )
    criterion = nn.MSELoss()
    optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)

    train_model(model, optimizer, criterion, epochs, accuracy, X, y, X_test, y_test)

Training FullyConnectedNet with RMSprop and learning rate 0.0001 for 4000 epochs.
Epoch [100/4000], Loss: 0.1001, Time: 0.02, Train Acc: 0.3582, Test Acc: 0.2240, 
Epoch [200/4000], Loss: 0.0791, Time: 0.03, Train Acc: 0.5250, Test Acc: 0.2710, 
Epoch [300/4000], Loss: 0.0694, Time: 0.05, Train Acc: 0.6454, Test Acc: 0.2550, 
Epoch [400/4000], Loss: 0.0617, Time: 0.06, Train Acc: 0.7114, Test Acc: 0.2810, 
Epoch [500/4000], Loss: 0.0486, Time: 0.08, Train Acc: 0.8394, Test Acc: 0.3030, 
Epoch [600/4000], Loss: 0.0473, Time: 0.09, Train Acc: 0.8436, Test Acc: 0.3020, 
Epoch [700/4000], Loss: 0.0455, Time: 0.11, Train Acc: 0.8536, Test Acc: 0.2890, 
Epoch [800/4000], Loss: 0.0396, Time: 0.12, Train Acc: 0.8954, Test Acc: 0.2820, 
Epoch [900/4000], Loss: 0.0362, Time: 0.14, Train Acc: 0.9030, Test Acc: 0.2940, 
Epoch [1000/4000], Loss: 0.0366, Time: 0.15, Train Acc: 0.9174, Test Acc: 0.2960, 
Epoch [1100/4000], Loss: 0.0301, Time: 0.17, Train Acc: 0.9324, Test Acc: 0.2880, 
Epoch [1200/40

In [140]:
input_size = X.shape[1] * X.shape[2] * X.shape[3]
num_hidden_layers = 2
hidden_layer_size = 200

epochs = 2000
learning_rates = [0.00005, 0.00002, 0.00001]
accuracy = 0.9999

for learning_rate in learning_rates:

    model = FullyConnectedNet(
        input_size=input_size,
        num_hidden_layers=num_hidden_layers,
        hidden_layer_size=hidden_layer_size,
        num_labels=10,
        activation=nn.Tanh
    )
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)

    train_model(model, optimizer, criterion, epochs, accuracy, X, y, X_test, y_test)

Training FullyConnectedNet with RMSprop and learning rate 5e-05 for 2000 epochs.
Epoch [100/2000], Loss: 1.3158, Time: 0.02, Train Acc: 0.6078, Test Acc: 0.3710, 
Epoch [200/2000], Loss: 1.0129, Time: 0.04, Train Acc: 0.7462, Test Acc: 0.3850, 
Epoch [300/2000], Loss: 0.7949, Time: 0.06, Train Acc: 0.8316, Test Acc: 0.3840, 
Epoch [400/2000], Loss: 0.6193, Time: 0.08, Train Acc: 0.8932, Test Acc: 0.3820, 
Epoch [500/2000], Loss: 0.4961, Time: 0.1, Train Acc: 0.9232, Test Acc: 0.3710, 
Epoch [600/2000], Loss: 0.3708, Time: 0.12, Train Acc: 0.9634, Test Acc: 0.3690, 
Epoch [700/2000], Loss: 0.2750, Time: 0.14, Train Acc: 0.9818, Test Acc: 0.3700, 
Epoch [800/2000], Loss: 0.2119, Time: 0.16, Train Acc: 0.9880, Test Acc: 0.3680, 
Epoch [900/2000], Loss: 0.1562, Time: 0.18, Train Acc: 0.9946, Test Acc: 0.3610, 
Epoch [1000/2000], Loss: 0.1168, Time: 0.2, Train Acc: 0.9972, Test Acc: 0.3610, 
Epoch [1100/2000], Loss: 0.0888, Time: 0.22, Train Acc: 0.9986, Test Acc: 0.3530, 
Epoch [1200/2000]

Epoch [1400/2000], Loss: 0.4896, Time: 0.27, Train Acc: 0.9494, Test Acc: 0.3690, 
Epoch [1500/2000], Loss: 0.4388, Time: 0.29, Train Acc: 0.9604, Test Acc: 0.3680, 
Epoch [1600/2000], Loss: 0.3920, Time: 0.31, Train Acc: 0.9708, Test Acc: 0.3690, 
Epoch [1700/2000], Loss: 0.3492, Time: 0.33, Train Acc: 0.9784, Test Acc: 0.3720, 
Epoch [1800/2000], Loss: 0.3102, Time: 0.35, Train Acc: 0.9830, Test Acc: 0.3700, 
Epoch [1900/2000], Loss: 0.2749, Time: 0.37, Train Acc: 0.9880, Test Acc: 0.3670, 
Epoch [2000/2000], Loss: 0.2430, Time: 0.39, Train Acc: 0.9916, Test Acc: 0.3620, 


In [49]:
input_size = X.shape[1] * X.shape[2] * X.shape[3]

epochs = 3000
learning_rates = [0.02, 0.01, 0.005, 0.002, 0.001]
accuracy = 0.99

for learning_rate in learning_rates:

    model = CNN()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    train_model(model, optimizer, criterion, epochs, accuracy, X, y, X_test, y_test)

Training CNN with SGD and learning rate 0.002 for 3000 epochs.
Epoch [100/3000], Loss: 2.0261, Time: 0.18, Train Acc: 0.2706, Test Acc: 0.2680, 
Epoch [200/3000], Loss: 1.8972, Time: 0.34, Train Acc: 0.3226, Test Acc: 0.3070, 
Epoch [300/3000], Loss: 1.8144, Time: 0.48, Train Acc: 0.3596, Test Acc: 0.3320, 
Epoch [400/3000], Loss: 1.7487, Time: 0.67, Train Acc: 0.3826, Test Acc: 0.3620, 
Epoch [500/3000], Loss: 1.6923, Time: 0.83, Train Acc: 0.4038, Test Acc: 0.3730, 
Epoch [600/3000], Loss: 1.6403, Time: 0.98, Train Acc: 0.4232, Test Acc: 0.3830, 
Epoch [700/3000], Loss: 1.6038, Time: 1.14, Train Acc: 0.4326, Test Acc: 0.3830, 
Epoch [800/3000], Loss: 1.5647, Time: 1.3, Train Acc: 0.4480, Test Acc: 0.3770, 
Epoch [900/3000], Loss: 1.5252, Time: 1.45, Train Acc: 0.4612, Test Acc: 0.3860, 
Epoch [1000/3000], Loss: 1.4902, Time: 1.63, Train Acc: 0.4722, Test Acc: 0.3880, 
Epoch [1100/3000], Loss: 1.4539, Time: 1.79, Train Acc: 0.4884, Test Acc: 0.3920, 
Epoch [1200/3000], Loss: 1.4208, T

In [152]:
#delete_model_data(range(30,40))
md, out = load_output_files()
md


Unnamed: 0,model_id,model_type,num_hidden_layers,hidden_layers_size,activation_function,optimizer,criterion,learning_rate,momentum,num_epochs,time_minutes
0,1,FullyConnectedNet,2.0,200.0,Tanh,SGD,MSELoss,0.13,0.0,20000,0.31
1,2,FullyConnectedNet,2.0,200.0,Tanh,SGD,MSELoss,0.12,0.0,20000,0.31
2,3,FullyConnectedNet,2.0,200.0,Tanh,SGD,MSELoss,0.1,0.0,20000,0.38
3,4,FullyConnectedNet,2.0,200.0,Tanh,SGD,MSELoss,0.07,0.0,20000,0.5
4,5,FullyConnectedNet,2.0,200.0,Tanh,SGD,MSELoss,0.06,0.0,20000,0.5
5,6,FullyConnectedNet,2.0,200.0,Tanh,SGD,MSELoss,0.05,0.0,20000,0.54
6,7,FullyConnectedNet,2.0,200.0,Tanh,SGD,MSELoss,0.04,0.0,20000,0.73
7,8,FullyConnectedNet,2.0,200.0,Tanh,SGD,MSELoss,0.032,0.0,20000,0.76
8,9,FullyConnectedNet,2.0,200.0,Tanh,SGD,CrossEntropyLoss,0.032,0.0,4000,0.17
9,10,FullyConnectedNet,2.0,200.0,Tanh,SGD,CrossEntropyLoss,0.025,0.0,4000,0.2


In [66]:
def plot_output_data(metadata, output, model_id):
    metadata = metadata[metadata['model_id']==model_id]
    output = output[output['model_id']==model_id]
    
    xs = np.arange(metadata['num_epochs'].iloc[0])
    losses = output['train_loss']
    sharpness_H = output['sharpness_H']
    sharpness_A = output['sharpness_A']
    train_accuracy = output['train_accuracy']
    test_accuracy = output['test_accuracy']
    momentum = metadata['momentum'].iloc[0]
    learning_rate = metadata['learning_rate'].iloc[0]
    sharpness_H_lim = 2 * (1 + momentum) / learning_rate

    fig = make_subplots(rows = 2, cols = 1, 
                        specs=[[{"secondary_y": True}],
                               [{"secondary_y": True}]],
                        shared_xaxes=True,
                        vertical_spacing=0.1)
    
    fig.add_trace(
        go.Scatter(x=xs, y=losses, name="Training Loss",line=dict(width=2)),
        secondary_y=False, row=1, col=1
    )

    # fig.add_trace(
    #     go.Scatter(x=xs, y=sharpness_H, name="Max Eigenvalue of H", mode='markers', line=dict(width=2)),
    #     secondary_y=True, row=1, col=1
    # )

    fig.add_trace(
        go.Scatter(x=xs, y=sharpness_A, name="Max Eigenvalue of A", mode='markers', line=dict(width=2)),
        secondary_y=True, row=1, col=1
    )

    fig.add_trace(
        go.Scatter(x=xs, y=test_accuracy, name="Test Accuracy", line=dict(width=2)),
        secondary_y=False, row=2, col=1
    )

    fig.add_hline(y=2, line_dash="dash", line_color="black", 
                  row=1, col=1, secondary_y=True)

    

    fig.update_yaxes(title_text="Training Loss", secondary_y=False, 
                     range = [0,0.5], showgrid=False,
                     row=1, col=1)
    fig.update_yaxes(title_text="Max Eigenvalue of A", secondary_y=True, 
                     range = [0, 5],
                     row=1, col=1)
    
    fig.update_xaxes(title_text="epoch",
                     range = [0,5000])
    fig.update_layout(height = 1000, width = 1000)
    
    fig.show()

In [143]:
def plot_sgd_fcnn_data(metadata, output, model_ids_mse, model_ids_ce, save=True):

    max_epoch_mse = (
        output
        [(output["train_loss"].notna()) & (output["model_id"].isin(model_ids_mse))]
        ["epoch"]
        .max()
    )
    xs_mse = np.arange(max_epoch_mse)

    max_epoch_ce = (
        output
        [(output["train_loss"].notna()) & (output["model_id"].isin(model_ids_ce))]
        ["epoch"]
        .max()
    )
    xs_ce = np.arange(max_epoch_ce)

    fig = make_subplots(rows = 2, cols = 2, 
                        vertical_spacing=0.1, shared_xaxes=True,
                        subplot_titles=["MSE Loss", "Cross-Entropy Loss"] )
    colors = px.colors.qualitative.D3[:3]

    for i, model_id in enumerate(model_ids_mse):
        md = metadata[metadata['model_id']==model_id]
        out = output[output['model_id']==model_id]
        lr = md['learning_rate'].iloc[0]
        
        losses = out['train_loss']
        sharpness_H = out['sharpness_H']
    
        sharpness_H_lim = 2 / lr
        
        fig.add_trace(
            go.Scatter(x=xs_mse, y=losses, name= f"η = {lr}",
                       line=dict(width=2.5), marker_color=colors[i],
                       legend="legend",
                       showlegend=True), 
            row=1, col=1
        )

        fig.add_trace(
            go.Scatter(x=xs_mse, y=sharpness_H, name= "Sharpness of H", 
                       mode='markers', showlegend=False,
                       marker=dict(size=5), marker_color=colors[i]),
            row=2, col=1
        )

        fig.add_hline(y=sharpness_H_lim, line_dash="dash", line_color=colors[i], 
                        row=2, col=1)
        
    for i, model_id in enumerate(model_ids_ce):
        md = metadata[metadata['model_id']==model_id]
        out = output[output['model_id']==model_id]
        lr = md['learning_rate'].iloc[0]
        
        losses = out['train_loss']
        sharpness_H = out['sharpness_H']
    
        sharpness_H_lim = 2 / lr
        
        fig.add_trace(
            go.Scatter(x=xs_ce, y=losses, name= f"η = {lr}",
                       line=dict(width=2.5), marker_color=colors[i],
                       legend="legend2",
                       showlegend=True), 
            row=1, col=2
        )

        fig.add_trace(
            go.Scatter(x=xs_ce, y=sharpness_H, name= "Sharpness of H", 
                       mode='markers', showlegend=False,
                       marker=dict(size=5), marker_color=colors[i]),
            row=2, col=2
        )

        fig.add_hline(y=sharpness_H_lim, line_dash="dash", line_color=colors[i], 
                        row=2, col=2)
        
    mse_y_sharp_max = 2 / metadata[metadata["model_id"]==model_ids_mse[-1]]["learning_rate"].iloc[0]*1.1
    ce_y_sharp_max = 2 / metadata[metadata["model_id"]==model_ids_ce[-1]]["learning_rate"].iloc[0]*1.2


    fig.update_yaxes(title_text="Training Loss",
                    range = [0,0.08],
                    row=1, col=1)
    fig.update_yaxes(title_text="Sharpness",
                    range = [10, mse_y_sharp_max],
                    row=2, col=1)
    fig.update_yaxes(title_text="",
                    range = [0,1.5],
                    row=1, col=2)
    fig.update_yaxes(title_text="",
                    range = [20, ce_y_sharp_max],
                    row=2, col=2)
    
    fig.update_xaxes(title_text="", row=1, col=1)
    fig.update_xaxes(title_text="", row=1, col=2)
    fig.update_xaxes(title_text="Epoch", row=2, col=1)
    fig.update_xaxes(title_text="Epoch", row=2, col=2)

    fig.update_layout(height = 400, width = 800, 
                      title = dict(text=f"FCNN with GD on CIFAR-10", x = 0.5),
                      legend=dict(x=0.29, y=0.99,
                                  bgcolor='rgba(255, 255, 255, 0.3)'),
                      legend2=dict(x=0.83, y=0.99,
                                   bgcolor='rgba(255, 255, 255, 0.3)')
                    )
    if save:
        fig.write_image("output/images/gd_fcnn_cifar10.png",
                    width = 800, height = 400, scale = 4)
    fig.show()

In [144]:
def plot_sgdm_fcnn_data(metadata, output, model_ids_mse, model_ids_ce, save=True):

    max_epoch_mse = (
        output
        [(output["train_loss"].notna()) & (output["model_id"].isin(model_ids_mse))]
        ["epoch"]
        .max()
    )
    xs_mse = np.arange(max_epoch_mse)

    max_epoch_ce = (
        output
        [(output["train_loss"].notna()) & (output["model_id"].isin(model_ids_ce))]
        ["epoch"]
        .max()
    )
    xs_ce = np.arange(max_epoch_ce)

    fig = make_subplots(rows = 2, cols = 2, 
                        vertical_spacing=0.1, shared_xaxes=True,
                        subplot_titles=["MSE Loss", "Cross-Entropy Loss"] )
    colors = px.colors.qualitative.D3[:3]

    for i, model_id in enumerate(model_ids_mse):
        md = metadata[metadata['model_id']==model_id]
        out = output[output['model_id']==model_id]
        lr = md['learning_rate'].iloc[0]
        momentum = md['momentum'].iloc[0]
        
        losses = out['train_loss']
        sharpness_H = out['sharpness_H']
        
        sharpness_H_lim = 2 * (1 + momentum) / lr
        
        fig.add_trace(
            go.Scatter(x=xs_mse, y=losses, name= f"η = {lr}",
                       line=dict(width=2.5), marker_color=colors[i],
                       legend="legend",
                       showlegend=True), 
            row=1, col=1
        )

        fig.add_trace(
            go.Scatter(x=xs_mse, y=sharpness_H, name= "Sharpness of H", 
                       mode='markers', showlegend=False,
                       marker=dict(size=5), marker_color=colors[i]),
            row=2, col=1
        )

        fig.add_hline(y=sharpness_H_lim, line_dash="dash", line_color=colors[i], 
                        row=2, col=1)
        
    for i, model_id in enumerate(model_ids_ce):
        md = metadata[metadata['model_id']==model_id]
        out = output[output['model_id']==model_id]
        lr = md['learning_rate'].iloc[0]
        momentum = md['momentum'].iloc[0]
        
        losses = out['train_loss']
        sharpness_H = out['sharpness_H']
    
        sharpness_H_lim = 2 * (1 + momentum) / lr
        
        fig.add_trace(
            go.Scatter(x=xs_ce, y=losses, name= f"η = {lr}",
                       line=dict(width=2.5), marker_color=colors[i],
                       legend="legend2",
                       showlegend=True), 
            row=1, col=2
        )

        fig.add_trace(
            go.Scatter(x=xs_ce, y=sharpness_H, name= "Sharpness of H", 
                       mode='markers', showlegend=False,
                       marker=dict(size=5), marker_color=colors[i]),
            row=2, col=2
        )

        fig.add_hline(y=sharpness_H_lim, line_dash="dash", line_color=colors[i], 
                        row=2, col=2)
        
    mse_y_sharp_max = (
        2 * (1 + metadata[metadata["model_id"]==model_ids_mse[-1]]
                         ["momentum"].iloc[0]) 
          / metadata[metadata["model_id"]==model_ids_mse[-1]]
                    ["learning_rate"].iloc[0] 
          * 1.1
    )

    ce_y_sharp_max = (
        2 * (1 + metadata[metadata["model_id"]==model_ids_ce[-1]]
                         ["momentum"].iloc[0]) 
          / metadata[metadata["model_id"]==model_ids_ce[-1]]
                    ["learning_rate"].iloc[0]
          *1.2
    )


    fig.update_yaxes(title_text="Training Loss",
                    range = [0,0.08],
                    row=1, col=1)
    fig.update_yaxes(title_text="Sharpness",
                    range = [10, mse_y_sharp_max],
                    row=2, col=1)
    fig.update_yaxes(title_text="",
                    range = [0,2],
                    row=1, col=2)
    fig.update_yaxes(title_text="",
                    range = [0, ce_y_sharp_max],
                    row=2, col=2)
    
    fig.update_xaxes(title_text="", row=1, col=1)
    fig.update_xaxes(title_text="", row=1, col=2)
    fig.update_xaxes(title_text="Epoch", row=2, col=1)
    fig.update_xaxes(title_text="Epoch", row=2, col=2)

    fig.update_layout(height = 400, width = 800, 
                      title = dict(text=f"FCNN with GD and Momentum on CIFAR-10", x = 0.5),
                      legend=dict(x=0.29, y=0.99,
                                  bgcolor='rgba(255, 255, 255, 0.3)'),
                      legend2=dict(x=0.83, y=0.99,
                                   bgcolor='rgba(255, 255, 255, 0.3)')
                    )
    if save:
        fig.write_image("output/images/gd_mom_fcnn_cifar10.png",
                    width = 800, height = 400, scale = 4)
    fig.show()

In [155]:
def plot_rmsprop_fcnn_data(metadata, output, model_ids_mse, model_ids_ce, save=True):

    max_epoch_mse = (
        output
        [(output["train_loss"].notna()) & (output["model_id"].isin(model_ids_mse))]
        ["epoch"]
        .max()
    )
    xs_mse = np.arange(max_epoch_mse)

    max_epoch_ce = (
        output
        [(output["train_loss"].notna()) & (output["model_id"].isin(model_ids_ce))]
        ["epoch"]
        .max()
    )
    xs_ce = np.arange(max_epoch_ce)

    fig = make_subplots(rows = 2, cols = 2, 
                        vertical_spacing=0.1, shared_xaxes=True,
                        subplot_titles=["MSE Loss", "Cross-Entropy Loss"] )
    colors = px.colors.qualitative.D3[:3]

    for i, model_id in enumerate(model_ids_mse):
        md = metadata[metadata['model_id']==model_id]
        out = output[output['model_id']==model_id]
        lr = md['learning_rate'].iloc[0]
        
        losses = out['train_loss']
        sharpness_H = out['sharpness_A']
        
        sharpness_H_lim = 2 / lr
        
        fig.add_trace(
            go.Scatter(x=xs_mse, y=losses, name= f"η = {lr}",
                       line=dict(width=2.5), marker_color=colors[i],
                       legend="legend",
                       showlegend=True), 
            row=1, col=1
        )

        fig.add_trace(
            go.Scatter(x=xs_mse, y=sharpness_H, name= "Sharpness of Effective Hessian", 
                       mode='markers', showlegend=False,
                       marker=dict(size=5), marker_color=colors[i]),
            row=2, col=1
        )

        fig.add_hline(y=sharpness_H_lim, line_dash="dash", line_color=colors[i], 
                        row=2, col=1)
        
    for i, model_id in enumerate(model_ids_ce):
        md = metadata[metadata['model_id']==model_id]
        out = output[output['model_id']==model_id]
        lr = md['learning_rate'].iloc[0]
        
        losses = out['train_loss']
        sharpness_H = out['sharpness_A']
    
        sharpness_H_lim = 2 / lr
        
        fig.add_trace(
            go.Scatter(x=xs_ce, y=losses, name= f"η = {lr}",
                       line=dict(width=2.5), marker_color=colors[i],
                       legend="legend2",
                       showlegend=True), 
            row=1, col=2
        )

        fig.add_trace(
            go.Scatter(x=xs_ce, y=sharpness_H, name= "Sharpness of Effective Hessian", 
                       mode='markers', showlegend=False,
                       marker=dict(size=5), marker_color=colors[i]),
            row=2, col=2
        )

        fig.add_hline(y=sharpness_H_lim, line_dash="dash", line_color=colors[i], 
                        row=2, col=2)

    mse_y_sharp_max = (
        2 * (1 + metadata[metadata["model_id"]==model_ids_mse[-1]]
                         ["momentum"].iloc[0]) 
          / metadata[metadata["model_id"]==model_ids_mse[-1]]
                    ["learning_rate"].iloc[0] 
          * 1.1
    )

    ce_y_sharp_max = (
        2 * (1 + metadata[metadata["model_id"]==model_ids_ce[-1]]
                         ["momentum"].iloc[0]) 
          / metadata[metadata["model_id"]==model_ids_ce[-1]]
                    ["learning_rate"].iloc[0]
          *1.2
    )

    fig.update_yaxes(title_text="Training Loss",
                    range = [0.01, 0.08],
                    row=1, col=1)
    fig.update_yaxes(title_text="Sharpness",
                    range = [0, mse_y_sharp_max],
                    row=2, col=1)
    fig.update_yaxes(title_text="",
                    range = [0,1.75],
                    row=1, col=2)
    fig.update_yaxes(title_text="",
                    range = [0, ce_y_sharp_max],
                    row=2, col=2)
    
    fig.update_xaxes(title_text="", row=1, col=1)
    fig.update_xaxes(title_text="", row=1, col=2)
    fig.update_xaxes(title_text="Epoch", row=2, col=1)
    fig.update_xaxes(title_text="Epoch", row=2, col=2)

    fig.update_layout(height = 400, width = 800, 
                      title = dict(text=f"FCNN with RMSProp on CIFAR-10", x = 0.5),
                      legend=dict(x=0.29, y=0.99,
                                  bgcolor='rgba(255, 255, 255, 0.3)'),
                      legend2=dict(x=0.83, y=0.99,
                                   bgcolor='rgba(255, 255, 255, 0.3)')
                    )
    if save:
        fig.write_image("output/images/gd_mom_fcnn_cifar10.png",
                    width = 800, height = 400, scale = 4)
    fig.show()

In [146]:
model_ids_mse = [1,4,7]
model_ids_ce = [9, 10, 11]
plot_sgd_fcnn_data(md, out, model_ids_mse, model_ids_ce)



Support for Kaleido versions less than 1.0.0 is deprecated and will be removed after September 2025.
Please upgrade Kaleido to version 1.0.0 or greater (`pip install 'kaleido>=1.0.0'` or `pip install 'plotly[kaleido]'`).




In [147]:
model_ids_mse = [14, 15, 16]
model_ids_ce = [17, 21, 23]
plot_sgdm_fcnn_data(md, out, model_ids_mse, model_ids_ce, save=False)

In [157]:
model_ids_mse = [30, 31, 32]
model_ids_ce = [33,34,35]
plot_rmsprop_fcnn_data(md, out, model_ids_mse, model_ids_ce, save=True)



Support for Kaleido versions less than 1.0.0 is deprecated and will be removed after September 2025.
Please upgrade Kaleido to version 1.0.0 or greater (`pip install 'kaleido>=1.0.0'` or `pip install 'plotly[kaleido]'`).




In [137]:
plot_output_data(md, out, model_id=33)

In [32]:
model_id = 14
plot_output_data(md, out, model_id=model_id)
loss = out[out['model_id']==model_id]['train_loss'].values
((loss[:-1] - loss[1:]) < 0)

fig = go.Figure()
fig.add_trace(go.Scatter(x=np.arange(len(loss)-1), y=((loss[:-1] - loss[1:]) < 0),
                        mode='lines', name='Stability',line=dict(width=.5)))
fig.update_layout(
    xaxis_title='Epoch',
    yaxis_title='Stability',
    height=600,
    width=1200,
    showlegend=True
)

In [23]:
A = np.array([[1, 1],
              [1, 8]])

def f(x, y):
    X = np.array([x, y])
    return 0.5 * X.T @ A @ X

def grad(x):
    return A @ x

lambda_max = np.linalg.eigvalsh(A).max()

eta_conv = 1.8 / lambda_max
eta_div = 2.05 / lambda_max
steps = 20

xs_conv = []
xs_div = []
x_conv = np.array([-2.5, 1.5])
x_div = np.array([-2.5, 1.5])

for _ in range(steps):
    xs_conv.append(x_conv.copy())
    xs_div.append(x_div.copy())
    x_conv = x_conv - eta_conv * grad(x_conv)
    x_div = x_div - eta_div * grad(x_div)

xs_conv = np.array(xs_conv)
xs_div = np.array(xs_div)

gx = np.linspace(-3, 3, 200)
gy = np.linspace(-3, 3, 200)
X, Y = np.meshgrid(gx, gy)
Z = 0.5*(A[0,0]*X**2 + 2*A[0,1]*X*Y + A[1,1]*Y**2)

fig = make_subplots(rows = 1, cols = 2, horizontal_spacing=0.05,
                     subplot_titles=("η < 2 / λ_max", "η > 2 / λ_max"))

fig.add_trace(go.Contour(
    x=gx, y=gy, z=Z,
    contours=dict(
        coloring="lines",
        showlabels=False
    ),
    line_width=1,
    colorscale="Viridis",
    showscale=False
), row=1, col=1)

fig.add_trace(go.Contour(
    x=gx, y=gy, z=Z,
    contours=dict(
        coloring="lines",
        showlabels=False
    ),
    line_width=1,
    colorscale="Viridis",
    showscale=False
), row=1, col=2)

fig.add_trace(go.Scatter(
    x=xs_conv[:,0], y=xs_conv[:,1],
    mode="lines+markers",
    line=dict(width=2, color="red"),
    marker=dict(size=5, color="red"),
    name="GD Path"
), row=1, col=1)

fig.add_trace(go.Scatter(
    x=xs_div[:,0], y=xs_div[:,1],
    mode="lines+markers",
    line=dict(width=2, color="red"),
    marker=dict(size=5, color="red"),
    name="GD Path"
), row=1, col=2)

fig.update_yaxes(showticklabels=True, ticks="", row=1, col=1)
fig.update_yaxes(showticklabels=False, ticks="", row=1, col=2)

fig.update_layout(
    title=dict(text="Gradient Descent on a Quadratic", x =0.5),
    xaxis1_title="x₁",
    yaxis1_title="x₂",
    xaxis2_title="x₁",
    width=600,
    height=300,
    showlegend=False,
    margin=dict(l=15, r=60, t=80, b=30)
)

fig.show()
fig.write_image("output/images/gd_quadratic.png",
                width = 600, height = 300, scale = 4)



Support for Kaleido versions less than 1.0.0 is deprecated and will be removed after September 2025.
Please upgrade Kaleido to version 1.0.0 or greater (`pip install 'kaleido>=1.0.0'` or `pip install 'plotly[kaleido]'`).


