In [26]:
import os
os.environ["PYTHONHASHSEED"] = "42"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import pandas as pd
import numpy as np
import random
import time
import torch
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.autograd.functional import hessian
import torch.nn as nn
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [27]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(True)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
generator=torch.Generator(device=device).manual_seed(SEED)

In [28]:
def get_cropped_images(X, y, n):
    """Returns images that can be cropped by n pixels on all sides without
    losing any information (all cropped pixels = 0).

    Args:
        X np.ndarray: Input images.
        y np.ndarray: Corresponding labels.
        n (int): Number of pixels to crop from each side.

    Returns:
        np.ndarray: Cropped images.
    """

    mask = np.zeros((X.shape[1], X.shape[2]), dtype=bool)
    mask[:n, :] = True
    mask[-n:, :] = True
    mask[:, :n] = True
    mask[:, -n:] = True

    border_pixels = X[:, mask]
    croppable_mask = (border_pixels.sum(axis=1) == 0)
    return X[croppable_mask, n:-n, n:-n], y[croppable_mask]

In [29]:
def sample_data(X, y, num_per_class):
    X = np.asarray(X)
    y = np.asarray(y)

    classes = np.unique(y)
    indices = []

    for c in classes:
        cls_idx = np.where(y == c)[0]
        chosen = np.random.choice(cls_idx, num_per_class, replace=False)
        indices.append(chosen)

    indices = np.concatenate(indices)
    return X[indices], y[indices]

# Load raw CIFAR-10 
train = datasets.CIFAR10(root="./data", train=True,  download=True)
test  = datasets.CIFAR10(root="./data", train=False, download=True)

# # Subsample
X, y  = sample_data(train.data, train.targets, 500)
X_test, y_test = sample_data(test.data, test.targets, 100)

# Convert to float and scale
X  = X.astype(np.float32) / 255.0
X_test = X_test.astype(np.float32) / 255.0

# Normalize
mean = X.mean(axis=(0,1,2),keepdims=True)
std = X.std(axis=(0,1,2),keepdims=True)
X = (X - mean) / std
X_test = (X_test - mean) / std

# Reshape to NCHW
X = np.transpose(X, (0, 3, 1, 2))
X_test = np.transpose(X_test, (0, 3, 1, 2))

# Convert to torch
X = torch.tensor(X, dtype=torch.float32, device=device)
y = torch.tensor(y, dtype=torch.long, device=device)
X_test = torch.tensor(X_test, dtype=torch.float32, device=device)
y_test = torch.tensor(y_test, dtype=torch.long, device=device)

# One hot encode labels for RMSE criterion
y_onehot = torch.nn.functional.one_hot(y, num_classes=10).float().to(device)
y_test_onehot = torch.nn.functional.one_hot(y_test, num_classes=10).float().to(device)


Files already downloaded and verified
Files already downloaded and verified


In [15]:
# Preprocessing

# Load MNIST dataset
train = datasets.MNIST(root="./data", train=True,  download=True)
test  = datasets.MNIST(root="./data", train=False, download=True)
X_full = torch.cat([train.data, test.data], dim=0).numpy()
y_full = torch.cat([train.targets, test.targets], dim=0).numpy()

# Crop images by n pixels on each side
X_cropped, y_cropped = get_cropped_images(X_full, y_full, 4)
y_cropped = pd.Series(y_cropped)

# Select most frequent classes
num_labels = 7
classes = y_cropped.value_counts().index[:num_labels]

# Create train/test split with 800 train / 200 test samples per class
X_train_list = []
y_train_list = []
X_test_list = []
y_test_list = []

for clss in classes:
    indices = y_cropped[y_cropped == clss].sample(1000, random_state=SEED).index
    X_train_list.append(X_cropped[indices[:800]])
    y_train_list.append(y_cropped[indices[:800]])
    X_test_list.append(X_cropped[indices[800:]])
    y_test_list.append(y_cropped[indices[800:]])

X = np.concatenate(X_train_list, axis=0)
y = pd.concat(y_train_list, axis=0).reset_index(drop=True)
X_test = np.concatenate(X_test_list, axis=0)
y_test = pd.concat(y_test_list, axis=0).reset_index(drop=True)

# 0 mean normalize
X_mean, X_std = X.mean(), X.std()
X = (X - X_mean) / X_std
X_test = (X_test - X_mean) / X_std

# Convert y from image label to class label (see classes variable)
class_map = {clss: idx for idx, clss in enumerate(classes)}
y = y.map(class_map).to_numpy()
y_test = y_test.map(class_map).to_numpy()

# Convert to torch
X = torch.tensor(X, dtype=torch.float32, device=device)
y = torch.tensor(y, dtype=torch.long, device=device)
X_test = torch.tensor(X_test, dtype=torch.float32, device=device)
y_test = torch.tensor(y_test, dtype=torch.long, device=device)

# One hot encode labels for RMSE criterion
y_onehot = torch.nn.functional.one_hot(y, num_classes=7).float().to(device)
y_test_onehot = torch.nn.functional.one_hot(y_test, num_classes=7).float().to(device)

In [30]:
class FullyConnectedNet(nn.Module):
    def __init__(self, input_size, num_hidden_layers, hidden_layer_size, 
                 num_labels, activation=nn.ReLU):
        super(FullyConnectedNet, self).__init__()

        self.input_size = input_size
        self.num_hidden_layers = num_hidden_layers
        self.hidden_layers_size = hidden_layer_size
        self.num_labels = num_labels
        self.activation = activation

        layers = [nn.Flatten()]

        in_size = input_size
        for _ in range(num_hidden_layers):
            layers += [nn.Linear(in_size, hidden_layer_size), activation()]
            in_size = hidden_layer_size

        layers.append(nn.Linear(in_size, num_labels))
        self.network = nn.Sequential(*layers)
        self.param_list = list(self.parameters())

        for m in self.network:
            if isinstance(m, nn.Linear):
                if activation == nn.ReLU:
                    nn.init.kaiming_normal_(m.weight, generator=torch.Generator().manual_seed(SEED), nonlinearity='relu')
                elif activation == nn.Tanh:
                    nn.init.xavier_uniform_(m.weight)
                else:
                    nn.init.kaiming_normal_(m.weight, generator=torch.Generator().manual_seed(SEED), nonlinearity='relu')
                nn.init.zeros_(m.bias)

    def forward(self, x):
        return self.network(x)

In [31]:
def setup_output_files(output_dir="output"): 

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    metadata_path = os.path.join(output_dir, "metadata_rmsprop.csv")
    output_data_path = os.path.join(output_dir, "output_rmsprop.csv")

    if os.path.exists(metadata_path):
            metadata = pd.read_csv(metadata_path)
    else:
        metadata = pd.DataFrame({
            "model_id": pd.Series(dtype="int"),
            "model_type": pd.Series(dtype="str"),
            "num_hidden_layers": pd.Series(dtype="int"),
            "hidden_layers_size": pd.Series(dtype="int"),
            "activation_function": pd.Series(dtype="str"),
            "optimizer": pd.Series(dtype="str"),
            "learning_rate": pd.Series(dtype="float"),
            "num_epochs": pd.Series(dtype="int"),
            "time_minutes": pd.Series(dtype="float"),
        })

    if os.path.exists(output_data_path):
        output_data = pd.read_csv(output_data_path)
    else:
        output_data = pd.DataFrame({
            "model_id": pd.Series(dtype="int"),
            "epoch": pd.Series(dtype="int"),
            "train_loss": pd.Series(dtype="float"),
            "train_accuracy": pd.Series(dtype="float"),
            "test_accuracy": pd.Series(dtype="float"),
            "sharpness_H": pd.Series(dtype="float"),
            "sharpness_A": pd.Series(dtype="float"),
        })

    return metadata, output_data

def load_output_files(output_dir="output"):
    metadata_path = os.path.join(output_dir, "metadata_rmsprop.csv")
    output_data_path = os.path.join(output_dir, "output_rmsprop.csv")

    metadata = pd.read_csv(metadata_path)
    output_data = pd.read_csv(output_data_path)

    return metadata, output_data

def save_output_files(metadata, output_data, output_dir="output"):

    metadata_path = os.path.join(output_dir, "metadata_rmsprop.csv")
    output_data_path = os.path.join(output_dir, "output_rmsprop.csv")

    metadata.to_csv(metadata_path, index=False)
    output_data.to_csv(output_data_path, index=False)

def delete_model_data(model_ids, output_dir="output"):
    metadata, output_data = load_output_files(output_dir)
    metadata = metadata[~metadata['model_id'].isin(model_ids)]
    output_data = output_data[~output_data['model_id'].isin(model_ids)]
    save_output_files(metadata, output_data, output_dir)


In [32]:
def get_hessian_metrics(model, optimizer, criterion, X, y, 
                        subsample_dim = 1024, iters=30, tol = 1e-4):
    
    # Subsample data for compute efficiency
    subsample_dim = min(subsample_dim, len(X))
    idx = torch.randperm(len(X), device=X.device, generator=generator)[:subsample_dim]
    X = X[idx]
    y = y[idx]
    
    # Build graph for gradient
    outputs = model(X)
    loss = criterion(outputs, y)

    grads = torch.autograd.grad(
        loss, model.param_list,
        create_graph=True
    )
    g_flat = torch.cat([g.reshape(-1) for g in grads])
    dim    = g_flat.numel()
    device = g_flat.device

    # Computes Hessian-vector product with Pearlmutter trick
    def Hv(v):
        Hv_list = torch.autograd.grad(
            g_flat @ v,
            model.param_list,
            retain_graph=True
        )
        return torch.cat([h.reshape(-1) for h in Hv_list])
    
    # Performs power iteration to estimate largest eigenvalue
    def power_iteration(matvec):
        v = torch.randn(dim, device=device, generator=generator)
        v /= v.norm()

        eig_old = 0.0
        for _ in range(iters):
            Hv_v = matvec(v)
            eig = (v @ Hv_v).item()   
            v = Hv_v / Hv_v.norm()

            if abs(eig - eig_old) / (abs(eig_old) + 1e-12) < tol:
                break
            eig_old = eig

        Hv_v = matvec(v)
        eig = (v @ Hv_v).item()
        return eig

    lambda_H = power_iteration(Hv)
    
    if isinstance(optimizer, torch.optim.RMSprop):
        
        # Compute adaptive scaling matrix D (sqrt) for effective Hessian
        v_t = torch.cat([state['square_avg'].reshape(-1)
                        for state in optimizer.state.values()]
                        ).detach()
        eta = optimizer.param_groups[0]['lr']
        eps = optimizer.param_groups[0]['eps']
        D_sqrt = torch.sqrt(eta / torch.sqrt(v_t + eps))

        # Compute effective Hessian-vector product
        def Av(v):
            return D_sqrt * Hv(D_sqrt * v)
        
        lambda_A = power_iteration(Av)
    else:
        lambda_A = None

    return lambda_H, lambda_A

In [33]:
def train_model(model, optimizer, criterion, epochs, X, y, X_test, y_test):
    print(f"Training {model.__class__.__name__} with " +
          f"{optimizer.__class__.__name__} and learning rate " +
          f"{optimizer.param_groups[0]['lr']} for {epochs} epochs.")

    learning_rate = optimizer.param_groups[0]['lr']

    model.to(device)
    model.train()

    train_losses = np.full(epochs, np.nan)
    train_accuracies = np.full(epochs, np.nan)
    test_accuracies = np.full(epochs, np.nan)
    H_sharps = np.full(epochs, np.nan)
    A_sharps = np.full(epochs, np.nan)

    if isinstance(criterion, nn.MSELoss):
        y_loss = torch.nn.functional.one_hot(
            y, num_classes=model.num_labels).float().to(device)
       
    else:
        y_loss = y.to(device)

    start = time.time()
    
    train_acc = 0.0
    epoch = 0

    while train_acc < 0.99 and epoch < epochs :

        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y_loss)
        loss.backward()
        optimizer.step()

        train_losses[epoch] = loss.item()

        if epoch % (epochs // 100) == 0:
            H_sharps[epoch], A_sharps[epoch] = get_hessian_metrics(
                model, optimizer, criterion, X, y_loss
            )

        with torch.no_grad():
            model.eval()
            train_preds = outputs.argmax(dim=1)
            test_preds = model(X_test).argmax(dim=1)
            train_acc = (train_preds == y).float().mean().item()
            test_acc = (test_preds == y_test).float().mean().item()
            train_accuracies[epoch] = train_acc
            test_accuracies[epoch] = test_acc
        model.train()

        if (epoch+1) % 100 == 0:
            print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}, " +
                  f"Time: {round(((time.time() - start) / 60), 2)}, " +
                  f"Train Acc: {train_accuracies[epoch]:.4f}, " +
                  f"Test Acc: {test_accuracies[epoch]:.4f}, ")
        epoch += 1

    metadata, output_data = setup_output_files("output")
    model_id = metadata.shape[0] + 1

    metadata.loc[metadata.shape[0]] ={
        "model_id": model_id,
        "model_type": model.__class__.__name__,
        "num_hidden_layers": model.num_hidden_layers,
        "hidden_layers_size": model.hidden_layers_size,
        "activation_function": model.activation.__name__,
        "optimizer": optimizer.__class__.__name__,
        "learning_rate": learning_rate,
        "num_epochs": epochs,
        "time_minutes": round((time.time() - start) / 60, 2),
    }

    output_data = pd.concat([output_data, pd.DataFrame({
        "model_id": np.ones_like(train_losses) * model_id,
        "epoch": np.arange(1, epochs + 1),
        "train_loss": train_losses,
        "sharpness_H": H_sharps.round(4),
        "sharpness_A": A_sharps.round(4),
        "test_accuracy": test_accuracies,
        "train_accuracy": train_accuracies,
    })], ignore_index=True)

    save_output_files(metadata, output_data)

In [None]:
input_size = X.shape[1] * X.shape[2] # 400
num_hidden_layers = 2
hidden_layer_size = 21
learning_rates = [0.01, 0.003, 0.001, 0.0003, 0.0001]
epochs = 5000

for learning_rate in learning_rates:

    model = FullyConnectedNet(
        input_size=input_size,
        num_hidden_layers=num_hidden_layers,
        hidden_layer_size=hidden_layer_size,
        num_labels=num_labels,
        activation=nn.Tanh
    )

    optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)

    criterion = nn.CrossEntropyLoss()

    train_model(model, optimizer, criterion, epochs, X, y, X_test, y_test)


NameError: name 'num_labels' is not defined

In [9]:
input_size = X.shape[1] * X.shape[2] # 400
num_hidden_layers = 2
hidden_layer_size = 21
learning_rates = [0.01, 0.003, 0.001, 0.0003, 0.0001]
epochs = 5000

for learning_rate in learning_rates:

    model = FullyConnectedNet(
        input_size=input_size,
        num_hidden_layers=num_hidden_layers,
        hidden_layer_size=hidden_layer_size,
        num_labels=num_labels,
        activation=nn.Tanh
    )

    optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)

    criterion = nn.MSELoss()

    train_model(model, optimizer, criterion, epochs, X, y, X_test, y_test)

Epoch [20/5000], Loss: 0.0581, Time: 0.08
Epoch [40/5000], Loss: 0.0403, Time: 0.17
Epoch [60/5000], Loss: 0.0356, Time: 0.25
Epoch [80/5000], Loss: 0.0313, Time: 0.33
Epoch [100/5000], Loss: 0.0255, Time: 0.41
Epoch [120/5000], Loss: 0.0193, Time: 0.49
Epoch [140/5000], Loss: 0.0217, Time: 0.57
Epoch [160/5000], Loss: 0.0175, Time: 0.66
Epoch [180/5000], Loss: 0.0183, Time: 0.74
Epoch [200/5000], Loss: 0.0178, Time: 0.82
Epoch [220/5000], Loss: 0.0158, Time: 0.9
Epoch [240/5000], Loss: 0.0126, Time: 0.98
Epoch [260/5000], Loss: 0.0130, Time: 1.07
Epoch [280/5000], Loss: 0.0111, Time: 1.15
Epoch [300/5000], Loss: 0.0125, Time: 1.23
Epoch [320/5000], Loss: 0.0137, Time: 1.31
Epoch [340/5000], Loss: 0.0088, Time: 1.39
Epoch [360/5000], Loss: 0.0105, Time: 1.47
Epoch [380/5000], Loss: 0.0084, Time: 1.56
Epoch [400/5000], Loss: 0.0171, Time: 1.64
Epoch [420/5000], Loss: 0.0093, Time: 1.72
Epoch [440/5000], Loss: 0.0079, Time: 1.8
Epoch [460/5000], Loss: 0.0114, Time: 1.87
Epoch [480/5000],

In [38]:
input_size = X.shape[1] * X.shape[2] * X.shape[3] # 400
num_hidden_layers = 2
hidden_layer_size = 200
learning_rates = [0.12, 0.1, 0.07, 0.06, 0.05, 0.04, 0.032]
epochs = 20000

for learning_rate in learning_rates:

    model = FullyConnectedNet(
        input_size=input_size,
        num_hidden_layers=num_hidden_layers,
        hidden_layer_size=hidden_layer_size,
        num_labels=10,
        activation=nn.Tanh
    )

    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    criterion = nn.MSELoss()

    train_model(model, optimizer, criterion, epochs, X, y, X_test, y_test)

Training FullyConnectedNet with SGD and learning rate 0.12 for 20000 epochs.
Epoch [100/20000], Loss: 0.0929, Time: 0.01, Train Acc: 0.3288, Test Acc: 0.2390, 
Epoch [200/20000], Loss: 0.0803, Time: 0.02, Train Acc: 0.4388, Test Acc: 0.2650, 
Epoch [300/20000], Loss: 0.0736, Time: 0.03, Train Acc: 0.5202, Test Acc: 0.2670, 
Epoch [400/20000], Loss: 0.0689, Time: 0.03, Train Acc: 0.5874, Test Acc: 0.2770, 
Epoch [500/20000], Loss: 0.0652, Time: 0.04, Train Acc: 0.6432, Test Acc: 0.2820, 
Epoch [600/20000], Loss: 0.0621, Time: 0.05, Train Acc: 0.6830, Test Acc: 0.2860, 
Epoch [700/20000], Loss: 0.0594, Time: 0.06, Train Acc: 0.7186, Test Acc: 0.2880, 
Epoch [800/20000], Loss: 0.0570, Time: 0.07, Train Acc: 0.7532, Test Acc: 0.2970, 
Epoch [900/20000], Loss: 0.0547, Time: 0.07, Train Acc: 0.7790, Test Acc: 0.2960, 
Epoch [1000/20000], Loss: 0.0527, Time: 0.08, Train Acc: 0.7972, Test Acc: 0.2950, 
Epoch [1100/20000], Loss: 0.0507, Time: 0.09, Train Acc: 0.8140, Test Acc: 0.2960, 
Epoch [1

In [39]:

md, out = load_output_files()
md


Unnamed: 0,model_id,model_type,num_hidden_layers,hidden_layers_size,activation_function,optimizer,learning_rate,num_epochs,time_minutes
0,1,FullyConnectedNet,2,200,Tanh,SGD,0.12,20000,0.42
1,2,FullyConnectedNet,2,200,Tanh,SGD,0.1,20000,0.62
2,3,FullyConnectedNet,2,200,Tanh,SGD,0.07,20000,0.74
3,4,FullyConnectedNet,2,200,Tanh,SGD,0.06,20000,0.79
4,5,FullyConnectedNet,2,200,Tanh,SGD,0.05,20000,0.89
5,6,FullyConnectedNet,2,200,Tanh,SGD,0.04,20000,0.97
6,7,FullyConnectedNet,2,200,Tanh,SGD,0.032,20000,1.07


In [209]:
def plot_output_data(metadata, output, model_id):
    metadata = metadata[metadata['model_id']==model_id]
    output = output[output['model_id']==model_id]
    
    xs = np.arange(metadata['num_epochs'].iloc[0])
    losses = output['train_loss']
    sharpness_H = output['sharpness_H']
    sharpness_A = output['sharpness_A']
    train_accuracy = output['train_accuracy']
    test_accuracy = output['test_accuracy']
    sharpness_H_lim = 2 / metadata['learning_rate'].iloc[0]

    fig = make_subplots(rows = 2, cols = 1, 
                        specs=[[{"secondary_y": True}],
                               [{"secondary_y": True}]],
                        vertical_spacing=0.1)
    
    fig.add_trace(
        go.Scatter(x=xs, y=losses, name="Training Loss",line=dict(width=2)),
        secondary_y=False, row=1, col=1
    )

    fig.add_trace(
        go.Scatter(x=xs, y=sharpness_H, name="Max Eigenvalue of H", mode='markers', line=dict(width=2)),
        secondary_y=True, row=1, col=1
    )

    # fig.add_trace(
    #     go.Scatter(x=xs, y=sharpness_A, name="Max Eigenvalue of A", line=dict(width=2)),
    #     secondary_y=True, row=1, col=1
    # )

    fig.add_trace(
        go.Scatter(x=xs, y=test_accuracy, name="Test Accuracy", line=dict(width=2)),
        secondary_y=False, row=2, col=1
    )

    fig.add_hline(y=sharpness_H_lim, line_dash="dash", line_color="black", 
                  row=1, col=1, secondary_y=True)

    

    fig.update_yaxes(title_text="Training Loss", secondary_y=False, 
                     range = [0,0.1], showgrid=False,
                     row=1, col=1)
    fig.update_yaxes(title_text="Max Eigenvalue of A", secondary_y=True, 
                     range = [0, sharpness_H_lim*1.5],
                     row=1, col=1)
    
    fig.update_xaxes(title_text="epoch")
    fig.update_layout(height = 1000, width = 1000)
    
    fig.show()

In [93]:
def plot_mse_data(metadata, output, model_ids):

    max_epoch = output[output["train_loss"].notna()]["epoch"].max()
    xs = np.arange(max_epoch)

    fig = make_subplots(rows = 2, cols = 1, 
                        vertical_spacing=0.1, shared_xaxes=True )
    colors = px.colors.qualitative.D3[:3]

    for i, model_id in enumerate(model_ids):
        md = metadata[metadata['model_id']==model_id]
        out = output[output['model_id']==model_id]
        lr = md['learning_rate'].iloc[0]
        
        losses = out['train_loss']
        sharpness_H = out['sharpness_H']
    
        sharpness_H_lim = 2 / lr
        
        fig.add_trace(
            go.Scatter(x=xs, y=losses, name= f"η = {lr}",
                       line=dict(width=2.5), marker_color=colors[i]), 
            row=1, col=1
        )

        fig.add_trace(
            go.Scatter(x=xs, y=sharpness_H, name= "Sharpness of H", 
                       mode='markers', showlegend=False,
                       marker=dict(size=5), marker_color=colors[i]),
            row=2, col=1
        )

        fig.add_hline(y=sharpness_H_lim, line_dash="dash", line_color=colors[i], 
                        row=2, col=1)
        
    fig.update_yaxes(title_text="Training Loss",
                    range = [0,0.08],
                    row=1, col=1)
    fig.update_yaxes(title_text="Sharpness",
                    range = [0, sharpness_H_lim*1.2],
                    row=2, col=1)
    
    fig.update_xaxes(title_text="Epoch")
    fig.update_layout(height = 600, width = 600, 
                      title = dict(text="FCNN on CIFAR-10 with MSE Loss", x = 0.5),
                      legend=dict( x=0.75, bgcolor='rgba(255, 255, 255, 0.6)')
                    )
        
    fig.show()

    print(max_epoch)

In [94]:
model_ids = [1,3,6]

plot_mse_data(md, out, model_ids)

12869


In [None]:
loss = out[out['model_id']==model_id]['train_loss'].values
((loss[:-1] - loss[1:]) < 0)

fig = go.Figure()
fig.add_trace(go.Scatter(x=np.arange(len(loss)-1), y=((loss[:-1] - loss[1:]) < 0),
                        mode='lines', name='Stability',line=dict(width=.5)))
fig.update_layout(
    xaxis_title='Epoch',
    yaxis_title='Stability',
    height=600,
    width=1200,
    showlegend=True
)

In [30]:
model.eval()
with torch.no_grad():
    outputs = model(X_test.to(device))
    preds = outputs.argmax(dim=1)
    correct = (preds == y_test.to(device)).sum().item()
    accuracy = correct / len(y_test)

print(f"Test accuracy: {accuracy:.4f}")

NameError: name 'model' is not defined