In [1]:
import torch
import torch.nn as nn
import torchvision as tv
import torchvision.transforms as transforms
from torchvision import datasets
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import random
import time
import os

In [2]:
SAMPLE_SIZE = 5000
NUM_LABELS = 10

In [3]:
# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

torch.backends.cudnn.benchmark = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
# Load data
dataset = datasets.CIFAR10('./data', train=True, download=True, transform=transforms.ToTensor())

# Normalize images
all_images = torch.stack([dataset[i][0] for i in range(len(dataset))])
all_labels = torch.tensor([dataset[i][1] for i in range(len(dataset))])

cifar10_mean = np.array(all_images.mean(dim=(0, 2, 3)))
cifar10_std = np.array(all_images.std(dim=(0, 2, 3)))

mean_tensor = torch.tensor(cifar10_mean).view(1, 3, 1, 1)
std_tensor = torch.tensor(cifar10_std).view(1, 3, 1, 1)

normalized_images = (all_images - mean_tensor) / std_tensor

# Subset first 5k samples
images = normalized_images[:SAMPLE_SIZE]  # Use normalized images
subset_labels = all_labels[:SAMPLE_SIZE]

# Convert labels to one-hot encoding for MSE loss (CIFAR-10 has 10 classes)
labels = torch.zeros(subset_labels.size(0), NUM_LABELS, device=subset_labels.device)
labels.scatter_(1, subset_labels.unsqueeze(1), 1)

Files already downloaded and verified


tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.],
        [0., 0., 0.,  ..., 0., 0., 1.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [5]:
class FullyConnectedNet(nn.Module):
    def __init__(self, input_size, hidden_layer_1_size, hidden_layer_2_size, num_labels):
        super(FullyConnectedNet, self).__init__()
        self.network = nn.Sequential(
            nn.Flatten(),
            nn.Linear(input_size, hidden_layer_1_size, bias=True),
            nn.Tanh(),
            nn.Linear(hidden_layer_1_size, hidden_layer_2_size, bias=True),
            nn.Tanh(),
            nn.Linear(hidden_layer_2_size, num_labels, bias=True)
        )
    
    def forward(self, x):
        return self.network(x)

In [6]:
def compute_sharpness(model, criterion, images, labels,
                      iters: int = 20, tol: float = 1e-3,
                      subsample: int | None = 1024, damping: float = 0.0) -> float:
    """
    Estimates λ_max(H) (sharpness) of the loss at current model parameters via
    power iteration with Hessian–vector products (Pearlmutter trick).

    Args
    ----
    model      : nn.Module (params require_grad=True)
    criterion  : callable(logits, targets) -> scalar loss (mean reduction)
    images     : tensor [N, ...] on correct device
    labels     : tensor [N] on correct device
    iters      : power-iteration steps (15–25 typical)
    tol        : relative convergence tolerance
    subsample  : if not None, randomly sample this many examples for speed
    damping    : computes eigenvalues of (H + damping * I)

    Returns
    -------
    float: estimated largest eigenvalue (sharpness)
    """
    was_training = model.training
    model.eval()  # stabilize stats (esp. BN/Dropout)

    # ---- choose subset (for speed/memory) ----
    if subsample is not None and images.size(0) > subsample:
        idx = torch.randperm(images.size(0), device=images.device)[:subsample]
        xb, yb = images[idx], labels[idx]
    else:
        xb, yb = images, labels

    params = [p for p in model.parameters() if p.requires_grad]
    n = sum(p.numel() for p in params)
    if n == 0:
        if was_training: model.train()
        return 0.0

    # ---- forward with graph for second-order autodiff ----
    # Important: no torch.no_grad() here
    model.zero_grad(set_to_none=True)
    logits = model(xb)
    loss = criterion(logits, yb)

    # ∇ℓ with graph so we can differentiate it again
    grads = torch.autograd.grad(loss, params, create_graph=True, retain_graph=True)
    g_flat = torch.cat([gi.reshape(-1) for gi in grads])

    # init v ~ unit vector
    with torch.no_grad():
        v = torch.randn(n, device=g_flat.device)
        v /= (v.norm() + 1e-12)

    lam_prev = None
    for _ in range(iters):
        # H v = ∇[(∇ℓ)·v]
        gv = (g_flat * v).sum()
        Hv_parts = torch.autograd.grad(gv, params, retain_graph=True)
        Hv = torch.cat([h.reshape(-1) for h in Hv_parts])
        if damping != 0.0:
            Hv = Hv + damping * v

        with torch.no_grad():
            Hv_norm = Hv.norm()
            if Hv_norm == 0 or torch.isnan(Hv_norm):
                lam = 0.0
                break
            v = Hv / (Hv_norm + 1e-12)
            lam = torch.dot(v, Hv).item()

            if lam_prev is not None:
                if abs(lam - lam_prev) / (abs(lam_prev) + 1e-12) < tol:
                    break
            lam_prev = lam

    # cleanup and restore mode
    del grads, g_flat, logits, loss
    if was_training: model.train()
    return float(lam_prev if lam_prev is not None else lam)

In [7]:
def setup_output_files(output_dir="output"): 

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    metadata_path = os.path.join(output_dir, "metadata.csv")
    output_data_path = os.path.join(output_dir, "output.csv")

    if os.path.exists(metadata_path):
            metadata = pd.read_csv(metadata_path)
    else:
        metadata = pd.DataFrame({
            "model_id": pd.Series(dtype="int"),
            "model_type": pd.Series(dtype="str"),
            "optimizer": pd.Series(dtype="str"),
            "learning_rate": pd.Series(dtype="float"),
            "num_epochs": pd.Series(dtype="int"),
            "train_time": pd.Series(dtype="float")
        })

    if os.path.exists(output_data_path):
        output_data = pd.read_csv(output_data_path)
    else:
        output_data = pd.DataFrame({
            "model_id": pd.Series(dtype="int"),
            "loss": pd.Series(dtype="float"),
            "sharpness": pd.Series(dtype="float")
        })

    return metadata, output_data

In [8]:
def save_output_files(metadata, output_data, output_dir="output"):

    metadata_path = os.path.join(output_dir, "metadata.csv")
    output_data_path = os.path.join(output_dir, "output.csv")
    
    metadata.to_csv(metadata_path, index=False)
    output_data.to_csv(output_data_path, index=False)

In [9]:
def train_model(model, optimizer, criterion, learning_rate, num_epochs, images, labels, num_sharpness_computations=100):
    
    optimizer.param_groups[0]['lr'] = learning_rate

    model = model.to(device)
    images = images.to(device)
    labels = labels.to(device)

    train_losses = np.empty(num_epochs)
    sharps = np.full(num_epochs, np.nan)

    model.train()

    print(f"Model: {model.__class__.__name__}")
    print(f"Optimizer: {optimizer.__class__.__name__}")
    print(f"Learning Rate: {learning_rate}")
    print(f"Number of Epochs: {num_epochs}")

    start_time = time.time()
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_losses[epoch] = loss.item()
        
        if (epoch + 1) % (num_epochs // num_sharpness_computations) == 0 or epoch == 0:
            sharpness = compute_sharpness(model, criterion, images, labels, iters=10, 
                                    subsample=256)
            sharps[epoch] = sharpness
            print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Sharpness: {sharpness}")
            
    end_time = time.time()

    metadata, output_data, = setup_output_files("output")
    model_id = metadata.shape[0] + 1
    
    metadata.loc[metadata.shape[0]] ={
        "model_id": model_id,
        "model_type": model.__class__.__name__,
        "optimizer": optimizer.__class__.__name__,
        "learning_rate": learning_rate,
        "num_epochs": num_epochs,
        "train_time": end_time - start_time,
    }

    output_data = pd.concat([output_data, pd.DataFrame({
        "model_id": np.ones_like(train_losses) * model_id,
        "loss": train_losses,
        "sharpness": sharps
    })], ignore_index=True)

    save_output_files(metadata, output_data, output_dir="output")
    print("")

In [10]:
input_size = np.prod(all_images.shape[1:])
hidden_layer_1_size = 200
hidden_layer_2_size = 200
model = FullyConnectedNet(input_size, hidden_layer_1_size, hidden_layer_2_size, NUM_LABELS)

learning_rates = [2/20, 2/50, 2/80, 2/110]
num_epochs = 50
    
for lr in learning_rates:
    train_model(model=model, 
                optimizer=torch.optim.SGD(model.parameters()), 
                criterion=nn.MSELoss(), 
                num_epochs=num_epochs,
                learning_rate=lr,
                images=images, 
                labels=labels,
                num_sharpness_computations=10
    )

Model: FullyConnectedNet
Optimizer: SGD
Learning Rate: 0.1
Number of Epochs: 50
Epoch [1/50], Loss: 0.1233, Sharpness: 16.111536026000977
Epoch [5/50], Loss: 0.1006, Sharpness: 17.81403923034668
Epoch [10/50], Loss: 0.0945, Sharpness: 17.709705352783203
Epoch [15/50], Loss: 0.0909, Sharpness: 17.921512603759766
Epoch [20/50], Loss: 0.0885, Sharpness: 18.23446273803711
Epoch [25/50], Loss: 0.0867, Sharpness: 19.218605041503906
Epoch [30/50], Loss: 0.0853, Sharpness: 14.314674377441406
Epoch [35/50], Loss: 0.0843, Sharpness: 16.506935119628906
Epoch [40/50], Loss: 0.0834, Sharpness: 15.81524658203125
Epoch [45/50], Loss: 0.0826, Sharpness: 14.947366714477539
Epoch [50/50], Loss: 0.0820, Sharpness: 15.68829345703125

Model: FullyConnectedNet
Optimizer: SGD
Learning Rate: 0.04
Number of Epochs: 50
Epoch [1/50], Loss: 0.0819, Sharpness: 16.21432876586914
Epoch [5/50], Loss: 0.0817, Sharpness: 14.808934211730957
Epoch [10/50], Loss: 0.0815, Sharpness: 16.29054069519043
Epoch [15/50], Loss: 0

In [11]:
images = images.to(device)
labels = labels.to(device)
model = FullyConnectedNet().to(device)

criterion = nn.MSELoss()
learning_rate = 0.1
num_epochs = 5000
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

train_losses = np.empty(5000)
sharps = np.empty(5000)

model.train()
start_time = time.time()
for epoch in range(num_epochs):

    optimizer.zero_grad()
    outputs = model(images)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    
    train_losses.append(loss.item())
    
    if (epoch + 1) % 20 == 0 or epoch == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.6f}")
        sharpness = compute_sharpness(model, criterion, images, labels, iters=10, 
                                      subsample=256)
        sharps.append(sharpness)

end_time = time.time()
print(f"\nTraining completed in {end_time - start_time:.2f} seconds")
print(f"Final loss: {train_losses[-1]:.6f}")

TypeError: FullyConnectedNet.__init__() missing 4 required positional arguments: 'input_size', 'hidden_layer_1_size', 'hidden_layer_2_size', and 'num_labels'

In [None]:
sharpness_epochs = [20 * i for i in range(len(sharps))]

plt.figure(figsize=(10, 6))

# Create main plot with loss on left y-axis
ax1 = plt.gca()
line1 = ax1.plot(range(1, len(train_losses) + 1), train_losses, 'b-', linewidth=2, alpha=0.7, label='Training Loss')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('MSE Loss', color='blue')
ax1.tick_params(axis='y', labelcolor='blue')
ax1.grid(True, alpha=0.3)

# Create second y-axis for sharpness
ax2 = ax1.twinx()
line2 = ax2.plot(sharpness_epochs, sharps, 'ro', linewidth=2, markersize=4, label='Sharpness')
ax2.set_ylabel('Sharpness (Largest Eigenvalue)', color='red')
ax2.tick_params(axis='y', labelcolor='red')

# Set y-axis to start at 0 for sharpness with 20% padding at top
threshold = 2 / learning_rate
ax2.set_ylim(bottom=0, top=threshold * 1.5)

# Add horizontal line at 2/lr (theoretical threshold) on sharpness axis

line3 = ax2.axhline(y=threshold, color='k', linestyle='--', linewidth=1.5, alpha=0.7, 
                    label=f'2/η = {threshold:.1f}')

plt.title('Training Loss and Sharpness Over Time')

# Combine legends from both axes
lines = line1 + line2 + [line3]
labels = [l.get_label() for l in lines]
ax1.legend(lines, labels, loc='upper right')

plt.tight_layout()
plt.show()

# Print some statistics (using filtered data)
print(f"\nSharpness Statistics (excluding first 2 measurements):")
print(f"Initial sharpness (3rd measurement): {sharps[0]:.4f}")
print(f"Final sharpness: {sharps[-1]:.4f}")
print(f"Maximum sharpness: {max(sharps):.4f}")
print(f"Minimum sharpness: {min(sharps):.4f}")
print(f"Average sharpness: {np.mean(sharps):.4f}")

In [None]:
# Import plotly for interactive plotting
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
# Create interactive Plotly version of the training plot
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add training loss line (primary y-axis)
fig.add_trace(
    go.Scatter(
        x=list(range(1, len(train_losses) + 1)),
        y=train_losses,
        mode='lines',
        name='Training Loss',
        line=dict(color='blue', width=3),
        opacity=0.7,
        hovertemplate='<b>Epoch:</b> %{x}<br><b>Training Loss:</b> %{y:.6f}<extra></extra>'
    ),
    secondary_y=False,
)

# Add sharpness points (secondary y-axis)
fig.add_trace(
    go.Scatter(
        x=sharpness_epochs,
        y=sharps,
        mode='markers+lines',
        name='Sharpness',
        line=dict(color='red', width=2),
        marker=dict(color='red', size=8),
        hovertemplate='<b>Epoch:</b> %{x}<br><b>Sharpness:</b> %{y:.6f}<extra></extra>'
    ),
    secondary_y=True,
)

# Add theoretical threshold line (secondary y-axis)
threshold = 2 / learning_rate
fig.add_trace(
    go.Scatter(
        x=[1, len(train_losses)],
        y=[threshold, threshold],
        mode='lines',
        name=f'2/η = {threshold:.1f}',
        line=dict(color='black', width=2, dash='dash'),
        opacity=0.7,
        hovertemplate='<b>Theoretical Threshold:</b> %{y:.1f}<extra></extra>'
    ),
    secondary_y=True,
)

# Update layout and axes
fig.update_layout(
    title={
        'text': 'Training Loss and Sharpness Over Time (Interactive)',
        'x': 0.5,
        'xanchor': 'center',
        'font': {'size': 16}
    },
    xaxis_title='Epoch',
    width=900,
    height=600,
    hovermode='x unified',
    legend=dict(
        orientation="v",
        yanchor="top",
        y=1,
        xanchor="left",
        x=1.02
    ),
    margin=dict(r=150)  # Add right margin for legend
)

# Set x-axis properties
fig.update_xaxes(
    showgrid=True,
    gridwidth=1,
    gridcolor='lightgray',
    zeroline=False
)

# Set y-axes properties
fig.update_yaxes(
    title_text="MSE Loss", 
    secondary_y=False,
    color='blue',
    showgrid=True,
    gridwidth=1,
    gridcolor='lightgray'
)

fig.update_yaxes(
    title_text="Sharpness (Largest Eigenvalue)", 
    secondary_y=True,
    color='red',
    range=[0, threshold * 1.5],  # Match the matplotlib version
    showgrid=False
)

# Show the plot
fig.show()

# Print the same statistics as before
print(f"\nSharpness Statistics:")
print(f"Initial sharpness: {sharps[0]:.4f}")
print(f"Final sharpness: {sharps[-1]:.4f}")
print(f"Maximum sharpness: {max(sharps):.4f}")
print(f"Minimum sharpness: {min(sharps):.4f}")
print(f"Average sharpness: {np.mean(sharps):.4f}")
print(f"Theoretical threshold (2/η): {threshold:.1f}")