In [1]:
import sys
from pathlib import Path

project_root = Path("..").resolve()
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

import src.seed as seed
import src.models as models
import src.functions as fn

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import time
import plotly.graph_objects as go
from plotly.subplots import make_subplots

device = seed.device
generator = seed.generator

In [2]:
X, y, X_test, y_test = fn.load_cifar_10()

  entry = pickle.load(f, encoding="latin1")


In [36]:
output_dir = "eos/adam_MJ"

# Define model parameters
input_size = X.shape[1] * X.shape[2] * X.shape[3]
num_hidden_layers = 2
hidden_layer_size = 200

# Gradient descent and MSE loss
epochs = 4000
learning_rates = [3e-3, 1e-3, 3e-4, 1e-4, 3e-5]
accuracy = 1.1

for learning_rate in learning_rates:

    model = models.FullyConnectedNet(
        input_size=input_size,
        num_hidden_layers=num_hidden_layers,
        hidden_layer_size=hidden_layer_size,
        num_labels=10,
        activation=nn.Tanh
    )
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    fn.train_model(model, optimizer, criterion, epochs, accuracy, X, y, X_test, y_test, output_dir)

Training FullyConnectedNet with Adam and learning rate 0.003 for 4000 epochs.
Epoch [1000/4000], Loss: 0.0166, Time: 0.13, Train Acc: 0.9644, Test Acc: 0.2410, 
Epoch [2000/4000], Loss: 0.0090, Time: 0.26, Train Acc: 0.9886, Test Acc: 0.2250, 
Epoch [3000/4000], Loss: 0.0050, Time: 0.39, Train Acc: 0.9946, Test Acc: 0.2170, 
Epoch [4000/4000], Loss: 0.0034, Time: 0.52, Train Acc: 0.9966, Test Acc: 0.2130, 
Training FullyConnectedNet with Adam and learning rate 0.001 for 4000 epochs.



The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.



Epoch [1000/4000], Loss: 0.0080, Time: 0.13, Train Acc: 0.9928, Test Acc: 0.2300, 
Epoch [2000/4000], Loss: 0.0044, Time: 0.26, Train Acc: 0.9994, Test Acc: 0.2300, 
Epoch [3000/4000], Loss: 0.0021, Time: 0.39, Train Acc: 1.0000, Test Acc: 0.2270, 
Epoch [4000/4000], Loss: 0.0013, Time: 0.52, Train Acc: 1.0000, Test Acc: 0.2330, 
Training FullyConnectedNet with Adam and learning rate 0.0003 for 4000 epochs.
Epoch [1000/4000], Loss: 0.0065, Time: 0.12, Train Acc: 0.9974, Test Acc: 0.2170, 
Epoch [2000/4000], Loss: 0.0034, Time: 0.25, Train Acc: 0.9996, Test Acc: 0.2030, 
Epoch [3000/4000], Loss: 0.0008, Time: 0.38, Train Acc: 1.0000, Test Acc: 0.2070, 
Epoch [4000/4000], Loss: 0.0005, Time: 0.52, Train Acc: 1.0000, Test Acc: 0.2080, 
Training FullyConnectedNet with Adam and learning rate 0.0001 for 4000 epochs.
Epoch [1000/4000], Loss: 0.0145, Time: 0.12, Train Acc: 0.9948, Test Acc: 0.2510, 
Epoch [2000/4000], Loss: 0.0020, Time: 0.25, Train Acc: 0.9998, Test Acc: 0.2360, 
Epoch [3000/

In [35]:
output_dir = "eos/adam_MJ"
fn.delete_model_data(range(10),output_dir=output_dir)

In [2]:
output_dir = "eos/adam_MJ"
md, out = fn.load_output_files(output_dir)

In [3]:
def plot_output_data(metadata, output, model_id):
    metadata = metadata[metadata['model_id']==model_id]
    output = output[output['model_id']==model_id]
    
    xs = np.arange(metadata['num_epochs'].iloc[0])
    losses = output['train_loss']
    sharpness_H = output['sharpness_H']
    sharpness_A = output['sharpness_A']
    train_accuracy = output['train_accuracy']
    test_accuracy = output['test_accuracy']
    learning_rate = metadata['learning_rate'].iloc[0]
    sharpness_H_lim = 2 * (1 + 0.9)  / ((1 - 0.9) * learning_rate)

    fig = make_subplots(rows = 2, cols = 1, 
                        specs=[[{"secondary_y": True}],
                               [{"secondary_y": True}]],
                        shared_xaxes=True,
                        vertical_spacing=0.1)
    
    fig.add_trace(
        go.Scatter(x=xs, y=losses, name="Training Loss",line=dict(width=2)),
        secondary_y=False, row=1, col=1
    )

    # fig.add_trace(
    #     go.Scatter(x=xs, y=sharpness_H, name="Max Eigenvalue of H", mode='markers', line=dict(width=2)),
    #     secondary_y=True, row=1, col=1
    # )

    fig.add_trace(
        go.Scatter(x=xs, y=sharpness_H, name="Sharpness of Hessian", mode='markers', line=dict(width=2)),
        secondary_y=True, row=1, col=1
    )

    fig.add_trace(
        go.Scatter(x=xs, y=sharpness_A, name="Sharpness of Effective Hessian", mode='markers', line=dict(width=2)),
        secondary_y=True, row=1, col=1
    )

    fig.add_trace(
        go.Scatter(x=xs, y=test_accuracy, name="Test Accuracy", line=dict(width=2)),
        secondary_y=False, row=2, col=1
    )
    fig.add_hline(y=sharpness_H_lim, line_dash="dash", line_color="black", 
                  row=1, col=1, secondary_y=True)
    
    fig.update_yaxes(title_text="Training Loss", secondary_y=False, 
                     range = [0,0.1], showgrid=False,
                     row=1, col=1)
    fig.update_yaxes(title_text="Max Sharpness of Muon Layers", secondary_y=True, 
                     range = [0, sharpness_H_lim*1.1], showgrid=False,
                     row=1, col=1)
    
    fig.update_xaxes(title_text="epoch",
                     range = [0,output['train_loss'].notna().sum()])
    fig.update_layout(title_text = f"Stability of Muon ; learning rate = {learning_rate}", height = 1000, width = 1000)
    
    fig.show()


In [12]:
plot_output_data(md, out, model_id=2)

In [20]:
md

Unnamed: 0,model_id,model_type,activation_function,optimizer,criterion,learning_rate,beta1,beta2,num_epochs,time_minutes
0,1,FullyConnectedNet,Tanh,Adam,MSELoss,0.001,,,2000,0.38
1,2,FullyConnectedNet,Tanh,Adam,MSELoss,0.0001,,,2000,0.26
2,3,FullyConnectedNet,Tanh,Adam,MSELoss,1e-05,,,2000,0.32
