In [1]:
import numpy as np
import pandas as pd
import os 

summary_fname = "./results/summary.csv"

summary_df = pd.read_csv(summary_fname)
display(summary_df.head())


Unnamed: 0,exp_id,params_id,model,hidden_sizes,mc_samples,batch_size,prior_prec,record_idx,epoch,num_evals,elbo_neg_ave,test_pred_accuracy,test_pred_logloss,train_pred_accuracy,train_pred_logloss
0,0,0,Vadam,"[64, 32, 16]",10,100,0.01,0,0.166667,1200,8.400536,0.1067,2.312136,0.100517,2.329934
1,0,0,Vadam,"[64, 32, 16]",10,100,0.01,1,0.333333,1200,8.341419,0.0997,2.313617,0.1034,2.309945
2,0,0,Vadam,"[64, 32, 16]",10,100,0.01,2,0.5,1200,8.304572,0.1097,2.305187,0.087417,2.312705
3,0,0,Vadam,"[64, 32, 16]",10,100,0.01,3,0.666667,1200,8.265943,0.1036,2.31418,0.1048,2.29788
4,0,0,Vadam,"[64, 32, 16]",10,100,0.01,4,0.833333,1200,8.241416,0.1036,2.304528,0.096883,2.304286


In [2]:
params_df = summary_df[['params_id', 'hidden_sizes', 'mc_samples', 'batch_size', 'prior_prec']].drop_duplicates().set_index('params_id')
params_df

Unnamed: 0_level_0,hidden_sizes,mc_samples,batch_size,prior_prec
params_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,"[64, 32, 16]",10,100,0.01
1,"[64, 32, 16]",10,100,0.1
2,"[64, 32, 16]",10,100,1.0
3,"[64, 32, 16]",10,100,10.0
4,"[64, 32, 16]",10,100,100.0
5,"[64, 32, 16]",10,10,0.01
6,"[64, 32, 16]",10,10,0.1
7,"[64, 32, 16]",10,10,1.0
8,"[64, 32, 16]",10,10,10.0
9,"[64, 32, 16]",10,10,100.0


# Bad ELBO when batch size is one

In [3]:

min_elbo = summary_df[['exp_id', 'elbo_neg_ave']].groupby('exp_id').min()
summary_df_min_elbo = summary_df.merge(min_elbo, on=['exp_id'], how='left', suffixes=('', '_min'))


In [4]:
# When batch size is 1 and precision is 0.01
params_id = 10
params_selected = params_df.loc[params_id]
params_str = ", ".join([f"{k}={v}" for k, v in params_selected.to_dict().items()])

best_elbo_bs1_prec001 = summary_df_min_elbo[summary_df_min_elbo['params_id'] == params_id][['model', 'elbo_neg_ave_min']].drop_duplicates()
difference_ratio = best_elbo_bs1_prec001.max()['elbo_neg_ave_min'] / best_elbo_bs1_prec001.min()['elbo_neg_ave_min']  - 1
print(f"{difference_ratio:.2%} difference in ELBO between Vadam and VadaMuon for params: {params_str}")

111.06% difference in ELBO between Vadam and VadaMuon for params: hidden_sizes=[64, 32, 16], mc_samples=10, batch_size=1, prior_prec=0.01


In [5]:
# When batch size is 1 and precision is 100
params_id = 14
params_selected = params_df.loc[params_id]
params_str = ", ".join([f"{k}={v}" for k, v in params_selected.to_dict().items()])

best_elbo_bs1_prec001 = summary_df_min_elbo[summary_df_min_elbo['params_id'] == params_id][['model', 'elbo_neg_ave_min']].drop_duplicates()
difference_ratio = best_elbo_bs1_prec001.max()['elbo_neg_ave_min'] / best_elbo_bs1_prec001.min()['elbo_neg_ave_min']  - 1
print(f"{difference_ratio:.2%} difference in ELBO between Vadam and VadaMuon for params: {params_str}")

18.80% difference in ELBO between Vadam and VadaMuon for params: hidden_sizes=[64, 32, 16], mc_samples=10, batch_size=1, prior_prec=100.0


# Convergence speed

In [6]:
# When batch size is 100 and precision is 10
params_id = 3
params_selected = params_df.loc[params_id]
params_str = ", ".join([f"{k}={v}" for k, v in params_selected.to_dict().items()])
threshold_progress = .95
summary_df_best = summary_df[['exp_id', 'test_pred_logloss']].groupby('exp_id').min()
summary_df_enriched = summary_df.merge(summary_df_best, on=['exp_id'], how='left', suffixes=('', '_min'))

summary_df_worse = summary_df[['exp_id', 'test_pred_logloss']].groupby('exp_id').max()
summary_df_enriched = summary_df_enriched.merge(summary_df_worse, on=['exp_id'], how='left', suffixes=('', '_max'))

summary_df_enriched["test_pred_progress"] = (summary_df_enriched["test_pred_logloss_max"] - summary_df_enriched["test_pred_logloss"]) / (summary_df_enriched["test_pred_logloss_max"] - summary_df_enriched["test_pred_logloss_min"] + 1e-8)
progress_model_comp = pd.pivot_table(summary_df_enriched, index=['params_id', 'record_idx'], columns='model', values='test_pred_progress').reset_index()
progress_model_comp["diff"] = progress_model_comp["VadaMuon"] - progress_model_comp["Vadam"]
progress_model_comp_selected = progress_model_comp[progress_model_comp['params_id'] == params_id]
vadam_reaches_threshold = progress_model_comp_selected[progress_model_comp_selected["Vadam"] >= threshold_progress]['record_idx'].min()
vadamuon_reaches_threshold = progress_model_comp_selected[progress_model_comp_selected["VadaMuon"] >= threshold_progress]['record_idx'].min()
difference_ratio = (vadam_reaches_threshold / vadamuon_reaches_threshold) - 1 

print(f"For parameters: {params_str}")
print(f"Vadam reaches {threshold_progress*100:.1f}% of test accuracy improvement at evaluation step {vadam_reaches_threshold}, and VadaMuon at evaluation step {vadamuon_reaches_threshold} ({difference_ratio:.2%} difference).")

For parameters: hidden_sizes=[64, 32, 16], mc_samples=10, batch_size=100, prior_prec=10.0
Vadam reaches 95.0% of test accuracy improvement at evaluation step 489, and VadaMuon at evaluation step 55 (789.09% difference).


# Log loss comparison table

We take the best result across epochs and precisions and compare the two models.

In [7]:
best_losses = summary_df.groupby(['batch_size', 'model'])['test_pred_logloss'].min().unstack()
best_losses

model,VadaMuon,Vadam
batch_size,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.199324,0.148325
10,0.136991,0.142687
100,0.122433,0.151005


In [8]:
best_losses_ratio = best_losses.max(axis=1) / best_losses.min(axis=1) - 1
best_losses_ratio

batch_size
1      0.343831
10     0.041580
100    0.233365
dtype: float64