This code is to analyze the dynamics of learning across multiple runs with varying hidden state initializations and neuronal activations. It loads runs, computes and aggregates per-run metrics, plots and saves mean/std loss across runs, aggregates metrics timeseries and gradients.

In [1]:
from pathlib import Path
import sys

sys.path.append("..")
import AggregateMultiruns

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# CONFIG
data_dir = Path("../data/Ns100_SeqN100/")
model_root = Path("../Elman_SGD/Remap_predloss/N100T100/")

hidden_weights_inits = [
    "he",
    "shift",
    "cyclic-shift",
    "shift",
    "cmh",
    "mh",
    "ctridiag",
    "tridiag",
    "orthog",
]
input_types = ["gaussian", "onehot", "khot", "small-gaussian"]

SINGLE_DIR = "single-run"
MULTIRUNS_DIR = "multiruns"
RUN_PREFIX = "run_"
MODEL_FNAME = "Ns100_SeqN100_predloss_full.pth.tar"
HIDDEN_WEIGHTS_SUBDIR = "hidden-weights"

In [3]:
per_run_df, agg_df, ts_bucket = AggregateMultiruns.collect_all(
    h_inits=hidden_weights_inits[:1], in_types=input_types[:1]
)

Collecting for (hidden_init=he, input_type=gaussian)
Run 00: ../Elman_SGD/Remap_predloss/N100T100/he/gaussian/multiruns/run_00/Ns100_SeqN100_predloss_full.pth.tar
Run 01: ../Elman_SGD/Remap_predloss/N100T100/he/gaussian/multiruns/run_01/Ns100_SeqN100_predloss_full.pth.tar
Run 02: ../Elman_SGD/Remap_predloss/N100T100/he/gaussian/multiruns/run_02/Ns100_SeqN100_predloss_full.pth.tar
Run 03: ../Elman_SGD/Remap_predloss/N100T100/he/gaussian/multiruns/run_03/Ns100_SeqN100_predloss_full.pth.tar
Run 04: ../Elman_SGD/Remap_predloss/N100T100/he/gaussian/multiruns/run_04/Ns100_SeqN100_predloss_full.pth.tar
Run 05: ../Elman_SGD/Remap_predloss/N100T100/he/gaussian/multiruns/run_05/Ns100_SeqN100_predloss_full.pth.tar
Run 06: ../Elman_SGD/Remap_predloss/N100T100/he/gaussian/multiruns/run_06/Ns100_SeqN100_predloss_full.pth.tar
Run 07: ../Elman_SGD/Remap_predloss/N100T100/he/gaussian/multiruns/run_07/Ns100_SeqN100_predloss_full.pth.tar
Run 08: ../Elman_SGD/Remap_predloss/N100T100/he/gaussian/multiruns/

In [4]:
per_run_df.keys()

Index(['hidden_init', 'input_type', 'run_kind', 'run_id', 'path', 'final_loss',
       'best_loss', 'best_epoch', 'loss_auc', 'time_to_110pct_best',
       'final_loss_batch_mean', 'final_loss_batch_std', 'final_frob',
       'final_drift_from_init', 'final_spectral_radius', 'final_spectral_norm',
       'final_min_singular', 'final_cond_num', 'final_orth_err',
       'final_w_max_abs', 'final_w_sparsity', 'final_act_mean',
       'final_act_std', 'final_tanh_sat', 'loss_mean', 'loss_std', 'loss_max',
       'loss_min', 'loss_batch_mean_mean', 'loss_batch_mean_std',
       'loss_batch_mean_max', 'loss_batch_mean_min', 'loss_batch_std_mean',
       'loss_batch_std_std', 'loss_batch_std_max', 'loss_batch_std_min',
       'frob_mean', 'frob_std', 'frob_max', 'frob_min', 'drift_from_init_mean',
       'drift_from_init_std', 'drift_from_init_max', 'drift_from_init_min',
       'spectral_radius_mean', 'spectral_radius_std', 'spectral_radius_max',
       'spectral_radius_min', 'spectral_norm_

In [5]:
agg_df

Unnamed: 0,hidden_init,input_type,num_runs,final_loss_mean,final_loss_std,best_loss_mean,best_loss_std,best_epoch_mean,best_epoch_std,loss_auc_mean,...,act_std_min_mean,act_std_min_std,tanh_sat_mean_mean,tanh_sat_mean_std,tanh_sat_std_mean,tanh_sat_std_std,tanh_sat_max_mean,tanh_sat_max_std,tanh_sat_min_mean,tanh_sat_min_std
0,he,gaussian,10,0.004767,7.5e-05,0.00469,7.3e-05,49999.0,0.0,1125.896652,...,0.372175,0.00674,0.030082,0.001839,0.024532,0.002282,0.071798,0.006045,0.0,0.0


In [6]:
agg_df.keys()

Index(['hidden_init', 'input_type', 'num_runs', 'final_loss_mean',
       'final_loss_std', 'best_loss_mean', 'best_loss_std', 'best_epoch_mean',
       'best_epoch_std', 'loss_auc_mean',
       ...
       'act_std_min_mean', 'act_std_min_std', 'tanh_sat_mean_mean',
       'tanh_sat_mean_std', 'tanh_sat_std_mean', 'tanh_sat_std_std',
       'tanh_sat_max_mean', 'tanh_sat_max_std', 'tanh_sat_min_mean',
       'tanh_sat_min_std'],
      dtype='object', length=161)

In [7]:
ts_bucket["he", "gaussian"].keys()

dict_keys(['losses', 'metrics_df_list', 'grad_df_list', 'history_df_list'])

In [8]:
print(len(ts_bucket["he", "gaussian"]["losses"]))
print(len(ts_bucket["he", "gaussian"]["losses"][0]))
print(ts_bucket["he", "gaussian"]["losses"][0][:5])

10
50000
[0.17805549502372742, 0.17802384495735168, 0.17799217998981476, 0.17796054482460022, 0.17792890965938568]


In [9]:
print(len(ts_bucket["he", "gaussian"]["metrics_df_list"]))
print(len(ts_bucket["he", "gaussian"]["metrics_df_list"][0]))
display(ts_bucket["he", "gaussian"]["metrics_df_list"][0].head())

10
50


Unnamed: 0,epoch,loss,loss_batch_mean,loss_batch_std,frob,drift_from_init,spectral_radius,spectral_norm,min_singular,cond_num,orth_err,w_max_abs,w_sparsity,act_mean,act_std,tanh_sat,run_id
0,0,0.178055,0.178055,0.0,5.774087,0.000262,0.61031,0.350312,-0.393485,-0.89028,7.434097,0.099981,0.0,0.280204,0.375673,0.0,0
1,1000,0.149493,0.149493,0.0,5.780917,0.245461,0.60999,0.373429,-0.395207,-0.944895,7.43145,0.107955,0.0,0.274906,0.385883,0.0,0
2,2000,0.123094,0.123094,0.0,5.806212,0.49052,0.629674,0.342245,-0.404419,-0.846265,7.422012,0.114865,0.0,0.264183,0.428688,0.0,0
3,3000,0.096362,0.096362,0.0,5.849654,0.755729,0.743869,0.372078,-0.447476,-0.831504,7.417208,0.121608,0.0,0.243764,0.498146,0.0,0
4,4000,0.072888,0.072888,0.0,5.902051,1.003481,0.93106,0.456371,-0.37712,-1.210149,7.432401,0.126674,0.0,0.220508,0.568742,0.0,0


In [10]:
print(len(ts_bucket["he", "gaussian"]["grad_df_list"]))
print(len(ts_bucket["he", "gaussian"]["grad_df_list"][0]))
display(ts_bucket["he", "gaussian"]["grad_df_list"][0].head())

10
50


Unnamed: 0,grad_mean_sum,grad_std_sum,grad_l2_norm_sum,grad_mean_sq_sum,grad_max_abs_sum,grad_sparsity_sum,grad_mean_max,grad_std_max,grad_l2_norm_max,grad_mean_sq_max,...,grad_mean_mean,grad_std_mean,grad_l2_norm_mean,grad_mean_sq_mean,grad_max_abs_mean,grad_sparsity_mean,grad_group_rnn_l2_norm,grad_group_linear_l2_norm,epoch,run_id
0,0.001112,0.001398,0.08826,1.230034e-06,0.005423,0.0,0.000796,0.000488,0.048836,6.790813e-07,...,0.000278,0.00035,0.022065,3.075085e-07,0.001356,0.0,0.031183,0.057077,0,0
1,0.000886,0.001245,0.080102,8.926828e-07,0.004783,0.0001,0.000661,0.000408,0.044679,4.739137e-07,...,0.000221,0.000311,0.020026,2.231707e-07,0.001196,2.5e-05,0.02854,0.051563,1000,0
2,0.000756,0.001168,0.079808,6.8638e-07,0.004386,0.0,0.000541,0.000404,0.042872,3.217041e-07,...,0.000189,0.000292,0.019952,1.71595e-07,0.001097,0.0,0.031264,0.048544,2000,0
3,0.000662,0.001081,0.078157,5.552886e-07,0.003903,0.0,0.00046,0.000396,0.041159,2.323179e-07,...,0.000165,0.00027,0.019539,1.388221e-07,0.000976,0.0,0.032178,0.045979,3000,0
4,0.000555,0.000941,0.068685,4.387545e-07,0.003383,0.0,0.000422,0.000359,0.037028,1.91444e-07,...,0.000139,0.000235,0.017171,1.096886e-07,0.000846,0.0,0.027282,0.041404,4000,0


In [11]:
(ts_bucket["he", "gaussian"]["grad_df_list"][0] != 0).sum()

grad_mean_sum                50
grad_std_sum                 50
grad_l2_norm_sum             50
grad_mean_sq_sum             50
grad_max_abs_sum             50
grad_sparsity_sum            43
grad_mean_max                50
grad_std_max                 50
grad_l2_norm_max             50
grad_mean_sq_max             50
grad_max_abs_max             50
grad_sparsity_max            43
grad_mean_mean               50
grad_std_mean                50
grad_l2_norm_mean            50
grad_mean_sq_mean            50
grad_max_abs_mean            50
grad_sparsity_mean           43
grad_group_rnn_l2_norm       50
grad_group_linear_l2_norm    50
epoch                        49
run_id                       50
dtype: int64

In [12]:
print(len(ts_bucket["he", "gaussian"]["history_df_list"]))
print(len(ts_bucket["he", "gaussian"]["history_df_list"][0]))
display(ts_bucket["he", "gaussian"]["history_df_list"][0].head())

10
50


Unnamed: 0,epoch,loss,grad_norm,run_id
0,0,0.178055,0.056269,0
1,1000,0.149493,0.051598,0
2,2000,0.123094,0.051655,0
3,3000,0.096362,0.050998,0
4,4000,0.072888,0.045005,0
