In [1]:
%load_ext autoreload
%autoreload 2
import torch
import numpy as np
import pandas as pd
from omegaconf import OmegaConf
from pathlib import Path
from pprint import pprint
import copy
# plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
# mycode
from repo import REPO
from ml_utilities.output_loader.result_loader import SweepResult, JobResult
from ml_utilities.output_loader import create_job_output_loader
from ml_utilities.output_loader.plot import plot_sweep_summary, plot_data_log_values

# 15 CIFAR10 Instability Analysis debug

I use this notebook for debugging the resume training functionality of the trainer.

In [21]:
# this is the full config for the instability analysis
config_yaml = """
config:
  run_script_name: train_instability_analysis
  run_script_kwargs:
    run_config:
      exec_type: parallel
      hostname: dragonfly
      gpu_ids: [0, 1]
      runs_per_gpu: 3
      wandb:
        init:
          tags:
          - ${config.run_script_kwargs.job_config.experiment_data.experiment_tag}_exps
          - run_handler
          notes: null
          group: ${config.run_script_kwargs.job_config.experiment_data.experiment_tag}
          job_type: run_handler

    start_num: 0

    job_config:
      experiment_data:
        entity: jkuiml-fsl
        project_name: tflearning
        experiment_tag: '15.0'
        experiment_type: startnum_${config.run_script_kwargs.start_num}
        experiment_name: cifar10-${config.run_script_kwargs.job_config.experiment_data.experiment_tag}.${config.run_script_kwargs.start_num}-lenet #! override this in script, add prefix 'IA-X-'
        experiment_dir: null
        experiment_notes: 
        seed: 0
        gpu_id: 1
      wandb:
        init:
          tags:
          - ${config.run_script_kwargs.job_config.experiment_data.experiment_tag}_exps
          notes: ${config.run_script_kwargs.job_config.experiment_data.experiment_notes}
          group: ${config.run_script_kwargs.job_config.experiment_data.experiment_tag}
          job_type: ${config.run_script_kwargs.job_config.experiment_data.experiment_type}

      model:
        model_cfg: lenet_300_100_relu_cifar10 #resnet20-cifar10-B

      trainer:
        training_setup: supervised
        n_steps: 64e3
        log_train_step_every: 1
        log_additional_train_step_every_multiplier: 1
        log_additional_logs: true
        val_every: 500
        save_every: 10000 # CHECK
        batch_size: 128
        optimizer_scheduler:
          optimizer_name: AdamW
          optimizer_kwargs:
            lr: 0.001
            weight_decay: 0.0
          lr_scheduler_name: MultiStepLR
          lr_scheduler_kwargs:
            milestones: [32e3, 48e3]
            gamma: 0.1
        loss: crossentropy
        metrics:
        - Accuracy
        num_workers: 4

      data:
        dataset: cifar10
        dataset_kwargs:
          data_root_path: /system/user/beck/pwbeck/data
        dataset_split:
          train_val_split: 0.9
          # restrict_n_samples_train_task: 100
        train_split_transforms:
          image_transforms:
          - RandomHorizontalFlip
          - RandomCrop:
              size: 32
              padding: 4
          tensor_transforms: 
          joint_tensor_transforms: 
          enable_transforms: True

    instability_analysis_config: 
      score_fn: TError #TAccuracy
      interpolation_factors: [-0.1000,  0.0000,  0.1000,  0.3000,  0.5000,  0.7000,  0.9000,  1.0000, 1.1000]
      device: 1 #! override from config
      interpolate_linear_kwargs: 
        dataloader_kwargs: 
          batch_size: 128 #! override from config
      init_model_idxes_ks_or_every: [0,50,100,250,500,1000,2000,5000,10000,15000] # show instability at these checkpoint idxes
"""
cfg = OmegaConf.create(config_yaml)

In [2]:
cf10_lenet_instability_path = '/system/user/beck/pwbeck/projects/regularization/erank/outputs/IA-B-cifar10-15.1.0-lenet--230117_084806/instability_analysis/hp_result_dfs/default_params.p'

In [6]:
cf10_lenet_instability = pd.read_pickle(cf10_lenet_instability_path)
cf10_lenet_instability.keys()

dict_keys(['datasets', 'distances'])

In [10]:
res_df = cf10_lenet_instability['datasets']
res_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,datasets,val,val,val,val,val,val,train,train,train,train,train,train
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,score,interpolation_scores,interpolation_scores,interpolation_scores,interpolation_scores,interpolation_scores,instability,interpolation_scores,interpolation_scores,interpolation_scores,interpolation_scores,interpolation_scores,instability
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,alpha,0.00,0.25,0.50,0.75,1.00,NaN,0.00,0.25,0.50,0.75,1.00,NaN
init_model_idx_k,job,seeds,model_idxes,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3
0,IA-B-cifar10-15.1.0-lenet--checkpoint_idx-0,"(1, 2)","(64002, 64002)",0.453711,0.564648,0.641406,0.561133,0.45918,0.184961,0.37318,0.509013,0.650849,0.496589,0.364909,0.281805
50,IA-B-cifar10-15.1.0-lenet--checkpoint_idx-50,"(1, 2)","(64001, 64001)",0.453516,0.509766,0.555078,0.513672,0.454297,0.101172,0.369158,0.452079,0.515482,0.452797,0.370938,0.145434
100,IA-B-cifar10-15.1.0-lenet--checkpoint_idx-100,"(1, 2)","(64001, 64001)",0.445312,0.506641,0.544141,0.500586,0.450781,0.096094,0.366734,0.427956,0.476964,0.428906,0.366924,0.110136
250,IA-B-cifar10-15.1.0-lenet--checkpoint_idx-250,"(1, 2)","(64001, 64001)",0.449219,0.479297,0.498828,0.47793,0.455469,0.046484,0.367427,0.416588,0.460506,0.419828,0.364941,0.094322
500,IA-B-cifar10-15.1.0-lenet--checkpoint_idx-500,"(1, 2)","(64001, 64001)",0.448047,0.470117,0.489258,0.472852,0.45625,0.037109,0.367686,0.404701,0.43117,0.404497,0.368329,0.063162
1000,IA-B-cifar10-15.1.0-lenet--checkpoint_idx-1000,"(1, 2)","(64002, 64002)",0.452148,0.461133,0.484766,0.475977,0.455469,0.030957,0.366388,0.392376,0.410102,0.389811,0.363168,0.045324
2000,IA-B-cifar10-15.1.0-lenet--checkpoint_idx-2000,"(1, 2)","(64001, 64001)",0.443359,0.444922,0.457422,0.454688,0.444336,0.013574,0.36722,0.379027,0.390581,0.379878,0.364608,0.024667
5000,IA-B-cifar10-15.1.0-lenet--checkpoint_idx-5000,"(1, 2)","(64001, 64001)",0.448438,0.447656,0.453125,0.448633,0.451367,0.003223,0.36248,0.36922,0.3761,0.369787,0.365067,0.012327
10000,IA-B-cifar10-15.1.0-lenet--checkpoint_idx-10000,"(1, 2)","(64001, 64001)",0.444922,0.44668,0.444531,0.444727,0.443164,0.002637,0.365464,0.366731,0.370544,0.367291,0.367728,0.003948
15000,IA-B-cifar10-15.1.0-lenet--checkpoint_idx-15000,"(1, 2)","(64001, 64001)",0.443945,0.448438,0.452539,0.451562,0.447656,0.006738,0.365937,0.365668,0.366073,0.366344,0.366798,0.00043


## 15.0 Standard Training Run from which later two seeds will be started.

In [37]:
run_command = REPO.create_experiment(cfg, override=False)
print(run_command)

python run.py --config-name cifar10-14.0.8-resnet-B.yaml


In [38]:
sweepr = REPO.get_output_loader(cfg)
print(sweepr)
sweepr

/system/user/publicwork/beck/projects/regularization/erank/outputs/cifar10-14.0.8-resnet-B--230116_112841


JobResult(/system/user/publicwork/beck/projects/regularization/erank/outputs/cifar10-14.0.8-resnet-B--230116_112841)

In [41]:
sweepr.available_model_checkpoint_indices

[0, 50, 100, 250, 500, 750, 2000, 2500, 10000]

In [21]:
swr = SweepResult('/system/user/beck/pwbeck/projects/regularization/erank/outputs/IA-B-cifar10-15.1.0-lenet--230117_084806')

In [24]:
swr.seeds

[1, 2]

In [28]:
df, js = swr.query_jobs({'seed': [1, 3]})
df

Unnamed: 0,best_train_step,best_val_score,trainer.resume_training.checkpoint_idx,seed
IA-B-cifar10-15.1.0-lenet--checkpoint_idx-0-seed-1--230117_084826,63500,0.5512,0,1
IA-B-cifar10-15.1.0-lenet--checkpoint_idx-100-seed-1--230117_090107,41500,0.5644,100,1
IA-B-cifar10-15.1.0-lenet--checkpoint_idx-1000-seed-1--230117_090213,45500,0.5598,1000,1
IA-B-cifar10-15.1.0-lenet--checkpoint_idx-10000-seed-1--230117_091712,64000,0.5596,10000,1
IA-B-cifar10-15.1.0-lenet--checkpoint_idx-15000-seed-1--230117_090203,64000,0.5604,15000,1
IA-B-cifar10-15.1.0-lenet--checkpoint_idx-2000-seed-1--230117_091513,64000,0.564,2000,1
IA-B-cifar10-15.1.0-lenet--checkpoint_idx-250-seed-1--230117_090157,55500,0.555,250,1
IA-B-cifar10-15.1.0-lenet--checkpoint_idx-50-seed-1--230117_091727,56500,0.5544,50,1
IA-B-cifar10-15.1.0-lenet--checkpoint_idx-500-seed-1--230117_091352,64000,0.5562,500,1
IA-B-cifar10-15.1.0-lenet--checkpoint_idx-5000-seed-1--230117_090220,59500,0.5636,5000,1


In [29]:
js

[JobResult(/system/user/beck/pwbeck/projects/regularization/erank/outputs/IA-B-cifar10-15.1.0-lenet--230117_084806/outputs/IA-B-cifar10-15.1.0-lenet--checkpoint_idx-0-seed-1--230117_084826),
 JobResult(/system/user/beck/pwbeck/projects/regularization/erank/outputs/IA-B-cifar10-15.1.0-lenet--230117_084806/outputs/IA-B-cifar10-15.1.0-lenet--checkpoint_idx-100-seed-1--230117_090107),
 JobResult(/system/user/beck/pwbeck/projects/regularization/erank/outputs/IA-B-cifar10-15.1.0-lenet--230117_084806/outputs/IA-B-cifar10-15.1.0-lenet--checkpoint_idx-1000-seed-1--230117_090213),
 JobResult(/system/user/beck/pwbeck/projects/regularization/erank/outputs/IA-B-cifar10-15.1.0-lenet--230117_084806/outputs/IA-B-cifar10-15.1.0-lenet--checkpoint_idx-10000-seed-1--230117_091712),
 JobResult(/system/user/beck/pwbeck/projects/regularization/erank/outputs/IA-B-cifar10-15.1.0-lenet--230117_084806/outputs/IA-B-cifar10-15.1.0-lenet--checkpoint_idx-15000-seed-1--230117_090203),
 JobResult(/system/user/beck/pwb