In [2]:
%load_ext autoreload
%autoreload 2
import torch
import numpy as np
import pandas as pd
from omegaconf import OmegaConf
from pathlib import Path
from pprint import pprint
from torchinfo import summary
# plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
# mycode
from repo import REPO
from ml_utilities.output_loader.job_output import SweepResult, JobResult
from ml_utilities.output_loader import create_job_output_loader
from ml_utilities.output_loader.plot import plot_sweep_summary, plot_data_log_values

from ml_utilities.torch_models.fc import FC
from erank.mode_connectivity.instability_analysis import InstabilityAnalyzer

# 13.3 CIFAR10 Lenet pretraining

Do initial hyperparameter search for lenet.


In [3]:
model_cfg = """
# model_cfg: resnet20-cifar10
name: fc
model_kwargs:
  input_size: 3072
  hidden_sizes:
    - 300
    - 100
  output_size: 10
  flatten_input: True
  dropout: null
  act_fn: relu
  model_initializer: kaiming_normal
  batchnorm_initializer: uniform
"""
model_cfg = OmegaConf.create(model_cfg)
fc_model = FC(**model_cfg.model_kwargs)
summary(fc_model, input_data=torch.randn(128, 3, 32, 32), depth=4)

Layer (type:depth-idx)                   Output Shape              Param #
FC                                       [128, 10]                 --
├─Sequential: 1-1                        [128, 10]                 --
│    └─Flatten: 2-1                      [128, 3072]               --
│    └─Linear: 2-2                       [128, 300]                921,900
│    └─ReLU: 2-3                         [128, 300]                --
│    └─Linear: 2-4                       [128, 100]                30,100
│    └─ReLU: 2-5                         [128, 100]                --
│    └─Linear: 2-6                       [128, 10]                 1,010
Total params: 953,010
Trainable params: 953,010
Non-trainable params: 0
Total mult-adds (M): 121.99
Input size (MB): 1.57
Forward/backward pass size (MB): 0.42
Params size (MB): 3.81
Estimated Total Size (MB): 5.80

In [4]:
# sgd = torch.optim.SGD(fc_model.parameters(), lr=0.1)
# len(sgd.param_groups)

In [5]:
# config_yaml = """
# run_config:
#   exec_type: parallel
#   hostname: dragonfly
#   gpu_ids: [2,3]
#   runs_per_gpu: 2
#   wandb:
#     init:
#       tags:
#       - ${config.experiment_data.experiment_tag}_exps
#       - run_handler
#       notes: null
#       group: ${config.experiment_data.experiment_tag}
#       job_type: run_handler

# seeds: [1]

# sweep:
#   type: grid
#   axes: 
#   - parameter: trainer.optimizer_scheduler.optimizer_kwargs.lr
#     vals: [1e-4, 1e-3, 1e-2]
#   - parameter: trainer.optimizer_scheduler.optimizer_kwargs.weight_decay
#     vals: [0.0, 0.001, 0.0001]
#   # - parameter: trainer.batch_size
#   #   vals: [64, 256, 512]
    
# start_num: 0

# config:
#   experiment_data:
#     entity: jkuiml-fsl
#     project_name: tflearning
#     experiment_tag: '13.2'
#     experiment_type: startnum_${start_num}
#     experiment_name: cifar10-${config.experiment_data.experiment_tag}.${start_num}-lenet-hyps
#     experiment_dir: null
#     experiment_notes: 
#     job_name: null
#     seed: 0
#     hostname: null
#     gpu_id: 0
#   wandb:
#     init:
#       tags:
#       - ${config.experiment_data.experiment_tag}_exps
#       notes: ${config.experiment_data.experiment_notes}
#       group: ${config.experiment_data.experiment_tag}
#       job_type: ${config.experiment_data.experiment_type}
#     watch:
#       log: null
#       log_freq:

#   model:
#     name: fc
#     model_kwargs:
#       input_size: 3072
#       hidden_sizes:
#         - 300
#         - 100
#       output_size: 10
#       flatten_input: True
#       dropout: null
#       act_fn: relu
#       model_initializer: kaiming_normal
#       batchnorm_initializer: uniform
#     init_model: null

#   trainer:
#     training_setup: supervised
#     n_steps: 64e3
#     log_train_step_every: 1
#     log_additional_train_step_every_multiplier: 1
#     log_additional_logs: true
#     val_every: 500
#     save_every: 64000
#     early_stopping_patience: 20e3 #64e3
#     batch_size: 128
#     optimizer_scheduler:
#       optimizer_name: AdamW
#       optimizer_kwargs:
#         lr: 0.001 
#         # momentum: 0.9
#         weight_decay: 0.0
#       lr_scheduler_name: MultiStepLR
#       lr_scheduler_kwargs:
#         milestones: [32e3, 48e3]
#         gamma: 0.1

#     loss: crossentropy
#     metrics:
#     - Accuracy
#     num_workers: 4
#     verbose: false
#   data:
#     dataset: cifar10
#     dataset_kwargs:
#       data_root_path: /system/user/beck/pwbeck/data
#     dataset_split:
#       train_val_split: 0.9
#       # restrict_n_samples_train_task: 100
#     dataset_transforms:
#       image_transforms:
#       - RandomHorizontalFlip
#       - RandomCrop:
#           size: 32
#           padding: 4
#       tensor_transforms: 
#       joint_tensor_transforms: 
#       enable_transforms: True
# """
# # cfg = OmegaConf.create(config_yaml)

In [6]:
# run_command = REPO.create_experiment(cfg, override=False)
# print(run_command)

In [7]:
# best: lr=0.001, weight_decay=0.0

In [8]:
config_yaml = """
run_config:
  exec_type: parallel
  hostname: dragonfly
  gpu_ids: [2,3]
  runs_per_gpu: 2
  wandb:
    init:
      tags:
      - ${config.experiment_data.experiment_tag}_exps
      - run_handler
      notes: null
      group: ${config.experiment_data.experiment_tag}
      job_type: run_handler

seeds: [33,21,6]

# sweep:
#   type: skip
#   axes: 
#   - parameter: trainer.optimizer_scheduler.optimizer_kwargs.lr
#     vals: [1e-4, 1e-3, 1e-2]
#   - parameter: trainer.optimizer_scheduler.optimizer_kwargs.weight_decay
#     vals: [0.0, 0.001, 0.0001]
#   - parameter: trainer.batch_size
#     vals: [64, 256, 512]
    
start_num: 0

config:
  experiment_data:
    entity: jkuiml-fsl
    project_name: tflearning
    experiment_tag: '13.3'
    experiment_type: startnum_${start_num}
    experiment_name: cifar10-${config.experiment_data.experiment_tag}.${start_num}-lenet-pretrain
    experiment_dir: null
    experiment_notes: 
    job_name: null
    seed: 0
    hostname: null
    gpu_id: 0
  wandb:
    init:
      tags:
      - ${config.experiment_data.experiment_tag}_exps
      notes: ${config.experiment_data.experiment_notes}
      group: ${config.experiment_data.experiment_tag}
      job_type: ${config.experiment_data.experiment_type}
    watch:
      log: null
      log_freq:

  model:
    name: fc
    model_kwargs:
      input_size: 3072
      hidden_sizes:
        - 300
        - 100
      output_size: 10
      flatten_input: True
      dropout: null
      act_fn: relu
      model_initializer: kaiming_normal
      batchnorm_initializer: uniform
    init_model: null

  trainer:
    training_setup: supervised
    n_steps: 64e3
    log_train_step_every: 1
    log_additional_train_step_every_multiplier: 1
    log_additional_logs: true
    val_every: 500
    save_every: 1000
    early_stopping_patience: 20e3 #64e3
    batch_size: 128
    optimizer_scheduler:
      optimizer_name: AdamW
      optimizer_kwargs:
        lr: 0.001 
        # momentum: 0.9
        weight_decay: 0.0
      lr_scheduler_name: MultiStepLR
      lr_scheduler_kwargs:
        milestones: [32e3, 48e3]
        gamma: 0.1

    loss: crossentropy
    metrics:
    - Accuracy
    num_workers: 4
    verbose: false
  data:
    dataset: cifar10
    dataset_kwargs:
      data_root_path: /system/user/beck/pwbeck/data
    dataset_split:
      train_val_split: 0.9
      # restrict_n_samples_train_task: 100
    dataset_transforms:
      image_transforms:
      - RandomHorizontalFlip
      - RandomCrop:
          size: 32
          padding: 4
      tensor_transforms: 
      joint_tensor_transforms: 
      enable_transforms: True
"""
cfg = OmegaConf.create(config_yaml)

In [9]:
# run_command = REPO.create_experiment(cfg, override=False)
# print(run_command)

### Sweep result

In [10]:
sweepr = REPO.get_output_loader(cfg)
print(sweepr)

Exp. Tag(start_num): 13.3(0)
Exp. Name: cifar10-13.3.0-lenet-pretrain
Training setup: supervised
Model name: fc
Dataset name: cifar10
Sweep type: skip
  No sweep axes.
Seeds: [33, 21, 6]
Num. jobs: 3
Config updated: 2023-01-12 16:56:38
Sweep started:  2023-01-12 16:57:06



In [11]:
failed_jobs = sweepr.get_failed_jobs()
failed_jobs # No failed jobs

Collecting failed jobs: 100%|██████████| 3/3 [00:00<00:00, 55.20it/s]


([], {})

In [12]:
pprint(sweepr.available_log_columns)

{'_common_cols': ['epoch', 'train_step', 'log_step'],
 'train': ['lr',
           'loss_CrossEntropyLoss',
           'time_last_train_epoch_in_s',
           'Accuracy'],
 'train_step': ['weight_norm', 'lr', 'loss_CrossEntropyLoss', 'Accuracy'],
 'val': ['loss_CrossEntropyLoss', 'time_last_val_epoch_in_s', 'Accuracy']}


In [13]:
sw_summary = sweepr.get_summary()
sw_summary

Collecting summaries: 100%|██████████| 3/3 [00:00<00:00, 90.46it/s]


Unnamed: 0,best_step,best_val_score,seed
cifar10-13.3.0-lenet-pretrain---seed-21--230112_165724,42500,0.5536,21
cifar10-13.3.0-lenet-pretrain---seed-33--230112_165727,61500,0.5546,33
cifar10-13.3.0-lenet-pretrain---seed-6--230112_165726,63500,0.5546,6


# 13.4 CIFAR10 Lenet finetuning


### Finetuning for Instability analysis

In [14]:
# get a single pretraining job
df, jobs = sweepr.query_jobs({'seed':6})
df

Collecting summaries: 100%|██████████| 3/3 [00:00<00:00, 3448.32it/s]


Unnamed: 0,best_step,best_val_score,seed
cifar10-13.3.0-lenet-pretrain---seed-6--230112_165726,63500,0.5546,6


In [15]:
pretrain_job = jobs[0]
pretrain_job

JobResult(cifar10-13.3.0-lenet-pretrain---seed-6--230112_165726)

In [16]:
print(pretrain_job.directory)

/system/user/publicwork/beck/projects/regularization/erank/outputs/cifar10-13.3.0-lenet-pretrain--230112_165706/outputs/cifar10-13.3.0-lenet-pretrain---seed-6--230112_165726


In [17]:
# determine pretrain indices, remaining training time and lr schedule milestones
pretrain_idxes = np.array(pretrain_job.available_model_checkpoint_indices[::6])
pretrain_idxes.tolist()

[0, 6000, 12000, 18000, 24000, 30000, 36000, 42000, 48000, 54000, 60000]

In [18]:
total_steps = cfg.config.trainer.n_steps
remaining_training_steps = total_steps - np.array(pretrain_idxes)
remaining_training_steps.astype(int).tolist()

[64000, 58000, 52000, 46000, 40000, 34000, 28000, 22000, 16000, 10000, 4000]

In [19]:
fconfig_yaml = """
run_config:
  exec_type: parallel
  hostname: dragonfly
  gpu_ids: [2,3]
  runs_per_gpu: 3
  wandb:
    init:
      tags:
      - ${config.experiment_data.experiment_tag}_exps
      - run_handler
      notes: null
      group: ${config.experiment_data.experiment_tag}
      job_type: run_handler

seeds: [1,2]

sweep:
  type: line
  axes: 
  - parameter: trainer.n_steps # remaining steps to train
    vals: [64000, 58000, 52000, 46000, 40000, 34000, 28000, 22000, 16000, 10000, 4000]
  - parameter: model.pretrain_idx
    vals: [0, 6000, 12000, 18000, 24000, 30000, 36000, 42000, 48000, 54000, 60000]
    
start_num: 0

config:
  experiment_data:
    entity: jkuiml-fsl
    project_name: tflearning
    experiment_tag: '13.4'
    experiment_type: startnum_${start_num}
    experiment_name: cifar10-${config.experiment_data.experiment_tag}.${start_num}-lenet-finetune
    experiment_dir: null
    experiment_notes: 
    job_name: null
    seed: 0
    hostname: null
    gpu_id: 0
  wandb:
    init:
      tags:
      - ${config.experiment_data.experiment_tag}_exps
      notes: ${config.experiment_data.experiment_notes}
      group: ${config.experiment_data.experiment_tag}
      job_type: ${config.experiment_data.experiment_type}
    watch:
      log: null
      log_freq:

  model:
    name: fc
    model_kwargs:
      input_size: 3072
      hidden_sizes:
        - 300
        - 100
      output_size: 10
      flatten_input: True
      dropout: null
      act_fn: relu
      model_initializer: kaiming_normal
      batchnorm_initializer: uniform
    pretrain_idx: 0
    init_model: /system/user/publicwork/beck/projects/regularization/erank/outputs/cifar10-13.3.0-lenet-pretrain--230112_165706/outputs/cifar10-13.3.0-lenet-pretrain---seed-6--230112_165726/model_step_${config.model.pretrain_idx}.p

  trainer:
    training_setup: supervised
    n_steps: 64e3
    log_train_step_every: 1
    log_additional_train_step_every_multiplier: 1
    log_additional_logs: true
    val_every: 500
    save_every: 1000
    early_stopping_patience: 64e3
    batch_size: 128
    optimizer_scheduler:
      optimizer_name: AdamW
      optimizer_kwargs:
        lr: 0.001 
        # momentum: 0.9
        weight_decay: 0.0
      lr_scheduler_name: MultiStepLR
      lr_scheduler_kwargs:
        milestones: [32e3, 48e3]
        gamma: 0.1
        last_epoch: ${config.model.pretrain_idx}

    loss: crossentropy
    metrics:
    - Accuracy
    num_workers: 4
    verbose: false
  data:
    dataset: cifar10
    dataset_kwargs:
      data_root_path: /system/user/beck/pwbeck/data
    dataset_split:
      train_val_split: 0.9
      # restrict_n_samples_train_task: 100
    dataset_transforms:
      image_transforms:
      - RandomHorizontalFlip
      - RandomCrop:
          size: 32
          padding: 4
      tensor_transforms: 
      joint_tensor_transforms: 
      enable_transforms: True
"""
cfg = OmegaConf.create(fconfig_yaml)

In [20]:
# run_command = REPO.create_experiment(cfg, override=False)
# print(run_command)

### Instability analysis

In [21]:
finetunesw = REPO.get_output_loader(cfg, additional_search_pattern='095719')
finetunesw

SweepResult(cifar10-13.4.0-lenet-finetune--230113_095719)

In [22]:
finetunesw.get_failed_jobs()

Collecting failed jobs: 100%|██████████| 22/22 [00:00<00:00, 701.13it/s]


([], {})

In [23]:
fsw_summary = finetunesw.get_summary()
# fsw_summary

Collecting summaries: 100%|██████████| 22/22 [00:00<00:00, 732.35it/s]


In [24]:
finetunesw.get_sweep_param_values()

{'seed': [1, 2],
 'trainer.n_steps': [4000,
  10000,
  16000,
  22000,
  28000,
  34000,
  40000,
  46000,
  52000,
  58000,
  64000],
 'model.pretrain_idx': [0,
  6000,
  12000,
  18000,
  24000,
  30000,
  36000,
  42000,
  48000,
  54000,
  60000]}

In [25]:
random_job = finetunesw.get_jobs()[0]

In [26]:
print(finetunesw.directory)

/system/user/publicwork/beck/projects/regularization/erank/outputs/cifar10-13.4.0-lenet-finetune--230113_095719


In [27]:
instability_cfg = """
config:
  run_script_name: instability_analyzer
  run_script_kwargs:
    instability_sweep: /system/user/publicwork/beck/projects/regularization/erank/outputs/cifar10-13.4.0-lenet-finetune--230113_095719
    score_fn: TAccuracy
    interpolation_factors: [0.0000, 0.2500, 0.5000, 0.7500, 1.0000]
    device: 3
    interpolate_linear_kwargs: 
      interpolation_on_train_data: True
      dataloader_kwargs:
        batch_size: 1024
      compute_model_distances: True
    override_files: True
    num_seed_combinations: 1
    save_folder_suffix: 0
    float_eps_query_job: 1e-3
    init_model_idx_k_param_name: 'model.pretrain_idx'
    init_model_idxes_ks_or_every: 0
    train_model_idxes: [-1, -2]
"""
instability_cfg = OmegaConf.create(instability_cfg)

In [28]:
j = finetunesw.get_jobs('idx-0-seed-1')[0]
j.get_data_log('val')

Unnamed: 0,log_step,epoch,train_step,loss_CrossEntropyLoss,Accuracy,time_last_val_epoch_in_s
0,0,0,0,2.316737,0.0926,0.000000
1,501,2,500,1.722792,0.3818,1.205302
2,1003,3,1000,1.680621,0.4108,0.312282
3,1506,5,1500,1.605122,0.4246,0.292940
4,2008,6,2000,1.628264,0.4386,0.275300
...,...,...,...,...,...,...
124,62299,177,62000,1.281602,0.5480,0.277399
125,62801,178,62500,1.293068,0.5466,0.272901
126,63303,179,63000,1.309321,0.5496,0.284872
127,63806,181,63500,1.300245,0.5476,0.273378


In [27]:
insta = InstabilityAnalyzer.reload(finetunesw.directory, instability_folder_suffix='0')

Collecting failed jobs: 100%|██████████| 22/22 [00:00<00:00, 904.78it/s]


In [28]:
insta.combined_results_dfs['datasets']

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,datasets,val,val,val,val,val,val,train,train,train,train,train,train
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,score,interpolation_scores,interpolation_scores,interpolation_scores,interpolation_scores,interpolation_scores,instability,interpolation_scores,interpolation_scores,interpolation_scores,interpolation_scores,interpolation_scores,instability
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,alpha,0.00,0.25,0.50,0.75,1.00,NaN,0.00,0.25,0.50,0.75,1.00,NaN
default_params,init_model_idx_k,job,seeds,model_idxes,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3
default_params,0,cifar10-13.4.0-lenet-finetune--n_steps-64000-pretrain_idx-0,"(1, 2)","(63000, 62500)",0.549176,0.432327,0.338705,0.428538,0.549411,-0.210588,0.630976,0.486714,0.34981,0.477133,0.627348,-0.279352
default_params,0,cifar10-13.4.0-lenet-finetune--n_steps-64000-pretrain_idx-0,"(1, 2)","(64000, 64000)",0.549176,0.432327,0.338705,0.428538,0.549411,-0.210588,0.631404,0.487841,0.349405,0.475269,0.629233,-0.280913
default_params,6000,cifar10-13.4.0-lenet-finetune--n_steps-58000-pretrain_idx-6000,"(1, 2)","(55000, 57000)",0.550228,0.550541,0.550228,0.55485,0.559811,-0.004791,0.627652,0.627017,0.624732,0.62675,0.63076,-0.004474
default_params,6000,cifar10-13.4.0-lenet-finetune--n_steps-58000-pretrain_idx-6000,"(1, 2)","(58000, 58000)",0.550228,0.550541,0.550228,0.55485,0.559811,-0.004791,0.631826,0.62892,0.624136,0.629182,0.634096,-0.008825
default_params,12000,cifar10-13.4.0-lenet-finetune--n_steps-52000-pretrain_idx-12000,"(1, 2)","(42500, 46000)",0.556153,0.550882,0.551922,0.548966,0.547731,-0.004211,0.631899,0.630774,0.631905,0.631146,0.630698,-0.000601
default_params,12000,cifar10-13.4.0-lenet-finetune--n_steps-52000-pretrain_idx-12000,"(1, 2)","(52000, 52000)",0.556153,0.550882,0.551922,0.548966,0.547731,-0.004211,0.629831,0.632195,0.633868,0.633155,0.631434,-0.000801
default_params,18000,cifar10-13.4.0-lenet-finetune--n_steps-46000-pretrain_idx-18000,"(1, 2)","(26000, 21500)",0.550712,0.549215,0.552613,0.54794,0.545295,-0.002708,0.633141,0.632588,0.634793,0.633226,0.632284,-0.000428
default_params,18000,cifar10-13.4.0-lenet-finetune--n_steps-46000-pretrain_idx-18000,"(1, 2)","(46000, 46000)",0.550712,0.549215,0.552613,0.54794,0.545295,-0.002708,0.629632,0.630323,0.633617,0.632812,0.632106,-0.001237
default_params,24000,cifar10-13.4.0-lenet-finetune--n_steps-40000-pretrain_idx-24000,"(1, 2)","(24000, 40000)",0.5554,0.556051,0.55704,0.558459,0.555555,-7.8e-05,0.633217,0.632156,0.635677,0.632057,0.630902,-0.001157
default_params,24000,cifar10-13.4.0-lenet-finetune--n_steps-40000-pretrain_idx-24000,"(1, 2)","(40000, 40000)",0.5554,0.556051,0.55704,0.558459,0.555555,-7.8e-05,0.630231,0.635892,0.634778,0.635803,0.634677,-0.002223
