In [1]:
%load_ext autoreload
%autoreload 2
import torch
import numpy as np
import pandas as pd
from omegaconf import OmegaConf
from pathlib import Path
from pprint import pprint
# plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
# mycode
from repo import REPO
from ml_utilities.output_loader.result_loader import SweepResult, JobResult
from ml_utilities.output_loader import create_job_output_loader
from ml_utilities.output_loader.plot import plot_sweep_summary, plot_data_log_values

# 14 CIFAR10 Resume Training

I use this notebook for debugging the resume training functionality of the trainer.

## 14.0 Standard Training Run from which later will be resumed

In [6]:
config_yaml = """
run_config:
  exec_type: parallel
  hostname: dragonfly
  gpu_ids: [0, 1]
  runs_per_gpu: 1
  wandb:
    init:
      tags:
      - ${config.experiment_data.experiment_tag}_exps
      - run_handler
      notes: null
      group: ${config.experiment_data.experiment_tag}
      job_type: run_handler

seeds: [0]

# sweep:
#   type: grid
#   axes: 
#   - parameter: data.dataset_transforms.enable_transforms
#     vals: [True, False]
    
start_num: 7

config:
  experiment_data:
    entity: jkuiml-fsl
    project_name: tflearning
    experiment_tag: '14.0'
    experiment_type: startnum_${start_num}
    experiment_name: cifar10-${config.experiment_data.experiment_tag}.${start_num}-resnet-B
    experiment_dir: null
    experiment_notes: 
    job_name: null
    seed: 0
    hostname: null
    gpu_id: 1
  wandb:
    init:
      tags:
      - ${config.experiment_data.experiment_tag}_exps
      notes: ${config.experiment_data.experiment_notes}
      group: ${config.experiment_data.experiment_tag}
      job_type: ${config.experiment_data.experiment_type}
    watch:
      log: null
      log_freq:

  model:
    model_cfg: resnet20-cifar10-B

  trainer:
    training_setup: supervised
    n_steps: 64e3
    log_train_step_every: 1
    log_additional_train_step_every_multiplier: 1
    log_additional_logs: true
    val_every: 500
    save_every: 10000
    save_every_idxes: [50,100,250,500,750,2000,2500]
    early_stopping_patience: 64e3
    batch_size: 128
    optimizer_scheduler:
      optimizer_name: SGD
      optimizer_kwargs:
        lr: 0.01 #0.1
        momentum: 0.9
        weight_decay: 0.0001
      lr_scheduler_name: MultiStepLR
      lr_scheduler_kwargs:
        milestones: [32e3, 48e3]
        gamma: 0.1
    loss: crossentropy
    metrics:
    - Accuracy
    num_workers: 4
    verbose: false

  data:
    dataset: cifar10
    dataset_kwargs:
      data_root_path: /system/user/beck/pwbeck/data
    dataset_split:
      train_val_split: 0.9
      # restrict_n_samples_train_task: 100
    train_split_transforms:
      image_transforms:
      - RandomHorizontalFlip
      - RandomCrop:
          size: 32
          padding: 4
      tensor_transforms: 
      joint_tensor_transforms: 
      enable_transforms: True
"""
cfg = OmegaConf.create(config_yaml)

In [7]:
# run_command = REPO.create_experiment(cfg, override=False)
# print(run_command)

In [8]:
sweepr = REPO.get_output_loader(cfg)
print(sweepr)
sweepr

/system/user/publicwork/beck/projects/regularization/erank/outputs/cifar10-14.0.7-resnet-B--230116_104528


JobResult(/system/user/publicwork/beck/projects/regularization/erank/outputs/cifar10-14.0.7-resnet-B--230116_104528)

In [9]:
sweepr.get_data_log('train_step')

Unnamed: 0,log_step,loss_CrossEntropyLoss,lr,Accuracy,weight_norm,epoch,train_step
0,3,2.770440,0.0100,0.093750,43.065948,1,1
1,5,2.779710,0.0100,0.093750,43.065132,1,2
2,7,2.654719,0.0100,0.054688,43.064083,1,3
3,9,2.476277,0.0100,0.125000,43.062958,1,4
4,11,2.562410,0.0100,0.078125,43.061771,1,5
...,...,...,...,...,...,...,...
63994,128613,0.114891,0.0001,0.968750,52.543095,182,63995
63995,128615,0.086893,0.0001,0.984375,52.543091,182,63996
63996,128617,0.182929,0.0001,0.937500,52.543087,182,63997
63997,128619,0.098812,0.0001,0.968750,52.543087,182,63998


In [41]:
sweepr.available_model_checkpoint_indices

[0, 50, 100, 250, 500, 750, 2000, 2500, 10000]

## 14.1 Resume training from earlier run

In [33]:
config_yaml = """
run_config:
  exec_type: parallel
  hostname: dragonfly
  gpu_ids: [0]
  runs_per_gpu: 1
  wandb:
    init:
      tags:
      - ${config.experiment_data.experiment_tag}_exps
      - run_handler
      notes: null
      group: ${config.experiment_data.experiment_tag}
      job_type: run_handler

seeds: [0]

# sweep:
#   type: grid
#   axes: 
#   - parameter: data.dataset_transforms.enable_transforms
#     vals: [True, False]
    
start_num: 1

config:
  experiment_data:
    entity: jkuiml-fsl
    project_name: tflearning
    experiment_tag: '14.1'
    experiment_type: startnum_${start_num}
    experiment_name: cifar10-${config.experiment_data.experiment_tag}.${start_num}-resnet-B-resume
    experiment_dir: null
    experiment_notes: 
    job_name: null
    seed: 0
    hostname: null
    gpu_id: 0
  wandb:
    init:
      tags:
      - ${config.experiment_data.experiment_tag}_exps
      notes: ${config.experiment_data.experiment_notes}
      group: ${config.experiment_data.experiment_tag}
      job_type: ${config.experiment_data.experiment_type}
    watch:
      log: null
      log_freq:

  model:
    model_cfg: resnet20-cifar10-B

  trainer:
    training_setup: supervised
    n_steps: 64e3
    log_train_step_every: 1
    log_additional_logs: true
    val_every: 500
    save_every: 1000
    early_stopping_patience: 64e3
    batch_size: 128
    optimizer_scheduler:
      optimizer_name: SGD
      optimizer_kwargs:
        lr: 0.01 #0.1
        momentum: 0.9
        weight_decay: 0.0001
      lr_scheduler_name: MultiStepLR
      lr_scheduler_kwargs:
        milestones: [32e3, 48e3]
        gamma: 0.1
    loss: crossentropy
    metrics:
    - Accuracy
    num_workers: 4
    #! NEW 
    resume_training:
      job_dir: /system/user/publicwork/beck/projects/regularization/erank/outputs/cifar10-14.0.7-resnet-B--230116_104528
      checkpoint_idx: 30000

  data:
    dataset: cifar10
    dataset_kwargs:
      data_root_path: /system/user/beck/pwbeck/data
    dataset_split:
      train_val_split: 0.9
      # restrict_n_samples_train_task: 100
    train_split_transforms:
      image_transforms:
      - RandomHorizontalFlip
      - RandomCrop:
          size: 32
          padding: 4
      tensor_transforms: 
      joint_tensor_transforms: 
      enable_transforms: True
"""
cfg = OmegaConf.create(config_yaml)

In [34]:
run_command = REPO.create_experiment(cfg, override=True)
print(run_command)

python run.py --config-name cifar10-14.1.1-resnet-B-resume.yaml
