In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
module_path = os.path.abspath(os.path.join('../../../src/'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Tutorial: Hyperparameter Search

In [3]:
import torch
import pytorch_lightning as pl
import optuna
from optuna.integration import PyTorchLightningPruningCallback

from classification.models.M5 import M5PLModule
from classification.trainer.HyperParamSearch import MetricsCallback, save_model

### Objective

In [4]:
def objective(trial):
    metrics_callback = MetricsCallback()  
    
    # create a trainer
    trainer = pl.Trainer(
        logger=None, #loggers.TensorBoardLogger(config.LOG_DIR, name="M5"),
        max_epochs=2,                                                               
        gpus=1 if torch.cuda.is_available() else None,
        callbacks=[metrics_callback],                                                  # save latest accuracy
        early_stop_callback=PyTorchLightningPruningCallback(trial, monitor="val_acc"), # early stopping
    )
    
    #trial.logger_version = trainer.logger.version

    # here we sample the hyper params, similar as in our old random search
    trial_hparams = {"batch_size": trial.suggest_categorical('batch_size', [64]),
                     "learning_rate": trial.suggest_loguniform("learning_rate", 1e-6, 1e-1),
                     "p_drop": trial.suggest_float("p_drop", 0, 0.),
                     "lr_decay": trial.suggest_float("lr_decay", 0.8, 1),
                     "weight_decay": trial.suggest_float("weight_decay", 1e-6, 1e-1)
                    }    

    model = M5PLModule(trial_hparams)
    trainer.fit(model)

    # save model
    save_model(model, '{}.p'.format(trial.number), "saved_models")

    # return validation accuracy from latest model, as that's what we want to minimize by our hyper param search
    return metrics_callback.metrics[-1]["val_acc"]

### Run Search

In [5]:
pruner = optuna.pruners.MedianPruner()
study = optuna.create_study(direction="maximize", pruner=pruner)
study.optimize(objective, n_trials=100, timeout=21600) #6h

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
best_trial = study.best_trial

print("  Value: {}".format(best_trial.value))

print("  Params: ")
for key, value in best_trial.params.items():
    print("    {}: {}".format(key, value))

GPU available: True, used: True
No environment variable for node rank defined. Set as 0.
CUDA_VISIBLE_DEVICES: [0]


Loading cached train data from /nfs/students/summer-term-2020/project-4/data/data_8k
Loading cached val data from /nfs/students/summer-term-2020/project-4/data/data_8k


Set SLURM handle signals.

   | Name           | Type         | Params
--------------------------------------------
0  | model          | M5           | 555 K 
1  | model.model    | Sequential   | 555 K 
2  | model.model.0  | Conv1d       | 10 K  
3  | model.model.1  | BatchNorm1d  | 256   
4  | model.model.2  | MaxPool1d    | 0     
5  | model.model.3  | Dropout      | 0     
6  | model.model.4  | Conv1d       | 49 K  
7  | model.model.5  | BatchNorm1d  | 256   
8  | model.model.6  | MaxPool1d    | 0     
9  | model.model.7  | Dropout      | 0     
10 | model.model.8  | Conv1d       | 98 K  
11 | model.model.9  | BatchNorm1d  | 512   
12 | model.model.10 | MaxPool1d    | 0     
13 | model.model.11 | Dropout      | 0     
14 | model.model.12 | Conv1d       | 393 K 
15 | model.model.13 | BatchNorm1d  | 1 K   
16 | model.model.14 | MaxPool1d    | 0     
17 | model.model.15 | AvgPool1d    | 0     
18 | model.model.16 | PermuteLayer | 0     
19 | model.model.17 | Linear       | 1 K   


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

Val-Acc=0.03793716656787196




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Val-Acc=0.8103141671606402


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

[32m[I 2020-06-09 15:20:45,014][0m Finished trial#0 with value: 0.8251333728512151 with parameters: {'batch_size': 64, 'learning_rate': 0.0005526446899718101, 'p_drop': 0, 'lr_decay': 0.8004671484264645, 'weight_decay': 0.013578867935547207}. Best is trial#0 with value: 0.8251333728512151.[0m
GPU available: True, used: True
No environment variable for node rank defined. Set as 0.
CUDA_VISIBLE_DEVICES: [0]


Val-Acc=0.8251333728512151

Loading cached train data from /nfs/students/summer-term-2020/project-4/data/data_8k
Loading cached val data from /nfs/students/summer-term-2020/project-4/data/data_8k


Set SLURM handle signals.

   | Name           | Type         | Params
--------------------------------------------
0  | model          | M5           | 555 K 
1  | model.model    | Sequential   | 555 K 
2  | model.model.0  | Conv1d       | 10 K  
3  | model.model.1  | BatchNorm1d  | 256   
4  | model.model.2  | MaxPool1d    | 0     
5  | model.model.3  | Dropout      | 0     
6  | model.model.4  | Conv1d       | 49 K  
7  | model.model.5  | BatchNorm1d  | 256   
8  | model.model.6  | MaxPool1d    | 0     
9  | model.model.7  | Dropout      | 0     
10 | model.model.8  | Conv1d       | 98 K  
11 | model.model.9  | BatchNorm1d  | 512   
12 | model.model.10 | MaxPool1d    | 0     
13 | model.model.11 | Dropout      | 0     
14 | model.model.12 | Conv1d       | 393 K 
15 | model.model.13 | BatchNorm1d  | 1 K   
16 | model.model.14 | MaxPool1d    | 0     
17 | model.model.15 | AvgPool1d    | 0     
18 | model.model.16 | PermuteLayer | 0     
19 | model.model.17 | Linear       | 1 K   


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

Val-Acc=0.03793716656787196


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Val-Acc=0.7990515708358032


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

[32m[I 2020-06-09 15:21:08,166][0m Finished trial#1 with value: 0.8180201541197392 with parameters: {'batch_size': 64, 'learning_rate': 3.657479021972008e-05, 'p_drop': 0, 'lr_decay': 0.9985837395665875, 'weight_decay': 0.08198542382441497}. Best is trial#0 with value: 0.8251333728512151.[0m
GPU available: True, used: True
No environment variable for node rank defined. Set as 0.
CUDA_VISIBLE_DEVICES: [0]


Val-Acc=0.8180201541197392

Loading cached train data from /nfs/students/summer-term-2020/project-4/data/data_8k
Loading cached val data from /nfs/students/summer-term-2020/project-4/data/data_8k


Set SLURM handle signals.

   | Name           | Type         | Params
--------------------------------------------
0  | model          | M5           | 555 K 
1  | model.model    | Sequential   | 555 K 
2  | model.model.0  | Conv1d       | 10 K  
3  | model.model.1  | BatchNorm1d  | 256   
4  | model.model.2  | MaxPool1d    | 0     
5  | model.model.3  | Dropout      | 0     
6  | model.model.4  | Conv1d       | 49 K  
7  | model.model.5  | BatchNorm1d  | 256   
8  | model.model.6  | MaxPool1d    | 0     
9  | model.model.7  | Dropout      | 0     
10 | model.model.8  | Conv1d       | 98 K  
11 | model.model.9  | BatchNorm1d  | 512   
12 | model.model.10 | MaxPool1d    | 0     
13 | model.model.11 | Dropout      | 0     
14 | model.model.12 | Conv1d       | 393 K 
15 | model.model.13 | BatchNorm1d  | 1 K   
16 | model.model.14 | MaxPool1d    | 0     
17 | model.model.15 | AvgPool1d    | 0     
18 | model.model.16 | PermuteLayer | 0     
19 | model.model.17 | Linear       | 1 K   


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

Val-Acc=0.03023117960877297


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

Val-Acc=0.6147006520450504


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

[32m[I 2020-06-09 15:21:30,096][0m Finished trial#2 with value: 0.5631298162418494 with parameters: {'batch_size': 64, 'learning_rate': 0.011553366485236343, 'p_drop': 0, 'lr_decay': 0.9363953371402909, 'weight_decay': 0.05749691282766589}. Best is trial#0 with value: 0.8251333728512151.[0m
GPU available: True, used: True
No environment variable for node rank defined. Set as 0.
CUDA_VISIBLE_DEVICES: [0]


Val-Acc=0.5631298162418494

Loading cached train data from /nfs/students/summer-term-2020/project-4/data/data_8k


Set SLURM handle signals.

   | Name           | Type         | Params
--------------------------------------------
0  | model          | M5           | 555 K 
1  | model.model    | Sequential   | 555 K 
2  | model.model.0  | Conv1d       | 10 K  
3  | model.model.1  | BatchNorm1d  | 256   
4  | model.model.2  | MaxPool1d    | 0     
5  | model.model.3  | Dropout      | 0     
6  | model.model.4  | Conv1d       | 49 K  
7  | model.model.5  | BatchNorm1d  | 256   
8  | model.model.6  | MaxPool1d    | 0     
9  | model.model.7  | Dropout      | 0     
10 | model.model.8  | Conv1d       | 98 K  
11 | model.model.9  | BatchNorm1d  | 512   
12 | model.model.10 | MaxPool1d    | 0     
13 | model.model.11 | Dropout      | 0     
14 | model.model.12 | Conv1d       | 393 K 
15 | model.model.13 | BatchNorm1d  | 1 K   
16 | model.model.14 | MaxPool1d    | 0     
17 | model.model.15 | AvgPool1d    | 0     
18 | model.model.16 | PermuteLayer | 0     
19 | model.model.17 | Linear       | 1 K   


Loading cached val data from /nfs/students/summer-term-2020/project-4/data/data_8k


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

Val-Acc=0.03793716656787196


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

Detected KeyboardInterrupt, attempting graceful shutdown...
[33m[W 2020-06-09 15:21:37,779][0m Setting status of trial#3 as TrialState.FAIL because of the following error: IndexError('list index out of range')
Traceback (most recent call last):
  File "/nfs/homedirs/herrmanp/miniconda3/envs/ml/lib/python3.8/site-packages/optuna/study.py", line 734, in _run_trial
    result = func(trial)
  File "<ipython-input-4-b7a8d267ec64>", line 30, in objective
    return metrics_callback.metrics[-1]["val_acc"]
IndexError: list index out of range[0m





IndexError: list index out of range