In [1]:
%load_ext autoreload
%autoreload 2

import torch
import torchvision
import torch.nn.functional as F
from torch import nn
from sklearn.metrics import precision_recall_fscore_support
import numpy as np

# manage ray's relative imports

import ray

from ray import tune
from ray.tune.suggest.optuna import OptunaSearch
from ray.tune import JupyterNotebookReporter

# manage beams's relative imports
import sys
sys.path.append('..')

from src.beam import beam_arguments, Experiment, Study
from src.beam import UniversalDataset, UniversalBatchSampler
from src.beam import Algorithm
from src.beam import LinearNet
from torchvision import transforms
import matplotlib.pyplot as plt

from src.beam import DataTensor
from src.beam.utils import is_notebook
from examples.cifar10_example import Cifar10Network, CIFAR10Algorithm, CIFAR10Dataset
# from cifar10_example import Cifar10Network, CIFAR10Algorithm, CIFAR10Dataset

from ray.tune.suggest.hebo import HEBOSearch

In [3]:
# path_to_data = '/localdata/elads/data/datasets/cifar10'
# root_dir = '/localdata/elads/data/cifar10'

path_to_data = '/home/shared/data/dataset/cifar10'
root_dir = '/home/shared/data/results/cifar10'

## Training with a single worker

In [14]:
# here you put all actions which are performed only once before initializing the workers
# for example, setti`ng running arguments and experiment:

args = beam_arguments(f"--project-name=cifar10 --root-dir={root_dir} --algorithm=CIFAR10Algorithm --device=1 --amp --lr-d=1e-2 --batch-size=512",
                      "--n-epochs=40 --clip-gradient=1000 --parallel=0 --accumulate=1 --no-deterministic",
                      "--weight-decay=3e-5 --beta1=0.9 --enable-tqdm --no-print-results", 
                      path_to_data=path_to_data, dropout=.0, activation='relu', channels=1024, label_smoothing=.2,
                      padding=6, gain=.2, turn_point=512, final_point=4096, minimal_gain=.05, temperature=0.05)

experiment = Experiment(args)

[32m2022-08-11 09:38:43[0m | [1mINFO[0m | [1mCreating new experiment[0m
[32m2022-08-11 09:38:43[0m | [1mINFO[0m | [1mExperiment directory is: /home/shared/data/results/cifar10/cifar10/CIFAR10Algorithm/debug/0019_20220811_093843[0m
[32m2022-08-11 09:38:44[0m | [1mINFO[0m | [1mbeam project: cifar10[0m
[32m2022-08-11 09:38:44[0m | [1mINFO[0m | [1mExperiment Hyperparameters[0m
[32m2022-08-11 09:38:44[0m | [1mINFO[0m | [1mproject_name: cifar10[0m
[32m2022-08-11 09:38:44[0m | [1mINFO[0m | [1malgorithm: CIFAR10Algorithm[0m
[32m2022-08-11 09:38:44[0m | [1mINFO[0m | [1midentifier: debug[0m
[32m2022-08-11 09:38:44[0m | [1mINFO[0m | [1mmp_port: random[0m
[32m2022-08-11 09:38:44[0m | [1mINFO[0m | [1mroot_dir: /home/shared/data/results/cifar10[0m
[32m2022-08-11 09:38:44[0m | [1mINFO[0m | [1mreload: False[0m
[32m2022-08-11 09:38:44[0m | [1mINFO[0m | [1mresume: -1[0m
[32m2022-08-11 09:38:44[0m | [1mINFO[0m | [1moverride: False[0m

In [16]:
alg = experiment.fit(CIFAR10Algorithm, CIFAR10Dataset, tensorboard_arguments={'images': {'sample': {'dataformats': 'NCHW'}}})

[32m2022-08-11 09:41:06[0m | [1mINFO[0m | [1mSingle worker mode[0m
[32m2022-08-11 09:41:06[0m | [1mINFO[0m | [1mWorker: 1/1 is running...[0m
[32m2022-08-11 09:41:06[0m | [1mINFO[0m | [1mWorker 1 will be running on device=cuda:1[0m




[32m2022-08-11 09:41:40[0m | [31m[1mERROR[0m | [31m[1mKeyboardInterrupt: Training was interrupted, Worker terminates[0m
[32m2022-08-11 09:41:40[0m | [31m[1mERROR[0m | [31m[1mKeyboardInterrupt: Training was interrupted, reloads last checkpoint[0m
[32m2022-08-11 09:41:40[0m | [1mINFO[0m | [1mReload experiment from checkpoint: /home/shared/data/results/cifar10/cifar10/CIFAR10Algorithm/debug/0019_20220811_093843/checkpoints/checkpoint_000010[0m
[32m2022-08-11 09:41:40[0m | [1mINFO[0m | [1mLoading network state from: /home/shared/data/results/cifar10/cifar10/CIFAR10Algorithm/debug/0019_20220811_093843/checkpoints/checkpoint_000010[0m


## Reload experiment from path

In [6]:
experiment = Experiment.reload_from_path('/home/shared/data/results/cifar10/cifar10/CIFAR10Algorithm/debug/0001_20220718_103719')

[32m2022-07-18 10:38:57[0m | [1mINFO[0m | [1mReload experiment from path: /home/shared/data/results/cifar10/cifar10/CIFAR10Algorithm/debug/0001_20220718_103719[0m
[32m2022-07-18 10:38:57[0m | [1mINFO[0m | [1mResuming existing experiment[0m
[32m2022-07-18 10:38:57[0m | [1mINFO[0m | [1mbeam project: cifar10[0m
[32m2022-07-18 10:38:57[0m | [1mINFO[0m | [1mExperiment Hyperparameters[0m
[32m2022-07-18 10:38:57[0m | [1mINFO[0m | [1mproject_name: cifar10[0m
[32m2022-07-18 10:38:57[0m | [1mINFO[0m | [1malgorithm: CIFAR10Algorithm[0m
[32m2022-07-18 10:38:57[0m | [1mINFO[0m | [1midentifier: debug[0m
[32m2022-07-18 10:38:57[0m | [1mINFO[0m | [1mmp_port: random[0m
[32m2022-07-18 10:38:57[0m | [1mINFO[0m | [1mroot_dir: /home/shared/data/results/cifar10[0m
[32m2022-07-18 10:38:57[0m | [1mINFO[0m | [1mreload: True[0m
[32m2022-07-18 10:38:57[0m | [1mINFO[0m | [1mresume: 0001_20220718_103719[0m
[32m2022-07-18 10:38:57[0m | [1mINFO[0m

continue training

In [7]:
alg = experiment.fit(CIFAR10Algorithm, CIFAR10Dataset, tensorboard_arguments={'images': {'sample': {'dataformats': 'NCHW'}}})

[32m2022-07-18 10:39:12[0m | [1mINFO[0m | [1mSingle worker mode[0m
[32m2022-07-18 10:39:12[0m | [1mINFO[0m | [1mWorker: 1/1 is running...[0m
[32m2022-07-18 10:39:12[0m | [1mINFO[0m | [1mLoading network state from: /home/shared/data/results/cifar10/cifar10/CIFAR10Algorithm/debug/0001_20220718_103719/checkpoints/checkpoint_000004[0m




[32m2022-07-18 10:39:19[0m | [1mINFO[0m | [1m[0m
[32m2022-07-18 10:39:19[0m | [1mINFO[0m | [1mFinished epoch 4/40:[0m
[32m2022-07-18 10:39:19[0m | [1mINFO[0m | [1mtrain:[0m
[32m2022-07-18 10:39:19[0m | [1mINFO[0m | [1mseconds:  5.869 | batches: 73 | samples:  3.738e+04 | batch_rate:  12.44 [iter/sec] | sample_rate:  6.368e+03 [iter/sec] [0m
[32m2022-07-18 10:39:19[0m | [1mINFO[0m | [1mloss:        | avg: 590.8     | std: 14.87     | min: 563.8     | 25%: 580.4     | 50%: 588.5     | 75%: 601.9     | max: 627.2     [0m
[32m2022-07-18 10:39:19[0m | [1mINFO[0m | [1macc:         | avg: 0.8721    | std: 0.01807   | min: 0.8262    | 25%: 0.8594    | 50%: 0.8711    | 75%: 0.8848    | max: 0.9102    [0m
[32m2022-07-18 10:39:19[0m | [1mINFO[0m | [1mlr:          | avg: 0.0005    | std: nan       | min: 0.0005    | 25%: 0.0005    | 50%: 0.0005    | 75%: 0.0005    | max: 0.0005    [0m
[32m2022-07-18 10:39:19[0m | [1mINFO[0m | [1mvalidation:[0m
[32m2

## Training with 2 workers

In [8]:
# here you put all actions which are performed only once before initializing the workers
# for example, setti`ng running arguments and experiment:

args = beam_arguments(f"--project-name=cifar10 --root-dir={root_dir} --algorithm=CIFAR10Algorithm --device=0 --half --lr-d=1e-3 --batch-size=512",
                      "--n-epochs=40 --clip-gradient=1000 --parallel=2 --accumulate=1 --no-deterministic",
                      "--weight-decay=1e-5 --beta1=0.9 --beta2=0.9", 
                      path_to_data=path_to_data, dropout=.0, activation='relu', channels=512, label_smoothing=.2,
                      padding=6, gain=.2, turn_point=1024, final_point=3000, minimal_gain=.05, temperature=1)

experiment = Experiment(args)

[32m2022-07-18 10:48:47[0m | [1mINFO[0m | [1mCreating new experiment[0m
[32m2022-07-18 10:48:47[0m | [1mINFO[0m | [1mExperiment directory is: /home/shared/data/results/cifar10/cifar10/CIFAR10Algorithm/debug/0002_20220718_104847[0m
[32m2022-07-18 10:48:47[0m | [1mINFO[0m | [1mbeam project: cifar10[0m
[32m2022-07-18 10:48:47[0m | [1mINFO[0m | [1mExperiment Hyperparameters[0m
[32m2022-07-18 10:48:47[0m | [1mINFO[0m | [1mproject_name: cifar10[0m
[32m2022-07-18 10:48:47[0m | [1mINFO[0m | [1malgorithm: CIFAR10Algorithm[0m
[32m2022-07-18 10:48:47[0m | [1mINFO[0m | [1midentifier: debug[0m
[32m2022-07-18 10:48:47[0m | [1mINFO[0m | [1mmp_port: random[0m
[32m2022-07-18 10:48:47[0m | [1mINFO[0m | [1mroot_dir: /home/shared/data/results/cifar10[0m
[32m2022-07-18 10:48:47[0m | [1mINFO[0m | [1mreload: False[0m
[32m2022-07-18 10:48:47[0m | [1mINFO[0m | [1mresume: -1[0m
[32m2022-07-18 10:48:47[0m | [1mINFO[0m | [1moverride: False[0m

In [9]:
alg = experiment.fit(CIFAR10Algorithm, CIFAR10Dataset, tensorboard_arguments={'images': {'sample': {'dataformats': 'NCHW'}}})

[32m2022-07-18 10:48:58[0m | [1mINFO[0m | [1mInitializing 2 parallel workers[0m
[32m2022-07-18 10:48:58[0m | [1mINFO[0m | [1mMultiprocessing port is: 57141[0m
[32m2022-07-18 10:49:02[0m | [1mINFO[0m | [1mWorker: 1/2 is running...[0m
[32m2022-07-18 10:49:02[0m | [1mINFO[0m | [1mWorker: 2/2 is running...[0m


train: 100%|██████████| 73/73 [00:13<00:00,  5.37it/s]

[32m2022-07-18 10:49:26[0m | [1mINFO[0m | [1m[0m
[32m2022-07-18 10:49:26[0m | [1mINFO[0m | [1mFinished epoch 1/40:[0m
[32m2022-07-18 10:49:26[0m | [1mINFO[0m | [1mtrain:[0m
[32m2022-07-18 10:49:26[0m | [1mINFO[0m | [1mseconds:  14.71 | batches: 73 | samples:  3.738e+04 | batch_rate:  4.964 [iter/sec] | sample_rate:  2.542e+03 [iter/sec] [0m
[32m2022-07-18 10:49:26[0m | [1mINFO[0m | [1mloss:        | avg: 1.647e+03 | std: 991.6     | min: 1.017e+03 | 25%: 1.173e+03 | 50%: 1.307e+03 | 75%: 1.567e+03 | max: 6.148e+03 [0m
[32m2022-07-18 10:49:26[0m | [1mINFO[0m | [1macc:         | avg: 0.2553    | std: 0.06811   | min: 0.09375   | 25%: 0.1992    | 50%: 0.252     | 75%: 0.3066    | max: 0.4121    [0m
[32m2022-07-18 10:49:26[0m | [1mINFO[0m | [1mlr:          | avg: 5e-05     | std: nan       | min: 5e-05     | 25%: 5e-05     | 50%: 5e-05     | 75%: 5e-05     | max: 5e-05     [0m
[32m2022-07-18 10:49:26[0m | [1mINFO[0m | [1mvalidation:[0m
[32m2


train:   8%|▊         | 6/73 [00:00<?, ?it/s]
train: 100%|██████████| 73/73 [00:12<00:00,  5.50it/s]

[32m2022-07-18 10:49:40[0m | [1mINFO[0m | [1m[0m
[32m2022-07-18 10:49:40[0m | [1mINFO[0m | [1mFinished epoch 2/40:[0m
[32m2022-07-18 10:49:40[0m | [1mINFO[0m | [1mtrain:[0m
[32m2022-07-18 10:49:40[0m | [1mINFO[0m | [1mseconds:  13.3 | batches: 73 | samples:  3.738e+04 | batch_rate:  5.489 [iter/sec] | sample_rate:  2.81e+03 [iter/sec] [0m
[32m2022-07-18 10:49:40[0m | [1mINFO[0m | [1mloss:        | avg: 950.7     | std: 62.33     | min: 832.0     | 25%: 900.0     | 50%: 946.0     | 75%: 992.0     | max: 1.14e+03  [0m
[32m2022-07-18 10:49:40[0m | [1mINFO[0m | [1macc:         | avg: 0.4747    | std: 0.06133   | min: 0.3281    | 25%: 0.4277    | 50%: 0.4805    | 75%: 0.5215    | max: 0.5938    [0m
[32m2022-07-18 10:49:40[0m | [1mINFO[0m | [1mlr:          | avg: 5e-05     | std: nan       | min: 5e-05     | 25%: 5e-05     | 50%: 5e-05     | 75%: 5e-05     | max: 5e-05     [0m
[32m2022-07-18 10:49:40[0m | [1mINFO[0m | [1mvalidation:[0m
[32m202


train:   8%|▊         | 6/73 [00:00<?, ?it/s]
train: 100%|██████████| 73/73 [00:12<00:00,  5.50it/s]

[32m2022-07-18 10:49:55[0m | [1mINFO[0m | [1m[0m
[32m2022-07-18 10:49:55[0m | [1mINFO[0m | [1mFinished epoch 3/40:[0m
[32m2022-07-18 10:49:55[0m | [1mINFO[0m | [1mtrain:[0m
[32m2022-07-18 10:49:55[0m | [1mINFO[0m | [1mseconds:  13.27 | batches: 73 | samples:  3.738e+04 | batch_rate:  5.502 [iter/sec] | sample_rate:  2.817e+03 [iter/sec] [0m
[32m2022-07-18 10:49:55[0m | [1mINFO[0m | [1mloss:        | avg: 829.4     | std: 32.64     | min: 776.0     | 25%: 807.0     | 50%: 825.0     | 75%: 851.5     | max: 922.0     [0m
[32m2022-07-18 10:49:55[0m | [1mINFO[0m | [1macc:         | avg: 0.6103    | std: 0.03629   | min: 0.5215    | 25%: 0.582     | 50%: 0.6152    | 75%: 0.6367    | max: 0.6797    [0m
[32m2022-07-18 10:49:55[0m | [1mINFO[0m | [1mlr:          | avg: 5e-05     | std: nan       | min: 5e-05     | 25%: 5e-05     | 50%: 5e-05     | 75%: 5e-05     | max: 5e-05     [0m
[32m2022-07-18 10:49:55[0m | [1mINFO[0m | [1mvalidation:[0m
[32m2


train:   8%|▊         | 6/73 [00:00<?, ?it/s]
train: 100%|██████████| 73/73 [00:11<00:00,  5.69it/s]

[32m2022-07-18 10:50:09[0m | [1mINFO[0m | [1m[0m
[32m2022-07-18 10:50:09[0m | [1mINFO[0m | [1mFinished epoch 4/40:[0m
[32m2022-07-18 10:50:09[0m | [1mINFO[0m | [1mtrain:[0m
[32m2022-07-18 10:50:09[0m | [1mINFO[0m | [1mseconds:  12.88 | batches: 73 | samples:  3.738e+04 | batch_rate:  5.669 [iter/sec] | sample_rate:  2.903e+03 [iter/sec] [0m
[32m2022-07-18 10:50:09[0m | [1mINFO[0m | [1mloss:        | avg: 777.1     | std: 24.64     | min: 727.0     | 25%: 763.0     | 50%: 774.0     | 75%: 791.0     | max: 859.0     [0m
[32m2022-07-18 10:50:09[0m | [1mINFO[0m | [1macc:         | avg: 0.6695    | std: 0.02791   | min: 0.5957    | 25%: 0.6504    | 50%: 0.6738    | 75%: 0.6895    | max: 0.7441    [0m
[32m2022-07-18 10:50:09[0m | [1mINFO[0m | [1mlr:          | avg: 5.703e-05 | std: nan       | min: 5.703e-05 | 25%: 5.703e-05 | 50%: 5.703e-05 | 75%: 5.703e-05 | max: 5.703e-05 [0m
[32m2022-07-18 10:50:09[0m | [1mINFO[0m | [1mvalidation:[0m
[32m2


train:   8%|▊         | 6/73 [00:00<?, ?it/s]
train:  21%|██        | 15/73 [00:01<00:11,  4.98it/s]

[32m2022-07-18 10:50:13[0m | [31m[1mERROR[0m | [31m[1mKeyboardInterrupt: Training was interrupted, reloads last checkpoint[0m


train:  22%|██▏       | 16/73 [00:02<00:12,  4.60it/s]

[32m2022-07-18 10:50:13[0m | [31m[1mERROR[0m | [31m[1mKeyboardInterrupt: Training was interrupted, Worker terminates[0m
[32m2022-07-18 10:50:13[0m | [31m[1mERROR[0m | [31m[1mKeyboardInterrupt: Training was interrupted, Worker terminates[0m
[32m2022-07-18 10:50:19[0m | [1mINFO[0m | [1mReload experiment from checkpoint: /home/shared/data/results/cifar10/cifar10/CIFAR10Algorithm/debug/0002_20220718_104847/checkpoints/checkpoint_000004[0m
[32m2022-07-18 10:50:19[0m | [1mINFO[0m | [1mLoading network state from: /home/shared/data/results/cifar10/cifar10/CIFAR10Algorithm/debug/0002_20220718_104847/checkpoints/checkpoint_000004[0m


In [14]:
predictions = alg.evaluate('test')

In [16]:
predictions.statistics['metrics']

defaultdict(list,
            {'precision': array([0.76832151, 0.92334495, 0.87545788, 0.53495441, 0.53638254,
                    0.5632    , 0.68841202, 0.85095541, 0.71588188, 0.77132806]),
             'recall': array([0.65 , 0.795, 0.239, 0.528, 0.774, 0.704, 0.802, 0.668, 0.897,
                    0.877]),
             'fscore': array([0.70422535, 0.85437937, 0.37549097, 0.53145445, 0.63364716,
                    0.62577778, 0.7408776 , 0.74845938, 0.79627164, 0.82077679]),
             'support': array([1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000])})

In [17]:
predictions

KeyError: 0

## Show tensorboard

In [10]:
%load_ext tensorboard

In [11]:
%tensorboard --logdir /localdata/elads/data/cifar10/cifar10/CIFAR10Algorithm/debug/0147_20220706_122801 --port=13067 --bind_all

## Hyperparameter search with native optuna

In [52]:
args = beam_arguments(f"--project-name=cifar10 --root-dir={root_dir} --algorithm=CIFAR10Algorithm --device=0 --amp --lr-d=1e-3 --batch-size=512",
                      "--n-epochs=2 --clip-gradient=1000 --parallel=4 --accumulate=1 --no-deterministic",
                      "--weight-decay=1e-5 --beta1=0.9 --beta2=0.9", 
                      path_to_data=path_to_data, dropout=.0, activation='relu', channels=512, label_smoothing=.2,
                      padding=6, gain=.2, turn_point=1024, final_point=3000, minimal_gain=.05, temperature=1)

study = Study(args, Alg=CIFAR10Algorithm, Dataset=CIFAR10Dataset)

[32m2022-08-11 13:33:55[0m | [1mINFO[0m | [1mbeam project: cifar10[0m
[32m2022-08-11 13:33:55[0m | [1mINFO[0m | [1mExperiment Hyperparameters[0m
[32m2022-08-11 13:33:55[0m | [1mINFO[0m | [1mproject_name: cifar10[0m
[32m2022-08-11 13:33:55[0m | [1mINFO[0m | [1malgorithm: CIFAR10Algorithm[0m
[32m2022-08-11 13:33:55[0m | [1mINFO[0m | [1midentifier: debug_hp_optimization_20220811_133355[0m
[32m2022-08-11 13:33:55[0m | [1mINFO[0m | [1mmp_port: random[0m
[32m2022-08-11 13:33:55[0m | [1mINFO[0m | [1mroot_dir: /home/shared/data/results/cifar10[0m
[32m2022-08-11 13:33:55[0m | [1mINFO[0m | [1mreload: False[0m
[32m2022-08-11 13:33:55[0m | [1mINFO[0m | [1mresume: -1[0m
[32m2022-08-11 13:33:55[0m | [1mINFO[0m | [1moverride: False[0m
[32m2022-08-11 13:33:55[0m | [1mINFO[0m | [1mcpu_workers: 0[0m
[32m2022-08-11 13:33:55[0m | [1mINFO[0m | [1mdevice: 0[0m
[32m2022-08-11 13:33:55[0m | [1mINFO[0m | [1mdevice_list: None[0m
[32m

In [46]:
def suggest(trial):
    lr = trial.suggest_loguniform("lr", 1e-3, 2e-2)
    print('My suggestion')
    print(lr)
    return {'lr_dense': lr}
    

In [None]:
summary = study.grid_search(explode_parameters={'lr-dense': [1e-3, 1e-4], "weight_decay": [1e-6, 1e-2]},  direction='maximize')

  lr-dense weight_decay
0    0.001     0.000001
0    0.001         0.01
0   0.0001     0.000001
0   0.0001         0.01


[32m[I 2022-08-11 13:39:14,041][0m A new study created in memory with name: cifar10/CIFAR10Algorithm/debug_hp_optimization_20220811_133355[0m


[32m2022-08-11 13:39:14[0m | [1mINFO[0m | [1mNext Hyperparameter suggestion:[0m
[32m2022-08-11 13:39:14[0m | [1mINFO[0m | [1mlr-dense: 0.001[0m
[32m2022-08-11 13:39:14[0m | [1mINFO[0m | [1mweight_decay: 1e-06[0m
[32m2022-08-11 13:39:14[0m | [1mINFO[0m | [1mCreating new experiment[0m
[32m2022-08-11 13:39:14[0m | [1mINFO[0m | [1mExperiment directory is: /home/shared/data/results/cifar10/cifar10/CIFAR10Algorithm/debug_hp_optimization_20220811_133355/0002_20220811_133914[0m
[32m2022-08-11 13:39:14[0m | [1mINFO[0m | [1mSingle worker mode[0m
[32m2022-08-11 13:39:14[0m | [1mINFO[0m | [1mWorker: 1/1 is running...[0m
[32m2022-08-11 13:39:14[0m | [1mINFO[0m | [1mWorker 1 will be running on device=cuda:0[0m


  def dummy_suggest(trial):
  create_trial(state=TrialState.WAITING, system_attrs={"fixed_params": params})
  self.add_trial(


In [47]:
summary = study.optuna(suggest, direction='maximize', n_jobs=1, n_trials=4)

[32m[I 2022-08-11 11:47:55,116][0m A new study created in memory with name: cifar10/CIFAR10Algorithm/debug_hp_optimization_20220811_114751[0m


My suggestion
0.0014562113817765687
[32m2022-08-11 11:47:55[0m | [1mINFO[0m | [1mNext Hyperparameter suggestion:[0m
[32m2022-08-11 11:47:55[0m | [1mINFO[0m | [1mlr_dense: 0.0014562113817765687[0m
[32m2022-08-11 11:47:55[0m | [1mINFO[0m | [1mCreating new experiment[0m
[32m2022-08-11 11:47:55[0m | [1mINFO[0m | [1mExperiment directory is: /home/shared/data/results/cifar10/cifar10/CIFAR10Algorithm/debug_hp_optimization_20220811_114751/0000_20220811_114755[0m
[32m2022-08-11 11:47:55[0m | [1mINFO[0m | [1mSingle worker mode[0m
[32m2022-08-11 11:47:55[0m | [1mINFO[0m | [1mWorker: 1/1 is running...[0m
[32m2022-08-11 11:47:55[0m | [1mINFO[0m | [1mWorker 1 will be running on device=cuda:0[0m


[32m[I 2022-08-11 11:48:07,663][0m Trial 0 finished with value: 0.37508491847826086 and parameters: {'lr': 0.0014562113817765687}. Best is trial 0 with value: 0.37508491847826086.[0m


My suggestion
0.00544479618713628
[32m2022-08-11 11:48:07[0m | [1mINFO[0m | [1mNext Hyperparameter suggestion:[0m
[32m2022-08-11 11:48:07[0m | [1mINFO[0m | [1mlr_dense: 0.00544479618713628[0m
[32m2022-08-11 11:48:07[0m | [1mINFO[0m | [1mCreating new experiment[0m
[32m2022-08-11 11:48:07[0m | [1mINFO[0m | [1mExperiment directory is: /home/shared/data/results/cifar10/cifar10/CIFAR10Algorithm/debug_hp_optimization_20220811_114751/0001_20220811_114807[0m
[32m2022-08-11 11:48:07[0m | [1mINFO[0m | [1mSingle worker mode[0m
[32m2022-08-11 11:48:07[0m | [1mINFO[0m | [1mWorker: 1/1 is running...[0m
[32m2022-08-11 11:48:07[0m | [1mINFO[0m | [1mWorker 1 will be running on device=cuda:0[0m


[32m[I 2022-08-11 11:48:21,547][0m Trial 1 finished with value: 0.10793138586956522 and parameters: {'lr': 0.00544479618713628}. Best is trial 0 with value: 0.37508491847826086.[0m


My suggestion
0.016051414950411205
[32m2022-08-11 11:48:21[0m | [1mINFO[0m | [1mNext Hyperparameter suggestion:[0m
[32m2022-08-11 11:48:21[0m | [1mINFO[0m | [1mlr_dense: 0.016051414950411205[0m
[32m2022-08-11 11:48:21[0m | [1mINFO[0m | [1mCreating new experiment[0m
[32m2022-08-11 11:48:21[0m | [1mINFO[0m | [1mExperiment directory is: /home/shared/data/results/cifar10/cifar10/CIFAR10Algorithm/debug_hp_optimization_20220811_114751/0002_20220811_114821[0m
[32m2022-08-11 11:48:21[0m | [1mINFO[0m | [1mSingle worker mode[0m
[32m2022-08-11 11:48:21[0m | [1mINFO[0m | [1mWorker: 1/1 is running...[0m
[32m2022-08-11 11:48:21[0m | [1mINFO[0m | [1mWorker 1 will be running on device=cuda:0[0m


NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
NaN or Inf found in input tensor.
[32m[I 2022-08-11 11:48:40,182][0m Trial 2 finished with value: 0.10105298913043478 and parameters: {'lr': 0.016051414950411205}. Best is trial 0 with value: 0.37508491847826086.[0m


My suggestion
0.00314494444474515
[32m2022-08-11 11:48:40[0m | [1mINFO[0m | [1mNext Hyperparameter suggestion:[0m
[32m2022-08-11 11:48:40[0m | [1mINFO[0m | [1mlr_dense: 0.00314494444474515[0m
[32m2022-08-11 11:48:40[0m | [1mINFO[0m | [1mCreating new experiment[0m
[32m2022-08-11 11:48:40[0m | [1mINFO[0m | [1mExperiment directory is: /home/shared/data/results/cifar10/cifar10/CIFAR10Algorithm/debug_hp_optimization_20220811_114751/0003_20220811_114840[0m
[32m2022-08-11 11:48:40[0m | [1mINFO[0m | [1mSingle worker mode[0m
[32m2022-08-11 11:48:40[0m | [1mINFO[0m | [1mWorker: 1/1 is running...[0m
[32m2022-08-11 11:48:40[0m | [1mINFO[0m | [1mWorker 1 will be running on device=cuda:0[0m


[32m[I 2022-08-11 11:48:58,701][0m Trial 3 finished with value: 0.14580502717391305 and parameters: {'lr': 0.00314494444474515}. Best is trial 0 with value: 0.37508491847826086.[0m


In [48]:
summary.get_trials()

[FrozenTrial(number=0, values=[0.37508491847826086], datetime_start=datetime.datetime(2022, 8, 11, 11, 47, 55, 119527), datetime_complete=datetime.datetime(2022, 8, 11, 11, 48, 7, 662923), params={'lr': 0.0014562113817765687}, distributions={'lr': LogUniformDistribution(high=0.02, low=0.001)}, user_attrs={}, system_attrs={}, intermediate_values={0: 0.19964334239130435, 1: 0.37508491847826086}, trial_id=0, state=TrialState.COMPLETE, value=None),
 FrozenTrial(number=1, values=[0.10793138586956522], datetime_start=datetime.datetime(2022, 8, 11, 11, 48, 7, 665521), datetime_complete=datetime.datetime(2022, 8, 11, 11, 48, 21, 547694), params={'lr': 0.00544479618713628}, distributions={'lr': LogUniformDistribution(high=0.02, low=0.001)}, user_attrs={}, system_attrs={}, intermediate_values={0: 0.08220108695652174, 1: 0.10793138586956522}, trial_id=1, state=TrialState.COMPLETE, value=None),
 FrozenTrial(number=2, values=[0.10105298913043478], datetime_start=datetime.datetime(2022, 8, 11, 11, 4

## Hyperparameter search with ray-tune and optuna

In [10]:
args = beam_arguments(f"--project-name=cifar10 --root-dir={root_dir} --algorithm=CIFAR10Algorithm --device=0 --amp --lr-d=1e-3 --batch-size=512",
                      "--n-epochs=2 --clip-gradient=1000 --parallel=0 --accumulate=1 --no-deterministic",
                      "--weight-decay=1e-5 --beta1=0.9 --beta2=0.9", 
                      path_to_data=path_to_data, dropout=.0, activation='relu', channels=512, label_smoothing=.2,
                      padding=6, gain=.2, turn_point=1024, final_point=3000, minimal_gain=.05, temperature=1)

study = Study(args, Alg=CIFAR10Algorithm, Dataset=CIFAR10Dataset, print_results=False)

[32m2022-08-11 12:43:53[0m | [1mINFO[0m | [1mbeam project: cifar10[0m
[32m2022-08-11 12:43:53[0m | [1mINFO[0m | [1mExperiment Hyperparameters[0m
[32m2022-08-11 12:43:53[0m | [1mINFO[0m | [1mproject_name: cifar10[0m
[32m2022-08-11 12:43:53[0m | [1mINFO[0m | [1malgorithm: CIFAR10Algorithm[0m
[32m2022-08-11 12:43:53[0m | [1mINFO[0m | [1midentifier: debug_hp_optimization_20220811_124353[0m
[32m2022-08-11 12:43:53[0m | [1mINFO[0m | [1mmp_port: random[0m
[32m2022-08-11 12:43:53[0m | [1mINFO[0m | [1mroot_dir: /home/shared/data/results/cifar10[0m
[32m2022-08-11 12:43:53[0m | [1mINFO[0m | [1mreload: False[0m
[32m2022-08-11 12:43:53[0m | [1mINFO[0m | [1mresume: -1[0m
[32m2022-08-11 12:43:53[0m | [1mINFO[0m | [1moverride: False[0m
[32m2022-08-11 12:43:53[0m | [1mINFO[0m | [1mcpu_workers: 0[0m
[32m2022-08-11 12:43:53[0m | [1mINFO[0m | [1mdevice: 0[0m
[32m2022-08-11 12:43:53[0m | [1mINFO[0m | [1mdevice_list: None[0m
[32m

In [4]:
import inspect

In [5]:
inspect.getfile(CIFAR10Algorithm)

'/home/elad/docker/beamds/notebooks/../cifar10_example.py'

In [13]:
# hebo = HEBOSearch(metric="mean_accuracy", mode="max")

# analysis = study.tune(config={"lr_dense": tune.loguniform(1e-3, 2e-2),
#                               "weight_decay": tune.loguniform(1e-6, 1e-4),
#                               "gamma": tune.loguniform(.1, .9),
#                               "dropout": tune.uniform(0, .75),
#                               "scale_down": tune.uniform(0.4, .7),
#                               "scale_up": tune.uniform(0.9, 1.2),
#                               "ratio_down": tune.uniform(0.7, .95),
#                               "ratio_up": tune.uniform(1.05, 1.4),
#                               "channels": tune.choice([128, 256, 512]),
#                               "batch_size": tune.choice([512, 1024, 2048]),},
#                        metric="mean_accuracy",
#                        max_concurrent_trials=4,
#                        resources_per_trial={"gpu": 1},
#                        mode="max",
#                        search_alg=hebo,
#                       progress_reporter=JupyterNotebookReporter(overwrite=True),
#                        num_samples=400)

# runtime_env = {"working_dir": "../examples/" }
runtime_env = {"working_dir": ".." }


analysis = study.tune(config={"lr_dense": tune.loguniform(1e-4, 2e-2),
                              "weight_decay": tune.loguniform(1e-6, 1e-2),
                              "beta1": tune.loguniform(.85, .95),
                              "gain": tune.uniform(0.2, .2),
                              "temperature": tune.uniform(0.05, 2.),
                              "minimal_gain": tune.loguniform(.01, .1),
                              "channels": tune.choice([256, 512, 1024]),
                              "padding": tune.choice([4, 6, 8]),
                              "turn_point": tune.choice([256, 512, 1024]),
                              "final_point": tune.choice([2048, 2048*1.5, 2048*2]),
                              "batch_size": tune.choice([256, 512, 1024]),
                              "activation": tune.choice(['relu', 'celu', 'gelu']),},
                       metric="mean_accuracy",
                       max_concurrent_trials=8,
                       resources_per_trial={"gpu": 1},
                       mode="max",
                       search_alg=OptunaSearch(),
                      progress_reporter=JupyterNotebookReporter(overwrite=True),
                       num_samples=4, runtime_env=runtime_env)


Trial name,status,loc,activation,batch_size,beta1,channels,final_point,gain,lr_dense,minimal_gain,padding,temperature,turn_point,weight_decay,acc,iter,total time (s)
runner_tune_85504352,TERMINATED,172.17.0.7:24241,gelu,256,0.94688,256,4096,0.2,0.00113188,0.0112082,4,0.0706069,1024,3.45423e-06,0.78626,2,19.5684
runner_tune_8849ae72,TERMINATED,172.17.0.7:24279,celu,1024,0.944153,1024,2048,0.2,0.000663368,0.0851956,4,0.101959,256,0.000248386,0.551225,2,27.3594
runner_tune_884cae24,TERMINATED,172.17.0.7:24337,relu,512,0.858114,256,2048,0.2,0.000847372,0.0180671,8,0.565499,1024,0.00557459,0.438774,2,14.4375
runner_tune_949db2c2,TERMINATED,172.17.0.7:24387,gelu,1024,0.891607,1024,2048,0.2,0.000527338,0.0509203,8,1.35825,256,0.000871987,0.15696,2,26.5331


2022-08-11 12:46:55,332	INFO tune.py:701 -- Total run time: 68.65 seconds (68.53 seconds for the tuning loop).


In [18]:
import pandas as pd

In [38]:
df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})

In [39]:
df_e = pd.DataFrame({'c': [11, 22], 'd': [33, 44]})

In [41]:
df = df.merge(df_e, how='cross')

In [43]:
[it[1].to_dict() for it in list(df.iterrows())]

[{'a': 1, 'b': 4, 'c': 11, 'd': 33},
 {'a': 1, 'b': 4, 'c': 22, 'd': 44},
 {'a': 2, 'b': 5, 'c': 11, 'd': 33},
 {'a': 2, 'b': 5, 'c': 22, 'd': 44},
 {'a': 3, 'b': 6, 'c': 11, 'd': 33},
 {'a': 3, 'b': 6, 'c': 22, 'd': 44}]

In [33]:
df['exp'] = pd.Series([{'c': 11, 'd': 22}] * len(df))

In [34]:
df

Unnamed: 0,a,b,exp
0,1,4,"{'c': 11, 'd': 22}"
1,2,5,"{'c': 11, 'd': 22}"
2,3,6,"{'c': 11, 'd': 22}"


In [31]:
df.explode('exp')

Unnamed: 0,a,b,exp
0,1,4,c
0,1,4,d
1,2,5,c
1,2,5,d
2,3,6,c
2,3,6,d


In [14]:
1+1

2

In [15]:
analysis.best_logdir

'/home/shared/data/results/cifar10/ray_results/cifar10/CIFAR10Algorithm/debug_hp_optimization_20220811_124353/runner_tune_2022-08-11_12-45-46/runner_tune_85504352_1_activation=gelu,batch_size=256,beta1=0.94688,channels=256,final_point=4096,gain=0.2,lr_dense=0.0011319,mini_2022-08-11_12-45-48'

In [15]:
analysis.trial_dataframes

{'/root/ray_results/runner_tune_2022-08-11_12-18-52/runner_tune_c3099c74_1_activation=gelu,batch_size=256,beta1=0.88145,channels=256,final_point=2048,gain=0.2,lr_dense=0.0063656,mini_2022-08-11_12-18-53':    mean_accuracy  time_this_iter_s   done  timesteps_total  episodes_total  \
 0       0.173658         10.918643  False              NaN             NaN   
 1       0.103176          6.174871  False              NaN             NaN   
 
    training_iteration  trial_id                     experiment_id  \
 0                   1  c3099c74  5fe21e28835c4d31b14762d0575a5f10   
 1                   2  c3099c74  5fe21e28835c4d31b14762d0575a5f10   
 
                   date   timestamp  time_total_s    pid      hostname  \
 0  2022-08-11_12-19-08  1660220348     10.918643  14217  217a4b387abd   
 1  2022-08-11_12-19-14  1660220354     17.093514  14217  217a4b387abd   
 
       node_ip  time_since_restore  timesteps_since_restore  \
 0  172.17.0.7           10.918643                        