In [1]:
%load_ext autoreload
%autoreload 2

import torch
import torchvision
import torch.nn.functional as F
from torch import nn
from sklearn.metrics import precision_recall_fscore_support
import numpy as np

# manage ray's relative imports

import ray
runtime_env = {"working_dir": ".." }
ray.init(runtime_env=runtime_env, dashboard_port=13065, include_dashboard=True, dashboard_host="0.0.0.0")

from ray import tune
from ray.tune.suggest.optuna import OptunaSearch
from ray.tune import JupyterNotebookReporter

# manage beams's relative imports
import sys
sys.path.append('..')

from src.beam import beam_arguments, Experiment, Study
from src.beam import UniversalDataset, UniversalBatchSampler
from src.beam import Algorithm
from src.beam import LinearNet
from torchvision import transforms
import matplotlib.pyplot as plt

from src.beam import DataTensor
from src.beam.utils import is_notebook
from cifar10_example import Cifar10Network, CIFAR10Algorithm, CIFAR10Dataset

from ray.tune.suggest.hebo import HEBOSearch

2022-07-18 10:37:17,586	INFO services.py:1456 -- View the Ray dashboard at [1m[32mhttp://172.17.0.5:13065[39m[22m
2022-07-18 10:37:18,164	INFO packaging.py:388 -- Creating a file package for local directory '..'.
2022-07-18 10:37:18,290	INFO packaging.py:241 -- Pushing file package 'gcs://_ray_pkg_6c539cca0b885c24.zip' (9.95MiB) to Ray cluster...
2022-07-18 10:37:18,435	INFO packaging.py:243 -- Successfully pushed file package 'gcs://_ray_pkg_6c539cca0b885c24.zip'.


In [2]:
# path_to_data = '/localdata/elads/data/datasets/cifar10'
# root_dir = '/localdata/elads/data/cifar10'

path_to_data = '/home/shared/data/dataset/cifar10'
root_dir = '/home/shared/data/results/cifar10'

## Training with a single worker

In [3]:
# here you put all actions which are performed only once before initializing the workers
# for example, setti`ng running arguments and experiment:

args = beam_arguments(f"--project-name=cifar10 --root-dir={root_dir} --algorithm=CIFAR10Algorithm --device=1 --amp --lr-d=1e-2 --batch-size=512",
                      "--n-epochs=40 --clip-gradient=1000 --parallel=0 --accumulate=1 --no-deterministic",
                      "--weight-decay=3e-5 --beta1=0.9", 
                      path_to_data=path_to_data, dropout=.0, activation='relu', channels=1024, label_smoothing=.2,
                      padding=6, gain=.2, turn_point=512, final_point=4096, minimal_gain=.05, temperature=0.05)

experiment = Experiment(args)

[32m2022-07-18 10:37:19[0m | [1mINFO[0m | [1mCreating new experiment[0m
[32m2022-07-18 10:37:19[0m | [1mINFO[0m | [1mExperiment directory is: /home/shared/data/results/cifar10/cifar10/CIFAR10Algorithm/debug/0001_20220718_103719[0m
[32m2022-07-18 10:37:19[0m | [1mINFO[0m | [1mbeam project: cifar10[0m
[32m2022-07-18 10:37:19[0m | [1mINFO[0m | [1mExperiment Hyperparameters[0m
[32m2022-07-18 10:37:19[0m | [1mINFO[0m | [1mproject_name: cifar10[0m
[32m2022-07-18 10:37:19[0m | [1mINFO[0m | [1malgorithm: CIFAR10Algorithm[0m
[32m2022-07-18 10:37:19[0m | [1mINFO[0m | [1midentifier: debug[0m
[32m2022-07-18 10:37:19[0m | [1mINFO[0m | [1mmp_port: random[0m
[32m2022-07-18 10:37:19[0m | [1mINFO[0m | [1mroot_dir: /home/shared/data/results/cifar10[0m
[32m2022-07-18 10:37:19[0m | [1mINFO[0m | [1mreload: False[0m
[32m2022-07-18 10:37:20[0m | [1mINFO[0m | [1mresume: -1[0m
[32m2022-07-18 10:37:20[0m | [1mINFO[0m | [1moverride: False[0m

In [4]:
alg = experiment.fit(CIFAR10Algorithm, CIFAR10Dataset, tensorboard_arguments={'images': {'sample': {'dataformats': 'NCHW'}}})

[32m2022-07-18 10:37:20[0m | [1mINFO[0m | [1mSingle worker mode[0m
[32m2022-07-18 10:37:20[0m | [1mINFO[0m | [1mWorker: 1/1 is running...[0m




train:   1%|1         | 1/73 [00:00<?, ?it/s]

[32m2022-07-18 10:37:34[0m | [1mINFO[0m | [1m[0m
[32m2022-07-18 10:37:34[0m | [1mINFO[0m | [1mFinished epoch 1/40:[0m
[32m2022-07-18 10:37:34[0m | [1mINFO[0m | [1mtrain:[0m
[32m2022-07-18 10:37:34[0m | [1mINFO[0m | [1mseconds:  7.83 | batches: 73 | samples:  3.738e+04 | batch_rate:  9.323 [iter/sec] | sample_rate:  4.773e+03 [iter/sec] [0m
[32m2022-07-18 10:37:34[0m | [1mINFO[0m | [1mloss:        | avg: 930.3     | std: 113.4     | min: 776.4     | 25%: 840.4     | 50%: 897.0     | 75%: 1.001e+03 | max: 1.181e+03 [0m
[32m2022-07-18 10:37:34[0m | [1mINFO[0m | [1macc:         | avg: 0.4534    | std: 0.1464    | min: 0.0625    | 25%: 0.3613    | 50%: 0.4844    | 75%: 0.5566    | max: 0.668     [0m
[32m2022-07-18 10:37:34[0m | [1mINFO[0m | [1mlr:          | avg: 0.0005    | std: nan       | min: 0.0005    | 25%: 0.0005    | 50%: 0.0005    | 75%: 0.0005    | max: 0.0005    [0m
[32m2022-07-18 10:37:34[0m | [1mINFO[0m | [1mvalidation:[0m
[32m20

## Reload experiment from path

In [6]:
experiment = Experiment.reload_from_path('/home/shared/data/results/cifar10/cifar10/CIFAR10Algorithm/debug/0001_20220718_103719')

[32m2022-07-18 10:38:57[0m | [1mINFO[0m | [1mReload experiment from path: /home/shared/data/results/cifar10/cifar10/CIFAR10Algorithm/debug/0001_20220718_103719[0m
[32m2022-07-18 10:38:57[0m | [1mINFO[0m | [1mResuming existing experiment[0m
[32m2022-07-18 10:38:57[0m | [1mINFO[0m | [1mbeam project: cifar10[0m
[32m2022-07-18 10:38:57[0m | [1mINFO[0m | [1mExperiment Hyperparameters[0m
[32m2022-07-18 10:38:57[0m | [1mINFO[0m | [1mproject_name: cifar10[0m
[32m2022-07-18 10:38:57[0m | [1mINFO[0m | [1malgorithm: CIFAR10Algorithm[0m
[32m2022-07-18 10:38:57[0m | [1mINFO[0m | [1midentifier: debug[0m
[32m2022-07-18 10:38:57[0m | [1mINFO[0m | [1mmp_port: random[0m
[32m2022-07-18 10:38:57[0m | [1mINFO[0m | [1mroot_dir: /home/shared/data/results/cifar10[0m
[32m2022-07-18 10:38:57[0m | [1mINFO[0m | [1mreload: True[0m
[32m2022-07-18 10:38:57[0m | [1mINFO[0m | [1mresume: 0001_20220718_103719[0m
[32m2022-07-18 10:38:57[0m | [1mINFO[0m

continue training

In [7]:
alg = experiment.fit(CIFAR10Algorithm, CIFAR10Dataset, tensorboard_arguments={'images': {'sample': {'dataformats': 'NCHW'}}})

[32m2022-07-18 10:39:12[0m | [1mINFO[0m | [1mSingle worker mode[0m
[32m2022-07-18 10:39:12[0m | [1mINFO[0m | [1mWorker: 1/1 is running...[0m
[32m2022-07-18 10:39:12[0m | [1mINFO[0m | [1mLoading network state from: /home/shared/data/results/cifar10/cifar10/CIFAR10Algorithm/debug/0001_20220718_103719/checkpoints/checkpoint_000004[0m




[32m2022-07-18 10:39:19[0m | [1mINFO[0m | [1m[0m
[32m2022-07-18 10:39:19[0m | [1mINFO[0m | [1mFinished epoch 4/40:[0m
[32m2022-07-18 10:39:19[0m | [1mINFO[0m | [1mtrain:[0m
[32m2022-07-18 10:39:19[0m | [1mINFO[0m | [1mseconds:  5.869 | batches: 73 | samples:  3.738e+04 | batch_rate:  12.44 [iter/sec] | sample_rate:  6.368e+03 [iter/sec] [0m
[32m2022-07-18 10:39:19[0m | [1mINFO[0m | [1mloss:        | avg: 590.8     | std: 14.87     | min: 563.8     | 25%: 580.4     | 50%: 588.5     | 75%: 601.9     | max: 627.2     [0m
[32m2022-07-18 10:39:19[0m | [1mINFO[0m | [1macc:         | avg: 0.8721    | std: 0.01807   | min: 0.8262    | 25%: 0.8594    | 50%: 0.8711    | 75%: 0.8848    | max: 0.9102    [0m
[32m2022-07-18 10:39:19[0m | [1mINFO[0m | [1mlr:          | avg: 0.0005    | std: nan       | min: 0.0005    | 25%: 0.0005    | 50%: 0.0005    | 75%: 0.0005    | max: 0.0005    [0m
[32m2022-07-18 10:39:19[0m | [1mINFO[0m | [1mvalidation:[0m
[32m2

## Training with 2 workers

In [8]:
# here you put all actions which are performed only once before initializing the workers
# for example, setti`ng running arguments and experiment:

args = beam_arguments(f"--project-name=cifar10 --root-dir={root_dir} --algorithm=CIFAR10Algorithm --device=0 --half --lr-d=1e-3 --batch-size=512",
                      "--n-epochs=40 --clip-gradient=1000 --parallel=2 --accumulate=1 --no-deterministic",
                      "--weight-decay=1e-5 --beta1=0.9 --beta2=0.9", 
                      path_to_data=path_to_data, dropout=.0, activation='relu', channels=512, label_smoothing=.2,
                      padding=6, gain=.2, turn_point=1024, final_point=3000, minimal_gain=.05, temperature=1)

experiment = Experiment(args)

[32m2022-07-18 10:48:47[0m | [1mINFO[0m | [1mCreating new experiment[0m
[32m2022-07-18 10:48:47[0m | [1mINFO[0m | [1mExperiment directory is: /home/shared/data/results/cifar10/cifar10/CIFAR10Algorithm/debug/0002_20220718_104847[0m
[32m2022-07-18 10:48:47[0m | [1mINFO[0m | [1mbeam project: cifar10[0m
[32m2022-07-18 10:48:47[0m | [1mINFO[0m | [1mExperiment Hyperparameters[0m
[32m2022-07-18 10:48:47[0m | [1mINFO[0m | [1mproject_name: cifar10[0m
[32m2022-07-18 10:48:47[0m | [1mINFO[0m | [1malgorithm: CIFAR10Algorithm[0m
[32m2022-07-18 10:48:47[0m | [1mINFO[0m | [1midentifier: debug[0m
[32m2022-07-18 10:48:47[0m | [1mINFO[0m | [1mmp_port: random[0m
[32m2022-07-18 10:48:47[0m | [1mINFO[0m | [1mroot_dir: /home/shared/data/results/cifar10[0m
[32m2022-07-18 10:48:47[0m | [1mINFO[0m | [1mreload: False[0m
[32m2022-07-18 10:48:47[0m | [1mINFO[0m | [1mresume: -1[0m
[32m2022-07-18 10:48:47[0m | [1mINFO[0m | [1moverride: False[0m

In [9]:
alg = experiment.fit(CIFAR10Algorithm, CIFAR10Dataset, tensorboard_arguments={'images': {'sample': {'dataformats': 'NCHW'}}})

[32m2022-07-18 10:48:58[0m | [1mINFO[0m | [1mInitializing 2 parallel workers[0m
[32m2022-07-18 10:48:58[0m | [1mINFO[0m | [1mMultiprocessing port is: 57141[0m
[32m2022-07-18 10:49:02[0m | [1mINFO[0m | [1mWorker: 1/2 is running...[0m
[32m2022-07-18 10:49:02[0m | [1mINFO[0m | [1mWorker: 2/2 is running...[0m


train: 100%|██████████| 73/73 [00:13<00:00,  5.37it/s]

[32m2022-07-18 10:49:26[0m | [1mINFO[0m | [1m[0m
[32m2022-07-18 10:49:26[0m | [1mINFO[0m | [1mFinished epoch 1/40:[0m
[32m2022-07-18 10:49:26[0m | [1mINFO[0m | [1mtrain:[0m
[32m2022-07-18 10:49:26[0m | [1mINFO[0m | [1mseconds:  14.71 | batches: 73 | samples:  3.738e+04 | batch_rate:  4.964 [iter/sec] | sample_rate:  2.542e+03 [iter/sec] [0m
[32m2022-07-18 10:49:26[0m | [1mINFO[0m | [1mloss:        | avg: 1.647e+03 | std: 991.6     | min: 1.017e+03 | 25%: 1.173e+03 | 50%: 1.307e+03 | 75%: 1.567e+03 | max: 6.148e+03 [0m
[32m2022-07-18 10:49:26[0m | [1mINFO[0m | [1macc:         | avg: 0.2553    | std: 0.06811   | min: 0.09375   | 25%: 0.1992    | 50%: 0.252     | 75%: 0.3066    | max: 0.4121    [0m
[32m2022-07-18 10:49:26[0m | [1mINFO[0m | [1mlr:          | avg: 5e-05     | std: nan       | min: 5e-05     | 25%: 5e-05     | 50%: 5e-05     | 75%: 5e-05     | max: 5e-05     [0m
[32m2022-07-18 10:49:26[0m | [1mINFO[0m | [1mvalidation:[0m
[32m2


train:   8%|▊         | 6/73 [00:00<?, ?it/s]
train: 100%|██████████| 73/73 [00:12<00:00,  5.50it/s]

[32m2022-07-18 10:49:40[0m | [1mINFO[0m | [1m[0m
[32m2022-07-18 10:49:40[0m | [1mINFO[0m | [1mFinished epoch 2/40:[0m
[32m2022-07-18 10:49:40[0m | [1mINFO[0m | [1mtrain:[0m
[32m2022-07-18 10:49:40[0m | [1mINFO[0m | [1mseconds:  13.3 | batches: 73 | samples:  3.738e+04 | batch_rate:  5.489 [iter/sec] | sample_rate:  2.81e+03 [iter/sec] [0m
[32m2022-07-18 10:49:40[0m | [1mINFO[0m | [1mloss:        | avg: 950.7     | std: 62.33     | min: 832.0     | 25%: 900.0     | 50%: 946.0     | 75%: 992.0     | max: 1.14e+03  [0m
[32m2022-07-18 10:49:40[0m | [1mINFO[0m | [1macc:         | avg: 0.4747    | std: 0.06133   | min: 0.3281    | 25%: 0.4277    | 50%: 0.4805    | 75%: 0.5215    | max: 0.5938    [0m
[32m2022-07-18 10:49:40[0m | [1mINFO[0m | [1mlr:          | avg: 5e-05     | std: nan       | min: 5e-05     | 25%: 5e-05     | 50%: 5e-05     | 75%: 5e-05     | max: 5e-05     [0m
[32m2022-07-18 10:49:40[0m | [1mINFO[0m | [1mvalidation:[0m
[32m202


train:   8%|▊         | 6/73 [00:00<?, ?it/s]
train: 100%|██████████| 73/73 [00:12<00:00,  5.50it/s]

[32m2022-07-18 10:49:55[0m | [1mINFO[0m | [1m[0m
[32m2022-07-18 10:49:55[0m | [1mINFO[0m | [1mFinished epoch 3/40:[0m
[32m2022-07-18 10:49:55[0m | [1mINFO[0m | [1mtrain:[0m
[32m2022-07-18 10:49:55[0m | [1mINFO[0m | [1mseconds:  13.27 | batches: 73 | samples:  3.738e+04 | batch_rate:  5.502 [iter/sec] | sample_rate:  2.817e+03 [iter/sec] [0m
[32m2022-07-18 10:49:55[0m | [1mINFO[0m | [1mloss:        | avg: 829.4     | std: 32.64     | min: 776.0     | 25%: 807.0     | 50%: 825.0     | 75%: 851.5     | max: 922.0     [0m
[32m2022-07-18 10:49:55[0m | [1mINFO[0m | [1macc:         | avg: 0.6103    | std: 0.03629   | min: 0.5215    | 25%: 0.582     | 50%: 0.6152    | 75%: 0.6367    | max: 0.6797    [0m
[32m2022-07-18 10:49:55[0m | [1mINFO[0m | [1mlr:          | avg: 5e-05     | std: nan       | min: 5e-05     | 25%: 5e-05     | 50%: 5e-05     | 75%: 5e-05     | max: 5e-05     [0m
[32m2022-07-18 10:49:55[0m | [1mINFO[0m | [1mvalidation:[0m
[32m2


train:   8%|▊         | 6/73 [00:00<?, ?it/s]
train: 100%|██████████| 73/73 [00:11<00:00,  5.69it/s]

[32m2022-07-18 10:50:09[0m | [1mINFO[0m | [1m[0m
[32m2022-07-18 10:50:09[0m | [1mINFO[0m | [1mFinished epoch 4/40:[0m
[32m2022-07-18 10:50:09[0m | [1mINFO[0m | [1mtrain:[0m
[32m2022-07-18 10:50:09[0m | [1mINFO[0m | [1mseconds:  12.88 | batches: 73 | samples:  3.738e+04 | batch_rate:  5.669 [iter/sec] | sample_rate:  2.903e+03 [iter/sec] [0m
[32m2022-07-18 10:50:09[0m | [1mINFO[0m | [1mloss:        | avg: 777.1     | std: 24.64     | min: 727.0     | 25%: 763.0     | 50%: 774.0     | 75%: 791.0     | max: 859.0     [0m
[32m2022-07-18 10:50:09[0m | [1mINFO[0m | [1macc:         | avg: 0.6695    | std: 0.02791   | min: 0.5957    | 25%: 0.6504    | 50%: 0.6738    | 75%: 0.6895    | max: 0.7441    [0m
[32m2022-07-18 10:50:09[0m | [1mINFO[0m | [1mlr:          | avg: 5.703e-05 | std: nan       | min: 5.703e-05 | 25%: 5.703e-05 | 50%: 5.703e-05 | 75%: 5.703e-05 | max: 5.703e-05 [0m
[32m2022-07-18 10:50:09[0m | [1mINFO[0m | [1mvalidation:[0m
[32m2


train:   8%|▊         | 6/73 [00:00<?, ?it/s]
train:  21%|██        | 15/73 [00:01<00:11,  4.98it/s]

[32m2022-07-18 10:50:13[0m | [31m[1mERROR[0m | [31m[1mKeyboardInterrupt: Training was interrupted, reloads last checkpoint[0m


train:  22%|██▏       | 16/73 [00:02<00:12,  4.60it/s]

[32m2022-07-18 10:50:13[0m | [31m[1mERROR[0m | [31m[1mKeyboardInterrupt: Training was interrupted, Worker terminates[0m
[32m2022-07-18 10:50:13[0m | [31m[1mERROR[0m | [31m[1mKeyboardInterrupt: Training was interrupted, Worker terminates[0m
[32m2022-07-18 10:50:19[0m | [1mINFO[0m | [1mReload experiment from checkpoint: /home/shared/data/results/cifar10/cifar10/CIFAR10Algorithm/debug/0002_20220718_104847/checkpoints/checkpoint_000004[0m
[32m2022-07-18 10:50:19[0m | [1mINFO[0m | [1mLoading network state from: /home/shared/data/results/cifar10/cifar10/CIFAR10Algorithm/debug/0002_20220718_104847/checkpoints/checkpoint_000004[0m


In [14]:
predictions = alg.evaluate('test')

In [16]:
predictions.statistics['metrics']

defaultdict(list,
            {'precision': array([0.76832151, 0.92334495, 0.87545788, 0.53495441, 0.53638254,
                    0.5632    , 0.68841202, 0.85095541, 0.71588188, 0.77132806]),
             'recall': array([0.65 , 0.795, 0.239, 0.528, 0.774, 0.704, 0.802, 0.668, 0.897,
                    0.877]),
             'fscore': array([0.70422535, 0.85437937, 0.37549097, 0.53145445, 0.63364716,
                    0.62577778, 0.7408776 , 0.74845938, 0.79627164, 0.82077679]),
             'support': array([1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000])})

In [17]:
predictions

KeyError: 0

## Show tensorboard

In [10]:
%load_ext tensorboard

In [11]:
%tensorboard --logdir /localdata/elads/data/cifar10/cifar10/CIFAR10Algorithm/debug/0147_20220706_122801 --port=13067 --bind_all

## Hyperparameter search with native optuna

In [None]:
args = beam_arguments(f"--project-name=cifar10 --root-dir={root_dir} --algorithm=CIFAR10Algorithm --device=0 --half --lr-d=1e-3 --batch-size=512",
                      "--n-epochs=40 --clip-gradient=1000 --parallel=4 --accumulate=1 --no-deterministic",
                      "--weight-decay=1e-5 --beta1=0.9 --beta2=0.9", 
                      path_to_data=path_to_data, dropout=.0, activation='relu', channels=512, label_smoothing=.2,
                      padding=6, gain=.2, turn_point=1024, final_point=3000, minimal_gain=.05, temperature=1)

study = Study(cifar10_algorithm_generator, args)

In [14]:
def suggest(trial):
    lr = trial.suggest_loguniform("lr", 1e-3, 2e-2)
    print('My suggestion')
    print(lr)
    return {'lr_dense': lr}
    

In [15]:
study.optuna(suggest, direction='maximize', n_jobs=1, n_trials=10)

[32m[I 2022-06-20 08:46:21,221][0m A new study created in memory with name: cifar10/CIFAR10Algorithm/debug_hp_optimization_20220620_084618[0m


My suggestion
0.009781018192999246
[32m2022-06-20 08:46:21.224[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mrunner_optuna[0m:[36m114[0m - [1mNext Hyperparameter suggestion:[0m
[32m2022-06-20 08:46:21.225[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mrunner_optuna[0m:[36m116[0m - [1mlr_dense: 0.009781018192999246[0m
[32m2022-06-20 08:46:21.229[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36m__init__[0m:[36m262[0m - [1mCreating new experiment[0m
[32m2022-06-20 08:46:21.229[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36m__init__[0m:[36m277[0m - [1mExperiment directory is: /home/shared/data/results/cifar10/cifar10/CIFAR10Algorithm/debug_hp_optimization_20220620_084618/0000_20220620_084621[0m
[32m2022-06-20 08:46:21.239[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mrun[0m:[36m554[0m - [1mSingle worker mode[0m
[32m2022-06-20 08:46:21.240[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mrun

KeyboardInterrupt: 

## Hyperparameter search with ray-tune and optuna

In [3]:
args = beam_arguments(f"--project-name=cifar10 --root-dir={root_dir} --algorithm=CIFAR10Algorithm --device=0 --amp --lr-d=1e-3 --batch-size=512",
                      "--n-epochs=40 --clip-gradient=1000 --parallel=0 --accumulate=1 --no-deterministic",
                      "--weight-decay=1e-5 --beta1=0.9 --beta2=0.9", 
                      path_to_data=path_to_data, dropout=.0, activation='relu', channels=512, label_smoothing=.2,
                      padding=6, gain=.2, turn_point=1024, final_point=3000, minimal_gain=.05, temperature=1)

study = Study(args, Alg=CIFAR10Algorithm, Dataset=CIFAR10Dataset, print_results=True)

[32m2022-07-06 16:20:20[0m | [1mINFO    [0m | [1mHyperparameter Optimization[0m
[32m2022-07-06 16:20:20[0m | [1mINFO    [0m | [1mbeam project: cifar10[0m
[32m2022-07-06 16:20:20[0m | [1mINFO    [0m | [1mExperiment Hyperparameters[0m
[32m2022-07-06 16:20:20[0m | [1mINFO    [0m | [1mproject_name: cifar10[0m
[32m2022-07-06 16:20:20[0m | [1mINFO    [0m | [1malgorithm: CIFAR10Algorithm[0m
[32m2022-07-06 16:20:20[0m | [1mINFO    [0m | [1midentifier: debug_hp_optimization_20220706_162020[0m
[32m2022-07-06 16:20:20[0m | [1mINFO    [0m | [1mmp_port: random[0m
[32m2022-07-06 16:20:20[0m | [1mINFO    [0m | [1mroot_dir: /localdata/elads/data/cifar10[0m
[32m2022-07-06 16:20:20[0m | [1mINFO    [0m | [1mreload: False[0m
[32m2022-07-06 16:20:20[0m | [1mINFO    [0m | [1mresume: -1[0m
[32m2022-07-06 16:20:20[0m | [1mINFO    [0m | [1moverride: False[0m
[32m2022-07-06 16:20:20[0m | [1mINFO    [0m | [1mcpu_workers: 0[0m
[32m2022-07-06

In [None]:
# hebo = HEBOSearch(metric="mean_accuracy", mode="max")

# analysis = study.tune(config={"lr_dense": tune.loguniform(1e-3, 2e-2),
#                               "weight_decay": tune.loguniform(1e-6, 1e-4),
#                               "gamma": tune.loguniform(.1, .9),
#                               "dropout": tune.uniform(0, .75),
#                               "scale_down": tune.uniform(0.4, .7),
#                               "scale_up": tune.uniform(0.9, 1.2),
#                               "ratio_down": tune.uniform(0.7, .95),
#                               "ratio_up": tune.uniform(1.05, 1.4),
#                               "channels": tune.choice([128, 256, 512]),
#                               "batch_size": tune.choice([512, 1024, 2048]),},
#                        metric="mean_accuracy",
#                        max_concurrent_trials=4,
#                        resources_per_trial={"gpu": 1},
#                        mode="max",
#                        search_alg=hebo,
#                       progress_reporter=JupyterNotebookReporter(overwrite=True),
#                        num_samples=400)


analysis = study.tune(config={"lr_dense": tune.loguniform(1e-4, 2e-2),
                              "weight_decay": tune.loguniform(1e-6, 1e-2),
                              "beta1": tune.loguniform(.85, .95),
                              "gain": tune.uniform(0.2, .2),
                              "temperature": tune.uniform(0.05, 2.),
                              "minimal_gain": tune.loguniform(.01, .1),
                              "channels": tune.choice([256, 512, 1024]),
                              "padding": tune.choice([4, 6, 8]),
                              "turn_point": tune.choice([256, 512, 1024]),
                              "final_point": tune.choice([2048, 2048*1.5, 2048*2]),
                              "batch_size": tune.choice([256, 512, 1024]),
                              "activation": tune.choice(['relu', 'celu', 'gelu']),},
                       metric="mean_accuracy",
                       max_concurrent_trials=8,
                       resources_per_trial={"gpu": 1},
                       mode="max",
                       search_alg=OptunaSearch(),
                      progress_reporter=JupyterNotebookReporter(overwrite=True),
                       num_samples=400)


Trial name,status,loc,activation,batch_size,beta1,channels,final_point,gain,lr_dense,minimal_gain,padding,temperature,turn_point,weight_decay,acc,iter,total time (s)
runner_tune_ec1791a4,RUNNING,172.17.0.2:39938,relu,512,0.928183,1024,4096,0.2,0.00671536,0.0357185,4,0.062858,512,4.63743e-05,0.934482,35.0,454.552
runner_tune_3d2a6602,RUNNING,172.17.0.2:40084,relu,512,0.911644,1024,4096,0.2,0.0071578,0.0349454,4,0.0630833,512,3.41031e-05,0.930131,25.0,308.486
runner_tune_93f28ee2,RUNNING,172.17.0.2:40184,relu,512,0.911794,1024,4096,0.2,0.011317,0.0469718,4,0.0575666,512,2.46689e-05,0.925337,17.0,217.496
runner_tune_c8115550,RUNNING,172.17.0.2:40344,relu,256,0.927557,1024,4096,0.2,0.0106841,0.0470998,4,0.0569684,512,3.21594e-05,0.763281,2.0,34.0246
runner_tune_38247390,PENDING,,relu,512,0.927516,1024,4096,0.2,0.00689768,0.0473985,4,0.0550966,512,2.76028e-05,,,
runner_tune_c944283c,TERMINATED,172.17.0.2:23692,celu,256,0.904816,256,2048,0.2,0.0099276,0.0735093,8,1.58846,512,3.5821e-06,0.880295,40.0,204.667
runner_tune_cc0e4228,TERMINATED,172.17.0.2:23729,relu,512,0.883576,512,2048,0.2,0.000335789,0.01734,4,0.221518,256,6.57065e-05,0.908114,40.0,233.51
runner_tune_cc12cf64,TERMINATED,172.17.0.2:23731,gelu,1024,0.933654,512,4096,0.2,0.00266318,0.014136,6,1.3378,512,0.00157602,0.910742,40.0,230.826
runner_tune_cc172366,TERMINATED,172.17.0.2:23733,gelu,512,0.943031,1024,2048,0.2,0.000101818,0.0812117,6,1.14841,1024,2.45449e-06,0.872603,40.0,497.553
runner_tune_cc1b3c94,TERMINATED,172.17.0.2:24077,relu,256,0.856223,256,4096,0.2,0.00219286,0.0769723,4,0.668636,512,0.00776539,0.907031,40.0,188.601


In [None]:
1+1