In [1]:
%load_ext autoreload
%autoreload 2

import torch
import torchvision
import torch.nn.functional as F
from torch import nn
from sklearn.metrics import precision_recall_fscore_support
import numpy as np

# manage ray's relative imports
import ray
runtime_env = {"working_dir": ".." }
ray.init(runtime_env=runtime_env, dashboard_port=13065, include_dashboard=True)

from ray import tune
from ray.tune.suggest.optuna import OptunaSearch
from ray.tune import JupyterNotebookReporter

# manage beams's relative imports
import sys
sys.path.append('..')

from src.beam import beam_arguments, Experiment, Study
from src.beam import UniversalDataset, UniversalBatchSampler
from src.beam import Algorithm
from src.beam import LinearNet
from torchvision import transforms
import matplotlib.pyplot as plt

from src.beam import DataTensor
from src.beam.utils import is_notebook
from cifar10_example import cifar10_algorithm_generator, Cifar10Network

from ray.tune.suggest.hebo import HEBOSearch

2022-06-21 11:59:31,313	INFO services.py:1456 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:13065[39m[22m
2022-06-21 11:59:31,818	INFO packaging.py:388 -- Creating a file package for local directory '..'.
2022-06-21 11:59:31,841	INFO packaging.py:241 -- Pushing file package 'gcs://_ray_pkg_6ed689020eaf0264.zip' (0.78MiB) to Ray cluster...
2022-06-21 11:59:31,856	INFO packaging.py:243 -- Successfully pushed file package 'gcs://_ray_pkg_6ed689020eaf0264.zip'.


In [2]:
# path_to_data = '/localdata/elads/data/datasets/cifar10'
# root_dir = '/localdata/elads/data/cifar10'

path_to_data = '/home/shared/data/dataset/cifar10'
root_dir = '/home/shared/data/results/cifar10'

## Training with a single worker

In [5]:
# here you put all actions which are performed only once before initializing the workers
# for example, setti`ng running arguments and experiment:

args = beam_arguments(f"--project-name=cifar10 --root-dir={root_dir} --algorithm=CIFAR10Algorithm --device=1 --half --lr-d=1e-4 --batch-size=512",
                      "--n-epochs=4 --clip=0 --parallel=1 --accumulate=1 --cudnn-benchmark",
                      "--weight-decay=.00256 --beta1=0.9 --beta2=0.9", 
                      path_to_data=path_to_data, gamma=1., dropout=.0, activation='celu', channels=512,
                      padding=4, gain=.4, turn_point=500, final_point=3000, minimal_gain=.02)

experiment = Experiment(args)

[32m2022-06-21 11:59:53.684[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36m__init__[0m:[36m184[0m - [1mbeam project: cifar10[0m
[32m2022-06-21 11:59:53.685[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36m__init__[0m:[36m185[0m - [1mExperiment Hyperparameters[0m
[32m2022-06-21 11:59:53.685[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36m__init__[0m:[36m190[0m - [1mproject_name: cifar10[0m
[32m2022-06-21 11:59:53.686[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36m__init__[0m:[36m190[0m - [1midentifier: debug[0m
[32m2022-06-21 11:59:53.686[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36m__init__[0m:[36m190[0m - [1malgorithm: CIFAR10Algorithm[0m
[32m2022-06-21 11:59:53.687[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36m__init__[0m:[36m190[0m - [1mmp_port: None[0m
[32m2022-06-21 11:59:53.687[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36m__init__[0m:[36m190[0m - [1mroot_

## Train with single or multiple workers

In [6]:
alg = experiment(cifar10_algorithm_generator, tensorboard_arguments={'images': {'sample': {'dataformats': 'NCHW'}}})

[32m2022-06-21 11:59:55.760[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mrun[0m:[36m554[0m - [1mSingle worker mode[0m
[32m2022-06-21 11:59:55.761[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mrun_worker[0m:[36m46[0m - [1mWorker: 1/1 is running...[0m


train:   0%|          | 0/73 [00:00<?, ?it/s]

validation:   0%|          | 0/22 [00:00<?, ?it/s]


[32m2022-06-21 11:59:59.544[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36msave_model_results[0m:[36m398[0m - [1mFinished epoch 1/4:[0m
[32m2022-06-21 11:59:59.547[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mlog_data[0m:[36m444[0m - [1mtrain:[0m
[32m2022-06-21 11:59:59.548[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mlog_data[0m:[36m448[0m - [1mloss 1104.08 	|[0m
[32m2022-06-21 11:59:59.549[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mlog_data[0m:[36m448[0m - [1macc 0.290909 	|[0m
[32m2022-06-21 11:59:59.550[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mlog_data[0m:[36m448[0m - [1mlr 5.84e-06 	|[0m
[32m2022-06-21 11:59:59.551[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mlog_data[0m:[36m444[0m - [1mvalidation:[0m
[32m2022-06-21 11:59:59.552[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mlog_data[0m:[36m448[0m - [1mloss 1033.11 	|[0m
[32m2022-06-21 11:

train:   0%|          | 0/73 [00:00<?, ?it/s]

validation:   0%|          | 0/22 [00:00<?, ?it/s]


[32m2022-06-21 12:00:04.401[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36msave_model_results[0m:[36m398[0m - [1mFinished epoch 2/4:[0m
[32m2022-06-21 12:00:04.409[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mlog_data[0m:[36m444[0m - [1mtrain:[0m
[32m2022-06-21 12:00:04.410[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mlog_data[0m:[36m448[0m - [1mloss 977.63 	|[0m
[32m2022-06-21 12:00:04.411[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mlog_data[0m:[36m448[0m - [1macc 0.425969 	|[0m
[32m2022-06-21 12:00:04.412[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mlog_data[0m:[36m448[0m - [1mlr 1.168e-05 	|[0m
[32m2022-06-21 12:00:04.413[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mlog_data[0m:[36m444[0m - [1mvalidation:[0m
[32m2022-06-21 12:00:04.414[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mlog_data[0m:[36m448[0m - [1mloss 924.091 	|[0m
[32m2022-06-21 12:

train:   0%|          | 0/73 [00:00<?, ?it/s]

validation:   0%|          | 0/22 [00:00<?, ?it/s]


[32m2022-06-21 12:00:09.629[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36msave_model_results[0m:[36m398[0m - [1mFinished epoch 3/4:[0m
[32m2022-06-21 12:00:09.635[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mlog_data[0m:[36m444[0m - [1mtrain:[0m
[32m2022-06-21 12:00:09.636[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mlog_data[0m:[36m448[0m - [1mloss 882.582 	|[0m
[32m2022-06-21 12:00:09.639[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mlog_data[0m:[36m448[0m - [1macc 0.534675 	|[0m
[32m2022-06-21 12:00:09.640[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mlog_data[0m:[36m448[0m - [1mlr 1.752e-05 	|[0m
[32m2022-06-21 12:00:09.641[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mlog_data[0m:[36m444[0m - [1mvalidation:[0m
[32m2022-06-21 12:00:09.642[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mlog_data[0m:[36m448[0m - [1mloss 841.068 	|[0m
[32m2022-06-21 12

train:   0%|          | 0/73 [00:00<?, ?it/s]

validation:   0%|          | 0/22 [00:00<?, ?it/s]


[32m2022-06-21 12:00:14.072[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36msave_model_results[0m:[36m398[0m - [1mFinished epoch 4/4:[0m
[32m2022-06-21 12:00:14.085[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mlog_data[0m:[36m444[0m - [1mtrain:[0m
[32m2022-06-21 12:00:14.086[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mlog_data[0m:[36m448[0m - [1mloss 807.829 	|[0m
[32m2022-06-21 12:00:14.087[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mlog_data[0m:[36m448[0m - [1macc 0.62286 	|[0m
[32m2022-06-21 12:00:14.088[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mlog_data[0m:[36m448[0m - [1mlr 2.336e-05 	|[0m
[32m2022-06-21 12:00:14.089[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mlog_data[0m:[36m444[0m - [1mvalidation:[0m
[32m2022-06-21 12:00:14.089[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mlog_data[0m:[36m448[0m - [1mloss 797.955 	|[0m
[32m2022-06-21 12:

In [7]:
np.mean(alg.evaluate('test')['scalar']['acc'])

test:   0%|          | 0/20 [00:00<?, ?it/s]

0.6234949439764023

## Show tensorboard

In [8]:
%load_ext tensorboard

In [11]:
%tensorboard --logdir /home/shared/data/results/cifar10/cifar10/CIFAR10Algorithm/debug/0001_20220621_115953 --port=17066 --bind_all

## Hyperparameter search with native optuna

In [None]:
args = beam_arguments(f"--project-name=cifar10 --root-dir={root_dir} --algorithm=CIFAR10Algorithm --device=1 --half --lr-d=1e-4 --batch-size=512",
                      "--n-epochs=40 --epoch-length-train=50000 --epoch-length-eval=10000 --clip=0 --parallel=1 --accumulate=1 --cudnn-benchmark",
                      "--weight-decay=.00256 --beta1=0.9 --beta2=0.9", 
                      path_to_data=path_to_data, gamma=1., dropout=.0, activation='celu', channels=512,
                      scale_down=.7, scale_up=1.4, ratio_down=.7, ratio_up=1.4)

study = Study(cifar10_algorithm_generator, args)

In [14]:
def suggest(trial):
    lr = trial.suggest_loguniform("lr", 1e-3, 2e-2)
    print('My suggestion')
    print(lr)
    return {'lr_dense': lr}
    

In [15]:
study.optuna(suggest, direction='maximize', n_jobs=1, n_trials=10)

[32m[I 2022-06-20 08:46:21,221][0m A new study created in memory with name: cifar10/CIFAR10Algorithm/debug_hp_optimization_20220620_084618[0m


My suggestion
0.009781018192999246
[32m2022-06-20 08:46:21.224[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mrunner_optuna[0m:[36m114[0m - [1mNext Hyperparameter suggestion:[0m
[32m2022-06-20 08:46:21.225[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mrunner_optuna[0m:[36m116[0m - [1mlr_dense: 0.009781018192999246[0m
[32m2022-06-20 08:46:21.229[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36m__init__[0m:[36m262[0m - [1mCreating new experiment[0m
[32m2022-06-20 08:46:21.229[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36m__init__[0m:[36m277[0m - [1mExperiment directory is: /home/shared/data/results/cifar10/cifar10/CIFAR10Algorithm/debug_hp_optimization_20220620_084618/0000_20220620_084621[0m
[32m2022-06-20 08:46:21.239[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mrun[0m:[36m554[0m - [1mSingle worker mode[0m
[32m2022-06-20 08:46:21.240[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36mrun

KeyboardInterrupt: 

## Hyperparameter search with ray-tune and optuna

In [3]:
args = beam_arguments(f"--project-name=cifar10 --root-dir={root_dir} --algorithm=CIFAR10Algorithm --device=0 --half --lr-d=1e-3 --batch-size=512",
                      "--n-epochs=40 --clip=0 --parallel=1 --accumulate=1 --cudnn-benchmark",
                      "--weight-decay=.00256 --beta1=0.9 --beta2=0.9 ", 
                      path_to_data=path_to_data, activation='celu', channels=512, dropout=.0,
                      padding=4, gain=.4, turn_point=500, final_point=3000, minimal_gain=.02, temprature=.125)

study = Study(cifar10_algorithm_generator, args)

[32m2022-06-20 21:04:15.526[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36m__init__[0m:[36m85[0m - [1mHyperparameter Optimization[0m
[32m2022-06-20 21:04:15.527[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36m__init__[0m:[36m86[0m - [1mbeam project: cifar10[0m
[32m2022-06-20 21:04:15.529[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36m__init__[0m:[36m87[0m - [1mExperiment Hyperparameters[0m
[32m2022-06-20 21:04:15.530[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36m__init__[0m:[36m90[0m - [1mproject_name: cifar10[0m
[32m2022-06-20 21:04:15.530[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36m__init__[0m:[36m90[0m - [1midentifier: debug_hp_optimization_20220620_210415[0m
[32m2022-06-20 21:04:15.531[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[36m__init__[0m:[36m90[0m - [1malgorithm: CIFAR10Algorithm[0m
[32m2022-06-20 21:04:15.532[0m | [1mINFO    [0m | [36msrc.beam.experiment[0m:[

In [None]:
# hebo = HEBOSearch(metric="mean_accuracy", mode="max")

# analysis = study.tune(config={"lr_dense": tune.loguniform(1e-3, 2e-2),
#                               "weight_decay": tune.loguniform(1e-6, 1e-4),
#                               "gamma": tune.loguniform(.1, .9),
#                               "dropout": tune.uniform(0, .75),
#                               "scale_down": tune.uniform(0.4, .7),
#                               "scale_up": tune.uniform(0.9, 1.2),
#                               "ratio_down": tune.uniform(0.7, .95),
#                               "ratio_up": tune.uniform(1.05, 1.4),
#                               "channels": tune.choice([128, 256, 512]),
#                               "batch_size": tune.choice([512, 1024, 2048]),},
#                        metric="mean_accuracy",
#                        max_concurrent_trials=4,
#                        resources_per_trial={"gpu": 1},
#                        mode="max",
#                        search_alg=hebo,
#                       progress_reporter=JupyterNotebookReporter(overwrite=True),
#                        num_samples=400)


analysis = study.tune(config={"lr_dense": tune.loguniform(1e-4, 2e-2),
                              "weight_decay": tune.loguniform(1e-6, 1e-2),
                              "beta1": tune.loguniform(.85, .95),
                              "gain": tune.uniform(0.2, .2),
                              "temprature": tune.uniform(0.05, 5.),
                              "minimal_gain": tune.loguniform(.01, .1),
                              "channels": tune.choice([256, 512, 1024]),
                              "padding": tune.choice([4, 6, 8]),
                              "turn_point": tune.choice([256, 512, 1024]),
                              "final_point": tune.choice([2048, 2048*1.5, 2048*2]),
                              "batch_size": tune.choice([512, 1024, 2048]),
                              "activation": tune.choice(['relu', 'celu', 'gelu']),},
                       metric="mean_accuracy",
                       max_concurrent_trials=8,
                       resources_per_trial={"gpu": 1},
                       mode="max",
                       search_alg=OptunaSearch(),
                      progress_reporter=JupyterNotebookReporter(overwrite=True),
                       num_samples=400)


Trial name,status,loc,activation,batch_size,beta1,channels,final_point,gain,lr_dense,minimal_gain,padding,temprature,turn_point,weight_decay,acc,iter,total time (s)
runner_tune_6c7e62ca,RUNNING,172.17.0.2:24720,relu,512,0.850039,1024,2048,0.2,0.00517562,0.0308595,6,1.82263,1024,0.000333747,0.929599,27.0,326.542
runner_tune_c7b45aa0,RUNNING,172.17.0.2:24775,relu,512,0.852321,1024,2048,0.2,0.00717215,0.0351799,6,1.71532,512,0.000459183,0.907049,21.0,261.575
runner_tune_cf1b69ba,RUNNING,172.17.0.2:24640,relu,512,0.850407,1024,2048,0.2,0.00499638,0.0301177,6,2.03082,1024,0.000346404,0.928622,39.0,476.122
runner_tune_ea99898c,RUNNING,172.17.0.2:24832,relu,512,0.850901,1024,2048,0.2,0.0102751,0.037915,6,1.89948,1024,0.000533225,0.844993,15.0,204.735
runner_tune_0fd686f0,PENDING,,relu,512,0.850103,1024,2048,0.2,0.0102876,0.045788,6,3.40604,1024,1.42559e-05,,,
runner_tune_0e347ab2,TERMINATED,172.17.0.2:24035,gelu,512,0.868141,512,3072,0.2,0.000143155,0.0144721,8,0.553062,512,1.0415e-05,0.886452,40.0,244.701
runner_tune_12a098fc,TERMINATED,172.17.0.2:23678,gelu,1024,0.948043,512,4096,0.2,0.00237786,0.0175715,6,0.279561,1024,0.00780286,0.917969,40.0,245.937
runner_tune_1502fc2a,TERMINATED,172.17.0.2:23718,celu,2048,0.873043,1024,4096,0.2,0.000252165,0.0135683,4,4.74368,256,0.00322105,0.891724,40.0,454.348
runner_tune_155ba23e,TERMINATED,172.17.0.2:24087,relu,512,0.859167,1024,3072,0.2,0.00177057,0.0546031,6,2.41898,1024,1.29446e-06,0.932351,40.0,488.752
runner_tune_17656412,TERMINATED,172.17.0.2:23804,gelu,2048,0.895943,1024,2048,0.2,0.00604309,0.0103357,6,3.49494,1024,0.00253636,0.867065,40.0,459.089
