# How to use bayesian optimization

In [1]:
import pandas as pd
import numpy as np
from src.experimenting.hopt_experiment import HoptExperiment
from src.pipeline_optimization.bayesian_hopt import BayesianHopt
from src.pipeline_optimization.bayesian_hopt import Config
from src.utils.thesis_utils import thesis_lookup_objective, thesis_search_space
from src.visualization.visualize import visualize_search_performance, compare_search_performance

In [2]:
objective = thesis_lookup_objective('WEST_diff_17520')
search_space = thesis_search_space()

In [3]:
max_evals = 5

In [4]:
bay_opt = BayesianHopt(
    identifier='Bayesian optimization',
    search_space=search_space,
    objective=objective,
    max_evals=max_evals,
    algo='tpe'
)

In [5]:
bay_opt.run_bayesian_hopt()

100%|██████████| 5/5 [00:00<00:00, 13.86it/s, best loss: 13.704639055387075]


Unnamed: 0_level_0,results,results,results,results,configs,configs,configs,configs,configs
Unnamed: 0_level_1,loss,status,walltime,crossval,num_trees,learning_rate,max_depth,min_child_weight,subsample
0,13.704639,ok,62.880725,13.685483,800.0,0.068129,14.0,40.0,0.5
1,16.926628,ok,29.373307,16.907129,380.0,0.040842,11.0,22.5,0.5
2,16.953856,ok,34.754487,16.873757,380.0,0.316228,18.0,40.0,1.0
3,15.495866,ok,26.541053,15.607463,520.0,0.008799,7.0,22.5,1.0
4,14.780277,ok,59.52143,14.773443,520.0,0.014678,16.0,5.0,0.75


# How to use HoptExperiment

In [6]:
rand = BayesianHopt(
    identifier='Random search',
    search_space=search_space,
    objective=objective,
    max_evals=max_evals,
    algo='random'
)
bay_opt = BayesianHopt(
    identifier='Bayesian optimization',
    search_space=search_space,
    objective=objective,
    max_evals=max_evals,
    algo='tpe'
)

In [7]:
hopt_exp = HoptExperiment(
    hopts=[bay_opt,rand],
    iterations=5
)

In [18]:
results = hopt_exp.run_hopt_experiment()

100%|██████████| 5/5 [00:00<00:00, 11.25it/s, best loss: 13.961606657080207]
100%|██████████| 5/5 [00:00<00:00, 13.45it/s, best loss: 14.781152083031301]
100%|██████████| 5/5 [00:00<00:00, 14.48it/s, best loss: 13.898100938310876]
100%|██████████| 5/5 [00:00<00:00, 12.42it/s, best loss: 14.785963630653185]
100%|██████████| 5/5 [00:00<00:00, 14.52it/s, best loss: 13.677601669380046]
100%|██████████| 5/5 [00:00<00:00, 15.10it/s, best loss: 13.995307960722695]
100%|██████████| 5/5 [00:00<00:00, 12.10it/s, best loss: 13.560237872094975]
100%|██████████| 5/5 [00:00<00:00, 15.48it/s, best loss: 14.79231625869484] 
100%|██████████| 5/5 [00:00<00:00, 14.68it/s, best loss: 13.634611630952325]
100%|██████████| 5/5 [00:00<00:00, 16.33it/s, best loss: 13.593005612988376]


In [20]:
rand.__dict__.keys()

dict_keys(['_identifier', '_search_space', '_objective', '_max_evals', '_algo', 'results'])

In [21]:
hopt_exp._hopts[0].results

Unnamed: 0_level_0,results,results,results,results,configs,configs,configs,configs,configs
Unnamed: 0_level_1,loss,status,walltime,crossval,num_trees,learning_rate,max_depth,min_child_weight,subsample
0,13.940648,ok,53.191596,13.905583,660.0,0.040842,11.0,22.5,0.75
1,14.080106,ok,42.261532,14.127091,800.0,0.014678,7.0,5.0,0.75
2,16.953856,ok,34.754487,16.873757,380.0,0.008799,18.0,40.0,1.0
3,13.634612,ok,67.979447,13.568974,800.0,0.113646,18.0,40.0,0.75
4,28.557951,ok,9.889011,28.317811,100.0,0.113646,18.0,5.0,1.0


# Get averaged results

In [6]:
# parameters
n_average = 3
analyses = [bay_opt, rand]

In [7]:
from tqdm import tqdm

In [13]:
a = [[analysis.run_analysis()['results']['loss'] for analysis in analyses] for j in range(n_average)]

100%|██████████| 5/5 [00:00<00:00, 15.13it/s, best loss: 14.189351130990136]
100%|██████████| 5/5 [00:00<00:00, 17.29it/s, best loss: 13.764959979985221]
100%|██████████| 5/5 [00:00<00:00, 10.09it/s, best loss: 14.750710606556805]
100%|██████████| 5/5 [00:00<00:00, 18.54it/s, best loss: 13.581010795375526]
100%|██████████| 5/5 [00:00<00:00, 13.85it/s, best loss: 14.785963630653185]
100%|██████████| 5/5 [00:00<00:00, 17.94it/s, best loss: 13.965246985454133]


In [14]:
all_ranks = list()
for results in tqdm(a):
    rolling_mins = pd.DataFrame()
    for result in results:
        rolling_min = pd.Series([result[:j+1].min() for j in range(len(result))])
        rolling_mins = rolling_mins.append(rolling_min, ignore_index=True)
    ranks = rolling_mins.rank()
    print(ranks)
    all_ranks.append(ranks)
avg_ranks = pd.concat(all_ranks).groupby(level=0).mean()

100%|██████████| 3/3 [00:00<00:00, 43.02it/s]

     0    1    2    3    4
0  2.0  2.0  2.0  2.0  2.0
1  1.0  1.0  1.0  1.0  1.0
     0    1    2    3    4
0  2.0  2.0  2.0  2.0  2.0
1  1.0  1.0  1.0  1.0  1.0
     0    1    2    3    4
0  2.0  2.0  1.0  2.0  2.0
1  1.0  1.0  2.0  1.0  1.0





In [12]:
avg_ranks

Unnamed: 0,0,1,2,3,4
0,1.0,1.333333,1.333333,1.333333,1.666667
1,2.0,1.666667,1.666667,1.666667,1.333333


In [17]:
def analyses_ranks(n_average, analyses):
    all_ranks = list()
    for i in tqdm(range(n_average)):
        rolling_mins = pd.DataFrame()
        for analysis in analyses:
            result = analysis.run_analysis()['results']['loss']
            rolling_min = pd.Series([result[:j+1].min() for j in range(len(result))], name=analysis.identifier)
            rolling_mins = rolling_mins.append(rolling_min)
        ranks = rolling_mins.rank()
        all_ranks.append(ranks)
    avg_ranks = pd.concat(all_ranks).groupby(level=0).mean()
    return avg_ranks

In [18]:
analyses_ranks(20,analyses)

100%|██████████| 20/20 [05:12<00:00, 15.60s/it]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
Bayesian optimization,1.5,1.55,1.6,1.65,1.55,1.6,1.65,1.5,1.475,1.425,...,1.2,1.2,1.2,1.2,1.2,1.2,1.2,1.2,1.2,1.2
Random search,1.5,1.45,1.4,1.35,1.45,1.4,1.35,1.5,1.525,1.575,...,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8,1.8


# Averaged search results
- why ontkoppelen: aparte results creater and visualizer
Wat zijn de opties:
1. run_experiment(avg=10) argument, output = averaged results
    - Why not? NOT possible to compare separate iterations of different search algo's
2. run_experiment(avg=10) argument, output = list of lists of results
    - Possible, but pretty unclear output format.
    - All 
3. run_experiment(avg=10) argument, output = experiment collection class
    - Rather not, but could be the best option
4. List of hyperoptimization_analyses? And run_search() and run_searches_analysis()?

# Visualize averaged rankings

In [15]:
import plotly.graph_objects as go

fig = go.Figure()
idx = avg_ranks.columns
for analysis in analyses:
    fig.add_trace(go.Scatter(x=idx, y=avg_ranks.loc[analysis.identifier,:], mode='lines', name=analysis.identifier))
fig.show()

KeyError: 'Bayesian optimization'