# Fusing bayesian optimization explorations at different sample sizes

In [17]:
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.svm import SVC
from sklearn.gaussian_process.kernels import Matern, WhiteKernel

from bayes_opt import BayesianOptimization
from bayes_opt.util import Colours

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import colors, cm 

from math import log, floor, sqrt

copper = mpl.cm.copper 
cNorm  = colors.Normalize(vmin=-1, vmax=0)
scalarMap = cm.ScalarMappable(norm=cNorm, cmap=copper)

Utilities for compute time allocation

In [18]:
def cost_per_model(pct, algo='svm'): 
    x = [i for i in range(1,101, 1)] 
 
    if algo == 'rf': 
        nlogn = [i*log(i) for i in x]
        return nlogn[99]/nlogn[int(pct*100) - 1]
    if algo == 'svm':
        n_n = [i*i for i in x]
        return n_n[99]/n_n[int(pct*100) - 1] 
    


def budget_division(budget, how='equal', steps=3, lower=0.4):
    
    def normalizing_factor(lst, budget):
        '''sum(lst).X = budget'''
        return budget / sum(lst)
    
    if how == 'equal':
        return [int(budget/steps) for _ in range(steps)]
    else:
        slices = [budget/(1+s) for s in range(steps)]
        norm_factor = normalizing_factor(slices, budget)
        normalized_slices = [norm_factor*s for s in slices]
        
        if how == 'linear_asc':
            return normalized_slices
        if how == 'linear_desc':
            return normalized_slices[::-1]
        

def models_at_sample_size(budget, sample_size, algo):
    return int(budget*cost_per_model(sample_size, algo))


def size(i, lower=0.4, steps=3):
    #i += 1
    return lower + i * (1 - lower)/(steps - 1)

Data generation

In [19]:
def get_data():
    """Synthetic binary classification dataset."""
    data, targets = make_classification(
        n_samples=5_000,
        n_features=22,
        n_informative=12,
        n_redundant=4, 
        random_state=0,
    )
    return data, targets

Function to optimize: random forest classifier being score with negative log loss.

In [20]:
def rfc_cv(n_estimators, min_samples_split, max_features, data, targets): 
    estimator = RFC(
        n_estimators=n_estimators,
        min_samples_split=min_samples_split,
        max_features=max_features,
        random_state=2
    ) 
    
    cval = cross_val_score(estimator, data, targets,
                           scoring='neg_log_loss', cv=3)
    return cval.mean()
 

In [21]:
n_points = 5 # points to probe in next level

def points_to_explore(optimizer):
    x0_obs = np.array([[res["params"]["max_features"]] for res in optimizer.res]) 
    x1_obs = np.array([[res["params"]["min_samples_split"]] for res in optimizer.res])
    x2_obs = np.array([[res["params"]["n_estimators"]] for res in optimizer.res]) 
    y_obs = np.array([res["target"] for res in optimizer.res]) 
     
 
    fig, ax = plt.subplots(3, 1,figsize=(3,9))
    ax[0].scatter(x0_obs, x1_obs, c=scalarMap.to_rgba(y_obs))   
    ax[0].set_xlabel("max_features")
    ax[0].set_ylabel("min_samples_split")
    
    ax[1].scatter(x0_obs, x2_obs, c=scalarMap.to_rgba(y_obs))
    ax[1].set_xlabel("max_features")
    ax[1].set_ylabel("n_estimators")
    
    ax[2].scatter(x1_obs, x2_obs, c=scalarMap.to_rgba(y_obs))
    ax[2].set_xlabel("min_samples_split")
    ax[2].set_ylabel("n_estimators")
    fig.tight_layout()
    plt.show() 
    
    plt.matshow(optimizer._gp.L_)
    plt.show()
    
    #n_points = floor(sqrt(n_iter))
    
    idx = y_obs.argsort()[-n_points:][::-1]
    to_explore = [[x0_obs[i], x1_obs[i], x2_obs[i]] for i in idx]
    
    return to_explore



def optimize_rfc(data, targets, level, n_iter=0, bounds=None, probe=None):
    """Apply Bayesian Optimization to Random Forest parameters."""
    def rfc_crossval(n_estimators, min_samples_split, max_features): 
        return rfc_cv(
            n_estimators=int(n_estimators),
            min_samples_split=float(min_samples_split),
            max_features=max(min(max_features, 0.999), 1e-3), 
            data=data,
            targets=targets,
        )

    optimizer = BayesianOptimization(
        f=rfc_crossval,
        pbounds={
            "n_estimators": (10, 250),
            "min_samples_split": (0.01, 0.999),
            "max_features": (0.1, 0.999), 
        },
        random_state=1234,
        verbose=2
    ) 
    optimizer._gp.kernel = Matern(nu=2.5) + WhiteKernel(noise_level=0.1/level)
    
    if len(params) > 0:
        for p in params: 
            optimizer.probe(
            params=p,
            lazy=True,
            )
    
    if level == 1:
        init_points = 2
        n_iter -= init_points
    else:
        init_points = 0
        n_iter -= n_points 
    optimizer.maximize(init_points=init_points, n_iter=n_iter, acq="ucb", kappa=20/level) 
    
    return points_to_explore(optimizer)

In [None]:
data, targets = get_data() 

level = 1
lower = 0.3
steps = 3
budget = 50

bounds = None
params = []

plt.figure()
for i, b in enumerate(budget_division(budget, how='equal', steps=steps, lower=lower)):
    sample_size = size(i, lower, steps) 
    n_iter = models_at_sample_size(b, sample_size, 'rf')
    
    rows = int(len(data) * sample_size)
    idx = np.random.choice(len(data), rows, replace=False) 
    sampled_X = data[idx,:]
    sampled_y = targets[idx]
    data, targets = get_data() 

    print(Colours.green(f"--- Optimizing Random Forest: {n_iter} models; budget:{b} --- "))
    params = optimize_rfc(sampled_X, sampled_y, level, n_iter, bounds, params)   
    level += 1
#plt.legend(loc="upper right")

[92m--- Optimizing Random Forest: 72 models; budget:16 --- [0m
|   iter    |  target   | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------
| [0m 1       [0m | [0m-0.649   [0m | [0m 0.2722  [0m | [0m 0.6253  [0m | [0m 115.1   [0m |
| [0m 2       [0m | [0m-0.6927  [0m | [0m 0.806   [0m | [0m 0.7814  [0m | [0m 75.42   [0m |
| [95m 3       [0m | [95m-0.3945  [0m | [95m 0.2883  [0m | [95m 0.03816 [0m | [95m 115.3   [0m |
| [0m 4       [0m | [0m-0.6131  [0m | [0m 0.2434  [0m | [0m 0.4403  [0m | [0m 153.6   [0m |
| [0m 5       [0m | [0m-0.6928  [0m | [0m 0.7878  [0m | [0m 0.904   [0m | [0m 19.49   [0m |
| [0m 6       [0m | [0m-0.6927  [0m | [0m 0.85    [0m | [0m 0.9176  [0m | [0m 119.1   [0m |
