In [4]:
from src.tests import test_dataset
from sklearn.datasets import load_breast_cancer, fetch_covtype
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [5]:
# 1. Load the breast cancer dataset (Easy example)


data = load_breast_cancer()
X = data.data
y = data.target

trials = 5
sample = [0] * trials
random = [0] * trials


# The third argument for the test_dataset function is the seed for reproductibility. 
# The warning that triggers is due to a lack of variance of Y outcomes during bayes opt.

# Try 5 examples of the process
for i in range(trials):
    print(f'------Trial number {i + 1}------')
    result = test_dataset(X, y, seed=40 + i)
    sample[i] = result[0]
    random[i] = result[1]

print(f'The average of best sample accuracy is {np.mean(sample)}')
print(f'The std of best sample accuracy is {np.std(sample)}')

print(f'The average of best random accuracy is {np.mean(random)}')
print(f'The std of best random accuracy is {np.std(random)}')

------Trial number 1------


Bayesian Optimization Progress: 100%|██████████| 20/20 [00:15<00:00,  1.31iter/s, Best Score=0.9626]


Best Hyperparameters (Random Init): {'n_estimators': 54, 'min_samples_split': 5, 'min_samples_leaf': 1}
Best accuracy: 0.9626373626373628



Bayesian Optimization Progress: 100%|██████████| 20/20 [00:11<00:00,  1.71iter/s, Best Score=0.9626]


Best Hyperparameters (Sample Init): {'n_estimators': 59, 'min_samples_split': 6, 'min_samples_leaf': 1}
Best accuracy: 0.9626373626373628

------Trial number 2------


Bayesian Optimization Progress: 100%|██████████| 20/20 [00:15<00:00,  1.33iter/s, Best Score=0.9648]


Best Hyperparameters (Random Init): {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1}
Best accuracy: 0.9648351648351647



Bayesian Optimization Progress: 100%|██████████| 20/20 [00:14<00:00,  1.35iter/s, Best Score=0.9670]


Best Hyperparameters (Sample Init): {'n_estimators': 98, 'min_samples_split': 3, 'min_samples_leaf': 1}
Best accuracy: 0.9670329670329672

------Trial number 3------


Bayesian Optimization Progress: 100%|██████████| 20/20 [00:18<00:00,  1.08iter/s, Best Score=0.9626]


Best Hyperparameters (Random Init): {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1}
Best accuracy: 0.9626373626373628



Bayesian Optimization Progress: 100%|██████████| 20/20 [00:17<00:00,  1.16iter/s, Best Score=0.9626]


Best Hyperparameters (Sample Init): {'n_estimators': 177, 'min_samples_split': 2, 'min_samples_leaf': 1}
Best accuracy: 0.9626373626373628

------Trial number 4------


Bayesian Optimization Progress: 100%|██████████| 20/20 [00:18<00:00,  1.08iter/s, Best Score=0.9582]


Best Hyperparameters (Random Init): {'n_estimators': 101, 'min_samples_split': 2, 'min_samples_leaf': 3}
Best accuracy: 0.9582417582417582



Bayesian Optimization Progress: 100%|██████████| 20/20 [00:19<00:00,  1.05iter/s, Best Score=0.9604]


Best Hyperparameters (Sample Init): {'n_estimators': 123, 'min_samples_split': 10, 'min_samples_leaf': 4}
Best accuracy: 0.9604395604395604

------Trial number 5------


Bayesian Optimization Progress: 100%|██████████| 20/20 [00:21<00:00,  1.07s/iter, Best Score=0.9714]


Best Hyperparameters (Random Init): {'n_estimators': 182, 'min_samples_split': 4, 'min_samples_leaf': 1}
Best accuracy: 0.9714285714285713



Bayesian Optimization Progress: 100%|██████████| 20/20 [00:17<00:00,  1.14iter/s, Best Score=0.9758]

Best Hyperparameters (Sample Init): {'n_estimators': 97, 'min_samples_split': 3, 'min_samples_leaf': 1}
Best accuracy: 0.9758241758241757

The average of best sample accuracy is 0.9657142857142859
The std of best sample accuracy is 0.005490108130460071
The average of best random accuracy is 0.9639560439560441
The std of best random accuracy is 0.004306795152146205





In [6]:
# 1. Load the Covertype dataset (A little bit more complicated with few data points)
data = fetch_covtype()
X = data.data
y = data.target


subset_size = 10 ** 4
X = X[:subset_size]
y = y[:subset_size]


# Try 5 examples of the process
for i in range(trials):
    print(f'------Trial number {i + 1}------')
    result = test_dataset(X, y, seed= 40 + i)
    sample[i] = result[0]
    random[i] = result[1]

print(f'The average of best sample accuracy is {np.mean(sample)}')
print(f'The std of best sample accuracy is {np.std(sample)}')

print(f'The average of best random accuracy is {np.mean(random)}')
print(f'The std of best random accuracy is {np.std(random)}')


------Trial number 1------


Bayesian Optimization Progress: 100%|██████████| 20/20 [00:49<00:00,  2.48s/iter, Best Score=0.8448]


Best Hyperparameters (Random Init): {'n_estimators': 200, 'min_samples_split': 3, 'min_samples_leaf': 1}
Best accuracy: 0.8447500000000001



Bayesian Optimization Progress: 100%|██████████| 20/20 [00:41<00:00,  2.09s/iter, Best Score=0.8471]


Best Hyperparameters (Sample Init): {'n_estimators': 145, 'min_samples_split': 2, 'min_samples_leaf': 1}
Best accuracy: 0.8471250000000001

------Trial number 2------


Bayesian Optimization Progress: 100%|██████████| 20/20 [00:46<00:00,  2.31s/iter, Best Score=0.8435]


Best Hyperparameters (Random Init): {'n_estimators': 190, 'min_samples_split': 4, 'min_samples_leaf': 1}
Best accuracy: 0.8435



Bayesian Optimization Progress: 100%|██████████| 20/20 [00:40<00:00,  2.04s/iter, Best Score=0.8444]


Best Hyperparameters (Sample Init): {'n_estimators': 133, 'min_samples_split': 4, 'min_samples_leaf': 1}
Best accuracy: 0.844375

------Trial number 3------


Bayesian Optimization Progress: 100%|██████████| 20/20 [00:56<00:00,  2.81s/iter, Best Score=0.8480]


Best Hyperparameters (Random Init): {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1}
Best accuracy: 0.8480000000000001



Bayesian Optimization Progress: 100%|██████████| 20/20 [00:55<00:00,  2.77s/iter, Best Score=0.8480]


Best Hyperparameters (Sample Init): {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1}
Best accuracy: 0.8480000000000001

------Trial number 4------


Bayesian Optimization Progress: 100%|██████████| 20/20 [00:50<00:00,  2.52s/iter, Best Score=0.8421]


Best Hyperparameters (Random Init): {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1}
Best accuracy: 0.842125



Bayesian Optimization Progress: 100%|██████████| 20/20 [00:53<00:00,  2.66s/iter, Best Score=0.8430]


Best Hyperparameters (Sample Init): {'n_estimators': 171, 'min_samples_split': 2, 'min_samples_leaf': 1}
Best accuracy: 0.843

------Trial number 5------


Bayesian Optimization Progress: 100%|██████████| 20/20 [00:50<00:00,  2.50s/iter, Best Score=0.8425]


Best Hyperparameters (Random Init): {'n_estimators': 165, 'min_samples_split': 2, 'min_samples_leaf': 1}
Best accuracy: 0.8425



Bayesian Optimization Progress: 100%|██████████| 20/20 [00:50<00:00,  2.52s/iter, Best Score=0.8432]

Best Hyperparameters (Sample Init): {'n_estimators': 200, 'min_samples_split': 3, 'min_samples_leaf': 1}
Best accuracy: 0.8432499999999999

The average of best sample accuracy is 0.8451500000000001
The std of best sample accuracy is 0.0020423638265500764
The average of best random accuracy is 0.8441750000000001
The std of best random accuracy is 0.002117781858454766



