#### Import Libraries & Dependencies

In [13]:
from bayes_opt import BayesianOptimization
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, accuracy_score, make_scorer
from sklearn.model_selection import cross_val_score

import pandas as pd
import numpy as np
import time

from sklearn.preprocessing import LabelEncoder


#### Defined Utility Methods

In [14]:
def load_data_set(filename):
    """
    Load a dataset from a CSV file using Pandas.

    This function attempts to read a CSV file specified by the 'filename' parameter 
    and return it as a Pandas DataFrame. If the function encounters an error during 
    the file reading process (e.g., file not found, invalid file format), it catches 
    the exception, prints the error message, and returns None.

    Parameters:
    - filename (str): The path to the CSV file that needs to be loaded.

    Returns:
    - pandas.DataFrame: A DataFrame containing the data from the CSV file, or None 
                         if an error occurs during file reading.
    """
    
    try:
        return pd.read_csv(filename)	
    except Exception as e:
        print (e)


def gbm_cl_bo(max_depth, max_features, learning_rate, n_estimators, subsample):
    """
    Perform Gradient Boosting Classifier model evaluation with cross-validation.

    This function takes hyperparameters for a Gradient Boosting Classifier,
    creates a model with these parameters, and evaluates its performance using 
    cross-validation on a training dataset. The function returns the mean accuracy 
    score of the model over the cross-validation folds.

    Parameters:
    - max_depth (float): The maximum depth of the individual regression estimators. 
                         The value is rounded to the nearest integer.
    - max_features (str or int): The number of features to consider when looking for 
                                 the best split. Can be an integer, float, string, or None.
    - learning_rate (float): Learning rate shrinks the contribution of each tree by 
                             `learning_rate`.
    - n_estimators (float): The number of boosting stages to be run. The value is 
                            rounded to the nearest integer.
    - subsample (float): The fraction of samples to be used for fitting the individual 
                         base learners. If smaller than 1.0, this results in Stochastic 
                         Gradient Boosting.

    Returns:
    - score (float): The mean accuracy score of the model computed over cross-validation 
                     folds.

    Note: This function assumes the existence of predefined `x_train` and `y_train` 
          datasets and uses a fixed random state for reproducibility.
    """
    
    params_gbm = {}
    params_gbm['max_depth'] = round(max_depth)
    params_gbm['max_features'] = max_features
    params_gbm['learning_rate'] = learning_rate
    params_gbm['n_estimators'] = round(n_estimators)
    params_gbm['subsample'] = subsample
    
    accuracy_scorer = make_scorer(accuracy_score)
    
    scores = cross_val_score(GradientBoostingClassifier(random_state=123, **params_gbm),
                                                                        x_train, 
                                                                        y_train, 
                                                                        scoring = accuracy_scorer, 
                                                                        cv=5
                                                        ).mean()
    
    score = scores.mean()
    return score


#### Read Data Source (Iris Flowers Dataset)

In [15]:
# Load the wine data set using load_wine()
test_dataset = load_data_set("iris_dataset.csv")
test_dataset.head()

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


#### Retrieve Dataset Components, Encoder the Dependent Variable & Create Splits using Random Distributions

In [16]:
# Instatiate the label encoder and fit the data
label_encoder = LabelEncoder()
test_dataset["class"] = label_encoder.fit_transform(test_dataset["class"])

# Retrieve dataset components data and target
X = test_dataset[['sepallength', 'sepalwidth', 'petallength', 'petalwidth']]
y = test_dataset["class"]

# Create training sets using random distribution
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state = 12)

#### Define the Search Space Parameters and Instantiate  Optimizer

In [17]:
# Run Bayesian Optimization
start = time.time()

# Search space parameters
params_gbm ={
    'max_depth':(3, 10),
    'max_features':(0.8, 1),
    'learning_rate':(0.01, 1),
    'n_estimators':(80, 150),
    'subsample': (0.8, 1)
}

# Create an instance of bayesian optimizer
optimizer = BayesianOptimization(gbm_cl_bo, params_gbm, random_state=111)
optimizer.maximize(init_points=20, n_iter=4)

print('It takes %s minutes' % round((time.time() - start)/60, 2) )

|   iter    |  target   | learni... | max_depth | max_fe... | n_esti... | subsample |
-------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.9285   [0m | [0m0.616    [0m | [0m4.183    [0m | [0m0.8872   [0m | [0m133.8    [0m | [0m0.8591   [0m |
| [0m2        [0m | [0m0.9198   [0m | [0m0.1577   [0m | [0m3.157    [0m | [0m0.884    [0m | [0m96.71    [0m | [0m0.8675   [0m |
| [95m3        [0m | [95m0.9289   [0m | [95m0.9908   [0m | [95m4.664    [0m | [95m0.8162   [0m | [95m126.9    [0m | [95m0.9242   [0m |
| [0m4        [0m | [0m0.9198   [0m | [0m0.2815   [0m | [0m6.264    [0m | [0m0.8237   [0m | [0m85.18    [0m | [0m0.9802   [0m |
| [0m5        [0m | [0m0.9198   [0m | [0m0.796    [0m | [0m8.884    [0m | [0m0.963    [0m | [0m149.4    [0m | [0m0.9155   [0m |
| [0m6        [0m | [0m0.9285   [0m | [0m0.8156   [0m | [0m5.949    [0m | [0m0.8055   [0m | [0m111.8

#### Retrieve Best Hyperparameters

In [19]:
# Best hyperparameters obtained by the optimizer
params_gbm = optimizer.max['params']

In [20]:
params_gbm

{'learning_rate': 0.9908053399845699,
 'max_depth': 4.6640851773001595,
 'max_features': 0.8162385318902423,
 'n_estimators': 126.8720166772641,
 'subsample': 0.9242485838801394}