<a href="https://colab.research.google.com/github/nunocesarsa/SENSECO_School_2021/blob/main/ColabNotebooks/SENSECO_01_OptimizingRandomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#updating the sklearn version
!pip install scikit-learn==0.23.2 

!pip install scikit-optimize

## Importing packages

In [2]:
#the test data
import sklearn
from sklearn.datasets import load_breast_cancer

# import the regressor for random forests
from sklearn.ensemble import RandomForestClassifier

# These are a bunch of methods of te hyperopt package to generate the search space
from hyperopt import hp

# these are the minimizing function (fmin), Tree-parzen estimator method, an evaluation function, trial method and status indicators
from hyperopt import fmin, tpe, space_eval, Trials, STATUS_OK, STATUS_FAIL
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

#our friends
import numpy as np

#The data:

In [None]:
data = load_breast_cancer(as_frame=True)['data']
target = load_breast_cancer(as_frame=True)['target']

data

In [4]:
train_data = data.sample(frac=0.8,random_state=200)
test_data  = data.drop(train_data.index)

train_target = target.drop(test_data.index)
test_target  = target.drop(train_data.index)



# General parameters

In [12]:
n_times = 10
n_cv = 3
rnd_state = 42

# Bayesian grid Search

- This method is a more "basic" method but just to compare it is shown here. 

Adapted from: https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html



## Hyperparameter space

In [None]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

#repeating the same choices as before
rfr_BGridCV = BayesSearchCV(
    RandomForestClassifier(),{
        'n_estimators': Integer(50,1000), 
        'min_samples_split': Real(0.001,.5),
        'min_samples_leaf': Real(0.001,.5),
    },
    cv=n_cv,
    n_iter=n_times,
    random_state=rnd_state
)


rfr_BGridCV

## Optimizing

In [None]:
rfr_BGridCV.fit(train_data,train_target)

# Tree structure Parzen Estimator

Described in: 
https://papers.nips.cc/paper/2011/file/86e8f7ab32cfd12577bc2619bc635690-Paper.pdf

hyperopt package:http://hyperopt.github.io/hyperopt/

Example of regular grid search: https://scikit-optimize.github.io/stable/auto_examples/hyperparameter-optimization.html

## Hyperpameter space

This is just a showcase but all possible parameter of the random forest could be used here:

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

The description of how to define the hyperparameter spaces is here:

http://hyperopt.github.io/hyperopt/getting-started/search_spaces/

In [15]:
space = {
    'n_estimators': hp.choice('n_estimators', range(50, 1001, 50)), #to avoid errors (e.g. to few trees)
    'min_samples_split' : hp.uniform('min_samples_split', 0,.5), #these must be a fractio up to 50% of the data it seems
    'min_samples_leaf' : hp.uniform('min_samples_leaf', 0,.5)
}

## Objective function

The objective function returns the prediction error. This error is used to evaluate if the model improved or not

In [16]:
#first we define the function
N_FOLDS = n_cv
MAX_EVALS = n_times

def objective(params, n_folds = N_FOLDS):
    """Objective function for Random forest Hyperparameter Tuning"""

    # Perform n_fold cross validation with hyperparameters
    # Use early stopping and evaluate based on ROC AUC
    rf = RandomForestClassifier(**params, random_state = rnd_state,bootstrap=True,oob_score=True)

    #clf = LogisticRegression(**params,random_state=0,verbose =0)
    scores = cross_val_score(rf, X=train_data, y=train_target, cv=5, scoring='accuracy') #thi is the simple Mean abs

    # Extract the best score
    best_score = np.mean(abs(scores)) #mean value

    # Loss must be minimized
    loss = 1-best_score #this might have to be adapted depending on the use case

    #NOTICE: fmin is a minimization function from python so it attempts to find the minimum value. 
    #If accuracy response is "good" when high, then, we might need to invert it. But remember then that your final loss is 1 - loss.

    # Returns the loss error, the current parameters and the status
    return {'loss': loss, 'params': params, 'status': STATUS_OK}



In [None]:
#testing function 
print(objective( {'n_estimators':50}, n_folds = N_FOLDS))
print(objective({'n_estimators':150}, n_folds = N_FOLDS))
print(objective({'n_estimators':500}, n_folds = N_FOLDS))

## Optimizing

Now we call the actual function - using the Adaptative tree parzen estimators approach:

http://hyperopt.github.io/hyperopt/#algorithms

In [None]:
bayes_trials = Trials()

best_rfr = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = MAX_EVALS, trials = bayes_trials,verbose=1)

In [None]:
best_rfr

# Comparing


In [None]:
#Best parameters on Grid search
rfr_BGridCV.best_params_

In [None]:
#Best parameters on TPE
best_rfr

In [None]:
#creating a random forest using default settings
RF_DEF = RandomForestClassifier(random_state=rnd_state)

#Creating a random forest object model using the tpe parameters
RF_BGS = RandomForestClassifier(**{'min_samples_leaf': 0.2056418754678036,'min_samples_split': 0.36413514584548523,'n_estimators': 936})
RF_BGS

#Creating a random forest object model using the tpe parameters
RF_TPE = RandomForestClassifier(**best_rfr)
RF_TPE

##Train the model and measure the error

In [None]:
RF_DEF.fit(train_data,train_target)
RF_BGS.fit(train_data,train_target)
RF_TPE.fit(train_data,train_target)

error_def = accuracy_score(test_target,RF_DEF.predict(test_data))
error_bgs = accuracy_score(test_target,RF_BGS.predict(test_data))
error_tpe = accuracy_score(test_target,RF_TPE.predict(test_data))

print("Default: " +  str(error_def))
print("BGS Optimized: " +  str(error_bgs))
print("TPE Optimized: " +  str(error_tpe))

In [None]:
#rfr_BGridCV = BayesSearchCV(
#    RandomForestClassifier(),{
#        'n_estimators': Categorical([10,50,100,250,500,750,1000]), 
#        'min_samples_split': Categorical([0.1,0.2,0.3,0.4,0.5]),
#        'min_samples_leaf': Categorical([0.1,0.2,0.3,0.4,0.5]),
#    },
#    cv=5,
#    n_iter=10,
#    random_state=42
#)
