# Hyper Parameter Optimization

First we import modules from sklearn and load a dataset

In [1]:
import pandas as pd
import numpy as np

from sklearn import datasets

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
breast_cancer = datasets.load_breast_cancer()
data = pd.DataFrame(breast_cancer.data[:, :],columns = breast_cancer.feature_names)
data['target'] = breast_cancer.target
data.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension',
       'target'],
      dtype='object')

In [3]:
X = data[['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension']]
y = data['target']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, 
                                                    random_state = 42)

In [5]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Random Forest Classifier with default settings

In [6]:
rf = RandomForestClassifier(random_state=42).fit(X_train, y_train)
print("f1_score (test): " , f1_score(rf.predict(X_test), y_test, average='micro'))
print("f1_score (train): " , f1_score(rf.predict(X_train), y_train, average='micro'))

f1_score (test):  0.956140350877193
f1_score (train):  0.9978021978021978




## Randomized Search

In [7]:
params = {'criterion': ['entropy', 'gini'],
        'max_depth': list(np.linspace(10, 600, 10, dtype = int)) + [None],
        'max_features': ['auto', 'sqrt','log2', None],
        'min_samples_leaf': [1, 4, 6, 8, 12],
        'min_samples_split': [2, 5, 7, 10, 14],
        'n_estimators': list(np.linspace(100, 1000, 10, dtype = int))}

model = RandomizedSearchCV(estimator=RandomForestClassifier(), param_distributions=params,
                           n_iter=50, scoring='f1', verbose=3, cv=3, n_jobs=-1,
                           random_state=42)

In [8]:
model.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   22.6s finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None

In [9]:
model.best_params_

{'n_estimators': 400,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': None,
 'criterion': 'entropy'}

In [10]:
rf = RandomForestClassifier(**model.best_params_,
                            random_state=42).fit(X_train, y_train)
print("f1_score (test): " , f1_score(rf.predict(X_test), y_test, average='micro'))
print("f1_score (train): " , f1_score(rf.predict(X_train), y_train, average='micro'))

f1_score (test):  0.9649122807017544
f1_score (train):  1.0


## Grid Search

In [11]:
grid = {'criterion': ['entropy'],
        'max_depth': [395, 397, 400, 403],
        'max_features': ['sqrt'],
        'min_samples_leaf': [1, 2],
        'min_samples_split': [4, 5, 6],
        'n_estimators': [580, 600, 620]}

model2 = GridSearchCV(estimator=RandomForestClassifier(), param_grid=grid,
                      scoring='f1', verbose=3, cv=3, n_jobs=-1)

In [12]:
model2.fit(X_train, y_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   15.0s
[Parallel(n_jobs=-1)]: Done 216 out of 216 | elapsed:   28.6s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [13]:
model2.best_params_

{'criterion': 'entropy',
 'max_depth': 395,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 580}

In [14]:
rf = RandomForestClassifier(**model2.best_params_,
                            random_state=42).fit(X_train, y_train)
print("f1_score (test): " , f1_score(rf.predict(X_test), y_test, average='micro'))
print("f1_score (train): " , f1_score(rf.predict(X_train), y_train, average='micro'))

f1_score (test):  0.9649122807017544
f1_score (train):  1.0


## Gradient Boosting Classifier with default settings

In [15]:
gb = GradientBoostingClassifier(random_state=42).fit(X_train, y_train)
print("f1_score: " , f1_score(gb.predict(X_test), y_test, average='micro'))
print("f1_score: " , f1_score(gb.predict(X_train), y_train, average='micro'))

f1_score:  0.956140350877193
f1_score:  1.0


In [16]:
params = {'loss': ['deviance', 'exponential'],
         'learning_rate': [0.05, 0.1, 0.15, 0.2],
         'n_estimators': list(np.linspace(30, 100, 7, dtype=int)),
         'subsample': [0.5, 0.7, 0.9 ,1],
         'min_samples_split': [3, 4, 5, 6, 7],
         'min_samples_leaf': list(np.linspace(1, 100, 10, dtype=int)),
         'max_depth': [2,4,6,8,10],
         'max_features': ['sqrt'],
         }

model = RandomizedSearchCV(estimator=GradientBoostingClassifier(random_state=42), 
                           param_distributions=params, scoring='f1', n_iter=300, 
                           verbose=3, cv=3, n_jobs=-1, random_state=42)

In [17]:
model.fit(X_train, y_train)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:    7.2s finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                        init=None,
                                                        learning_rate=0.1,
                                                        loss='deviance',
                                                        max_depth=3,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                     

In [18]:
model.best_params_

{'subsample': 0.7,
 'n_estimators': 65,
 'min_samples_split': 3,
 'min_samples_leaf': 34,
 'max_features': 'sqrt',
 'max_depth': 10,
 'loss': 'deviance',
 'learning_rate': 0.2}

In [19]:
gb = GradientBoostingClassifier(**model.best_params_, 
                                random_state=42).fit(X_train, y_train)
print("f1_score: " , f1_score(gb.predict(X_test), y_test, average='micro'))
print("f1_score: " , f1_score(gb.predict(X_train), y_train, average='micro'))

f1_score:  0.9824561403508771
f1_score:  1.0


In [20]:
grid = {'loss': ['deviance'],
         'learning_rate': [0.175, 0.2, 0.225],
         'n_estimators': [60, 65, 70],
         'subsample': [0.6, 0.7, 0.8],
         'min_samples_split': [2, 3, 4],
         'min_samples_leaf': [30, 34, 40],
         'max_depth': [9,10,11],
         'max_features': ['sqrt'],
         }

model2 = GridSearchCV(estimator=GradientBoostingClassifier(random_state=42), param_grid=grid,
                      scoring='f1', verbose=3, cv=3, n_jobs=-1)

In [21]:
model2.fit(X_train, y_train)

Fitting 3 folds for each of 729 candidates, totalling 2187 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 592 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 1552 tasks      | elapsed:   12.6s
[Parallel(n_jobs=-1)]: Done 2187 out of 2187 | elapsed:   17.5s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no...
                                                  verbose=0, warm_st

In [22]:
model2.best_params_

{'learning_rate': 0.2,
 'loss': 'deviance',
 'max_depth': 9,
 'max_features': 'sqrt',
 'min_samples_leaf': 30,
 'min_samples_split': 2,
 'n_estimators': 70,
 'subsample': 0.8}

In [23]:
gb = GradientBoostingClassifier(**model2.best_params_, 
                                random_state=42).fit(X_train, y_train)
print("f1_score: " , f1_score(gb.predict(X_test), y_test, average='micro'))
print("f1_score: " , f1_score(gb.predict(X_train), y_train, average='micro'))

f1_score:  0.9736842105263158
f1_score:  1.0


In [6]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score

In [34]:
space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'max_depth': hp.quniform('max_depth', 10, 1000, 10),
        'max_features': hp.choice('max_features', ['auto', 'sqrt','log2', None]),
        'min_samples_leaf': hp.choice('min_samples_leaf', [10, 20, 25, 30, 35, 40, 45, 50, 55, 60]),
        'min_samples_split' : hp.choice('min_samples_split', [2, 3, 4, 5, 6, 7, 8, 9, 10]),
        'n_estimators' : hp.choice('n_estimators', [100, 150, 200, 250, 300, 350, 400, 450, 500, 
                                                    550, 600, 650, 700, 750, 800, 850, 900, 950,
                                                    1000])
    }

In [35]:
def objective(space):
    model = RandomForestClassifier(criterion = space['criterion'], max_depth = space['max_depth'],
                                 max_features = space['max_features'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split'],
                                 n_estimators = space['n_estimators'], 
                                 )
    
    accuracy = cross_val_score(model, X_train, y_train, cv=4).mean()

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK }

In [36]:
trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 80,
            trials= trials)
best

100%|██████████| 80/80 [02:39<00:00,  1.99s/it, best loss: -0.9560031523595445]


{'criterion': 0,
 'max_depth': 370.0,
 'max_features': 0,
 'min_samples_leaf': 0,
 'min_samples_split': 5,
 'n_estimators': 10}

In [39]:
rf = RandomForestClassifier(criterion='entropy',max_depth=370, max_features='auto',
                            min_samples_leaf=10, min_samples_split=7, n_estimators=600,
                            random_state=42).fit(X_train, y_train)
print("f1_score (test): " , f1_score(rf.predict(X_test), y_test, average='micro'))
print("f1_score (train): " , f1_score(rf.predict(X_train), y_train, average='micro'))

f1_score (test):  0.9649122807017544
f1_score (train):  0.9758241758241758
