# Hyper Parameter Optimization

First we import modules from sklearn and load a dataset

In [2]:
import pandas as pd
import numpy as np

from sklearn import datasets

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [3]:
df = pd.read_csv('/Users/luken2/Documents/GitHub/hyper-parameter-tuning/data/height_weight_gender.csv')

df.columns

Index(['Gender', 'Height', 'Weight'], dtype='object')

In [4]:
binary_class = []
for gender in df['Gender']:
    if gender == 'Male':
        binary_class.append(1)
    if gender == 'Female':
        binary_class.append(0)
df['target'] = pd.DataFrame(binary_class)

In [5]:
X = df[['Height', 'Weight']]
y = df['target']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, 
                                                    random_state = 42)

In [7]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Random Forest Classifier with default settings

In [33]:
rf = RandomForestClassifier(random_state=42).fit(X_train, y_train)
print("f1_score: " , f1_score(rf.predict(X_test), y_test, average='micro'))
print("f1_score: " , f1_score(rf.predict(X_train), y_train, average='micro'))

f1_score:  0.9045
f1_score:  0.99175




## Randomized Search

Formulating a randomized search we need 4 things:
1. Estimator: Model to be optimized
2. param_distribution: space of params to randomly search
3. n_iter: number of iterations to perform
4. scoring: metric to be maximized

In [34]:
params = {'criterion': ['entropy', 'gini'],
        'max_depth': list(np.linspace(10, 600, 10, dtype = int)) + [None],
        'max_features': ['auto', 'sqrt','log2', None],
        'min_samples_leaf': [1, 4, 6, 8, 12],
        'min_samples_split': [2, 5, 7, 10, 14],
        'n_estimators': list(np.linspace(100, 1000, 10, dtype = int))}

model = RandomizedSearchCV(estimator=RandomForestClassifier(), param_distributions=params,
                           n_iter=50, scoring='f1', verbose=3, cv=3, n_jobs=-1,
                           random_state=42)

In [35]:
model.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.3min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None

The function `.best_params_` will give you the parameters it found and can be plugged directly into the original model

In [36]:
model.best_params_

{'n_estimators': 800,
 'min_samples_split': 14,
 'min_samples_leaf': 12,
 'max_features': 'log2',
 'max_depth': 141,
 'criterion': 'entropy'}

In [37]:
rf = RandomForestClassifier(**model.best_params_,
                            random_state=42).fit(X_train, y_train)
print("f1_score: " , f1_score(rf.predict(X_test), y_test, average='micro'))
print("f1_score: " , f1_score(rf.predict(X_train), y_train, average='micro'))

f1_score:  0.925
f1_score:  0.923375


## Grid Search

Formulating a grid search we need 3 things:
1. Estimator: Model to be optimized
2. param_grid: Grid of parameters to be tested (Avoid Large Grids!)
3. scoring: Metric to be maximized

In [38]:
grid = {'criterion': ['entropy'],
        'max_depth': [70, 80, 90, 100],
        'max_features': ['log2'],
        'min_samples_leaf': [13],
        'min_samples_split': [9, 10, 11],
        'n_estimators': [830]}

model2 = GridSearchCV(estimator=RandomForestClassifier(), param_grid=grid,
                      scoring='f1', verbose=3, cv=3, n_jobs=-1)

In [39]:
model2.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done  34 out of  36 | elapsed:   26.0s remaining:    1.5s
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:   26.1s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [40]:
model2.best_params_

{'criterion': 'entropy',
 'max_depth': 100,
 'max_features': 'log2',
 'min_samples_leaf': 13,
 'min_samples_split': 11,
 'n_estimators': 830}

In [41]:
rf = RandomForestClassifier(**model2.best_params_,
                            random_state=42).fit(X_train, y_train)
print("f1_score: " , f1_score(rf.predict(X_test), y_test, average='micro'))
print("f1_score: " , f1_score(rf.predict(X_train), y_train, average='micro'))

f1_score:  0.9245
f1_score:  0.923375


# Bayes Optimization

Formulating a bayesian optimization problem in hyperopt we need 4 parts:
1. Objective Function: Takes in an input and returns a loss to minimize
2. Space: Range of values to test
3. Optimization Algorithm: Method to construct the surragate function and choose next values to evaluate
4. Results: Score, Value pairs algorithm used

### hyperopt library
`pip install hyperopt`
* hp: Gives functions to create probability distibutions of our range of space
* fmin: Minimizes objective function given range of values to test and optimization algorithm
* tpe: Optimization algorithm (Tree-Structured Partizan Estimator)
* STATUS_OK: Checks status?
* Trials: Results history if we want to see what is going on behind the scenes

In [42]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score

In [43]:
MAX_EVALS = 50

#### To make parameter space use hyperopt.hp
1. `hp.choice` : Uniform distribution over each value specified
2. `hp.uniform` : Continuous uniform distribution (Floats)
3. `hp.quniform` : Discrete uniform distribution (Integers
4. `hp.loguniform` : Countinuous log distribution (Floats)
5. `hp.qloguniform` : Discrete log distribution (Integers)

In [44]:
space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'max_depth': hp.quniform('max_depth', 10, 200, 5),
        'max_features': hp.choice('max_features', ['auto', 'sqrt','log2', None]),
        'min_samples_leaf': hp.choice('min_samples_leaf', range(2,20)),
        'min_samples_split': hp.choice('min_samples_split', range(2,50)),
        'n_estimators': hp.choice('n_estimators', range(100, 1000, 10))}

In [78]:
def objective(params):
    
    # Model to be hyper-optimized
    
    model = RandomForestClassifier(**params, random_state=42)
    
    # Score to be used in evaluation
    
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_macro').mean()
    
    # Loss must be minimized
    
    loss = 1 - score
    
    return {'loss': loss, 'params': params, 'status': STATUS_OK }

In [51]:
trials = Trials()
# We initialize trials object here to be able to see our results after algorithm is complete
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = MAX_EVALS,
            trials= trials)
best

100%|██████████| 50/50 [10:07<00:00, 12.16s/it, best loss: 0.08225312228074633]


{'criterion': 1,
 'max_depth': 180.0,
 'max_features': 1,
 'min_samples_leaf': 11,
 'min_samples_split': 10,
 'n_estimators': 30}

In [79]:
# To see which results were best
best_results = sorted(trials.results, key = lambda x: x['loss'])
best_results[0]

{'loss': 0.08050118811690454,
 'params': {'learning_rate': 0.011273254429806057,
  'loss': 'exponential',
  'max_depth': 4.0,
  'max_features': 'auto',
  'min_samples_leaf': 60,
  'min_samples_split': 8,
  'n_estimators': 470,
  'subsample': 0.5181493292615545},
 'status': 'ok'}

In [60]:
rf = RandomForestClassifier(criterion='gini', max_depth=180, max_features='sqrt',
                            min_samples_leaf=13, min_samples_split=12, n_estimators=400,
                            random_state=42).fit(X_train, y_train)
print("f1_score: " , f1_score(rf.predict(X_test), y_test, average='micro'))
print("f1_score: " , f1_score(rf.predict(X_train), y_train, average='micro'))

f1_score:  0.924
f1_score:  0.923375


# Gradient Boosting Classifier with default settings

In [61]:
gb = GradientBoostingClassifier(random_state=42).fit(X_train, y_train)
print("f1_score: " , f1_score(gb.predict(X_test), y_test, average='micro'))
print("f1_score: " , f1_score(gb.predict(X_train), y_train, average='micro'))

f1_score:  0.9235
f1_score:  0.925


## Randomized Search

In [64]:
params = {'loss': ['deviance', 'exponential'],
         'learning_rate': [0.05, 0.1, 0.15, 0.2],
         'n_estimators': list(np.linspace(30, 300, 7, dtype=int)),
         'subsample': [0.5, 0.7, 0.9 ,1],
         'min_samples_split': [3, 4, 5, 6, 7],
         'min_samples_leaf': list(np.linspace(1, 100, 10, dtype=int)),
         'max_depth': [2,4,6,8,10],
         'max_features': ['auto', 'sqrt', 'log2'],
         }

model = RandomizedSearchCV(estimator=GradientBoostingClassifier(random_state=42), 
                           param_distributions=params, scoring='f1', n_iter=50, 
                           verbose=3, cv=3, n_jobs=-1, random_state=42)

In [65]:
model.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   13.1s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   17.5s finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                        init=None,
                                                        learning_rate=0.1,
                                                        loss='deviance',
                                                        max_depth=3,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                     

In [66]:
model.best_params_

{'subsample': 0.7,
 'n_estimators': 165,
 'min_samples_split': 5,
 'min_samples_leaf': 56,
 'max_features': 'auto',
 'max_depth': 2,
 'loss': 'deviance',
 'learning_rate': 0.05}

In [67]:
gb = GradientBoostingClassifier(**model.best_params_, 
                                random_state=42).fit(X_train, y_train)
print("f1_score: " , f1_score(gb.predict(X_test), y_test, average='micro'))
print("f1_score: " , f1_score(gb.predict(X_train), y_train, average='micro'))

f1_score:  0.9235
f1_score:  0.920875


## Grid Search

In [68]:
grid = {'loss': ['deviance'],
         'learning_rate': [0.05, 0.1],
         'n_estimators': [100, 150],
         'subsample': [0.7, 0.8],
         'min_samples_split': [5],
         'min_samples_leaf': [40, 50, 60],
         'max_depth': [2, 3, 4],
         'max_features': ['sqrt'],
         }

model2 = GridSearchCV(estimator=GradientBoostingClassifier(random_state=42), param_grid=grid,
                      scoring='f1', verbose=3, cv=3, n_jobs=-1)

In [69]:
model2.fit(X_train, y_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 216 out of 216 | elapsed:   11.1s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no...
                                                  validation_fractio

In [70]:
model2.best_params_

{'learning_rate': 0.1,
 'loss': 'deviance',
 'max_depth': 3,
 'max_features': 'sqrt',
 'min_samples_leaf': 60,
 'min_samples_split': 5,
 'n_estimators': 100,
 'subsample': 0.7}

In [71]:
gb = GradientBoostingClassifier(**model2.best_params_, 
                                random_state=42).fit(X_train, y_train)
print("f1_score: " , f1_score(gb.predict(X_test), y_test, average='micro'))
print("f1_score: " , f1_score(gb.predict(X_train), y_train, average='micro'))

f1_score:  0.9225
f1_score:  0.92275


## Bayes Optimization

In [6]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score

In [21]:
MAX_EVALS = 50

In [73]:
space = {'loss': hp.choice('loss', ['deviance', 'exponential']),
         'learning_rate': hp.loguniform('learning_rate', np.log(0.005), np.log(0.2)),
         'n_estimators': hp.choice('n_estimators', range(50,1000,10)),
         'subsample': hp.uniform('subsample', 0.4, 1),
         'min_samples_split': hp.choice('min_samples_split', range(2, 20, 2)),
         'min_samples_leaf': hp.choice('min_samples_leaf', range(10,100,5)),
         'max_depth': hp.quniform('max_depth', 2, 12, 1),
         'max_features': hp.choice('max_features', ['auto', 'sqrt', 'log2'])
         }

In [74]:
def objective(params):
    
    model = GradientBoostingClassifier(**params, random_state=42, verbose=0)
    
    score = cross_val_score(model, X, y, cv=5, scoring='f1_macro').mean()
    
    loss = 1 - score
    
    return {'loss': loss, 'params': params, 'status': STATUS_OK }

In [75]:
tpe.algorithm = tpe.suggest

trials = Trials()

best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = MAX_EVALS,
            trials= trials)
best

100%|██████████| 50/50 [07:57<00:00,  9.54s/it, best loss: 0.08050118811690454]


{'learning_rate': 0.011273254429806057,
 'loss': 1,
 'max_depth': 4.0,
 'max_features': 0,
 'min_samples_leaf': 10,
 'min_samples_split': 3,
 'n_estimators': 42,
 'subsample': 0.5181493292615545}

In [76]:
best_results = sorted(trials.results, key = lambda x: x['loss'])
best_results[0]

{'loss': 0.08050118811690454,
 'params': {'learning_rate': 0.011273254429806057,
  'loss': 'exponential',
  'max_depth': 4.0,
  'max_features': 'auto',
  'min_samples_leaf': 60,
  'min_samples_split': 8,
  'n_estimators': 470,
  'subsample': 0.5181493292615545},
 'status': 'ok'}

In [77]:
gb = GradientBoostingClassifier(learning_rate=0.011, loss='exponential', max_depth=4,
                                max_features='auto', min_samples_leaf=60, min_samples_split=8,
                                n_estimators=470, subsample=0.5,
                                random_state=42).fit(X_train, y_train)
print("f1_score: " , f1_score(gb.predict(X_test), y_test, average='micro'))
print("f1_score: " , f1_score(gb.predict(X_train), y_train, average='micro'))

f1_score:  0.9235
f1_score:  0.921125
