# Random Forest

In [24]:
import pickle

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK, SparkTrials
from hyperopt.pyll.stochastic import sample

In [2]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100
pd.set_option('max_colwidth', 200)

### Import features
scikit-learn pipeline generated the features

In [29]:
titanic_features, label_col, features_cols = pickle.load(open( "data/titanic/features_df.p", "rb" ))

In [30]:
titanic_features.head()

Unnamed: 0,name_prefix_Master.,name_prefix_Miss.,name_prefix_Mr.,name_prefix_Mrs.,name_prefix_None,name_parenths_no,name_parenths_yes,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan,Pclass_1,Pclass_2,Pclass_3,ticket_text_1,ticket_text_2,ticket_text_3,ticket_text_4,ticket_text_5,ticket_text_6,ticket_text_7,ticket_text_8,ticket_length_3,ticket_length_4,ticket_length_5,ticket_length_6,ticket_length_7,cabin_chars_A,cabin_chars_B,cabin_chars_C,cabin_chars_D,cabin_chars_E,cabin_chars_F,cabin_chars_INFREQ,cabin_chars_NONE,Age,Fare,SibSp,Parch,Survived
0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,22.0,7.25,1.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,38.0,71.2833,1.0,0.0,1.0
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,26.0,7.925,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,35.0,53.1,1.0,0.0,1.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,35.0,8.05,0.0,0.0,0.0


### Fit a default model

In [27]:
clf = RandomForestClassifier(n_jobs=-1)

In [28]:
model_score = cross_val_score(clf, 
                              titanic_features[feature_cols], 
                              titanic_features[label_col],
                              cv=5, 
                              scoring="roc_auc")
model_score.mean()

0.8684341864170696

### Fit a model with grid search

In [13]:
param_grid = [{'n_estimators':      [100, 150, 200, 250], 
               'min_samples_split': [2, 5, 10, 15, 20],
               'max_features':      [2, 5, 8, 10, 12]}]

grid_search = GridSearchCV(clf, param_grid, scoring="roc_auc", n_jobs=-1, cv=5)

grid_search.fit(titanic_features[feature_cols], titanic_features[label_col])

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=-1,
                                              oob_score=False,
                                              random_

In [14]:
grid_search.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=12,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=250,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [15]:
grid_search.best_score_

0.8830553487667686

### Fit a model with Hyperopt grid search

Search space

In [19]:
space = {'n_estimators':      hp.quniform('n_estimators', 10, 300, 1),
         'min_samples_split': hp.quniform('min_samples_split', 2, 20, 1),
         'max_features':      hp.quniform('max_features', 2, 20, 1)}

Objective function

In [20]:
def objective(params, 
              n_folds=5, 
              scoring='roc_auc', 
              features_df=titanic_features, 
              features=feature_cols, 
              label=label_col):
   
    params['n_estimators'] = int(params['n_estimators'])
    params['min_samples_split'] = int(params['min_samples_split'])
    params['max_features'] = int(params['max_features'])
    params['random_state'] = 1
  
    clf = RandomForestClassifier(**params)

    scores = cross_val_score(clf, 
                             features_df[features], 
                             features_df[label], 
                             cv=n_folds, 
                             scoring=scoring, 
                             n_jobs=-1)
  
    mean_score = scores.mean()
    loss = 1 - mean_score
  
    return {'loss': loss, 'status': STATUS_OK}

In [21]:
bayes_trials = Trials()

best = fmin(fn=objective, 
            space=space, 
            algo=tpe.suggest,
            max_evals=300, 
            trials=bayes_trials, 
            rstate=np.random.RandomState(50))

100%|██████████| 300/300 [01:58<00:00,  2.53trial/s, best loss: 0.11722918964155282]


In [22]:
best

{'max_features': 14.0, 'min_samples_split': 10.0, 'n_estimators': 160.0}

In [23]:
print(1 - bayes_trials.best_trial['result']['loss'])

0.8827708103584472
