###  HyperParameter Searching
https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74


In [65]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix

url = 'https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv'
data = pd.read_csv(url)
data.head(10)

# setup method for missing data using a median imputer for age and fair
numeric_features = ['Age', 'Fare', 'Siblings/Spouses Aboard', 'Parents/Children Aboard']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
     ('scaler', StandardScaler())])
# setup one hot enoding for catagorical features
categorical_features = [ 'Sex', 'Pclass']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# combine numeric and catagorical transformation into one column transformer object
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
pipe = Pipeline(steps=[('preprocessor', preprocessor)])

# drop rows where survived is unknown
y = data['Survived']
X = pipe.fit_transform(data)

# normalizes the feature names
feature_names = numeric_features +  pipe.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names().tolist()
feature_names = list(map(lambda x: x.replace('x0',categorical_features[0]), feature_names))
feature_names = list(map(lambda x: x.replace('x1',categorical_features[1]), feature_names))

#### Create a baseline model using CrossValidation to see what gain is made from Random Fro
A single Decision Tree Model is used here as a baseline

In [7]:
import sklearn
# show all the difference metrix tp 
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'accuracy', 'roc_auc', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'brier_score_loss', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'])

In [28]:
# base line model with one Tree
from sklearn.model_selection import cross_val_predict
model = DecisionTreeClassifier()
scoring = 'accuracy'
# estimates accuracy
scores = cross_validate(model, X, y, scoring=[scoring] , cv=10)
best_score = np.max(scores['test_' + scoring])
print('Random Tree Best Classifier')
print('best {0} score : {1}'.format(scoring, best_score))
# create predictions 
y_pred = cross_val_predict(model, X, y, cv = 10)
print('true', confusion_matrix(y, y_pred), '\n predicted')

Random Tree Best Classifier
best accuracy score : 0.8409090909090909
true [[456  89]
 [ 97 245]] 
 predicted


In [34]:
### Is randpom forest really better ?

from sklearn.model_selection import cross_val_predict
model = RandomForestClassifier(n_estimators=10)
scoring = 'accuracy'
# estimates accuracy
scores = cross_validate(model, X, y, scoring=[scoring] , cv=5)
best_score = np.max(scores['test_' + scoring])
print('Random Tree Best Classifier')
print('best {0} score : {1}'.format(scoring, best_score))
# create predictions 
y_pred = cross_val_predict(model, X, y, cv = 5)
print('true', confusion_matrix(y, y_pred), '\n predicted')

Random Tree Best Classifier
best accuracy score : 0.847457627118644
true [[472  73]
 [104 238]] 
 predicted


In [67]:
# Number of trees in random forest
n_estimators  = np.arange(10,30, 10)
# Number of features to consider at every split
# Maximum number of levels in tree
max_depth = [3,4,5]
# Minimum number of samples required to split a node
min_samples_split = np.arange(2, 6, 2)
# Minimum number of samples required at each leaf node
min_samples_leaf = np.arange(2, 6,2)
# Method of selecting samples for training each tree
param_grid= {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
print(param_grid)


{'n_estimators': array([10, 20]), 'max_depth': [3, 4, 5], 'min_samples_split': array([2, 4]), 'min_samples_leaf': array([2, 4])}


In [86]:
model = RandomForestClassifier(n_estimators=10)
model = GridSearchCV(model, param_grid = param_grid,cv = 3,scoring=scoring, n_jobs = -1,  refit=True)

model = model.fit(X, y)


In [87]:
print('Random Tree Best Classifier on Grid Cearh ')
from sklearn.metrics import accuracy_score
# create predictions 
y_pred = model.predict(X)
print('true', confusion_matrix(y, y_pred), '\n predicted')
print('acc: {} '.format(accuracy_score(y, y_pred)))

Random Tree Best Classifier on Grid Cearh 
true [[503  42]
 [ 95 247]] 
 predicted
acc: 0.8455467869222097 


In [None]:
model.pre