In [1]:
import numpy as np
import pandas as pd
import re
import os
import pickle
from pandas.plotting import scatter_matrix
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV

get_ipython().magic(u'env OMP_NUM_THREADS=2')

from IPython.display import display, HTML

%matplotlib inline
import matplotlib.pyplot as plt

#get_ipython().magic(u'matplotlib')
#get_ipython().magic(u'matplotlib inline')

# Set the ransom seed used for the whole program to allow reprocibility
np.random.seed(3214412)

from sklearn.model_selection import cross_val_score

env: OMP_NUM_THREADS=2


In [2]:
data_dir = '../data/processed'
feature_filename = os.path.join(data_dir, 'feature_set.csv')

In [3]:
feature_df = pd.read_csv(feature_filename)
feature_df.head()

Unnamed: 0,Survived,Sex,Fare,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Cabin_nan,Cabin_B,Cabin_C,Cabin_D,Cabin_E
0,0,1,7.25,0,0,1,0,0,1,1,0,0,0,0
1,1,0,71.2833,1,0,0,1,0,0,0,0,1,0,0
2,1,0,7.925,0,0,1,0,0,1,1,0,0,0,0
3,1,0,53.1,1,0,0,0,0,1,0,0,1,0,0
4,0,1,8.05,0,0,1,0,0,1,1,0,0,0,0


In [4]:
# Remove the dependent variable, and remove the Pclass_2 as it's highly correlated with other
# varialbles as and not a very useful predictor per feature_analysis
ind_df = feature_df.drop(labels=['Survived', 'Pclass_2'], axis=1)
dep_df = feature_df.Survived

# Model Exploration

## Logistic Regression

In [5]:
from sklearn.linear_model import LogisticRegression
lrcv = LogisticRegression()
cv = cross_val_score(lrcv, ind_df, dep_df, cv=5)
print(', '.join(map(str, cv)))
print('Best Score: ', max(cv))

0.759776536313, 0.787709497207, 0.775280898876, 0.769662921348, 0.790960451977
Best Score:  0.790960451977


## KNN

In [6]:
from sklearn.neighbors import KNeighborsClassifier

In [7]:
knn = KNeighborsClassifier(n_neighbors=3)
cv = cross_val_score(knn, ind_df, dep_df, cv=5)
print(', '.join(map(str, cv)))
print('Best Score: ', max(cv))

0.77094972067, 0.720670391061, 0.808988764045, 0.775280898876, 0.774011299435
Best Score:  0.808988764045


In [8]:
knn = KNeighborsClassifier(n_neighbors=5)
cv = cross_val_score(knn, ind_df, dep_df, cv=5)
print(', '.join(map(str, cv)))
print('Best Score: ', max(cv))

0.715083798883, 0.703910614525, 0.808988764045, 0.769662921348, 0.774011299435
Best Score:  0.808988764045


In [9]:
knn = KNeighborsClassifier(n_neighbors=25)
cv = cross_val_score(knn, ind_df, dep_df, cv=5)
print(', '.join(map(str, cv)))
print('Best Score: ', max(cv))

0.553072625698, 0.754189944134, 0.691011235955, 0.707865168539, 0.734463276836
Best Score:  0.754189944134


In [10]:
from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(knn, {'n_neighbors':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 25, 50, 100]})
gs.fit(ind_df, dep_df)
print(gs.best_estimator_.n_neighbors)
print(gs.best_score_)

7
0.749719416386


## Support Vector Machine
TODO not sure what the value of C should be

In [11]:
from sklearn.svm import SVC
svc = SVC(C=0.025)
cv = cross_val_score(svc, ind_df, dep_df, cv=5)
print(', '.join(map(str, cv)))
print('Best Score: ', max(cv))

0.614525139665, 0.614525139665, 0.61797752809, 0.61797752809, 0.61581920904
Best Score:  0.61797752809


## Decision Trees

In [12]:
from sklearn.tree import DecisionTreeClassifier

In [13]:
dtc = DecisionTreeClassifier(max_depth=5)
cv = cross_val_score(dtc, ind_df, dep_df, cv=5)
print(', '.join(map(str, cv)))
print('Best Score: ', max(cv))

0.798882681564, 0.793296089385, 0.808988764045, 0.792134831461, 0.813559322034
Best Score:  0.813559322034


In [14]:
dtc = DecisionTreeClassifier(max_depth=10)
cv = cross_val_score(dtc, ind_df, dep_df, cv=5)
print(', '.join(map(str, cv)))
print('Best Score: ', max(cv))

0.787709497207, 0.759776536313, 0.870786516854, 0.814606741573, 0.830508474576
Best Score:  0.870786516854


In [15]:
dtc = DecisionTreeClassifier(max_depth=100)
cv = cross_val_score(dtc, ind_df, dep_df, cv=5)
print(', '.join(map(str, cv)))
print('Best Score: ', max(cv))

0.765363128492, 0.776536312849, 0.859550561798, 0.803370786517, 0.841807909605
Best Score:  0.859550561798


## Random Forest Classifier

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
rfc = RandomForestClassifier(n_estimators=1000)
cv = cross_val_score(rfc, ind_df, dep_df, cv=5)
print(', '.join(map(str, cv)))
print('Best Score: ', max(cv))

0.765363128492, 0.77094972067, 0.85393258427, 0.808988764045, 0.813559322034
Best Score:  0.85393258427


## Naive Bayes Classifier

In [18]:
from sklearn.naive_bayes import GaussianNB

In [19]:
nb = GaussianNB()
cv = cross_val_score(nb, ind_df, dep_df, cv=5)
print(', '.join(map(str, cv)))
print('Best Score: ', max(cv))

0.642458100559, 0.715083798883, 0.679775280899, 0.741573033708, 0.779661016949
Best Score:  0.779661016949


## Example - Confusion Matrix

In [20]:
from sklearn.metrics import confusion_matrix
lrcvmodel = lrcv.fit(ind_df, dep_df)
confusion_matrix(lrcvmodel.predict(ind_df), dep_df)

array([[453,  92],
       [ 96, 250]])

## Do the comparison in a loop

In [21]:
from collections import OrderedDict

classifiers = OrderedDict([
    ('SVC', SVC(C=0.025)),
    ('LogisticRegression', LogisticRegression()),
    ('KNN3', KNeighborsClassifier(n_neighbors=3)),
    ('KNN5', KNeighborsClassifier(n_neighbors=5)),
    ('KNN7', KNeighborsClassifier(n_neighbors=7)),
    ('KNN10', KNeighborsClassifier(n_neighbors=10)),
    ('DecisionTree5', DecisionTreeClassifier(max_depth=5)),
    ('DecisionTree10', DecisionTreeClassifier(max_depth=10)),
    ('DecisionTree100', DecisionTreeClassifier(max_depth=100)),
    ('RandomForest', RandomForestClassifier(n_estimators=1000)),
    ('NaiveBayes', GaussianNB())])

Comments
* SVC doesn't look like a great model with the F score really low and the CV score not that great
   * Maybe with a different C value SVC may be better
* 

In [22]:
from sklearn.metrics import f1_score, precision_score, recall_score

m_df = pd.DataFrame(columns=['Name', 'Best_CV_Accuracy', 'Average_CV_Accuracy', 'F_Score', 'Precision', 'Recall'])
mname = max(map(len, classifiers.keys()))

print("Model{0}Best CV Score\tAverage CV Score\tF Score".format(' '*(mname - 5)))
mname = max(map(len, classifiers.keys()))
for name, model in classifiers.items():
    pipeline = make_pipeline(preprocessing.StandardScaler(), model)
    cv = cross_val_score(pipeline, ind_df, dep_df, cv=5)
    preds = model.fit(ind_df, dep_df).predict(ind_df)
    f1 = f1_score(preds, dep_df)
    precision = precision_score(preds, dep_df)
    recall = recall_score(preds, dep_df)
    m_df = m_df.append({
            'Name': name, 
            'Best_CV_Accuracy': max(cv), 
            'Average_CV_Accuracy': np.mean(cv), 
            'F_Score': f1,
            'Precision': precision,
            'Recall': recall}, ignore_index=True)


Model             Best CV Score	Average CV Score	F Score


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [23]:
m_df = m_df.sort_values(['F_Score', 'Average_CV_Accuracy', 'Best_CV_Accuracy', 'Precision', 'Recall'], 
                        ascending=False)
    
from tabulate import tabulate
print(tabulate(m_df, headers='keys', tablefmt='psql'))

+----+--------------------+--------------------+-----------------------+-----------+-------------+----------+
|    | Name               |   Best_CV_Accuracy |   Average_CV_Accuracy |   F_Score |   Precision |   Recall |
|----+--------------------+--------------------+-----------------------+-----------+-------------+----------|
|  9 | RandomForest       |           0.853933 |              0.802571 |  0.88685  |    0.847953 | 0.929487 |
|  8 | DecisionTree100    |           0.859551 |              0.804819 |  0.885449 |    0.836257 | 0.940789 |
|  7 | DecisionTree10     |           0.876404 |              0.816036 |  0.840442 |    0.777778 | 0.914089 |
|  2 | KNN3               |           0.803371 |              0.787914 |  0.814056 |    0.812865 | 0.815249 |
|  3 | KNN5               |           0.819209 |              0.794687 |  0.78869  |    0.774854 | 0.80303  |
|  4 | KNN7               |           0.836158 |              0.801454 |  0.755352 |    0.722222 | 0.791667 |
|  6 | Dec

Best Models
* RandomForest
* Decision Trees
* KNN

Not too suprising as these are typical classification models.

# Deep Dives 

## Random Forest

In [24]:
forest = make_pipeline(preprocessing.StandardScaler(), RandomForestClassifier())
best_forest = GridSearchCV(forest, 
                          param_grid=dict(randomforestclassifier__n_estimators=[10, 100, 1000],
                                          randomforestclassifier__max_features=list(range(1, 13)),
                                          randomforestclassifier__max_depth=[10, 100, 1000, None]),
                          n_jobs=-1)
best_forest.fit(ind_df, dep_df)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'randomforestclassifier__max_features': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], 'randomforestclassifier__max_depth': [10, 100, 1000, None], 'randomforestclassifier__n_estimators': [10, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [25]:
print('Best Score: ', best_forest.best_score_)
best_forest_preds = best_forest.best_estimator_.fit(ind_df, dep_df).predict(ind_df)
print('F_Score: ', f1_score(best_forest_preds, dep_df))
print('Precision: ', precision_score(best_forest_preds, dep_df))
print('Recall: ', recall_score(best_forest_preds, dep_df))
print('Best Params:\n', '\n'.join(map(str, best_forest.best_params_.items())))

Best Score:  0.794612794613
F_Score:  0.852255054432
Precision:  0.801169590643
Recall:  0.910299003322
Best Params:
 ('randomforestclassifier__max_features', 1)
('randomforestclassifier__max_depth', 10)
('randomforestclassifier__n_estimators', 10)


In [26]:
confusion_matrix(best_forest.predict(ind_df), dep_df)

array([[522,  68],
       [ 27, 274]])

## DecisionTree

In [27]:
tree = make_pipeline(preprocessing.StandardScaler(), DecisionTreeClassifier())
best_tree = GridSearchCV(tree, 
                          param_grid=dict(decisiontreeclassifier__max_depth=[10, 100, 1000, None],
                                          decisiontreeclassifier__max_features=list(range(1, 13)),
                                          decisiontreeclassifier__max_leaf_nodes=[10, 100, 1000, None]),
                          n_jobs=-1)
best_tree.fit(ind_df, dep_df)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('decisiontreeclassifier', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurit...     min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'decisiontreeclassifier__max_depth': [10, 100, 1000, None], 'decisiontreeclassifier__max_features': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], 'decisiontreeclassifier__max_leaf_nodes': [10, 100, 1000, None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [28]:
print('Best Score: ', best_tree.best_score_)
best_tree_preds = best_tree.best_estimator_.fit(ind_df, dep_df).predict(ind_df)
print('F_Score: ', f1_score(best_tree_preds, dep_df))
print('Precision: ', precision_score(best_tree_preds, dep_df))
print('Recall: ', recall_score(best_tree_preds, dep_df))
print('Best Params:\n', '\n'.join(map(str, best_tree.best_params_.items())))

Best Score:  0.810325476992
F_Score:  0.71652173913
Precision:  0.602339181287
Recall:  0.884120171674
Best Params:
 ('decisiontreeclassifier__max_depth', 10)
('decisiontreeclassifier__max_features', 11)
('decisiontreeclassifier__max_leaf_nodes', 10)


In [29]:
confusion_matrix(best_tree.predict(ind_df), dep_df)

array([[522, 136],
       [ 27, 206]])

## KNN

In [30]:
knn = make_pipeline(preprocessing.StandardScaler(), KNeighborsClassifier())
best_knn = GridSearchCV(knn, 
                          param_grid=dict(kneighborsclassifier__n_neighbors=list(range(1, 101))),
                          n_jobs=-1)
best_knn.fit(ind_df, dep_df)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('kneighborsclassifier', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'kneighborsclassifier__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [31]:
print('Best Score: ', best_knn.best_score_)
best_knn_preds = best_knn.best_estimator_.fit(ind_df, dep_df).predict(ind_df)
print('F_Score: ', f1_score(best_knn_preds, dep_df))
print('Precision: ', precision_score(best_knn_preds, dep_df))
print('Recall: ', recall_score(best_knn_preds, dep_df))
print('Best Params:\n', '\n'.join(map(str, best_knn.best_params_.items())))

Best Score:  0.79797979798
F_Score:  0.747663551402
Precision:  0.701754385965
Recall:  0.8
Best Params:
 ('kneighborsclassifier__n_neighbors', 13)


In [32]:
confusion_matrix(best_knn.predict(ind_df), dep_df)

array([[489, 102],
       [ 60, 240]])

## Comparison of Gridsearched Random Forest, Decision Tree and KNN

In [33]:
for name, gs_model in dict(forest=best_forest, tree=best_tree, knn=best_knn).items():
    print('***{}***'.format(name))
    print('Best Score: ', gs_model.best_score_)
    preds = gs_model.best_estimator_.fit(ind_df, dep_df).predict(ind_df)
    print('F_Score: ', f1_score(preds, dep_df))
    print('Precision: ', precision_score(preds, dep_df))
    print('Recall: ', recall_score(preds, dep_df))

***knn***
Best Score:  0.79797979798
F_Score:  0.747663551402
Precision:  0.701754385965
Recall:  0.8
***tree***
Best Score:  0.810325476992
F_Score:  0.718584070796
Precision:  0.593567251462
Recall:  0.910313901345
***forest***
Best Score:  0.794612794613
F_Score:  0.853582554517
Precision:  0.801169590643
Recall:  0.913333333333


# Conculsion
Random Forest with optimized paramaters gives the best model
1. Not the best Score as the Decision tree gives the best score 0.8035 for forest vs 0.8148 for the tree.  Not a large difference.
2. Best F-Score
3. Best Precision
4. Best Recall

In [38]:
# Output best model to file
# Use file extension 'pkl' as per https://docs.python.org/2/library/pickle.html#example
import pickle
output_path = os.path.join('../models', 'random_forest_v1.pkl')

# Pickle dictionary
pickle.dump(best_forest.best_estimator_, open(output_path, 'wb'))

# Verify output model
loaded_model = pickle.load(open(output_path, 'rb'))
preds = loaded_model.predict(ind_df)
match = all(loaded_model.predict(ind_df) == best_forest.best_estimator_.predict(ind_df))
print('All predictions match expected: ', match)

All predictions match expected:  True


In [39]:
from sklearn.externals import joblib
joblib.dump(best_forest.best_estimator_, output_path) 

# Verify output model
loaded_model = joblib.load(output_path)
preds = loaded_model.predict(ind_df)
match = all(loaded_model.predict(ind_df) == best_forest.best_estimator_.predict(ind_df))
print('All predictions match expected: ', match)

All predictions match expected:  True
