In [1]:
import numpy as np
import pandas as pd
import re
import os
from pandas.plotting import scatter_matrix

get_ipython().magic(u'env OMP_NUM_THREADS=2')

from IPython.display import display, HTML

%matplotlib inline
import matplotlib.pyplot as plt

#get_ipython().magic(u'matplotlib')
#get_ipython().magic(u'matplotlib inline')

# Set the ransom seed used for the whole program to allow reprocibility
np.random.seed(3214412)

from sklearn.model_selection import cross_val_score

env: OMP_NUM_THREADS=2


In [2]:
data_dir = '../data/processed'
feature_filename = os.path.join(data_dir, 'feature_set.csv')

In [3]:
feature_df = pd.read_csv(feature_filename)
feature_df.head()

Unnamed: 0,Survived,Sex,Fare,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Cabin_nan,Cabin_B,Cabin_C,Cabin_D,Cabin_E
0,0,1,7.25,0,0,1,0,0,1,1,0,0,0,0
1,1,0,71.2833,1,0,0,1,0,0,0,0,1,0,0
2,1,0,7.925,0,0,1,0,0,1,1,0,0,0,0
3,1,0,53.1,1,0,0,0,0,1,0,0,1,0,0
4,0,1,8.05,0,0,1,0,0,1,1,0,0,0,0


In [4]:
# Remove the dependent variable, and remove the Pclass_2 as it's highly correlated with other
# varialbles as and not a very useful predictor per feature_analysis
ind_df = feature_df.drop(labels=['Survived', 'Pclass_2'], axis=1)
dep_df = feature_df.Survived

# Model Exploration

## Logistic Regression

In [5]:
from sklearn.linear_model import LogisticRegression
lrcv = LogisticRegression()
cv = cross_val_score(lrcv, ind_df, dep_df, cv=5)
print(', '.join(map(str, cv)))
print('Best Score: ', max(cv))

0.759776536313, 0.787709497207, 0.775280898876, 0.769662921348, 0.790960451977
Best Score:  0.790960451977


## KNN

In [6]:
from sklearn.neighbors import KNeighborsClassifier

In [7]:
knn = KNeighborsClassifier(n_neighbors=3)
cv = cross_val_score(knn, ind_df, dep_df, cv=5)
print(', '.join(map(str, cv)))
print('Best Score: ', max(cv))

0.77094972067, 0.720670391061, 0.808988764045, 0.775280898876, 0.774011299435
Best Score:  0.808988764045


In [8]:
knn = KNeighborsClassifier(n_neighbors=5)
cv = cross_val_score(knn, ind_df, dep_df, cv=5)
print(', '.join(map(str, cv)))
print('Best Score: ', max(cv))

0.715083798883, 0.703910614525, 0.808988764045, 0.769662921348, 0.774011299435
Best Score:  0.808988764045


In [9]:
knn = KNeighborsClassifier(n_neighbors=25)
cv = cross_val_score(knn, ind_df, dep_df, cv=5)
print(', '.join(map(str, cv)))
print('Best Score: ', max(cv))

0.553072625698, 0.754189944134, 0.691011235955, 0.707865168539, 0.734463276836
Best Score:  0.754189944134


In [10]:
from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(knn, {'n_neighbors':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 25, 50, 100]})
gs.fit(ind_df, dep_df)
print(gs.best_estimator_.n_neighbors)
print(gs.best_score_)

7
0.749719416386


## Support Vector Machine
TODO not sure what the value of C should be

In [11]:
from sklearn.svm import SVC
svc = SVC(C=0.025)
cv = cross_val_score(svc, ind_df, dep_df, cv=5)
print(', '.join(map(str, cv)))
print('Best Score: ', max(cv))

0.614525139665, 0.614525139665, 0.61797752809, 0.61797752809, 0.61581920904
Best Score:  0.61797752809


## Decision Trees

In [12]:
from sklearn.tree import DecisionTreeClassifier

In [13]:
dtc = DecisionTreeClassifier(max_depth=5)
cv = cross_val_score(dtc, ind_df, dep_df, cv=5)
print(', '.join(map(str, cv)))
print('Best Score: ', max(cv))

0.798882681564, 0.793296089385, 0.808988764045, 0.792134831461, 0.813559322034
Best Score:  0.813559322034


In [14]:
dtc = DecisionTreeClassifier(max_depth=10)
cv = cross_val_score(dtc, ind_df, dep_df, cv=5)
print(', '.join(map(str, cv)))
print('Best Score: ', max(cv))

0.787709497207, 0.759776536313, 0.870786516854, 0.814606741573, 0.830508474576
Best Score:  0.870786516854


In [15]:
dtc = DecisionTreeClassifier(max_depth=100)
cv = cross_val_score(dtc, ind_df, dep_df, cv=5)
print(', '.join(map(str, cv)))
print('Best Score: ', max(cv))

0.765363128492, 0.776536312849, 0.859550561798, 0.803370786517, 0.841807909605
Best Score:  0.859550561798


## Random Forest Classifier

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
rfc = RandomForestClassifier(n_estimators=1000)
cv = cross_val_score(rfc, ind_df, dep_df, cv=5)
print(', '.join(map(str, cv)))
print('Best Score: ', max(cv))

0.765363128492, 0.77094972067, 0.85393258427, 0.808988764045, 0.813559322034
Best Score:  0.85393258427


## Naive Bayes Classifier

In [18]:
from sklearn.naive_bayes import GaussianNB

In [19]:
nb = GaussianNB()
cv = cross_val_score(nb, ind_df, dep_df, cv=5)
print(', '.join(map(str, cv)))
print('Best Score: ', max(cv))

0.642458100559, 0.715083798883, 0.679775280899, 0.741573033708, 0.779661016949
Best Score:  0.779661016949


# Vary the Features