## Predicting Survival Rate
***

In [1]:
import numpy as np 
import pandas as pd

In [41]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# Classification
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

In [2]:
df = pd.read_csv('data/titanic_clean.csv')

In [3]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S,Title_Master,Title_Mr,Title_Mrs,Title_Ms,Title_Sir
0,0,3,1,22.0,1,0,7.2500,0,1,0,1,0,0,0
1,1,1,0,38.0,1,0,71.2833,0,0,0,0,1,0,0
2,1,3,0,26.0,0,0,7.9250,0,1,0,0,0,1,0
3,1,1,0,35.0,1,0,53.1000,0,1,0,0,1,0,0
4,0,3,1,35.0,0,0,8.0500,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,1,27.0,0,0,13.0000,0,1,0,1,0,0,0
887,1,1,0,19.0,0,0,30.0000,0,1,0,0,0,1,0
888,0,3,0,28.0,1,2,23.4500,0,1,0,0,0,1,0
889,1,1,1,26.0,0,0,30.0000,0,0,0,1,0,0,0


In [4]:
X = df.drop(columns='Survived')
y = df['Survived']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33)

In [38]:
logres_model = LogisticRegression(max_iter=300, n_jobs=-1)

In [39]:
logres_model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=300,
                   multi_class='auto', n_jobs=-1, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [40]:
# First score w (max_iter=300, n_jobs=-1): 0.8699551569506726
logres_model.score(X_test, y_test)

0.8699551569506726

In [60]:
# Use GridSearch to iterate through a couple of models and their hyperparameters
model_list = {'LogisticRegression': LogisticRegression(), 
              'RandomForestClassifier': RandomForestClassifier(), 
              'AdaBoostClassifier': AdaBoostClassifier(), 
              'GradientBoostingClassifier': GradientBoostingClassifier(), 
              'MLPClassifier': MLPClassifier()}

In [None]:
parameters = {'LogisticRegression': {'n_jobs':[-1]}, 
              'RandomForestClassifier': {'n_jobs':[-1]}, 
              'AdaBoostClassifier': {}, 
              'GradientBoostingClassifier': {}, 
              'MLPClassifier': {}}

In [76]:
base_score = dict()

for model_name, model in model_list.items():
    if model_name not in base_score:
        base_score[model_name] = {}
    clf = GridSearchCV(model, parameters[model_name])
    clf.fit(X_train, y_train)
    base_score[model_name] = {'Score':clf.score(X_test, y_test), 'Parameters':clf.get_params()}



In [79]:
base_score = {'Models':[], 'Scores':[], 'Parameters':[]}

for model_name, model in model_list.items():
    clf = GridSearchCV(model, parameters[model_name])
    clf.fit(X_train, y_train)
    base_score['Models'].append(model_name)
    base_score['Scores'].append(clf.score(X_test, y_test))
    base_score['Parameters'].append(clf.get_params())



In [83]:
for val in zip(base_score['Models'], base_score['Scores']):
    print(val)

('LogisticRegression', 0.8699551569506726)
('RandomForestClassifier', 0.8654708520179372)
('AdaBoostClassifier', 0.8609865470852018)
('GradientBoostingClassifier', 0.874439461883408)
('MLPClassifier', 0.852017937219731)
