## Predicting Survival Rate
***

In [1]:
import numpy as np 
import pandas as pd

In [52]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

# Classification
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

In [32]:
df = pd.read_csv('data/titanic_clean.csv')

In [33]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S,Title_Master,Title_Mr,Title_Mrs,Title_Ms,Title_Sir
0,0,3,1,22.0,1,0,7.2500,0,1,0,1,0,0,0
1,1,1,0,38.0,1,0,71.2833,0,0,0,0,1,0,0
2,1,3,0,26.0,0,0,7.9250,0,1,0,0,0,1,0
3,1,1,0,35.0,1,0,53.1000,0,1,0,0,1,0,0
4,0,3,1,35.0,0,0,8.0500,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,1,27.0,0,0,13.0000,0,1,0,1,0,0,0
887,1,1,0,19.0,0,0,30.0000,0,1,0,0,0,1,0
888,0,3,0,28.0,1,2,23.4500,0,1,0,0,0,1,0
889,1,1,1,26.0,0,0,30.0000,0,0,0,1,0,0,0


In [53]:
scaler = MinMaxScaler()

In [54]:
X = df.drop(columns='Survived')
y = df['Survived']

In [55]:
X_scaled = scaler.fit_transform(X)

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33)

In [47]:
# Use GridSearch to iterate through a couple of models and their hyperparameters
model_list = {'LogisticRegression': LogisticRegression(), 
              'RandomForestClassifier': RandomForestClassifier(), 
              'GradientBoostingClassifier': GradientBoostingClassifier()}

In [48]:
parameters = {'LogisticRegression': {'n_jobs':[-1], 
                                     'max_iter':[300,500,700], 
                                     'warm_start':[True,False], 
                                     'solver':['newton-cg','lbfgs'],
                                     'class_weight':[None, 'balanced']}, 
              'RandomForestClassifier': {'n_jobs':[-1],
                                         'n_estimators':[100,300,600],
                                         'criterion':['gini','entropy'],
                                         'max_depth':[None, 30,60],
                                         'min_samples_split':[2,3],
                                         'bootstrap':[True],
                                         'oob_score':[False,True],
                                         'class_weight':[None,'balanced','balanced_subsample']}, 
              'GradientBoostingClassifier': {'n_estimators':[100,300,600],
                                             'learning_rate':[1,.1,.05],
                                             'criterion':['friedman_mse','mse','mae'],
                                             'min_samples_split':[2,3]}}

In [49]:
def gridsearch_models(models, parameters):
    result = {'Models':[], 'Scores':[], 'Parameters':[]}

    for model_name, model in models.items():
        clf = GridSearchCV(model, parameters[model_name])
        clf.fit(X_train, y_train)
        result['Models'].append(model_name)
        result['Scores'].append(clf.score(X_test, y_test))
        result['Parameters'].append(clf.get_params())

    return result

In [57]:
first_model_test = gridsearch_models(model_list, parameters)

In [58]:
first_model_test # tried no scaling, standardscaler, and minmaxscaler - minmax worked best

{'Models': ['LogisticRegression',
  'RandomForestClassifier',
  'GradientBoostingClassifier'],
 'Scores': [0.8699551569506726, 0.8609865470852018, 0.8609865470852018],
 'Parameters': [{'cv': None,
   'error_score': nan,
   'estimator__C': 1.0,
   'estimator__class_weight': None,
   'estimator__dual': False,
   'estimator__fit_intercept': True,
   'estimator__intercept_scaling': 1,
   'estimator__l1_ratio': None,
   'estimator__max_iter': 100,
   'estimator__multi_class': 'auto',
   'estimator__n_jobs': None,
   'estimator__penalty': 'l2',
   'estimator__random_state': None,
   'estimator__solver': 'lbfgs',
   'estimator__tol': 0.0001,
   'estimator__verbose': 0,
   'estimator__warm_start': False,
   'estimator': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                      intercept_scaling=1, l1_ratio=None, max_iter=100,
                      multi_class='auto', n_jobs=None, penalty='l2',
                      random_state=None, solver='lbfgs', tol