# Kaggle Competition: Titanic
Predicting whether an idividual survived or not

In [60]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns',100)
import pickle

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

from sklearn.model_selection import train_test_split

# Import Logistic Regression
from sklearn.linear_model import LogisticRegression
# Import RandomForestClassifier and GradientBoostingClassifer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# Function for splitting training and test set
from sklearn.model_selection import train_test_split
# Function for creating model pipelines
from sklearn.pipeline import make_pipeline
# For standardization
from sklearn.preprocessing import StandardScaler
# Helper for cross-validation
from sklearn.model_selection import GridSearchCV
# Classification metrics (added later)
from sklearn.metrics import roc_curve, auc, confusion_matrix

Steps to follow: 
1. [Split dataset into train and test](#data_split)
2. [Build the model pipelines](#pipeline)
3. [Declare hyperparameters to tune for our task](#hyperparameters)
4. [Fit and tune models using cross-validation](#tune)
5. [Evaluate model](#evaluate)
6. [Area under ROC curve](#auroc)

In [13]:
df = pd.read_csv('titanic_base_table.csv')

In [14]:
df.shape

(891, 26)

In [15]:
df.tail(3)

Unnamed: 0,survived,pclass,isFemale,age,sibsp,parch,fare,class,adult_male,alone,age_missing,group_size,embarked_C,embarked_Q,embarked_S,who_child,who_man,who_woman,deck_A,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,deck_Missing
888,0,3,1,0.0,1,2,23.45,3,0,0,1,4,0,0,1,0,0,1,0,0,0,0,0,0,0,1
889,1,1,0,26.0,0,0,30.0,1,1,1,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0
890,0,3,0,32.0,0,0,7.75,3,1,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1


_________________

<span id='data_split'> </span>
# Split Data into Test/Train Split

In [16]:
y = df.survived

X = df.drop('survived', axis=1)

print(y.shape)
print(X.shape)

(891,)
(891, 25)


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42,
                                                   stratify=df.survived)

print(len(X_train),len(X_test), len(y_train), len(y_test))

712 179 712 179


_________________

<span id='pipeline'></span>
# Build Pipelines

Models tested: 
* Logistic Regression w/ L1 Penalty
* Logistic Regression w/ L2 Penalty
* Random Forest
* Gradient Boosted Tree

In [21]:
pipelines = {
    'l1':make_pipeline(StandardScaler(), 
                       LogisticRegression(penalty='l1', random_state=42,n_jobs=-1)),
    'l2':make_pipeline(StandardScaler(), 
                       LogisticRegression(penalty='l2', random_state=42,n_jobs=-1)),
    'rf':make_pipeline(StandardScaler(),
                       RandomForestClassifier(random_state=42,n_jobs=-1)), 
    'gb':make_pipeline(StandardScaler(), GradientBoostingClassifier(random_state=42))
}

_________________

<span id='hyperparameters'></span>
# Declare Hyperparameters to Tune

In [19]:
pipelines['l1'].get_params()

{'logisticregression': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l1', random_state=42, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 'logisticregression__C': 1.0,
 'logisticregression__class_weight': None,
 'logisticregression__dual': False,
 'logisticregression__fit_intercept': True,
 'logisticregression__intercept_scaling': 1,
 'logisticregression__max_iter': 100,
 'logisticregression__multi_class': 'ovr',
 'logisticregression__n_jobs': 1,
 'logisticregression__penalty': 'l1',
 'logisticregression__random_state': 42,
 'logisticregression__solver': 'liblinear',
 'logisticregression__tol': 0.0001,
 'logisticregression__verbose': 0,
 'logisticregression__warm_start': False,
 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'standardscaler__copy': True,
 'standardscaler__with_mean': True,
 'standardscaler

In [24]:
l1_hyperparameters = {'logisticregression__C': np.linspace(1e-3, 1e3, 10)}
l2_hyperparameters = {'logisticregression__C': np.linspace(1e-3, 1e3, 10)}

In [25]:
pipelines['rf'].get_params()

{'randomforestclassifier': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=10, n_jobs=-1, oob_score=False, random_state=42,
             verbose=0, warm_start=False),
 'randomforestclassifier__bootstrap': True,
 'randomforestclassifier__class_weight': None,
 'randomforestclassifier__criterion': 'gini',
 'randomforestclassifier__max_depth': None,
 'randomforestclassifier__max_features': 'auto',
 'randomforestclassifier__max_leaf_nodes': None,
 'randomforestclassifier__min_impurity_split': 1e-07,
 'randomforestclassifier__min_samples_leaf': 1,
 'randomforestclassifier__min_samples_split': 2,
 'randomforestclassifier__min_weight_fraction_leaf': 0.0,
 'randomforestclassifier__n_estimators': 10,
 'randomforestclassifier__n_jobs': -1,
 'randomfores

In [36]:
rf_hyperparameters = {'randomforestclassifier__n_estimators': [50, 100, 200],
                     'randomforestclassifier__max_features': ['auto', 'sqrt', .33],
                     'randomforestclassifier__max_depth': [3, 5, 8]}

In [37]:
pipelines['gb'].get_params()

{'gradientboostingclassifier': GradientBoostingClassifier(criterion='friedman_mse', init=None,
               learning_rate=0.1, loss='deviance', max_depth=3,
               max_features=None, max_leaf_nodes=None,
               min_impurity_split=1e-07, min_samples_leaf=1,
               min_samples_split=2, min_weight_fraction_leaf=0.0,
               n_estimators=100, presort='auto', random_state=42,
               subsample=1.0, verbose=0, warm_start=False),
 'gradientboostingclassifier__criterion': 'friedman_mse',
 'gradientboostingclassifier__init': None,
 'gradientboostingclassifier__learning_rate': 0.1,
 'gradientboostingclassifier__loss': 'deviance',
 'gradientboostingclassifier__max_depth': 3,
 'gradientboostingclassifier__max_features': None,
 'gradientboostingclassifier__max_leaf_nodes': None,
 'gradientboostingclassifier__min_impurity_split': 1e-07,
 'gradientboostingclassifier__min_samples_leaf': 1,
 'gradientboostingclassifier__min_samples_split': 2,
 'gradientboostingcl

In [38]:
gb_hyperparameters = {'gradientboostingclassifier__n_estimators': [50, 100, 200],
                     'gradientboostingclassifier__learning_rate': [.05, .1, .2],
                     'gradientboostingclassifier__max_depth': [3,5,8],
                     'gradientboostingclassifier__max_features': ['auto','sqrt',.33]}

In [39]:
hyperparameters = {
    'l1': l1_hyperparameters,
    'l2': l2_hyperparameters,
    'rf': rf_hyperparameters, 
    'gb': gb_hyperparameters
}

_________________

<span id='tune'></span>
# Fit and Tune Models Using Cross Validation

In [40]:
%%time

fitted_models = {}

for name, pipeline in pipelines.items():
    #print('{}:: {}\n\n'.format(name, pipeline))
    
    # create cross validation object
    model = GridSearchCV(pipeline, hyperparameters[name], cv=10, n_jobs=-1)
    
    # fit the model
    model.fit(X_train, y_train)
    
    # store the model in the dictionary
    fitted_models[name] = model
    
    print('{} has been fitted.'.format(name))

l1 has been fitted.
l2 has been fitted.
rf has been fitted.
gb has been fitted.
CPU times: user 3.82 s, sys: 350 ms, total: 4.17 s
Wall time: 2min 13s


_________________

<span id='evaluate'></span>
# Evaluate the Models

In [41]:
for k, v in fitted_models.items():
    print(k, v.best_score_)

l1 0.817415730337
l2 0.817415730337
rf 0.828651685393
gb 0.824438202247


In [43]:
fitted_models['rf']

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurit...imators=10, n_jobs=-1, oob_score=False, random_state=42,
            verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'randomforestclassifier__n_estimators': [50, 100, 200], 'randomforestclassifier__max_features': ['auto', 'sqrt', 0.33], 'randomforestclassifier__max_depth': [3, 5, 8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

_________________

<span id='auroc'></span>
# Area Under the ROC Curve

In [57]:
for name, model in fitted_models.items():
    pred = model.predict_proba(X_test)
    # get the 'survived' (1) class
    pred = [p[1] for p in pred]
    
    fpr, tpr, thresholds = roc_curve(y_test, pred)
    
    print('{} has an auc score of {}'.format(name, auc(fpr, tpr)))

l1 has an auc score of 0.8600790513833992
l2 has an auc score of 0.8598155467720685
rf has an auc score of 0.8488801054018446
gb has an auc score of 0.853227931488801


## Logistic Regression w/ L1 penalty gave us the best AUC score
Save the L1 model as a pickle file

In [61]:
with open('titanic_final_model.pkl','wb') as f:
    pickle.dump(fitted_models['l1'].best_estimator_,f)    

_________________