In [54]:
import pandas as pd
import numpy as np


In [55]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score


In [56]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier


In [57]:
titanic_df = pd.read_csv('./data/titanic_train_preprocessed.csv')
titanic_df.head()


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,24.0,0,0,7.4958,0,0,1
1,0,3,1,37.0,2,0,7.925,0,0,1
2,0,3,1,40.0,0,0,7.8958,0,0,1
3,1,1,1,42.0,0,0,26.2875,0,0,1
4,1,2,0,25.0,1,1,30.0,0,0,1


In [58]:
X = titanic_df.drop(['Survived'], axis=1)
Y = titanic_df['Survived']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)


In [59]:
def summerize_classification(y_test, y_pred):
    print('Test Data Count: {}'.format(len(y_test)))
    print('Accuracy: {:.2f}'.format(
        accuracy_score(y_test, y_pred, normalize=True)))
    print('Accuracy Score: {:.2f}'.format(
        accuracy_score(y_test, y_pred, normalize=False)))
    print('Precision: {:.2f}'.format(precision_score(y_test, y_pred)))
    print('Recall: {:.2f}'.format(recall_score(y_test, y_pred)))
    print()


In [60]:
from sklearn.model_selection import GridSearchCV

params = { 'max_depth': [2, 4, 5, 7, 9, 10] }

grid_search = GridSearchCV(DecisionTreeClassifier(), params, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'max_depth': 4}

In [61]:
for i in range(6):
    print('Params:', grid_search.cv_results_['params'][i])
    print('Mean Score:', grid_search.cv_results_['mean_test_score'][i])
    print('Rank:', grid_search.cv_results_['rank_test_score'][i])

Params: {'max_depth': 2}
Mean Score: 0.8049289891395155
Rank: 3
Params: {'max_depth': 4}
Mean Score: 0.8154645873944119
Rank: 1
Params: {'max_depth': 5}
Mean Score: 0.8119650979300101
Rank: 2
Params: {'max_depth': 7}
Mean Score: 0.7890931031281908
Rank: 4
Params: {'max_depth': 9}
Mean Score: 0.7820848417339645
Rank: 5
Params: {'max_depth': 10}
Mean Score: 0.7662768031189083
Rank: 6


In [62]:
decision_tree_model = DecisionTreeClassifier(max_depth=grid_search.best_params_['max_depth']).fit(x_train, y_train)

In [63]:
y_pred = decision_tree_model.predict(x_test)

In [64]:
summerize_classification(y_test, y_pred)

Test Data Count: 143
Accuracy: 0.80
Accuracy Score: 115.00
Precision: 0.81
Recall: 0.70



In [69]:
params = {
    'C': [0.1, 0.4, 0.8, 1, 2, 5],
    'penalty': ['l1', 'l2']
}

grid_search = GridSearchCV(LogisticRegression(solver='liblinear'), params, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'C': 0.8, 'penalty': 'l1'}

In [70]:
for i in range(12):
    print('Params:', grid_search.cv_results_['params'][i])
    print('Mean Score:', grid_search.cv_results_['mean_test_score'][i])
    print('Rank:', grid_search.cv_results_['rank_test_score'][i])

Params: {'C': 0.1, 'penalty': 'l1'}
Mean Score: 0.7662489557226398
Rank: 12
Params: {'C': 0.1, 'penalty': 'l2'}
Mean Score: 0.7662582381880627
Rank: 11
Params: {'C': 0.4, 'penalty': 'l1'}
Mean Score: 0.7785667873387171
Rank: 2
Params: {'C': 0.4, 'penalty': 'l2'}
Mean Score: 0.7750394504780469
Rank: 7
Params: {'C': 0.8, 'penalty': 'l1'}
Mean Score: 0.7820755592685417
Rank: 1
Params: {'C': 0.8, 'penalty': 'l2'}
Mean Score: 0.7732943469785575
Rank: 9
Params: {'C': 1, 'penalty': 'l1'}
Mean Score: 0.7785667873387171
Rank: 2
Params: {'C': 1, 'penalty': 'l2'}
Mean Score: 0.776803118908382
Rank: 5
Params: {'C': 2, 'penalty': 'l1'}
Mean Score: 0.7733036294439802
Rank: 8
Params: {'C': 2, 'penalty': 'l2'}
Mean Score: 0.7750580154088927
Rank: 6
Params: {'C': 5, 'penalty': 'l1'}
Mean Score: 0.7768216838392278
Rank: 4
Params: {'C': 5, 'penalty': 'l2'}
Mean Score: 0.771549243479068
Rank: 10
