In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [3]:
titanic_df = pd.read_csv('datasets/titanic_processed')

titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,1,1,0,47.0,1,1,52.5542,0,0,1
1,0,3,1,17.0,1,1,7.2292,1,0,0
2,1,1,0,33.0,1,0,53.1,0,0,1
3,0,1,1,54.0,0,0,51.8625,0,0,1
4,0,3,1,20.0,0,0,8.6625,0,0,1


In [4]:
x = titanic_df.drop('Survived', axis = 1)
y = titanic_df['Survived']

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2)

In [6]:
def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred, normalize = True)
    num_acc = accuracy_score(y_test, y_pred, normalize = False)
    
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    print('Test data count: ', len(y_test))
    print('accuracy score: ', acc)
    print('accuracy count: ', num_acc)
    print('precision: ', prec)
    print('recall: ',  recall)
    print()

In [7]:
from sklearn.model_selection import GridSearchCV

parameters = {'max_depth': [2,4,5,7,9,10]}

grid_search = GridSearchCV(DecisionTreeClassifier(), parameters, cv=3, return_train_score = True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'max_depth': 4}

In [8]:
for i in range(6):
    print('Parameters: ', grid_search.cv_results_['params'][i])
    
    print('Mean Test Score: ', grid_search.cv_results_['mean_test_score'][i])
    
    print('Rank: ', grid_search.cv_results_['rank_test_score'][i])

Parameters:  {'max_depth': 2}
Mean Test Score:  0.7723799026361716
Rank:  3
Parameters:  {'max_depth': 4}
Mean Test Score:  0.781078350326077
Rank:  1
Parameters:  {'max_depth': 5}
Mean Test Score:  0.7775787636630844
Rank:  2
Parameters:  {'max_depth': 7}
Mean Test Score:  0.7723431615688435
Rank:  4
Parameters:  {'max_depth': 9}
Mean Test Score:  0.7530724717553046
Rank:  6
Parameters:  {'max_depth': 10}
Mean Test Score:  0.7548176724533847
Rank:  5


In [9]:
decision_tree_model = DecisionTreeClassifier(max_depth = grid_search.best_params_['max_depth']).fit(x_train, y_train)

In [10]:
y_pred = decision_tree_model.predict(x_test)

In [11]:
summarize_classification(y_test, y_pred)

Test data count:  143
accuracy score:  0.8251748251748252
accuracy count:  118
precision:  0.8787878787878788
recall:  0.58



In [12]:
parameters = {'penalty': ['l1', 'l2'],
              'C': [0.1,0.4,0.8,1,2,5]}

grid_search = GridSearchCV(LogisticRegression(solver='liblinear'), parameters, cv=3, return_train_score = True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'C': 5, 'penalty': 'l2'}

In [13]:
for i in range(12):
    print('Parameters: ', grid_search.cv_results_['params'][i])
    
    print('Mean Test Score: ', grid_search.cv_results_['mean_test_score'][i])
    
    print('Rank: ', grid_search.cv_results_['rank_test_score'][i])

Parameters:  {'C': 0.1, 'penalty': 'l1'}
Mean Test Score:  0.7495269587581519
Rank:  12
Parameters:  {'C': 0.1, 'penalty': 'l2'}
Mean Test Score:  0.7722972352346836
Rank:  11
Parameters:  {'C': 0.4, 'penalty': 'l1'}
Mean Test Score:  0.7792229264260128
Rank:  7
Parameters:  {'C': 0.4, 'penalty': 'l2'}
Mean Test Score:  0.7757600808303481
Rank:  9
Parameters:  {'C': 0.8, 'penalty': 'l1'}
Mean Test Score:  0.7809956829245889
Rank:  6
Parameters:  {'C': 0.8, 'penalty': 'l2'}
Mean Test Score:  0.7757692660971801
Rank:  8
Parameters:  {'C': 1, 'penalty': 'l1'}
Mean Test Score:  0.7810048681914209
Rank:  5
Parameters:  {'C': 1, 'penalty': 'l2'}
Mean Test Score:  0.7740240653990997
Rank:  10
Parameters:  {'C': 2, 'penalty': 'l1'}
Mean Test Score:  0.7862496555524938
Rank:  2
Parameters:  {'C': 2, 'penalty': 'l2'}
Mean Test Score:  0.781032423991917
Rank:  4
Parameters:  {'C': 5, 'penalty': 'l1'}
Mean Test Score:  0.7845136401212455
Rank:  3
Parameters:  {'C': 5, 'penalty': 'l2'}
Mean Test Sc

In [15]:
logistic_model = LogisticRegression(solver='liblinear', penalty = grid_search.best_params_['penalty'],
                                    C=grid_search.best_params_['C']).fit(x_train, y_train)

In [16]:
y_pred = logistic_model.predict(x_test)

In [17]:
summarize_classification(y_test, y_pred)

Test data count:  143
accuracy score:  0.8181818181818182
accuracy count:  117
precision:  0.74
recall:  0.74

