# Cross validation 

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [2]:
titanic_df = pd.read_csv('datasets/titanic_processed_3.csv')
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,1,1,0,47.0,1,1,52.5542,0,0,1
1,0,3,1,16.0,0,0,9.2167,0,0,1
2,0,3,1,17.0,0,0,8.6625,0,0,1
3,1,2,0,36.0,0,0,13.0,0,0,1
4,0,3,0,39.0,0,5,29.125,0,1,0


In [3]:
X = titanic_df.drop('Survived', axis=1)
Y = titanic_df['Survived']

x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2)

In [4]:
def summarize_classification(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred, normalize=True)
    accuracy_count = accuracy_score(y_test, y_pred, normalize=False)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    print('test data count:', len(y_test))
    print('accuracy:', accuracy)
    print('precision:', precision)
    print('recall:', recall)
    print('accuracy_count:', accuracy_count)
    print()

# DecisionTree Tuning

In [5]:
from sklearn.model_selection import GridSearchCV

# tuningujemy tylko jeden z możliwych hyperparameters - do tablicy wrzucamy wartości hyperparametrów które będziemy sprawdzać
parameters = {'max_depth': [2,4,5,7,9,10]}

# cv =3 - na ile części podzielić nasz train dataset w celu walidacji. 2z3 zestawów będą wkorzystane do trenowania, 1z3 do oceny
grid_search = GridSearchCV(DecisionTreeClassifier(), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train,y_train)

grid_search.best_params_

{'max_depth': 4}

In [6]:
for i in range(6):
    print('Parameters:', grid_search.cv_results_['params'][i])
    print('Mean Test Score:', grid_search.cv_results_['mean_test_score'][i])
    print('Rank:', grid_search.cv_results_['rank_test_score'][i])

Parameters: {'max_depth': 2}
Mean Test Score: 0.7925740276617469
Rank: 3
Parameters: {'max_depth': 4}
Mean Test Score: 0.8119650979300101
Rank: 1
Parameters: {'max_depth': 5}
Mean Test Score: 0.7996751137102014
Rank: 2
Parameters: {'max_depth': 7}
Mean Test Score: 0.7785946347349856
Rank: 4
Parameters: {'max_depth': 9}
Mean Test Score: 0.7715956558061822
Rank: 5
Parameters: {'max_depth': 10}
Mean Test Score: 0.7680497540146662
Rank: 6


In [7]:
best_score = grid_search.cv_results_['params'][5]
max_depth = best_score['max_depth']

decision_tree_model = DecisionTreeClassifier(max_depth = max_depth).fit(x_train,y_train)

In [8]:
y_pred = decision_tree_model.predict(x_test)
summarize_classification(y_test, y_pred)

test data count: 143
accuracy: 0.7272727272727273
precision: 0.6170212765957447
recall: 0.58
accuracy_count: 104



# LogisticRegression tuning

In [9]:
# tuningujemy tylko jeden z możliwych hyperparameters - do tablicy wrzucamy wartości hyperparametrów które będziemy sprawdzać
parameters = {'penalty': ['l1','l2'],
             'C': [0.1, 0.4, 0.8, 1, 2, 5]}

# cv = 3 - na ile części podzielić nasz train dataset w celu walidacji. 2z3 zestawów będą wkorzystane do trenowania, 1z3 do oceny
grid_search = GridSearchCV(LogisticRegression(solver='liblinear'), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train,y_train)

grid_search.best_params_

{'C': 0.8, 'penalty': 'l1'}

In [10]:
penalty = grid_search.best_params_['penalty']
C = grid_search.best_params_['C']

logistic_regression_model = LogisticRegression(solver='liblinear', penalty = penalty, C = C).fit(x_train,y_train)

In [11]:
y_pred = logistic_regression_model.predict(x_test)
summarize_classification(y_test,y_pred)

test data count: 143
accuracy: 0.7552447552447552
precision: 0.6666666666666666
recall: 0.6
accuracy_count: 108

