In [81]:
import pandas as pd
import numpy as np

In [82]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [83]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [84]:
titanic_df = pd.read_csv('./data/titanic_train_preprocessed.csv')
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,24.0,0,0,7.4958,0,0,1
1,0,3,1,37.0,2,0,7.925,0,0,1
2,0,3,1,40.0,0,0,7.8958,0,0,1
3,1,1,1,42.0,0,0,26.2875,0,0,1
4,1,2,0,25.0,1,1,30.0,0,0,1


In [85]:
features = list(titanic_df.columns[1:])
features

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [86]:
result_dict = {}

In [87]:
def summerize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    return {
        'accuracy': acc,
        'precision': prec,
        'recall': recall,
        'accuracy_count': num_acc
    }

In [88]:
def build_model(classifier_func, y_col, x_cols, df, test_size=0.2):
    X = df[x_cols]
    y = df[y_col]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    
    model = classifier_func(X_train, y_train)
    
    y_pred = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    
    train_summary = summerize_classification(y_train, y_pred_train)
    test_summary = summerize_classification(y_test, y_pred)
    
    pred_results = pd.DataFrame({
        'y_test': y_test,
        'y_pred': y_pred
    })
    
    model_crosstab = pd.crosstab(pred_results.y_test, pred_results.y_pred, rownames=['Actual'], colnames=['Predicted'])
    
    return {
        'training': train_summary,
        'test': test_summary,
        'confusion_matrix': model_crosstab,
    }

In [89]:
def compare_results():
    for key in result_dict:
        print('Classification: {}'.format(key))
        print('Training:')
        for score in result_dict[key]['training']:
            print('{}: {}'.format(score, result_dict[key]['training'][score]))
        print('Test:')
        for score in result_dict[key]['test']:
            print('{}: {}'.format(score, result_dict[key]['test'][score]))
        print('\n')
        

In [90]:
def compare_single_results(key):
    print('Classification: {}'.format(key))
    print('Training:')
    for score in result_dict[key]['training']:
        print('{}: {}'.format(score, result_dict[key]['training'][score]))
    print('Test:')
    for score in result_dict[key]['test']:
        print('{}: {}'.format(score, result_dict[key]['test'][score]))


In [91]:
def logistic_regression_function(x_train, y_train):
    model = LogisticRegression(solver='liblinear')
    model.fit(x_train, y_train)
    return model

In [92]:
result_dict['Survived - Logistic Regression'] = build_model(logistic_regression_function, 'Survived', features, titanic_df)
compare_single_results('Survived - Logistic Regression')

Classification: Survived - Logistic Regression
Training:
accuracy: 0.8172231985940246
precision: 0.7989130434782609
recall: 0.6869158878504673
accuracy_count: 465
Test:
accuracy: 0.7692307692307693
precision: 0.8867924528301887
recall: 0.6351351351351351
accuracy_count: 110


In [100]:
def linear_discriminant_analysis_function(x_train, y_train, solver='svd'):
    model = LinearDiscriminantAnalysis(solver=solver)
    model.fit(x_train, y_train)
    return model

In [101]:
result_dict['Survived - Linear Discriminant Analysis'] = build_model(linear_discriminant_analysis_function, 'Survived', features[0:-1], titanic_df)
compare_single_results('Survived - Linear Discriminant Analysis')

Classification: Survived - Linear Discriminant Analysis
Training:
accuracy: 0.804920913884007
precision: 0.7894736842105263
recall: 0.7112068965517241
accuracy_count: 458
Test:
accuracy: 0.7482517482517482
precision: 0.6851851851851852
recall: 0.6607142857142857
accuracy_count: 107


In [102]:
def quadratic_discriminant_analysis_function(x_train, y_train):
    model = QuadraticDiscriminantAnalysis()
    model.fit(x_train, y_train)
    return model

In [103]:
result_dict['Survived - Quadratic Discriminant Analysis'] = build_model(quadratic_discriminant_analysis_function, 'Survived', features[0:-1], titanic_df)
compare_single_results('Survived - Quadratic Discriminant Analysis')

Classification: Survived - Quadratic Discriminant Analysis
Training:
accuracy: 0.8031634446397188
precision: 0.7652582159624414
recall: 0.7244444444444444
accuracy_count: 457
Test:
accuracy: 0.7692307692307693
precision: 0.75
recall: 0.7142857142857143
accuracy_count: 110


In [106]:
def stocatic_gradient_descent_function(x_train, y_train, max_iterations=1000, tol=1e-3):
    model = SGDClassifier(max_iter=max_iterations, tol=tol)
    model.fit(x_train, y_train)
    return model

In [107]:
result_dict['Survived - SGDClassifier'] = build_model(stocatic_gradient_descent_function, 'Survived', features[0:-1], titanic_df)
compare_single_results('Survived - SGDClassifier')

Classification: Survived - SGDClassifier
Training:
accuracy: 0.7135325131810193
precision: 0.617363344051447
recall: 0.8135593220338984
accuracy_count: 406
Test:
accuracy: 0.7272727272727273
precision: 0.5802469135802469
recall: 0.9038461538461539
accuracy_count: 104


In [108]:
def linear_svc_function(x_train, y_train, C=1.0, class_weight=None, max_iterations=1000, tol=1e-3):
    model = LinearSVC(C=C, class_weight=class_weight, max_iter=max_iterations, tol=tol, dual=False)
    model.fit(x_train, y_train)
    return model

In [109]:
result_dict['Survived - Linear SVC'] = build_model(linear_svc_function, 'Survived', features[0:-1], titanic_df)
compare_single_results('Survived - Linear SVC')

Classification: Survived - Linear SVC
Training:
accuracy: 0.8084358523725835
precision: 0.7904761904761904
recall: 0.7186147186147186
accuracy_count: 460
Test:
accuracy: 0.7552447552447552
precision: 0.7037037037037037
recall: 0.6666666666666666
accuracy_count: 108


In [112]:
def nearest_neighbors_function(x_train, y_train, radius=40):
    model = RadiusNeighborsClassifier(radius=radius)
    model.fit(x_train, y_train)
    return model

In [113]:
result_dict['Survived - Radius Neighbors'] = build_model(nearest_neighbors_function, 'Survived', features[0:-1], titanic_df)
compare_single_results('Survived - Radius Neighbors')

Classification: Survived - Radius Neighbors
Training:
accuracy: 0.664323374340949
precision: 0.696969696969697
recall: 0.3
accuracy_count: 378
Test:
accuracy: 0.6923076923076923
precision: 0.8181818181818182
recall: 0.3103448275862069
accuracy_count: 99


In [114]:
def desicion_tree_function(x_train, y_train, max_depth=None, max_features=None):
    model = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features)
    model.fit(x_train, y_train)
    return model

In [115]:
result_dict['Survived - Decision Tree'] = build_model(desicion_tree_function, 'Survived', features[0:-1], titanic_df)
compare_single_results('Survived - Decision Tree')

Classification: Survived - Decision Tree
Training:
accuracy: 0.984182776801406
precision: 1.0
recall: 0.9615384615384616
accuracy_count: 560
Test:
accuracy: 0.7412587412587412
precision: 0.6268656716417911
recall: 0.7777777777777778
accuracy_count: 106


In [118]:
def naive_bayes_function(x_train, y_train, priors=None):
    model = GaussianNB(priors=priors)
    model.fit(x_train, y_train)
    return model

In [119]:
result_dict['Survived - Naive Bayes'] = build_model(naive_bayes_function, 'Survived', features[0:-1], titanic_df)
compare_single_results('Survived - Naive Bayes')

Classification: Survived - Naive Bayes
Training:
accuracy: 0.7768014059753954
precision: 0.7466666666666667
recall: 0.7058823529411765
accuracy_count: 442
Test:
accuracy: 0.7762237762237763
precision: 0.6607142857142857
recall: 0.74
accuracy_count: 111
