In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [3]:
titanic_df = pd.read_csv('datasets/titanic_processed_3.csv')
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,1,1,0,47.0,1,1,52.5542,0,0,1
1,0,3,1,16.0,0,0,9.2167,0,0,1
2,0,3,1,17.0,0,0,8.6625,0,0,1
3,1,2,0,36.0,0,0,13.0,0,0,1
4,0,3,0,39.0,0,5,29.125,0,1,0


In [4]:
FEATURES = list(titanic_df.columns[1:])
FEATURES

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [18]:
result_dict= {}

In [19]:
def summarize_classification(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred, normalize=True)
    accuracy_count = accuracy_score(y_test, y_pred, normalize=False)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    return {
        'accuracy':accuracy,
        'precision':precision,
        'recall':recall,
        'accuracy_count':accuracy_count
    }

In [20]:
def build_model(classifier_fn, name_of_y_col, names_of_x_cols, dataset, test_frac = 0.2):
    
    X = dataset[names_of_x_cols]
    Y = dataset[name_of_y_col]
    
    x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=test_frac)
    model = classifier_fn(x_train, y_train)
    
    y_pred = model.predict(x_test)
    y_pred_train = model.predict(x_train)
    
    train_summary = summarize_classification(y_train, y_pred_train)
    test_summary = summarize_classification(y_test, y_pred)
    
    pred_results = pd.DataFrame({'y_test':y_test, 'y_pred':y_pred})
    
    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
    
    return {
        'training':train_summary,
        'test':test_summary,
        'confusion_matrix':model_crosstab
    }

In [21]:
def compare_results():
    for key in result_dict:
        print("Classification: ", key)
        
        print()
        print('Training data')
        for score in result_dict[key]['training']:
            print(score, result_dict[key]['training'][score])
            
        print()
        print('Test data')
        for score in result_dict[key]['test']:
            print(score, result_dict[key]['test'][score])
            
        print()
        

In [22]:
def logistic_fn(x_train, y_train):
    
    model = LogisticRegression(solver='liblinear')
    model.fit(x_train, y_train)
    
    return model

In [24]:
def linear_discriminant_fn(x_train, y_train, solver = 'svd'): # svd - singlular value decomposition
    model = LinearDiscriminantAnalysis(solver=solver)
    model.fit(x_train,y_train)
    
    return model

In [27]:
def quadratic_discriminant_fn(x_train, y_train):
    model = QuadraticDiscriminantAnalysis()
    model.fit(x_train,y_train)
    
    return model

In [33]:
def sgd_fn(x_train, y_train, max_iter=10000, tol=1e-3):
    
    # tol - tolerancja, miejsce w który model ma przerwać trenowanie.
    # gdy obliczymy stratę dla dwóch kolejnych iteracji i jest ona mniejsza niż wartość toleracji to przerywamy
    
    model = SGDClassifier(max_iter = max_iter, tol = tol)
    model.fit(x_train, y_train)
    
    return model

In [36]:
def linear_svc_fn(x_train, y_train, C = 1.0, max_iter = 1000, tol = 1e-3):
    
    model = LinearSVC(C = C, max_iter = max_iter, tol = tol, dual = False)
    # dual  = dotyczy optymalizacji. gdy ilość próbek > ilość parametrów, stosuj False
    
    model.fit(x_train, y_train)
    
    return model

In [42]:
def radius_neighbor_fn(x_train, y_train, radius = 40.0):
    
    model = RadiusNeighborsClassifier(radius=radius)
    model.fit(x_train, y_train)
    
    return model

In [53]:
def desicion_tree_fn(x_train, y_train, max_depth =None, max_features = None):
    
    model = DecisionTreeClassifier(max_depth = max_depth, max_features = max_features)
    model.fit(x_train, y_train)
    
    return model

In [54]:
def naive_bayes_fn(x_train, y_train, priors=None):
    model = GaussianNB(priors=priors)
    model.fit(x_train, y_train)
    
    return model

In [56]:
result_dict['survived - logistic'] = build_model(logistic_fn, 'Survived', FEATURES, titanic_df)
result_dict['survived - linear_discriminant_analysis'] = build_model(linear_discriminant_fn, 'Survived', FEATURES, titanic_df)
result_dict['survived - quadratic_discriminant_analysis'] = build_model(quadratic_discriminant_fn, 'Survived', FEATURES[0:-1], titanic_df)
result_dict['survived - stochastic_gradient_descent'] = build_model(sgd_fn, 'Survived', FEATURES, titanic_df)
result_dict['survived - svc'] = build_model(linear_svc_fn, 'Survived', FEATURES, titanic_df)
result_dict['survived - radius_neighbors'] = build_model(radius_neighbor_fn, 'Survived', FEATURES, titanic_df)
result_dict['survived - decision_tree'] = build_model(desicion_tree_fn, 'Survived', FEATURES, titanic_df)
result_dict['survived - naive_bayes'] = build_model(naive_bayes_fn, 'Survived', FEATURES, titanic_df)

compare_results()

Classification:  survived - logistic

Training data
accuracy 0.8031634446397188
precision 0.785
recall 0.6946902654867256
accuracy_count 457

Test data
accuracy 0.7762237762237763
precision 0.7678571428571429
recall 0.6935483870967742
accuracy_count 111

Classification:  survived - linear_discriminant_analysis

Training data
accuracy 0.7978910369068541
precision 0.7598039215686274
recall 0.7013574660633484
accuracy_count 454

Test data
accuracy 0.7692307692307693
precision 0.7833333333333333
recall 0.7014925373134329
accuracy_count 110

Classification:  survived - quadratic_discriminant_analysis

Training data
accuracy 0.804920913884007
precision 0.7685185185185185
recall 0.7312775330396476
accuracy_count 458

Test data
accuracy 0.7832167832167832
precision 0.7777777777777778
recall 0.6885245901639344
accuracy_count 112

Classification:  survived - stochastic_gradient_descent

Training data
accuracy 0.7240773286467487
precision 0.7532467532467533
recall 0.49361702127659574
accuracy_cou