In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn import preprocessing
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [5]:
titanic_df = pd.read_csv('dataset/titanic/train.csv')

titanic_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], 'columns', inplace=True)

titanic_df = titanic_df.dropna()

titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [20]:
titanic_df = titanic_df.sample(frac=1).reset_index(drop=True)

titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,2,1,60.0,1,1,39.0,0,0,1
1,0,2,1,29.0,1,0,27.7208,1,0,0
2,0,3,1,22.0,0,0,7.7958,0,0,1
3,0,1,0,2.0,1,2,151.55,0,0,1
4,0,3,1,7.0,4,1,39.6875,0,0,1


In [8]:
FEATURES = list(titanic_df.columns[1:])
FEATURES

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [9]:
result_dict = {}

In [11]:
def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    return {
        'accuracy': acc,
        'precision': prec,
        'recall': recall,
        'accuracy_count': num_acc
    }

In [16]:
def build_model(classifier_fn,
                 name_of_y_col,
                 names_of_x_cols,
                 dataset,
                 test_frac=0.2):
    
    X = dataset[names_of_x_cols]
    Y = dataset[name_of_y_col]
    
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
    
    model = classifier_fn(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    y_pred_train = model.predict(x_train)
    
    train_summary  = summarize_classification(y_train, y_pred_train)
    test_summary  = summarize_classification(y_test, y_pred)
    
    pred_results = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})
    
    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
    
    return {
        'training': train_summary,
        'test': test_summary,
        'confusion_matrix': model_crosstab
    }

In [14]:
def compare_results():
    for key in result_dict:
        print('Classification: ', key)
        
        print()
        print('Training data')
        
        for score in result_dict[key]['training']:
            print(score, result_dict[key]['training'][score])
            
        print()
        print('Test data')
        
        for score in result_dict[key]['test']:
            print(score, result_dict[key]['test'][score])
            
        print()

In [15]:
def logistic_fn(x_train, y_train):
    model = LogisticRegression(solver='liblinear')
    model.fit(x_train, y_train)
    
    return model

In [21]:
result_dict['survived - logistic'] = build_model(logistic_fn, 'Survived', FEATURES, titanic_df)

compare_results()

Classificatio:  survived - logistic

Training data
accuracy 0.7926186291739895
precision 0.7817258883248731
recall 0.6724890829694323
accuracy_count 451

Test data
accuracy 0.8251748251748252
precision 0.8863636363636364
recall 0.6610169491525424
accuracy_count 118



In [24]:
list(result_dict['survived - logistic']['training'])

['accuracy', 'precision', 'recall', 'accuracy_count']