In [1]:
%reset -f
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

## import sklearn metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

## import sklearn models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [2]:
# import clean data
titanic_df = pd.read_csv('datasets/titanic_processed.csv')
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,0,14.0,0,0,7.8542,0,0,1
1,1,1,1,28.0,0,0,26.55,0,0,1
2,1,1,0,36.0,1,2,120.0,0,0,1
3,0,3,1,17.0,1,0,7.0542,0,0,1
4,0,3,1,4.0,4,2,31.275,0,0,1


In [3]:
## since survived is col[0] then all of our features are col[1:]
FEATURES = list(titanic_df.columns[1:])
FEATURES

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [4]:
## we will store the results for our model metrics here
result_dict = {}

In [5]:
def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    return {'accuracy': acc,
           'precision': prec,
           'recall': recall,
           'accuracy_count':num_acc}


In [6]:
def build_model (classifier_fn,
                name_of_y_col,
                names_of_x_cols,
                dataset,
                test_frac=0.2):
    # get features and labels
    X = dataset[names_of_x_cols]
    Y = dataset[name_of_y_col]
    # instantiate predictor
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = test_frac)
    # train model
    model = classifier_fn(x_train, y_train)
    # test model
    y_pred = model.predict(x_test)
    y_pred_train = model.predict(x_train)
    
    train_summary = summarize_classification(y_train, y_pred_train)
    test_summary = summarize_classification(y_test, y_pred)
    
    ## objects for displaying results
    pred_results = pd.DataFrame({'y_test':y_test,'y_pred':y_pred})
    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
    
    return {'training':train_summary,
           'test': test_summary,
           'confusion_matrix':model_crosstab}

In [7]:
def compare_results():
    for key in result_dict:
        print('Classification: ',key)
        print()
        print('Training data')
        for score in result_dict[key]['training']:
            print(score, result_dict[key]['training'][score])
        print()
        print('Test data')
        for score in result_dict[key]['test']:
            print(score,result_dict[key]['test'][score])
        print()

In [8]:
def logistic_fn(x_train, y_train):
    model = LogisticRegression(solver='liblinear')
    model.fit(x_train,y_train)
    return model

In [23]:
result_dict['survived - logistic'] = build_model(logistic_fn,
                                                 'Survived',
                                                 FEATURES,
                                                 titanic_df)
#compare_results()

##### LDA, QDA

In [24]:
## the lineardiscriminantanalysis tries to reduce demensionality based on the axis that best seperates data into different classes
## the svd solver tries to find this without calculating the covariance matrix of features (usefull when you have many features or rows of data)
def linear_discriminant_fn(x_train, y_train, solver='svd'):
    model = LinearDiscriminantAnalysis(solver=solver)
    model.fit(x_train, y_train)
    return model

result_dict['survived - linear_discriminant_analysis'] = build_model(linear_discriminant_fn,
                                                 'Survived',
                                                 FEATURES,
                                                 titanic_df)
# compare_results()



In [25]:
## when we one-hot encoded the data, we introduced colinear variables this is called a dummy trap.
## Logistic regression removes this issue before it runs but not all estimators do this
## we can remove on of the dummie columns to remove the collinear
## instead of one-hot encoding we should use dummy encoding next time
def linear_discriminant_fn(x_train, y_train, solver='svd'):
    model = LinearDiscriminantAnalysis(solver=solver)
    model.fit(x_train, y_train)
    return model

result_dict['survived - linear_discriminant_analysis'] = build_model(linear_discriminant_fn,
                                                 'Survived',
                                                 FEATURES[0:-1], ## this is where we drop one of the dummy collumns
                                                 titanic_df)
# compare_results()


In [27]:
## use qda when your x variables corresponding to different labels have different covariances
## i.e covariances are different for X for all values of Y
def quadratic_discriminant_fn(x_train, y_train):
    model = QuadraticDiscriminantAnalysis()
    model.fit(x_train, y_train)
    return model

result_dict['survived - quadratic_discriminant_analysis'] = build_model(quadratic_discriminant_fn,
                                                 'Survived',
                                                 FEATURES[0:-1], ## this is where we drop one of the dummy collumns
                                                 titanic_df)
# compare_results()

In [28]:
## max_iter is max iterations this model with train
## tol is the stopping criterian for the model training, this means the slope flattening out, and
# the model isn't changing much in training
## sometimes a high max iteration will yield better modeling

def sgd_fn (x_train, y_train, max_iter=1000, tol=1e-3):
    model = SGDClassifier(max_iter=max_iter, tol=tol)
    model.fit(x_train, y_train)
    return model
result_dict['survived - SGD'] = build_model(sgd_fn,
                                                 'Survived',
                                                 FEATURES, 
                                                 titanic_df)
#compare_results()

In [29]:

def linear_svc_fn(x_train, y_train, C=1.0, max_iter=1000, tol=1e-3):
    ## you can also use SVC(kernel="linear")
    ## when the num samples > num features use dual=False
    model = LinearSVC(C=C, max_iter=max_iter, tol=tol, dual=False)
    model.fit(x_train, y_train)
    return model

result_dict['survived - LinearSVC'] = build_model(linear_svc_fn,
                                                 'Survived',
                                                 FEATURES, 
                                                 titanic_df)
# compare_results()

In [30]:
## by default this function considers all entities within 40 units to be a neighbor

def radius_neighbor_fn(x_train, y_train, radius=40.0):
    model = RadiusNeighborsClassifier(radius=radius)
    model.fit(x_train, y_train)
    return model

result_dict['survived - radius_neighbors'] = build_model(radius_neighbor_fn,
                                                        'Survived',
                                                        FEATURES,
                                                        titanic_df)
# compare_results()

In [31]:
def decision_tree_fn(x_train, y_train, max_depth=None, max_features=None):
    model = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features)
    model.fit(x_train, y_train)
    return model

result_dict['survived - decision_tree'] = build_model(decision_tree_fn,
                                                        'Survived',
                                                        FEATURES,
                                                        titanic_df)
# compare_results()

In [32]:
## use bayes theorem to find which label is most likely, given the attributes -
## observed in the feature vector and given how often the different -
## labels occur in the data
def naive_bayes_fn(x_train, y_train, priors=None):
    model = GaussianNB(priors=priors)
    model.fit(x_train, y_train)
    return model


result_dict['survived - neive_bayes'] = build_model(naive_bayes_fn,
                                                        'Survived',
                                                        FEATURES,
                                                        titanic_df)
compare_results()

Classification:  survived - logistic

Training data
accuracy 0.7873462214411248
precision 0.7623762376237624
recall 0.6784140969162996
accuracy_count 448

Test data
accuracy 0.8251748251748252
precision 0.8
recall 0.7868852459016393
accuracy_count 118

Classification:  survived - linear_discriminant_analysis

Training data
accuracy 0.7996485061511424
precision 0.7824074074074074
recall 0.7161016949152542
accuracy_count 455

Test data
accuracy 0.7762237762237763
precision 0.7083333333333334
recall 0.6538461538461539
accuracy_count 111

Classification:  survived - quadratic_discriminant_analysis

Training data
accuracy 0.8014059753954306
precision 0.7402597402597403
recall 0.7633928571428571
accuracy_count 456

Test data
accuracy 0.7342657342657343
precision 0.7321428571428571
recall 0.640625
accuracy_count 105

Classification:  survived - SGD

Training data
accuracy 0.7117750439367311
precision 0.753731343283582
recall 0.4353448275862069
accuracy_count 405

Test data
accuracy 0.72727272