In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [3]:
titanic_df = pd.read_csv('datasets/titanic.csv')

In [4]:
titanic_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
titanic_df = titanic_df.drop(columns = ['PassengerId'])

titanic_df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
titanic_df = titanic_df.drop(columns = ['Name'])

titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,female,35.0,1,0,113803,53.1,C123,S
4,0,3,male,35.0,0,0,373450,8.05,,S


In [9]:
titanic_df = titanic_df.dropna(subset=['Age'], axis=0)

In [10]:
titanic_df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       529
Embarked      2
dtype: int64

In [11]:
titanic_df.shape

(714, 10)

In [12]:
titanic_df = titanic_df.drop(columns = ['Cabin'])

titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,0,3,male,22.0,1,0,A/5 21171,7.25,S
1,1,1,female,38.0,1,0,PC 17599,71.2833,C
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,S
3,1,1,female,35.0,1,0,113803,53.1,S
4,0,3,male,35.0,0,0,373450,8.05,S


In [19]:
titanic_df = titanic_df.drop(columns = ['Ticket'])

titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,25.0,0,0,7.7417,0,1,0
1,0,3,0,11.0,4,2,31.275,0,0,1
2,0,2,1,19.0,1,1,36.75,0,0,1
3,0,3,1,24.0,0,0,9.5,0,0,1
4,0,3,1,37.0,2,0,7.925,0,0,1


In [20]:
from sklearn import preprocessing

label_encoding = preprocessing.LabelEncoder()
titanic_df['Sex'] = label_encoding.fit_transform(titanic_df['Sex'].astype(str))

titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,25.0,0,0,7.7417,0,1,0
1,0,3,0,11.0,4,2,31.275,0,0,1
2,0,2,1,19.0,1,1,36.75,0,0,1
3,0,3,1,24.0,0,0,9.5,0,0,1
4,0,3,1,37.0,2,0,7.925,0,0,1


In [21]:
titanic_df = pd.get_dummies(titanic_df, columns=['Embarked'])

KeyError: "None of [Index(['Embarked'], dtype='object')] are in the [columns]"

In [22]:
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,25.0,0,0,7.7417,0,1,0
1,0,3,0,11.0,4,2,31.275,0,0,1
2,0,2,1,19.0,1,1,36.75,0,0,1
3,0,3,1,24.0,0,0,9.5,0,0,1
4,0,3,1,37.0,2,0,7.925,0,0,1


In [23]:
titanic_df = titanic_df.sample(frac=1).reset_index(drop=True)

In [24]:
titanic_df.to_csv('datasets/titanic_processed', index=False)

In [25]:
FEATURES = list(titanic_df.columns[1:])

FEATURES

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [26]:
result_dict={}

In [28]:
def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred, normalize = True)
    num_acc = accuracy_score(y_test, y_pred, normalize = False)
    
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    return {
        'accuracy': acc,
        'precision':prec,
        'recall': recall,
        'accuracy_count': num_acc
    }

In [30]:
def build_model(classifier_fn, name_of_y_col, name_of_x_cols, dataset, test_frac=0.2):
    x = dataset[name_of_x_cols]
    y = dataset[name_of_y_col]
    
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=test_frac)
    model = classifier_fn(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    y_pred_train = model.predict(x_train)
    
    train_summary = summarize_classification(y_train, y_pred_train)
    test_summary = summarize_classification(y_test, y_pred)
    
    pred_results = pd.DataFrame({'y_test': y_test,
                                'y_pred': y_pred})
    
    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
    
    return {
        'training': train_summary,
        'test': test_summary,
        'confusion_matrix': model_crosstab
    }

In [31]:
def compare_results():
    for key in result_dict:
        print('Classification: ', key)
        
        print()
        print('Training data')
        for score in result_dict[key]['training']:
            print(score, result_dict[key]['training'][score])
            
        print()
        print('Test data')
        for score in result_dict[key]['test']:
            print(score, result_dict[key]['test'][score])
            
        print()

In [32]:
def logistic_fn(x_train, y_train):
    model = LogisticRegression(solver='liblinear')
    model.fit(x_train, y_train)
    
    return model

In [33]:
result_dict['survived - logistic'] = build_model(logistic_fn, 'Survived', FEATURES, titanic_df)

compare_results()

Classification:  survived - logistic

Training data
accuracy 0.7845884413309983
precision 0.7727272727272727
recall 0.6623376623376623
accuracy_count 448

Test data
accuracy 0.8391608391608392
precision 0.8214285714285714
recall 0.7796610169491526
accuracy_count 120



# Linear Discriminant Analysis

In [39]:
#this finds the best line to separate our data

def linear_discriminant_fn(x_train, y_train, solver='svd'):#singular value decomposition solver
    model = LinearDiscriminantAnalysis(solver = solver)
    model.fit(x_train, y_train)
    
    return model

In [42]:
result_dict['survived - linear_discriminant_analysis']=build_model(linear_discriminant_fn,'Survived',FEATURES,titanic_df)
compare_results()

Classification:  survived - logistic

Training data
accuracy 0.7845884413309983
precision 0.7727272727272727
recall 0.6623376623376623
accuracy_count 448

Test data
accuracy 0.8391608391608392
precision 0.8214285714285714
recall 0.7796610169491526
accuracy_count 120

Classification:  survived - linear_discriminant_analysis

Training data
accuracy 0.7933450087565674
precision 0.7641509433962265
recall 0.7043478260869566
accuracy_count 453

Test data
accuracy 0.7692307692307693
precision 0.7454545454545455
recall 0.6833333333333333
accuracy_count 110



In [45]:
result_dict['survived - linear_discriminant_analysis']=build_model(linear_discriminant_fn,'Survived',FEATURES[0:-1],titanic_df)
compare_results()

Classification:  survived - logistic

Training data
accuracy 0.7845884413309983
precision 0.7727272727272727
recall 0.6623376623376623
accuracy_count 448

Test data
accuracy 0.8391608391608392
precision 0.8214285714285714
recall 0.7796610169491526
accuracy_count 120

Classification:  survived - linear_discriminant_analysis

Training data
accuracy 0.7845884413309983
precision 0.7601809954751131
recall 0.7058823529411765
accuracy_count 448

Test data
accuracy 0.8461538461538461
precision 0.8260869565217391
recall 0.7307692307692307
accuracy_count 121

