In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [3]:
titanic_df = pd.read_csv('titanic_prosecced.csv')
titanic_df.head(10)

Unnamed: 0.1,Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,sex,Embarked_C,Embarked_Q,Embarked_S
0,0,0,2,1,23.0,0,0,15.0458,1,1,0,0
1,1,1,2,0,24.0,2,1,27.0,0,0,0,1
2,2,0,3,0,48.0,1,3,34.375,0,0,0,1
3,3,1,1,0,52.0,1,1,93.5,0,0,0,1
4,4,1,2,0,4.0,1,1,23.0,0,0,0,1
5,5,0,3,1,19.0,0,0,8.05,1,0,0,1
6,6,1,1,0,51.0,1,0,77.9583,0,0,0,1
7,7,0,3,0,45.0,0,1,14.4542,0,1,0,0
8,8,0,2,1,42.0,0,0,13.0,1,0,0,1
9,9,1,3,1,32.0,0,0,7.925,1,0,0,1


In [4]:
FEATURES = list(titanic_df.columns[1:])
FEATURES

['Survived',
 'Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'sex',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [5]:
result_dict = {}

In [6]:
def build_model(classifier_fn,
           name_of_y_col,
           name_of_x_cols,
           dataset,
           test_frac=0.2):
    X = dataset[name_of_x_cols]
    Y = dataset[name_of_y_col]
    
    x_train ,x_test, y_train, y_test = train_test_split(X, Y, test_size= test_frac)
    model = classifier_fn(x_train, y_train)
    y_pred = model.predict(x_test)
    y_pred_train = model.predict(x_train)
    train_summary = summarize_classification(y_train, y_pred_train)
    test_summary= summarize_classification(y_test, y_pred)
    pred_results = pd.DataFrame({'y_test': y_test,
                                'y_pred': y_pred})
    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
    return{'training': train_summary,
          'test': test_summary,
          'confusion_matrix':model_crosstab}

In [7]:
def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred, normalize = True)
    num_acc = accuracy_score(y_test, y_pred, normalize = False)
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    return {
        'accuracy':acc,
        'precision':prec,
        'recall':recall,
        'accuracy_count':num_acc
    }

In [8]:
def compare_results():
    for key in result_dict:
        print('Classification:', key)
        print()
        
        print('Training data')
        for score in result_dict[key]['training']:
            print(score, result_dict[key]['training'][score])                        
        print()
                                  
        print('Test data')
        for score in result_dict[key]['test']:
            print(score, result_dict[key]['test'][score])
        print()

In [9]:
def logistic_fn(x_train, y_train):
    model = LogisticRegression(solver='liblinear')
    model.fit(x_train, y_train)
    
    return model

In [10]:
result_dict['Survived - logistic'] = build_model(logistic_fn,'Survived',
                                                               FEATURES, 
                                                               titanic_df)
                                                        
compare_results()

Classification: Survived - logistic

Training data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 569

Test data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 143



In [11]:
def linear_discriminant_fn(X_train, Y_train, solver='svd'):
    
    model = LinearDiscriminantAnalysis(solver=solver)
    model.fit(X_train, Y_train)
    return model

In [12]:
result_dict['Survived - linear_discriminan_fn'] = build_model(linear_discriminant_fn,'Survived',
                                                               FEATURES[0:-1], 
                                                               titanic_df)
                                                        
compare_results()

Classification: Survived - logistic

Training data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 569

Test data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 143

Classification: Survived - linear_discriminan_fn

Training data
accuracy 0.7961335676625659
precision 0.7733990147783252
recall 0.6916299559471366
accuracy_count 453

Test data
accuracy 0.8461538461538461
precision 0.8305084745762712
recall 0.8032786885245902
accuracy_count 121





In [13]:
def quadratic_Discriminant_fn(X_train, Y_train, solver='svd'):
    
    model = QuadraticDiscriminantAnalysis()
    model.fit(X_train, Y_train)
    return model

In [14]:
result_dict['Survived - quadratic_Discriminant_analysis'] = build_model(quadratic_Discriminant_fn,'Survived',
                                                               FEATURES[0:-1], 
                                                               titanic_df)
                                                        
compare_results()

  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))


Classification: Survived - logistic

Training data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 569

Test data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 143

Classification: Survived - linear_discriminan_fn

Training data
accuracy 0.7961335676625659
precision 0.7733990147783252
recall 0.6916299559471366
accuracy_count 453

Test data
accuracy 0.8461538461538461
precision 0.8305084745762712
recall 0.8032786885245902
accuracy_count 121

Classification: Survived - quadratic_Discriminant_analysis

Training data
accuracy 0.5905096660808435
precision 0.0
recall 0.0
accuracy_count 336

Test data
accuracy 0.6153846153846154
precision 0.0
recall 0.0
accuracy_count 88



  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  'precision', 'predicted', average, warn_for)


In [15]:
def sgd_fn(X_train, Y_train, max_iter=1000, tol=1e-3):
    
    model = SGDClassifier(max_iter=max_iter, tol =tol)
    model.fit(X_train, Y_train)
    return model

In [17]:
result_dict['Survived - sgd_fn'] = build_model(sgd_fn,'Survived',
                                                               FEATURES, 
                                                               titanic_df)
                                                        
compare_results()

Classification: Survived - logistic

Training data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 569

Test data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 143

Classification: Survived - linear_discriminan_fn

Training data
accuracy 0.7961335676625659
precision 0.7733990147783252
recall 0.6916299559471366
accuracy_count 453

Test data
accuracy 0.8461538461538461
precision 0.8305084745762712
recall 0.8032786885245902
accuracy_count 121

Classification: Survived - quadratic_Discriminant_analysis

Training data
accuracy 0.5905096660808435
precision 0.0
recall 0.0
accuracy_count 336

Test data
accuracy 0.6153846153846154
precision 0.0
recall 0.0
accuracy_count 88

Classification: Survived - sgd_fn

Training data
accuracy 0.8769771528998243
precision 0.7902097902097902
recall 0.9576271186440678
accuracy_count 499

Test data
accuracy 0.8391608391608392
precision 0.7230769230769231
recall 0.9038461538461539
accuracy_count 120



In [24]:
def linear_svc_fn(X_train, Y_train, C=1.0, max_iter=1000, tol=1e-3):
    
    model = LinearSVC(C=C, max_iter=max_iter, tol =tol, dual=False)
    model.fit(X_train, Y_train)
    return model

In [25]:
result_dict['Survived - linear_svc_fn'] = build_model(linear_svc_fn,'Survived',
                                                               FEATURES, 
                                                               titanic_df)
                                                        
compare_results()

Classification: Survived - logistic

Training data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 569

Test data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 143

Classification: Survived - linear_discriminan_fn

Training data
accuracy 0.7961335676625659
precision 0.7733990147783252
recall 0.6916299559471366
accuracy_count 453

Test data
accuracy 0.8461538461538461
precision 0.8305084745762712
recall 0.8032786885245902
accuracy_count 121

Classification: Survived - quadratic_Discriminant_analysis

Training data
accuracy 0.5905096660808435
precision 0.0
recall 0.0
accuracy_count 336

Test data
accuracy 0.6153846153846154
precision 0.0
recall 0.0
accuracy_count 88

Classification: Survived - sgd_fn

Training data
accuracy 0.8769771528998243
precision 0.7902097902097902
recall 0.9576271186440678
accuracy_count 499

Test data
accuracy 0.8391608391608392
precision 0.7230769230769231
recall 0.9038461538461539
accuracy_count 120

Classification: Survived - linear_svc_fn

Trainin

In [28]:
def radius_neighbor_fn(X_train, Y_train, radius=40.0):
    
    model = RadiusNeighborsClassifier(radius=radius)
    model.fit(X_train, Y_train)
    return model

In [29]:
result_dict['Survived - radius_neighbor_fn'] = build_model(radius_neighbor_fn,'Survived',
                                                               FEATURES, 
                                                               titanic_df)
                                                        
compare_results()

Classification: Survived - logistic

Training data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 569

Test data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 143

Classification: Survived - linear_discriminan_fn

Training data
accuracy 0.7961335676625659
precision 0.7733990147783252
recall 0.6916299559471366
accuracy_count 453

Test data
accuracy 0.8461538461538461
precision 0.8305084745762712
recall 0.8032786885245902
accuracy_count 121

Classification: Survived - quadratic_Discriminant_analysis

Training data
accuracy 0.5905096660808435
precision 0.0
recall 0.0
accuracy_count 336

Test data
accuracy 0.6153846153846154
precision 0.0
recall 0.0
accuracy_count 88

Classification: Survived - sgd_fn

Training data
accuracy 0.8769771528998243
precision 0.7902097902097902
recall 0.9576271186440678
accuracy_count 499

Test data
accuracy 0.8391608391608392
precision 0.7230769230769231
recall 0.9038461538461539
accuracy_count 120

Classification: Survived - linear_svc_fn

Trainin

In [32]:
def decision_tree_classifier_fn(X_train, Y_train, max_depth = None, max_features = None ):
    
    model = DecisionTreeClassifier( max_depth=max_depth, max_features=max_features)
    model.fit(X_train, Y_train)
    return model

In [33]:
result_dict['Survived - decision_tree_classifier_fn'] = build_model(decision_tree_classifier_fn,'Survived',
                                                               FEATURES, 
                                                               titanic_df)
                                                        
compare_results()

Classification: Survived - logistic

Training data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 569

Test data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 143

Classification: Survived - linear_discriminan_fn

Training data
accuracy 0.7961335676625659
precision 0.7733990147783252
recall 0.6916299559471366
accuracy_count 453

Test data
accuracy 0.8461538461538461
precision 0.8305084745762712
recall 0.8032786885245902
accuracy_count 121

Classification: Survived - quadratic_Discriminant_analysis

Training data
accuracy 0.5905096660808435
precision 0.0
recall 0.0
accuracy_count 336

Test data
accuracy 0.6153846153846154
precision 0.0
recall 0.0
accuracy_count 88

Classification: Survived - sgd_fn

Training data
accuracy 0.8769771528998243
precision 0.7902097902097902
recall 0.9576271186440678
accuracy_count 499

Test data
accuracy 0.8391608391608392
precision 0.7230769230769231
recall 0.9038461538461539
accuracy_count 120

Classification: Survived - linear_svc_fn

Trainin

In [None]:
def naive_bayes_fn(X_train, Y_train, radius=40.0):
    
    model = GaussianNB(radius=radius)
    model.fit(X_train, Y_train)
    return model