In [33]:
from datasets import titanic_data

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from pygam import LogisticGAM, s, f
from sklearn.decomposition import PCA
from sklearn.svm import SVC

import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import StandardScaler
from tools import polynomial_features

# Data preparation

In [34]:
original_X, original_y, train_X, train_y, test_X, test_y = titanic_data()

## Polynomial features

In [35]:
original_X_2 = polynomial_features(original_X, 2)
train_X_2 = polynomial_features(train_X, 2)
test_X_2 = polynomial_features(test_X, 2)

scaler = StandardScaler().fit(original_X_2)

original_X_2 = scaler.transform(original_X_2)
train_X_2 = scaler.transform(train_X_2)
test_X_2 = scaler.transform(test_X_2)

## Colinear features

In [36]:
original_X_coli = original_X.drop(['embarked_C',
       'embarked_Q', 'embarked_S'], axis=1)
train_X_coli = train_X.drop(['embarked_C',
       'embarked_Q', 'embarked_S'], axis=1)
test_X_coli = test_X.drop(['embarked_C',
       'embarked_Q', 'embarked_S'], axis=1)

## Feature subset

In [37]:
feature_subset = ['pclass', 'sex', 'age', 'sibsp', 'embarked_S']

## PCA

In [38]:
pca = PCA(n_components=5)
pca = pca.fit(original_X)

transformed_train_X = pca.transform(train_X)
transformed_test_X = pca.transform(test_X)

In [39]:
experiments = {
    "LogisticRegression": {
        "model": LogisticRegression(penalty='none'),
        "train_X": train_X,
        "test_X": test_X,
    },
    "PolynomialLogisticRegression": {
        "model": LogisticRegression(penalty='none'),
        "train_X": train_X_2,
        "test_X": test_X_2,
    },
    "LinearDiscriminantAnalysis": {
        "model": LinearDiscriminantAnalysis(),
        "train_X": train_X,
        "test_X": test_X,
    },
    "QuadraticDiscriminantAnalysis": {
        "model": QuadraticDiscriminantAnalysis(),
        "train_X": train_X_coli,
        "test_X": test_X_coli,
    },
    "KNNClassification": {
        "model": KNeighborsClassifier(n_neighbors=12),
        "train_X": train_X,
        "test_X": test_X,
    },
    "SubsetLogisticRegression": {
        "model": LogisticRegression(penalty='none'),
        "train_X": train_X[feature_subset],
        "test_X": test_X[feature_subset],
    },
    "GeneralAdditiveModel": {
        "model": LogisticGAM(s(0) + s(1) + s(2) + s(3) + s(4)),
        "train_X": train_X[feature_subset],
        "test_X": test_X[feature_subset],
    },
    "PCAClassification": {
        "model": LogisticRegression(penalty='none'),
        "train_X": transformed_train_X,
        "test_X": transformed_test_X,
    },
    "SupportVectorMachinesPoly": {
        "model": SVC(kernel='poly', shrinking=False, degree=3, probability=True),
        "train_X": train_X,
        "test_X": test_X,
    },
    "SupportVectorMachinesLinear": {
        "model": SVC(kernel='poly', shrinking=False, degree=3, probability=True),
        "train_X": train_X,
        "test_X": test_X,
    },
    "SupportVectorMachinesRBF": {
        "model": SVC(kernel='rbf', shrinking=False, degree=3, probability=True),
        "train_X": train_X,
        "test_X": test_X,
    },
}

In [40]:
columns = ['train_acc', 'test_acc', 'cross_val_acc']
model_names = []
results = []

for model_name, experiment in experiments.items():
    train_X = experiment['train_X']
    test_X = experiment['test_X']
    
    model = experiment['model'].fit(train_X, train_y)

    train_acc = accuracy_score(train_y, model.predict(train_X))
    test_acc = accuracy_score(test_y, model.predict(test_X))

    try:
        cross_val = cross_val_score(model, train_X, train_y).mean()
    except:
        cross_val = np.NaN

    model_names.append(model_name)
    results.append([train_acc, test_acc, cross_val])

result_df = pd.DataFrame(results, columns = columns, index=model_names)

In [43]:
result_df.sort_values('test_acc', ascending=False)

Unnamed: 0,train_acc,test_acc,cross_val_acc
SupportVectorMachinesPoly,0.82183,0.839552,0.800852
SupportVectorMachinesLinear,0.82183,0.839552,0.800852
PolynomialLogisticRegression,0.817014,0.820896,0.817006
QuadraticDiscriminantAnalysis,0.799358,0.820896,0.799394
SupportVectorMachinesRBF,0.833066,0.809701,0.813768
KNNClassification,0.829856,0.80597,0.813806
SubsetLogisticRegression,0.791332,0.802239,0.794516
GeneralAdditiveModel,0.82183,0.802239,
LogisticRegression,0.797753,0.798507,0.796116
LinearDiscriminantAnalysis,0.796148,0.798507,0.788116
