In [None]:
import sys
import random
import math
import itertools
from math import sqrt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [None]:
random.seed(0)

In [None]:
df_training = pd.read_csv('../../datasets/titanic_training_processed.csv')
df_test = pd.read_csv('../../datasets/titanic_test_processed.csv')

This is just an experiment to practice with the concept of statistical distances. The idea is to visualise a dimensionality reduced representation of the two classes (survived / non-survived) to get some awarenenss of their separability and then use a simple Mahalanobis distance between each test observation and the two classes to perform classification. 

Originally I wanted to apply t-SNE for dimensionality reduction, but the t-SNE algorithm in scikit-learn cannot be applied to new data. Therefore, I am sticking with PCA instead. 

In [None]:
X = df_training.values[:, 2:]
pca = PCA(n_components = 2)
X_pca = pca.fit_transform(X)

In [None]:
index_1 = df_training.Survived == 1
index_0 = df_training.Survived == 0

In [None]:
fig, ax = plt.subplots()
_ = ax.scatter(X_pca[index_1, 0], X_pca[index_1, 1], c = 'blue', alpha = 0.5)
_ = ax.scatter(X_pca[index_0, 0], X_pca[index_0, 1], c = 'red', alpha = 0.5)

There seems to be a large overlap between the two distributions. What is the Bhattacharyya distance value?

In [None]:
def number_observations(array, x, y):
    return np.sum(np.logical_and(array[:, 0] < x, 
                                 np.logical_and(array[:, 0] >= x - 0.1, 
                                                np.logical_and(array[:, 1] < y, 
                                                               array[:, 1] >= y - 0.1))))

In [None]:
discrete = np.arange(-4, 10, 0.1)
n_1 = np.sum(index_1)
n_0 = np.sum(index_0)
bc = 0
for i in list(itertools.product(discrete, discrete)):
    x = i[0]
    y = i[1]
    p_1 = number_observations(X_pca[index_1, :], x, y) / n_1
    p_0 = number_observations(X_pca[index_0, :], x, y) / n_0
    bc = bc + math.sqrt(p_1 * p_0)
print('Bhattacharyya distance: ' + str(-math.log(bc)))

We are now using Mahalanobis distance to assign a class to each test observation. In order to do that, we need first to apply the same transforation that we applied to the traning data:

In [None]:
X_test_pca = pca.transform(df_test.values[:, 1:])

In [None]:
fig, ax = plt.subplots()
_ = ax.scatter(X_pca[index_1, 0],  X_pca[index_1, 1], c = 'blue', alpha = 0.5)
_ = ax.scatter(X_pca[index_0, 0], X_pca[index_0, 1], c = 'red', alpha = 0.5)
_ = ax.scatter(X_test_pca[:, 0], X_test_pca[:, 1], c = 'gray', alpha = 0.5)

And additionally we need to estimate the parameters of the two classes:

In [None]:
mean_0 = np.mean(X_pca[index_0, :], axis = 0)
mean_1 = np.mean(X_pca[index_1, :], axis = 0)
cov_0 = np.cov(X_pca[index_0, :].T)
cov_1 = np.cov(X_pca[index_1, :].T)

In [None]:
def mahalanobis(p, mean, cov):
    dif = p - mean
    return math.sqrt(np.dot(np.dot(dif.T, np.linalg.inv(cov)), dif))

In [None]:
y_test = []
for i in range(X_test_pca.shape[0]):
    dist_0 = mahalanobis(X_test_pca[i, :], mean_0, cov_0)
    dist_1 = mahalanobis(X_test_pca[i, :], mean_1, cov_1)
    y_test.append(int(dist_1 < dist_0))

In [None]:
submission = df_test.copy()
submission['Survived'] = y_test
submission = submission[['PassengerId', 'Survived']]

In [None]:
submission.head()

In [None]:
submission.to_csv('./submissions/03_mahalanobis.csv', index = False)

This submission achieved a 64.59% accuracy (much lower than the accuracy I got with logistic regression.) This method is not valid due to two factors: there is a large overlap between the two classes, and the distributions do not look like normal distributions. A similar but simpler approach, KNN, probably would have obtained a higher accuracy. 

## Feature selection

It is unlikely that we will improve the results much. However, just for the sake of the experiment, we are trying a feature selection approach (both forward selection and backward elimination). 

In [None]:
# generating sets for 10-fold cross validation
indexes = list(range(len(df_training)))
random.shuffle(indexes)
folds = []
for i in range(10):
    folds.append([])
for i in range(len(indexes)):
    folds[i % 10].append(indexes[i])

In [None]:
def produce_training_test_set(df_training, train_indexes, test_indexes, column_indexes):
    columns = df_training.columns[column_indexes]
    datasets = {}
    datasets['X_train'] = df_training.iloc[train_indexes][columns].values
    datasets['X_test'] = df_training.iloc[test_indexes][columns].values
    datasets['y_train'] = df_training.iloc[train_indexes]['Survived'].values
    datasets['y_test'] = df_training.iloc[test_indexes]['Survived'].values
    
    return datasets

In [None]:
def evaluate(datasets):
    if datasets['X_train'].shape[1] > 2:
        pca = PCA(n_components = 2)
        X_training_pca = pca.fit_transform(datasets['X_train'])
        X_test_pca = pca.transform(datasets['X_test'])
    else:
        X_training_pca = datasets['X_train']
        X_test_pca = datasets['X_test']
        
    mean_0 = np.mean(X_training_pca[datasets['y_train'] == 0, :], axis = 0)
    mean_1 = np.mean(X_training_pca[datasets['y_train'] == 1, :], axis = 0)
    cov_0 = np.cov(X_training_pca[datasets['y_train'] == 0, :].T)
    cov_1 = np.cov(X_training_pca[datasets['y_train'] == 1, :].T)
    
    y_pred = []
    for i in range(X_test_pca.shape[0]):
        if datasets['X_train'].shape[1] > 1:
            dist_0 = mahalanobis(X_test_pca[i, :], mean_0, cov_0)
            dist_1 = mahalanobis(X_test_pca[i, :], mean_1, cov_1)
        else:
            dist_0 = (X_test_pca[i, :] - mean_0) / cov_0
            dist_1 = (X_test_pca[i, :] - mean_1) / cov_1
        y_pred.append(int(dist_1 < dist_0))
        
    return sqrt(np.sum(np.power(np.array(y_pred) - np.array(datasets['y_test']), 2)))

In [None]:
def k_fold_cross_validation(df_training, folds, column_indexes):
    error = 0
    
    for k in range(10):
        train_indexes = []
        for j in range(10):
            if j == k:
                test_indexes = folds[j]
            else:
                train_indexes = train_indexes + folds[j]
                
        datasets = produce_training_test_set(df_training, train_indexes, test_indexes, column_indexes)
        
        error = error + evaluate(datasets)
        
    return error / 10.0

In [None]:
column_indexes = list(range(2, 62))
k_fold_cross_validation(df_training, folds, column_indexes)

In [None]:
# Forward selection
pending = list(range(2, 62))
model = []
min_error = sys.float_info.max
while len(pending) > 0:
    
    prev_error = min_error
    min_error = sys.float_info.max
    
    for i in pending:
        new_model = model + [i]
        error = k_fold_cross_validation(df_training, folds, new_model)
        
        if error < min_error:
            min_error = error
            best_model = new_model
            feature = i
            
    if min_error < prev_error:
        print('Selecting feature ' + df_training.columns[feature] + ' - error decreased to ' + str(min_error))
        model = best_model
        pending.remove(feature)
    else:
        print('END')
        break

In [None]:
pca = PCA(n_components = 2)
X_training_pca = pca.fit_transform(df_training.values[:, model])
X_test_pca = pca.transform(df_test.values[:, model])

mean_0 = np.mean(X_training_pca[index_0, :], axis = 0)
mean_1 = np.mean(X_training_pca[index_1, :], axis = 0)
cov_0 = np.cov(X_training_pca[index_0, :].T)
cov_1 = np.cov(X_training_pca[index_1, :].T)

y_pred = []
for i in range(X_test_pca.shape[0]):
    dist_0 = mahalanobis(X_test_pca[i, :], mean_0, cov_0)
    dist_1 = mahalanobis(X_test_pca[i, :], mean_1, cov_1)
    y_pred.append(int(dist_1 < dist_0))

In [None]:
submission = df_test.copy()
submission['Survived'] = y_pred
submission = submission[['PassengerId', 'Survived']]

In [None]:
submission.head()

In [None]:
submission.to_csv('./submissions/03_mahalanobis_forward_selection.csv', index = False)

Surprisingly enough, feature selection actually produced a much worse result: 40.69% prediction accuracy