In [None]:
import random
import math
import itertools
from math import sqrt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [None]:
random.seed(0)

In [None]:
df_training = pd.read_csv('../../datasets/titanic_training_processed.csv')
df_test = pd.read_csv('../../datasets/titanic_test_processed.csv')

This is just an experiment to practice with the concept of statistical distances. The idea is to visualise a dimensionality reduced representation of the two classes (survived / non-survived) to get some awarenenss of their separability and then use a simple Mahalanobis distance between each test observation and the two classes to perform classification. 

Originally I wanted to apply t-SNE for dimensionality reduction, but the t-SNE algorithm in scikit-learn cannot be applied to new data. Therefore, I am sticking with PCA instead. 

In [None]:
X = df_training.values[:, 2:]
pca = PCA(n_components = 2)
X_pca = pca.fit_transform(X)

In [None]:
index_1 = df_training.Survived == 1
index_0 = df_training.Survived == 0

In [None]:
fig, ax = plt.subplots()
_ = ax.scatter(X_pca[index_1, 0], X_pca[index_1, 1], c = 'blue', alpha = 0.5)
_ = ax.scatter(X_pca[index_0, 0], X_pca[index_0, 1], c = 'red', alpha = 0.5)

There seems to be a large overlap between the two distributions. What is the Bhattacharyya distance value?

In [None]:
def number_observations(array, x, y):
    return np.sum(np.logical_and(array[:, 0] < x, 
                                 np.logical_and(array[:, 0] >= x - 0.1, 
                                                np.logical_and(array[:, 1] < y, 
                                                               array[:, 1] >= y - 0.1))))

In [None]:
discrete = np.arange(-4, 10, 0.1)
n_1 = np.sum(index_1)
n_0 = np.sum(index_0)
bc = 0
for i in list(itertools.product(discrete, discrete)):
    x = i[0]
    y = i[1]
    p_1 = number_observations(X_pca[index_1, :], x, y) / n_1
    p_0 = number_observations(X_pca[index_0, :], x, y) / n_0
    bc = bc + math.sqrt(p_1 * p_0)
print('Bhattacharyya distance: ' + str(-math.log(bc)))

We are now using Mahalanobis distance to assign a class to each test observation. In order to do that, we need first to apply the same transforation that we applied to the traning data:

In [None]:
X_test_pca = pca.transform(df_test.values[:, 1:])

In [None]:
fig, ax = plt.subplots()
_ = ax.scatter(X_pca[index_1, 0],  X_pca[index_1, 1], c = 'blue', alpha = 0.5)
_ = ax.scatter(X_pca[index_0, 0], X_pca[index_0, 1], c = 'red', alpha = 0.5)
_ = ax.scatter(X_test_pca[:, 0], X_test_pca[:, 1], c = 'gray', alpha = 0.5)

And additionally we need to estimate the parameters of the two classes:

In [None]:
mean_0 = np.mean(X_pca[index_0, :], axis = 0)
mean_1 = np.mean(X_pca[index_1, :], axis = 0)
cov_0 = np.cov(X_pca[index_0, :].T)
cov_1 = np.cov(X_pca[index_1, :].T)

In [None]:
def mahalanobis(p, mean, cov):
    dif = p - mean
    return math.sqrt(np.dot(np.dot(dif.T, np.linalg.inv(cov)), dif))

In [None]:
y_test = []
for i in range(X_test_pca.shape[0]):
    dist_0 = mahalanobis(X_test_pca[i, :], mean_0, cov_0)
    dist_1 = mahalanobis(X_test_pca[i, :], mean_1, cov_1)
    y_test.append(int(dist_1 < dist_0))

In [None]:
submission = df_test.copy()
submission['Survived'] = y_test
submission = submission[['PassengerId', 'Survived']]

In [None]:
submission.head()

In [None]:
submission.to_csv('./submissions/03_mahalanobis.csv', index = False)

This submission achieved a 64.59% accuracy (much lower than the accuracy I got with logistic regression.) This method is not valid due to two factors: there is a large overlap between the two classes, and the distributions do not look like normal distributions. A similar but simpler approach, KNN, probably would have obtained a higher accuracy. 

## Feature selection

It is unlikely that we will improve the results much. However, just for the sake of the experiment, we are trying a feature selection approach (both forward selection and backward elimination). 