In [None]:
import os
import sys
import random
from math import sqrt
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier

In [None]:
random.seed(0)

In [None]:
df_training = pd.read_csv('../../datasets/titanic_training_processed.csv')
df_test = pd.read_csv('../../datasets/titanic_test_processed.csv')

In [None]:
df_training.head()

In [None]:
df_test.head()

In [None]:
columns = df_training.columns[2:]
X_train = df_training[columns].values
X_test = df_test[columns].values
y_train = df_training['Survived'].values

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_train.shape

## No feature selection

We have to still apply 10-fold cross validation to select weighted/no-weighted and the value of K. 

In [None]:
# generating sets for 10-fold cross validation
indexes = list(range(len(df_training)))
random.shuffle(indexes)
folds = []
for i in range(10):
    folds.append([])
for i in range(len(indexes)):
    folds[i % 10].append(indexes[i])

In [None]:
def produce_training_test_set(df_training, train_indexes, test_indexes, column_indexes):
    columns = df_training.columns[column_indexes]
    datasets = {}
    datasets['X_train'] = df_training.iloc[train_indexes][columns].values
    datasets['X_test'] = df_training.iloc[test_indexes][columns].values
    datasets['y_train'] = df_training.iloc[train_indexes]['Survived'].values
    datasets['y_test'] = df_training.iloc[test_indexes]['Survived'].values
    
    return datasets

In [None]:
def evaluate(datasets, neigs, weights):
    clf = KNeighborsClassifier(n_neighbors = neigs, weights = weights)
    clf.fit(datasets['X_train'], datasets['y_train'])
    y_pred = clf.predict(datasets['X_test'])
    return sqrt(np.sum(np.power(np.array(y_pred) - np.array(datasets['y_test']), 2)))

In [None]:
def k_fold_cross_validation(df_training, folds, column_indexes, neigs, weights):
    error = 0
    
    for k in range(10):
        train_indexes = []
        for j in range(10):
            if j == k:
                test_indexes = folds[j]
            else:
                train_indexes = train_indexes + folds[j]
                
        datasets = produce_training_test_set(df_training, train_indexes, test_indexes, column_indexes)
        
        error = error + evaluate(datasets, neigs, weights)
        
    return error / 10.0

In [None]:
K = range(1, 200)
W = ['uniform', 'distance']
column_indexes = list(range(2, 62)) # All columns
minimum = sys.float_info.max

errors = dict()
for w in W:
    errors[w] = list()
    for k in tqdm(K):
        error = k_fold_cross_validation(df_training, folds, column_indexes, k, w)
        errors[w].append(error)
        if error < minimum:
            minimum = error
            min_k = k
            min_w = w
            
print('Minimum for w = ' + min_w + ' and k = '+ str(min_k))

In [None]:
fig, ax = plt.subplots()
for w in W:
    ax.plot(K, errors[w])
ax.set_xlabel('k')
ax.set_ylabel('error')
ax.legend(W)
fig.set_figwidth(16)

In [None]:
clf = KNeighborsClassifier(n_neighbors = min_k, weights = min_w)

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_test = clf.predict(X_test)

In [None]:
submission = df_test.copy()
submission['Survived'] = y_test
submission = submission[['PassengerId', 'Survived']]

In [None]:
submission.head()

In [None]:
if not os.path.exists('./submissions/'):
    os.makedirs('./submissions/')

In [None]:
submission.to_csv('./submissions/04_knn.csv', index = False)

My submission to Kaggle produced a 70.81% test prediction accuracy. 

## Feature selection - forward selection

In [None]:
# generating sets for 10-fold cross validation
indexes = list(range(len(df_training)))
random.shuffle(indexes)
folds = []
for i in range(10):
    folds.append([])
for i in range(len(indexes)):
    folds[i % 10].append(indexes[i])

In [None]:
def produce_training_test_set(df_training, train_indexes, test_indexes, column_indexes):
    columns = df_training.columns[column_indexes]
    datasets = {}
    datasets['X_train'] = df_training.iloc[train_indexes][columns].values
    datasets['X_test'] = df_training.iloc[test_indexes][columns].values
    datasets['y_train'] = df_training.iloc[train_indexes]['Survived'].values
    datasets['y_test'] = df_training.iloc[test_indexes]['Survived'].values
    
    return datasets

In [None]:
def evaluate(datasets, C = None):
    if C is None:
        C = 1
    logreg = LogisticRegression(C = C)
    logreg.fit(datasets['X_train'], datasets['y_train'])
    y_pred = logreg.predict(datasets['X_test'])
    return sqrt(np.sum(np.power(np.array(y_pred) - np.array(datasets['y_test']), 2)))

In [None]:
def k_fold_cross_validation(df_training, folds, column_indexes, C = None):
    error = 0
    
    for k in range(10):
        train_indexes = []
        for j in range(10):
            if j == k:
                test_indexes = folds[j]
            else:
                train_indexes = train_indexes + folds[j]
                
        datasets = produce_training_test_set(df_training, train_indexes, test_indexes, column_indexes)
        
        error = error + evaluate(datasets, C)
        
    return error / 10.0

In [None]:
column_indexes = list(range(2, 62))
k_fold_cross_validation(df_training, folds, column_indexes)

In [None]:
# Forward selection
pending = list(range(2, 62))
model = []
min_error = sys.float_info.max
while len(pending) > 0:
    
    prev_error = min_error
    min_error = sys.float_info.max
    
    for i in pending:
        new_model = model + [i]
        error = k_fold_cross_validation(df_training, folds, new_model)
        
        if error < min_error:
            min_error = error
            best_model = new_model
            feature = i
            
    if min_error < prev_error:
        print('Selecting feature ' + df_training.columns[feature] + ' - error decreased to ' + str(min_error))
        model = best_model
        pending.remove(feature)
    else:
        print('END')
        break

In [None]:
model_forward = model
columns = df_training.columns[model_forward]
X_train = df_training[columns].values
X_test = df_test[columns].values
y_train = df_training['Survived'].values

In [None]:
logreg = LogisticRegression()

In [None]:
logreg.fit(X_train, y_train)

In [None]:
y_test = logreg.predict(X_test)

In [None]:
submission = df_test.copy()
submission['Survived'] = y_test
submission = submission[['PassengerId', 'Survived']]

In [None]:
submission.head()

In [None]:
submission.to_csv('./submissions/02_logistic_regression_forward_selection.csv', index = False)

This submission produced a 75.119% test prediction accuracy

## Forward selection - bakward elimination

In [None]:
# backward elimination
model = list(range(2, 62))
min_error = k_fold_cross_validation(df_training, folds, column_indexes)
while len(model) > 0:
    
    prev_error = min_error
    min_error = sys.float_info.max
    
    for i in model:
        new_model = model[:]
        new_model.remove(i)
        error = k_fold_cross_validation(df_training, folds, new_model)
        
        if error < min_error:
            min_error = error
            best_model = new_model
            feature = i
            
    if min_error < prev_error:
        print('Removing feature ' + df_training.columns[feature] + ' - error decreased to ' + str(min_error))
        model = best_model
    else:
        print('END')
        break

Backward eliminiation seems to reduce the prediction error for the training set even more. Let's make a submission

In [None]:
model_backward = model
columns = df_training.columns[model_backward]
X_train = df_training[columns].values
X_test = df_test[columns].values
y_train = df_training['Survived'].values

In [None]:
logreg = LogisticRegression()

In [None]:
logreg.fit(X_train, y_train)

In [None]:
y_test = logreg.predict(X_test)

In [None]:
submission = df_test.copy()
submission['Survived'] = y_test
submission = submission[['PassengerId', 'Survived']]

In [None]:
submission.head()

In [None]:
submission.to_csv('./submissions/02_logistic_regression_backward_elimination.csv', index = False)

This submission produced 73.205% prediction accuracy for the test set. Therefore, we should keep forward selection. 

## Model selection - regularisation

The only parameter worth playing with is the regularisation factor C.

In [None]:
column_indexes = model_forward

In [None]:
C = np.arange(0.01, 1.5, 0.01)
rmses = []
for c in C:
    rmses.append(k_fold_cross_validation(df_training, folds, new_model, c))

In [None]:
fig, ax = plt.subplots()
ax.plot(C, rmses)
ax.set_xlabel('C')
ax.set_ylabel('RMSE')
ax.set_title('Regularisation factor - model selection')

Minimum seems to be at 0.7

In [None]:
columns = df_training.columns[model_forward]
X_train = df_training[columns].values
X_test = df_test[columns].values
y_train = df_training['Survived'].values

In [None]:
logreg = LogisticRegression(C = 0.7)

In [None]:
logreg.fit(X_train, y_train)

In [None]:
y_test = logreg.predict(X_test)

In [None]:
submission = df_test.copy()
submission['Survived'] = y_test
submission = submission[['PassengerId', 'Survived']]

In [None]:
submission.head()

In [None]:
submission.to_csv('./submissions/02_logistic_regression_regularisation.csv', index = False)

I obtained exactly the same test accuracy after my submission than in the case of not using regularisation.