In [None]:
import os
import sys
import random
import itertools
import multiprocessing
from math import sqrt
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.svm import SVC

In [None]:
random.seed(0)

In [None]:
df_training = pd.read_csv('../../datasets/titanic_training_processed.csv')
df_test = pd.read_csv('../../datasets/titanic_test_processed.csv')

In [None]:
df_training.head()

In [None]:
df_test.head()

In [None]:
columns = df_training.columns[2:]
X_train = df_training[columns].values
X_test = df_test[columns].values
y_train = df_training['Survived'].values

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_train.shape

## No feature selection

We have to still apply 10-fold cross validation to select the kernel and the value of C. For the polynomial and RBF kernels we also need to determine the value of their respective parameters  

In [None]:
# generating sets for 10-fold cross validation
indexes = list(range(len(df_training)))
random.shuffle(indexes)
folds = []
for i in range(10):
    folds.append([])
for i in range(len(indexes)):
    folds[i % 10].append(indexes[i])

In [None]:
def produce_training_test_set(df_training, train_indexes, test_indexes, column_indexes):
    columns = df_training.columns[column_indexes]
    datasets = {}
    datasets['X_train'] = df_training.iloc[train_indexes][columns].values
    datasets['X_test'] = df_training.iloc[test_indexes][columns].values
    datasets['y_train'] = df_training.iloc[train_indexes]['Survived'].values
    datasets['y_test'] = df_training.iloc[test_indexes]['Survived'].values
    
    return datasets

In [None]:
def evaluate(datasets, C, kernel, kernel_params):
    kwargs = {}
    if kernel == 'poly':
        kwargs['degree'] = kernel_params['degree']
        kwargs['coef0'] = kernel_params['coef0']
       
    clf = SVC(C = C, gamma = 'auto', kernel = kernel, **kwargs)
    clf.fit(datasets['X_train'], datasets['y_train'])
    y_pred = clf.predict(datasets['X_test'])
    return sqrt(np.sum(np.power(np.array(y_pred) - np.array(datasets['y_test']), 2)))

In [None]:
def k_fold_cross_validation(df_training, folds, column_indexes, C, kernel, kernel_params):
    error = 0
    
    for k in range(10):
        train_indexes = []
        for j in range(10):
            if j == k:
                test_indexes = folds[j]
            else:
                train_indexes = train_indexes + folds[j]
                
        datasets = produce_training_test_set(df_training, train_indexes, test_indexes, column_indexes)
        
        error = error + evaluate(datasets, C, kernel, kernel_params)
        
    return error / 10.0

In [None]:
# Results were very similar for gamma = scale and gamma = auto
# No good results with degree = 1
C = np.arange(0.2, 5.2, 0.2).tolist()
kernel = ['linear', 'poly', 'rbf']
degree = [2, 3]
coef0 = np.arange(0, 3.2, 0.2).tolist()

poly_params = list(itertools.product(*[degree, coef0]))

comb = list(itertools.product(*[C, ['linear'], [None], [None]]))
comb.extend(list(itertools.product(*[C, ['rbf'], [None], [None]])))
comb.extend(list(itertools.product(*[C, ['poly'], degree, coef0])))

column_indexes = list(range(2, 62)) # All columns
minimum = sys.float_info.max

errors = pd.DataFrame(data = comb, columns = ['C', 'kernel', 'degree', 'coef0'])
errors['error'] = np.nan

for i in tqdm(range(len(errors))):
    errors.loc[i, 'error'] = k_fold_cross_validation(df_training,
                                                     folds,
                                                     column_indexes,
                                                     errors['C'].loc[i],
                                                     errors['kernel'].loc[i],
                                                     {'degree': errors['degree'].loc[i],
                                                      'coef0': errors['coef0'].loc[i]})

In [None]:
errors = errors.sort_values(by = 'error')
errors.head(5)

In [None]:
fig, ax = plt.subplots()
errors_linear = errors[errors.kernel == 'linear'].sort_values(by = 'C')
ax.plot(errors_linear.C, errors_linear.error)
ax.set_xlabel('C')
ax.set_ylabel('RMSE')
ax.set_title('Linear model')
ax.grid(True)

In [None]:
fig, ax = plt.subplots()
errors_rbf = errors[errors.kernel == 'rbf'].sort_values(by = 'C')
ax.plot(errors_rbf.C, errors_rbf.error)
ax.set_xlabel('C')
ax.set_ylabel('RMSE')
ax.set_title('RBF kernel')
ax.grid(True)

In [None]:
fig, ax = plt.subplots(1, len(degree))
errors_poly = errors[errors.kernel == 'poly']
for d in degree:
    i = degree.index(d)
    errors_d = errors_poly[errors_poly.degree == d].pivot(index='C', 
                                                          columns='coef0', 
                                                          values='error')
    im = ax[i].imshow(errors_d, cmap = 'viridis', extent=[errors.C.min(), 
                                                          errors.C.max(), 
                                                          errors.coef0.min(), 
                                                          errors.coef0.max()])
    fig.colorbar(im, ax = ax[i])
    ax[i].set_xlabel('C')
    ax[i].set_ylabel('coef0')
    ax[i].set_title('poly kernel - degree = ' + str(d))
fig.set_figwidth(12)
fig.set_figheight(6)

In [None]:
clf = SVC(C = errors.C.values[0], 
          gamma = 'auto', 
          kernel = errors.kernel.values[0], 
          coef0 = errors.coef0.values[0])

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_test = clf.predict(X_test)

In [None]:
submission = df_test.copy()
submission['Survived'] = y_test
submission = submission[['PassengerId', 'Survived']]

In [None]:
submission.head()

In [None]:
if not os.path.exists('./submissions/'):
    os.makedirs('./submissions/')

In [None]:
submission.to_csv('./submissions/05_svm.csv', index = False)

My submission to Kaggle produced a 78.95% test prediction accuracy. This is almost as high as what I my best result *with feature selection* to this date.

## Feature selection - forward selection

For this classifier, and due to the longer feature selection process (we have to select the value of K and the weighting type - uniform, distance - for each combination of features during feature selection), we are applying multiprocessing to accelerate things. 

In [None]:
def k_fold_cross_validation_unpack(args):
    return k_fold_cross_validation(args[0],
                                   args[1],
                                   args[2],
                                   args[3][0],
                                   args[3][1],
                                   {'degree': args[3][2],
                                    'coef0': args[3][3]})

In [None]:
#parameters = itertools.product([df_training], [folds], [[0]], comb)
#for p in parameters:
#    k_fold_cross_validation_unpack(p)
#    dffd

In [None]:
# TODO: try without multiprocessing

# Forward selection
pending = list(range(2, 62))
model = []
min_error = sys.float_info.max
num_processes = multiprocessing.cpu_count() - 1
pool = multiprocessing.Pool(processes = num_processes)

while len(pending) > 0:
    prev_error = min_error
    min_error = sys.float_info.max
    
    for i in pending:
        new_model = model + [i]
        parameters = itertools.product([df_training], [folds], [new_model], comb)
        
        errors = pool.map(k_fold_cross_validation_unpack, parameters)
        
        best = list(itertools.product(comb))[np.argmin(errors)]
        minimum = min(errors)
        
        if minimum < min_error:
            min_error = minimum
            best_model = new_model
            feature = i
            best_comb = best
            
    if min_error < prev_error:
        print('Selecting feature ' + 
              df_training.columns[feature] + 
              '(C = ' + 
              str(best[0]) + 
              ', kernel = ' +
              best[1] + 
              ', degree = ' +
              str(best[2]) +
              ', coef0 = ' +
              str(best[3]) +
              ') - error decreased to ' +
              str(min_error))
        model = best_model
        pending.remove(feature)
    else:
        print('END')
        break

pool.close()

In [None]:
model_forward = model
columns = df_training.columns[model_forward]
X_train = df_training[columns].values
X_test = df_test[columns].values
y_train = df_training['Survived'].values

In [None]:
clf = KNeighborsClassifier(n_neighbors = best_k, weights = best_w)

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_test = clf.predict(X_test)

In [None]:
submission = df_test.copy()
submission['Survived'] = y_test
submission = submission[['PassengerId', 'Survived']]

In [None]:
submission.head()

In [None]:
submission.to_csv('./submissions/04_knn_forward_selection.csv', index = False)

This submission produced a 79.90% test prediction accuracy. This is the best prediction accuracy I got so far. This demonstrates that the classes are not linearly separable. 

## Feature selection - backward elimination

In [None]:
# backward elimination
model = list(range(2, 62))
# The best k and w values were already selected for the full model in the first section
# of this notebook 
min_error = k_fold_cross_validation(df_training, folds, column_indexes, min_k, min_w)
num_processes = multiprocessing.cpu_count() - 1
pool = multiprocessing.Pool(processes = num_processes)

while len(model) > 0:
    prev_error = min_error
    min_error = sys.float_info.max
    
    for i in model:
        new_model = model[:]
        new_model.remove(i)
        parameters = itertools.product([df_training], [folds], [new_model], K, W)
        
        errors = pool.map(k_fold_cross_validation_unpack, parameters)
        
        best = list(itertools.product(K, W))[np.argmin(errors)]
        minimum = min(errors)
        
        if minimum < min_error:
            min_error = minimum
            best_model = new_model
            feature = i
            best_k = best[0]
            best_w = best[1]
            
    if min_error < prev_error:
        print('Removing feature ' + 
              df_training.columns[feature] + 
              '(k = ' + 
              str(best_k) + 
              ', w = ' +
              best_w + 
              ') - error decreased to ' +
              str(min_error))
        model = best_model
    else:
        print('END')
        break

pool.close()

In [None]:
model_backward = model
columns = df_training.columns[model_backward]
X_train = df_training[columns].values
X_test = df_test[columns].values
y_train = df_training['Survived'].values

In [None]:
clf = KNeighborsClassifier(n_neighbors = best_k, weights = best_w)

In [None]:
clf.fit(X_train, y_train)

In [None]:
y_test = clf.predict(X_test)

In [None]:
submission = df_test.copy()
submission['Survived'] = y_test
submission = submission[['PassengerId', 'Survived']]

In [None]:
submission.head()

In [None]:
submission.to_csv('./submissions/04_knn_backward_elimination.csv', index = False)

This submission produced 75.12% prediction accuracy for the test set. Therefore, we should keep forward selection. 