In [1]:
import os
import sys
import random
import itertools
import multiprocessing
from math import sqrt
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB

In [2]:
random.seed(0)

In [3]:
df_training = pd.read_csv('../../datasets/titanic_training_processed.csv')
df_test = pd.read_csv('../../datasets/titanic_test_processed.csv')

In [4]:
df_training.head()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,Pclass_3,Pclass_1,Sex_male,TicketPrefix_A/5,...,CabinClass_C,CabinClass_E,CabinClass_G,CabinClass_D,CabinClass_A,CabinClass_B,CabinClass_F,Embarked_S,Embarked_C,Embarked_Q
0,1,0,-0.565419,0.43255,-0.473408,-0.502163,0.902081,-0.565368,0.737281,9.380891,...,-0.266146,-0.192901,-0.067116,-0.196006,-0.130783,-0.235849,-0.121613,0.618959,-0.481772,-0.30739
1,2,1,0.663488,0.43255,-0.473408,0.786404,-1.107304,1.766775,-1.354813,-0.10648,...,3.753114,-0.192901,-0.067116,-0.196006,-0.130783,-0.235849,-0.121613,-1.613803,2.073341,-0.30739
2,3,1,-0.258192,-0.474279,-0.473408,-0.48858,0.902081,-0.565368,-1.354813,-0.10648,...,-0.266146,-0.192901,-0.067116,-0.196006,-0.130783,-0.235849,-0.121613,0.618959,-0.481772,-0.30739
3,4,1,0.433068,0.43255,-0.473408,0.420494,-1.107304,1.766775,-1.354813,-0.10648,...,3.753114,-0.192901,-0.067116,-0.196006,-0.130783,-0.235849,-0.121613,0.618959,-0.481772,-0.30739
4,5,0,0.433068,-0.474279,-0.473408,-0.486064,0.902081,-0.565368,0.737281,-0.10648,...,-0.266146,-0.192901,-0.067116,-0.196006,-0.130783,-0.235849,-0.121613,0.618959,-0.481772,-0.30739


In [5]:
df_test.head()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,Pclass_3,Pclass_1,Sex_male,TicketPrefix_A/5,TicketPrefix_PC,...,CabinClass_C,CabinClass_E,CabinClass_G,CabinClass_D,CabinClass_A,CabinClass_B,CabinClass_F,Embarked_S,Embarked_C,Embarked_Q
0,892,0.394665,-0.474279,-0.473408,-0.490508,0.902081,-0.565368,0.737281,-0.10648,-0.268554,...,-0.266146,-0.192901,-0.067116,-0.196006,-0.130783,-0.235849,-0.121613,-1.613803,-0.481772,3.249548
1,893,1.354749,0.43255,-0.473408,-0.507194,0.902081,-0.565368,-1.354813,-0.10648,-0.268554,...,-0.266146,-0.192901,-0.067116,-0.196006,-0.130783,-0.235849,-0.121613,0.618959,-0.481772,-0.30739
2,894,2.506849,-0.474279,-0.473408,-0.453112,-1.107304,-0.565368,0.737281,-0.10648,-0.268554,...,-0.266146,-0.192901,-0.067116,-0.196006,-0.130783,-0.235849,-0.121613,-1.613803,-0.481772,3.249548
3,895,-0.181385,-0.474279,-0.473408,-0.473739,0.902081,-0.565368,0.737281,-0.10648,-0.268554,...,-0.266146,-0.192901,-0.067116,-0.196006,-0.130783,-0.235849,-0.121613,0.618959,-0.481772,-0.30739
4,896,-0.565419,0.43255,0.767199,-0.400792,0.902081,-0.565368,-1.354813,-0.10648,-0.268554,...,-0.266146,-0.192901,-0.067116,-0.196006,-0.130783,-0.235849,-0.121613,0.618959,-0.481772,-0.30739


In [6]:
columns = df_training.columns[2:]
X_train = df_training[columns].values
X_test = df_test[columns].values
y_train = df_training['Survived'].values

In [7]:
X_train.shape

(891, 60)

In [8]:
X_test.shape

(418, 60)

In [9]:
y_train.shape

(891,)

## No feature selection

In [10]:
# generating sets for 10-fold cross validation
indexes = list(range(len(df_training)))
random.shuffle(indexes)
folds = []
for i in range(10):
    folds.append([])
for i in range(len(indexes)):
    folds[i % 10].append(indexes[i])

In [11]:
def produce_training_test_set(df_training, train_indexes, test_indexes, column_indexes):
    columns = df_training.columns[column_indexes]
    datasets = {}
    datasets['X_train'] = df_training.iloc[train_indexes][columns].values
    datasets['X_test'] = df_training.iloc[test_indexes][columns].values
    datasets['y_train'] = df_training.iloc[train_indexes]['Survived'].values
    datasets['y_test'] = df_training.iloc[test_indexes]['Survived'].values
    
    return datasets

In [12]:
def evaluate(datasets):
    clf = GaussianNB()
    clf.fit(datasets['X_train'], datasets['y_train'])
    y_pred = clf.predict(datasets['X_test'])
    return sqrt(np.sum(np.power(np.array(y_pred) - np.array(datasets['y_test']), 2)))

In [13]:
def k_fold_cross_validation(df_training, folds, column_indexes):
    error = 0
    
    for k in range(10):
        train_indexes = []
        for j in range(10):
            if j == k:
                test_indexes = folds[j]
            else:
                train_indexes = train_indexes + folds[j]
                
        datasets = produce_training_test_set(df_training, train_indexes, test_indexes, column_indexes)
        
        error = error + evaluate(datasets)
        
    return error / 10.0

In [14]:
column_indexes = list(range(2, 62)) # All columns
error = k_fold_cross_validation(df_training, folds, column_indexes)
print(error)

7.207765926094888


In [15]:
clf = GaussianNB()

In [16]:
clf.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [17]:
y_test = clf.predict(X_test)

In [18]:
submission = df_test.copy()
submission['Survived'] = y_test
submission = submission[['PassengerId', 'Survived']]

In [19]:
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,1
1,893,1
2,894,1
3,895,1
4,896,1


In [20]:
if not os.path.exists('./submissions/'):
    os.makedirs('./submissions/')

In [21]:
submission.to_csv('./submissions/06_naive_bayes.csv', index = False)

My submission to Kaggle produced a 39.23% test prediction accuracy. 

## Feature selection - forward selection

For this classifier, and due to the longer feature selection process (we have to select the value of K and the weighting type - uniform, distance - for each combination of features during feature selection), we are applying multiprocessing to accelerate things. 

In [22]:
# Forward selection
pending = list(range(2, 62))
model = []
min_error = sys.float_info.max

while len(pending) > 0:
    prev_error = min_error
    min_error = sys.float_info.max
    
    for i in pending:
        new_model = model + [i]
        error = k_fold_cross_validation(df_training, folds, new_model)

        if error < min_error:
            min_error = error
            best_model = new_model
            feature = i
            
    if min_error < prev_error:
        print('Selecting feature ' + 
              df_training.columns[feature] + 
              ' - error decreased to ' +
              str(min_error))
        model = best_model
        pending.remove(feature)
    else:
        print('END')
        break

Selecting feature Sex_male - error decreased to 4.328105960328346
Selecting feature SibSp - error decreased to 4.229648993377826
Selecting feature CabinClass_E - error decreased to 4.183230617319234
Selecting feature Age - error decreased to 4.179806635986888
Selecting feature Pclass_3 - error decreased to 4.178897103832353
Selecting feature Embarked_S - error decreased to 4.156529428690836
Selecting feature TicketPrefix_STON/O2. - error decreased to 4.115563339919731
Selecting feature CabinClass_F - error decreased to 4.081894766206114
Selecting feature TicketPrefix_NUMBER - error decreased to 4.056363077241119
END


In [23]:
model_forward = model
columns = df_training.columns[model_forward]
X_train = df_training[columns].values
X_test = df_test[columns].values
y_train = df_training['Survived'].values

In [24]:
clf = GaussianNB()

In [25]:
clf.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [26]:
y_test = clf.predict(X_test)

In [27]:
submission = df_test.copy()
submission['Survived'] = y_test
submission = submission[['PassengerId', 'Survived']]

In [28]:
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [29]:
submission.to_csv('./submissions/06_naive_bayes_forward_selection.csv', index = False)

This submission produced a 76.55% accuracy result. This is much better than not using feature selection. 

## Feature selection - backward elimination

In [30]:
# backward elimination
model = list(range(2, 62))
min_error = k_fold_cross_validation(df_training, folds, column_indexes)

while len(model) > 0:
    prev_error = min_error
    min_error = sys.float_info.max
    
    for i in model:
        new_model = model[:]
        new_model.remove(i)
        error = k_fold_cross_validation(df_training, folds, new_model)
        
        if error < min_error:
            min_error = error
            best_model = new_model
            feature = i
            
    if min_error < prev_error:
        print('Removing feature ' + 
              df_training.columns[feature] + 
              ' - error decreased to ' +
              str(min_error))
        model = best_model
    else:
        print('END')
        break

Removing feature TicketPrefix_S.C./A.4. - error decreased to 7.179626081507726
Removing feature TicketPrefix_S.P. - error decreased to 7.144529163175934
Removing feature TicketPrefix_W.E.P. - error decreased to 7.094707521696274
Removing feature TicketPrefix_S.O.P. - error decreased to 7.04522593114059
Removing feature TicketPrefix_A4. - error decreased to 7.016911684119533
Removing feature TicketPrefix_Fa - error decreased to 6.988740476417481
Removing feature TicketPrefix_S.C./PARIS - error decreased to 6.981245041238625
Removing feature TicketPrefix_WE/P - error decreased to 6.974277629000113
Removing feature TicketPrefix_P/PP - error decreased to 6.9673768951648585
END


In [31]:
model_backward = model
columns = df_training.columns[model_backward]
X_train = df_training[columns].values
X_test = df_test[columns].values
y_train = df_training['Survived'].values

In [32]:
clf = GaussianNB()

In [33]:
clf.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [34]:
y_test = clf.predict(X_test)

In [35]:
submission = df_test.copy()
submission['Survived'] = y_test
submission = submission[['PassengerId', 'Survived']]

In [36]:
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,1
1,893,1
2,894,1
3,895,1
4,896,1


In [37]:
submission.to_csv('./submissions/06_naive_bayes_backward_elimination.csv', index = False)

This submission produced 41.14% prediction accuracy.