In [719]:
import pandas as pd
import os


DATA_FILEPATH = "data/titanic.csv"

def fetch_data():
    """Import the data from csv to pd dataframe"""
    relaviteFilepath = os.path.join(os.path.abspath(''), DATA_FILEPATH)
    return pd.read_csv(relaviteFilepath)

# df stands for dataframe. This is the object that we will manipulate throughouht the notebook
titanicdf = fetch_data()
titanicdf

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.2500
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.9250
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1000
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.0500
...,...,...,...,...,...,...,...,...
882,0,2,Rev. Juozas Montvila,male,27.0,0,0,13.0000
883,1,1,Miss. Margaret Edith Graham,female,19.0,0,0,30.0000
884,0,3,Miss. Catherine Helen Johnston,female,7.0,1,2,23.4500
885,1,1,Mr. Karl Howell Behr,male,26.0,0,0,30.0000


# 1.  Take the titanic dataset and using all attributes to predict the class ‘Survived’ 

* (a)  Choose Three classifiers and evaluate their performance using all attributes
* (b)  Define a feature selection method and use it on all the classifiers
* (c)  Compare the classifiers and explain the differences observed

First let's do some pre-processing

In [720]:
from sklearn import preprocessing

#Drop the names
titanicdf.drop(columns=["Name"], axis=1, inplace=True)

#The sex also needs to be encoded
#creating labelEncoder
le = preprocessing.LabelEncoder()
# Converting string labels into numbers.
titanicdf["Sex"] = le.fit_transform(titanicdf["Sex"])

#Age, fare -> class
titanicdf["Fare"] = titanicdf["Fare"] // 10 #Group by range of 10

def group_age(age):
    """Directly use numbers as we would need to apply label encoder"""
    if age < 18:
        return 0 #Child
    if age < 30:
        return 1 #Yound adult
    if age < 60:
        return 2 #Adult
    return 3 #Senior

#group ages
titanicdf['Age'] = titanicdf['Age'].apply(group_age)

titanicdf

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,1,1,1,0,0.0
1,1,1,0,2,1,0,7.0
2,1,3,0,1,0,0,0.0
3,1,1,0,2,1,0,5.0
4,0,3,1,2,0,0,0.0
...,...,...,...,...,...,...,...
882,0,2,1,1,0,0,1.0
883,1,1,0,1,0,0,3.0
884,0,3,0,0,1,2,2.0
885,1,1,1,1,0,0,3.0


Prepare function to get metrics from a model, compare models, and perform feature selection

In [721]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def get_acc_metrics(Model, xTest: pd.DataFrame, yTest: pd.Series):
    ''' Returns accuracy, precision, recall and f1-score of the given model '''
    yPred = Model.predict(xTest)

    acc = accuracy_score(yTest, yPred)
    prec = precision_score(yTest, yPred)
    recall = recall_score(yTest, yPred)
    f1 = f1_score(yTest, yPred)

    return [acc, prec, recall, f1]

In [722]:
from sklearn.model_selection import train_test_split
from sklearn.base import clone

def compare_models(model, x1 : pd.DataFrame, y1 : pd.Series, x2: pd.DataFrame, y2 : pd.Series) -> None:
    '''
    Train the model with the given [x1,y1] dataset and compare it to the model trained with the [x2,y2] dataset.
    '''

    metrics = []
    passes = ["original", "selected"]

    print(f"Performing model analysis")
    for X,y in [(x1,y1), (x2, y2)]:
        currentModel = clone(model)

        #Prepare basis train-test
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

        #Fit
        currentModel.fit(X_train, y_train)

        #Add metrics to list
        modelMetrics = get_acc_metrics(currentModel, X_test, y_test)

        print(f"Score on {passes.pop(0)} data : \n\tAccuracy = {100*modelMetrics[0]:.2f}%\n\tPrecision = {modelMetrics[1]:.2f}\n\tRecall = {modelMetrics[2]:.2f}\n\tF1 = {modelMetrics[3]:.2f}")

        #Add in list for final comparison
        metrics.append(modelMetrics)

    diff = []
    for i in range(4):
        diff.append(metrics[1][i] - metrics[0][i])
    print(f"Gain on model by using the second dataset : \n\tAccuracy = {100*diff[0]:.2f}\n\tPrecision = {diff[1]:.2f}\n\tRecall = {diff[2]:.2f}\n\tF1 = {diff[3]:.2f}")


## PCA comparison

In [723]:
from sklearn.decomposition import PCA

def comparePCA(model, x: pd.DataFrame, y: pd.Series, n: int):
    pca = PCA(n_components=n)

    xTransformed = pca.fit(x).transform(x)

    compare_models(model, x, y, xTransformed, y)

## RFE

In [724]:
from sklearn.feature_selection import RFE

def compareRFE(model, x: pd.DataFrame, y: pd.Series, n: int):
    rfe = RFE(clone(model), n_features_to_select=n)

    rfeFitted = rfe.fit(x, y)
    print(f"Features: {rfeFitted.feature_names_in_}.\nRanking : {rfeFitted.ranking_}.\nSelected features : {rfeFitted.feature_names_in_[rfeFitted.get_support(True)]}")

    xTransformed = rfeFitted.transform(x)

    compare_models(model, x, y, xTransformed, y)

## Chi2

In [725]:
from sklearn.feature_selection import SelectKBest, chi2

def compareChi2(model, x: pd.DataFrame, y: pd.Series, k: int):
    selector = SelectKBest(score_func=chi2, k=k)

    selectorFitted = selector.fit(x, y)
    print(f"Features: {selectorFitted.feature_names_in_}.\nSelected features : {selectorFitted.feature_names_in_[selectorFitted.get_support(True)]}")

    xTransformed = selectorFitted.transform(x)

    compare_models(model, x, y, xTransformed, y)

Prepare dataset, parameters in common

In [726]:
X,y = titanicdf.drop(columns=['Survived'], axis=1, inplace=False), titanicdf["Survived"]
features_to_select = 3

# Decision tree

In [727]:
from sklearn import tree
comparePCA(tree.DecisionTreeClassifier(), X, y, features_to_select)

Performing model analysis
Score on original data : 
	Accuracy = 78.65%
	Precision = 0.79
	Recall = 0.61
	F1 = 0.69
Score on selected data : 
	Accuracy = 72.66%
	Precision = 0.70
	Recall = 0.51
	F1 = 0.59
Gain on model by using the second dataset : 
	Accuracy = -5.99
	Precision = -0.09
	Recall = -0.10
	F1 = -0.10


The PCA from 6 to 3 features has overall ~-6% on the accuracy, which starts to be a significant loss. Every metric loses value (~0.1 for each), which means that this dataset (and/or algorithm) seems to not be very receptive of PCA.\
It could also very well be that 6->3 is too much, or that the features don't mix very well.

In [728]:
compareRFE(tree.DecisionTreeClassifier(), X, y, features_to_select)

Features: ['Pclass' 'Sex' 'Age' 'Siblings/Spouses Aboard' 'Parents/Children Aboard'
 'Fare'].
Ranking : [1 1 2 4 3 1].
Selected features : ['Pclass' 'Sex' 'Fare']
Performing model analysis
Score on original data : 
	Accuracy = 78.65%
	Precision = 0.79
	Recall = 0.61
	F1 = 0.69
Score on selected data : 
	Accuracy = 77.90%
	Precision = 0.82
	Recall = 0.54
	F1 = 0.65
Gain on model by using the second dataset : 
	Accuracy = -0.75
	Precision = 0.04
	Recall = -0.07
	F1 = -0.03


The PCE has fairly good results, it has a ~-1% accuracy but successfully reduces the number of features from 6 to 3.\
The recall and F1-score are lowered at 0.08 and 0.04, but the precision is actually increased, which could be a very good results depending on the constraint.

In [729]:
compareChi2(tree.DecisionTreeClassifier(), X, y, features_to_select)

Features: ['Pclass' 'Sex' 'Age' 'Siblings/Spouses Aboard' 'Parents/Children Aboard'
 'Fare'].
Selected features : ['Pclass' 'Sex' 'Fare']
Performing model analysis
Score on original data : 
	Accuracy = 78.65%
	Precision = 0.79
	Recall = 0.61
	F1 = 0.69
Score on selected data : 
	Accuracy = 77.90%
	Precision = 0.82
	Recall = 0.54
	F1 = 0.65
Gain on model by using the second dataset : 
	Accuracy = -0.75
	Precision = 0.04
	Recall = -0.07
	F1 = -0.03


Similarly and even better, the decision tree using chi2 feature selection goes from 6 to 3 features with only -0.75% accuracy. Once again, the precision goes up a little and the recall and f1-score go down at the same level. Loosing 0.75% accuracy for -3 features indicates that these features were not that important for the algorithm, and this is a trade-off that is very benefic (helps reduce complexity for a very low cost and could greatly increase the computation time while using k-folds etc).

# KNN

In [730]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing

XNormalizedArray = preprocessing.MinMaxScaler().fit_transform(X)
#Go back to DF to feed in the fct
XNormalizedDf = pd.DataFrame(XNormalizedArray, index=X.index, columns=X.columns)


comparePCA(KNeighborsClassifier(metric="cosine"), XNormalizedDf, y, features_to_select)

Performing model analysis
Score on original data : 
	Accuracy = 79.40%
	Precision = 0.77
	Recall = 0.66
	F1 = 0.71
Score on selected data : 
	Accuracy = 79.78%
	Precision = 0.78
	Recall = 0.66
	F1 = 0.72
Gain on model by using the second dataset : 
	Accuracy = 0.37
	Precision = 0.01
	Recall = 0.00
	F1 = 0.00




The PCA achieves very good results ! It has a +0.37% accuracy, with almost the same precision/recall with 3 features compared to 6 features.\
__The normalization of features probably helps the PCA with features merging !__\
The normalization is a very important step of KNN if features are widly different, and it could very well explain why the features can be merged so well. Maybe the normalization step should always be considered for better results when using PCA.

Cannot perform RFE on KNN because KNN does not indicate anyting about features (SVM could be used for example) -> go directly to chi

In [731]:
compareChi2(KNeighborsClassifier(metric="cosine"), XNormalizedDf, y, features_to_select)

Features: ['Pclass' 'Sex' 'Age' 'Siblings/Spouses Aboard' 'Parents/Children Aboard'
 'Fare'].
Selected features : ['Pclass' 'Sex' 'Fare']
Performing model analysis
Score on original data : 
	Accuracy = 79.40%
	Precision = 0.77
	Recall = 0.66
	F1 = 0.71
Score on selected data : 
	Accuracy = 77.53%
	Precision = 0.79
	Recall = 0.56
	F1 = 0.66
Gain on model by using the second dataset : 
	Accuracy = -1.87
	Precision = 0.02
	Recall = -0.10
	F1 = -0.05




On KNN, the chi2 is a bit worse than the PCA ~-2% accuracy, increases precision marginally, lowers recall sligthly). It is still a very good trade off for a 6->3 feature reduction.

# Naive Bayes

In [732]:
from sklearn.naive_bayes import GaussianNB

gnbClf = GaussianNB()

In [733]:
comparePCA(GaussianNB(), X, y, features_to_select)

Performing model analysis
Score on original data : 
	Accuracy = 78.28%
	Precision = 0.73
	Recall = 0.69
	F1 = 0.71
Score on selected data : 
	Accuracy = 67.42%
	Precision = 0.67
	Recall = 0.30
	F1 = 0.42
Gain on model by using the second dataset : 
	Accuracy = -10.86
	Precision = -0.06
	Recall = -0.39
	F1 = -0.29


The Bayesian with PCA lose 10% accuracy from 6 to 3 features, which is a lot.

In [734]:
compareChi2(GaussianNB(), X, y, features_to_select)

Features: ['Pclass' 'Sex' 'Age' 'Siblings/Spouses Aboard' 'Parents/Children Aboard'
 'Fare'].
Selected features : ['Pclass' 'Sex' 'Fare']
Performing model analysis
Score on original data : 
	Accuracy = 78.28%
	Precision = 0.73
	Recall = 0.69
	F1 = 0.71
Score on selected data : 
	Accuracy = 76.40%
	Precision = 0.69
	Recall = 0.70
	F1 = 0.70
Gain on model by using the second dataset : 
	Accuracy = -1.87
	Precision = -0.04
	Recall = 0.01
	F1 = -0.01


The chi2 feature selection allow to go from 6 to 3 features with a loss of only ~1.75%, which is still an acceptable trade-off for the simplicity.\
Curiously, while the decision tree had better precision and worst recall with feature selection, the opposite is true with naive bayes: the precision goes down while the recall seems to be a bit better.\
\
The number of features has been set to 3. This is actually an hyper-parameter and the adjustment of this number impacts the performance of each selection algorithm a lot, so these results are to take with a grain of salt.