In [84]:
import pandas as pd
import os


DATA_FILEPATH = "data/titanic.csv"

def fetch_data():
    """Import the data from csv to pd dataframe"""
    relaviteFilepath = os.path.join(os.path.abspath(''), DATA_FILEPATH)
    return pd.read_csv(relaviteFilepath)

# df stands for dataframe. This is the object that we will manipulate throughouht the notebook
titanicdf = fetch_data()
titanicdf

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.2500
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.9250
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1000
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.0500
...,...,...,...,...,...,...,...,...
882,0,2,Rev. Juozas Montvila,male,27.0,0,0,13.0000
883,1,1,Miss. Margaret Edith Graham,female,19.0,0,0,30.0000
884,0,3,Miss. Catherine Helen Johnston,female,7.0,1,2,23.4500
885,1,1,Mr. Karl Howell Behr,male,26.0,0,0,30.0000


# Take the titanic dataset and using all attributes to predict the class ‘Survived’ with:
* Decision tree
* KNN
* Naive Bayes 

# Determine:
* Accuracy of the classifies with 5-fold CV
* Calculate theirs Precision, Recall and F1-score

First let's do some pre-processing

In [85]:
from sklearn import preprocessing

#Drop the names
titanicdf.drop(columns=["Name"], axis=1, inplace=True)

#The sex also needs to be encoded
#creating labelEncoder
le = preprocessing.LabelEncoder()
# Converting string labels into numbers.
titanicdf["Sex"] = le.fit_transform(titanicdf["Sex"])

#Age, fare -> class
titanicdf["Fare"] = titanicdf["Fare"] // 10 #Group by range of 10

def group_age(age):
    """Directly use numbers as we would need to apply label encoder"""
    if age < 18:
        return 0 #Child
    if age < 30:
        return 1 #Yound adult
    if age < 60:
        return 2 #Adult
    return 3 #Senior

#group ages
titanicdf['Age'] = titanicdf['Age'].apply(group_age)

titanicdf

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,1,1,1,0,0.0
1,1,1,0,2,1,0,7.0
2,1,3,0,1,0,0,0.0
3,1,1,0,2,1,0,5.0
4,0,3,1,2,0,0,0.0
...,...,...,...,...,...,...,...
882,0,2,1,1,0,0,1.0
883,1,1,0,1,0,0,3.0
884,0,3,0,0,1,2,2.0
885,1,1,1,1,0,0,3.0


Prepare function to get metrics from a model, and perform K-fold on model

In [86]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def get_acc_metrics(Model, xTest, yTest):
    ''' Returns accuracy, precision, recall and f1-score of the given model '''
    yPred = Model.predict(xTest)

    acc = accuracy_score(yTest, yPred)
    prec = precision_score(yTest, yPred)
    recall = recall_score(yTest, yPred)
    f1 = f1_score(yTest, yPred)

    return [acc, prec, recall, f1]

In [87]:
from sklearn.model_selection import KFold


def k_fold(model, x, y, k):
    '''
    Train the model with the given [x,y] dataset according to the k-fold method, given k.
    Returns:
        - Metrics as [accuracy, precision, recall, f1]
    '''
    kfolder = KFold(n_splits=k)
    #Acc, Prec, Recall, f1
    metrics = [[], [], [], []]

    #Ask kfold for indexes
    for trainIndexes, testIndexes in kfolder.split(x):
        #Separate x,y according to indexes
        X_train, X_test = x.iloc[trainIndexes,:], x.iloc[testIndexes,:]
        y_train, y_test = y.iloc[trainIndexes], y.iloc[testIndexes]

        #Fit
        model.fit(X_train, y_train)

        #Add metrics to list
        modelMetric = get_acc_metrics(model, X_test, y_test)
        for i in range(4):
            metrics[i].append(modelMetric[i])

    meanMetrics = [sum(metric)/k for metric in metrics]

    return meanMetrics

Prepare dataset, parameters in common

In [88]:
X,y = titanicdf.drop(columns=['Survived'], axis=1, inplace=False), titanicdf["Survived"]
k=5

# Decision tree

In [89]:
from sklearn import tree

treeClf = tree.DecisionTreeClassifier()

print(f"Performing {k}-fold for Decision tree")

metrics = k_fold(treeClf, X, y, k)

acc, prec, recall, f1 = metrics[0], metrics[1], metrics[2], metrics[3]

print(f"Accuracy {100*acc:.2f}%, Precision {100*prec:.2f}%, Recall {100*recall:.2f}%, F1-score {100*f1:.2f}%,")

Performing 5-fold for Decision tree
Accuracy 79.26%, Precision 74.94%, Recall 69.69%, F1-score 72.01%,


# KNN

In [90]:
from sklearn.neighbors import KNeighborsClassifier

distance = "cosine"

kClf = KNeighborsClassifier(metric=distance)

print(f"Performing {k}-fold for KNN with distance {distance}")

metrics = k_fold(kClf, X, y, k)

acc, prec, recall, f1 = metrics[0], metrics[1], metrics[2], metrics[3]

print(f"Accuracy {100*acc:.2f}%, Precision {100*prec:.2f}%, Recall {100*recall:.2f}%, F1-score {100*f1:.2f}%,")

Performing 5-fold for KNN with distance cosine
Accuracy 78.69%, Precision 72.50%, Recall 72.22%, F1-score 72.26%,




Unnamed: 0,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,3,1,1,1,0,0.0
1,1,0,2,1,0,7.0
2,3,0,1,0,0,0.0
3,1,0,2,1,0,5.0
4,3,1,2,0,0,0.0
...,...,...,...,...,...,...
882,2,1,1,0,0,1.0
883,1,0,1,0,0,3.0
884,3,0,0,1,2,2.0
885,1,1,1,0,0,3.0


# Naive Bayes

In [91]:
from sklearn.naive_bayes import GaussianNB

gnbClf = GaussianNB()

print(f"Performing {k}-fold for Bayesian")

metrics = k_fold(gnbClf, X, y, k)

acc, prec, recall, f1 = metrics[0], metrics[1], metrics[2], metrics[3]

print(f"Accuracy {100*acc:.2f}%, Precision {100*prec:.2f}%, Recall {100*recall:.2f}%, F1-score {100*f1:.2f}%,")

Performing 5-fold for Bayesian
Accuracy 78.13%, Precision 72.60%, Recall 69.60%, F1-score 70.90%,


We achieve an overall accuracy of 79%, 78.7% and 78.1% with the 5-fold coverage over all data. On the regular series, the respective accuracies were 82%, 78% and 77.5%. The 5-fold seems to be a bit more efficient on accuracy overall, but we could do a t-pair to verify the this assumption :). It is to note that the accuracies reported using the 5-fold are way more close and regular between the algorithm than the holdout ones. This could indicate a better stability of the testing phase.\
\
Each algorithm is different in term of its precision/recall repartition:
* 72.6/69.6 for bayes
* 72.5, 72.2 for knn
* 74.84, 69.35 for decision trees

Lastly, the f1 scores are respectively 71.8%, 72.26% and 70.90%. The KNN seems to be the best one, but the three of us are very, very close together. The knn could also be enhanced with hyper-parameter selection for k using the innter/outer cross-validation seen in class, which could help upgrade its accuracy further.