In [67]:
import pandas as pd
import os


DATA_FILEPATH = "data/titanic.csv"

def fetch_data():
    """Import the data from csv to pd dataframe"""
    relaviteFilepath = os.path.join(os.path.abspath(''), DATA_FILEPATH)
    return pd.read_csv(relaviteFilepath)

# df stands for dataframe. This is the object that we will manipulate throughouht the notebook
titanicdf = fetch_data()
titanicdf

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.2500
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.9250
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1000
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.0500
...,...,...,...,...,...,...,...,...
882,0,2,Rev. Juozas Montvila,male,27.0,0,0,13.0000
883,1,1,Miss. Margaret Edith Graham,female,19.0,0,0,30.0000
884,0,3,Miss. Catherine Helen Johnston,female,7.0,1,2,23.4500
885,1,1,Mr. Karl Howell Behr,male,26.0,0,0,30.0000


# Take the titanic dataset and use all attributes to predict the class‘Survived’ with a Decision tree classifier.  

* convert age and fare into classes
* exclude names from the attribute list

## (a)  Find the best tree depth for the model

First let's do some pre-processing

In [68]:
from sklearn import preprocessing

#Drop the names
titanicdf.drop(columns=["Name"], axis=1, inplace=True)

#The sex also needs to be encoded
#creating labelEncoder
le = preprocessing.LabelEncoder()
# Converting string labels into numbers.
titanicdf["Sex"] = le.fit_transform(titanicdf["Sex"])

#Age, fare -> class
titanicdf["Fare"] = titanicdf["Fare"] // 10 #Group by range of 10

def group_age(age):
    """Directly use numbers as we would need to apply label encoder"""
    if age < 18:
        return 0 #Child
    if age < 30:
        return 1 #Yound adult
    if age < 60:
        return 2 #Adult
    return 3 #Senior

#group ages
titanicdf['Age'] = titanicdf['Age'].apply(group_age)

titanicdf

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,1,1,1,0,0.0
1,1,1,0,2,1,0,7.0
2,1,3,0,1,0,0,0.0
3,1,1,0,2,1,0,5.0
4,0,3,1,2,0,0,0.0
...,...,...,...,...,...,...,...
882,0,2,1,1,0,0,1.0
883,1,1,0,1,0,0,3.0
884,0,3,0,0,1,2,2.0
885,1,1,1,1,0,0,3.0


Apply the tree with gridsearch to determine the best depth

In [69]:
from sklearn.model_selection import GridSearchCV
from sklearn import tree

X,y = titanicdf.drop(columns=['Survived'], axis=1, inplace=False), titanicdf["Survived"]

treeClf = tree.DecisionTreeClassifier()

#GridSearch will apply cross-validation and test every possible parameter to determine the best combination. The max depth is our main target.
Depths = range(1,15)
parameters = {
    "criterion": ["gini", "entropy"],
    "max_depth": Depths,
}
gridTreeClf = GridSearchCV(treeClf, parameters, n_jobs=-1, return_train_score=True)
 
gridTreeClf.fit(X,y)

print(f"Best parameter found : {gridTreeClf.best_params_}, score: {round(100*gridTreeClf.best_score_, 1)}%")

print(f"Mean test/train scored :")
for i in Depths:
 print(f"Max depth {i}, test={round(100*gridTreeClf.cv_results_['mean_test_score'][i], 1)}%, train={round(100*gridTreeClf.cv_results_['mean_train_score'][i], 1)}%")

Best parameter found : {'criterion': 'gini', 'max_depth': 6}, score: 82.0%
Mean test/train scored :
Max depth 1, test=77.2%, train=78.9%
Max depth 2, test=80.7%, train=81.3%
Max depth 3, test=80.2%, train=82.5%
Max depth 4, test=80.7%, train=84.0%
Max depth 5, test=82.0%, train=85.3%
Max depth 6, test=81.5%, train=85.8%
Max depth 7, test=79.5%, train=86.9%
Max depth 8, test=79.8%, train=87.5%
Max depth 9, test=79.9%, train=87.6%
Max depth 10, test=79.4%, train=87.9%
Max depth 11, test=79.8%, train=88.0%
Max depth 12, test=79.7%, train=88.0%
Max depth 13, test=79.8%, train=88.1%
Max depth 14, test=78.6%, train=78.6%


The best criterion is the gini with max depth at 6. We see that this particular pre-pruning strategy yields better reslults on a set from 1 to 15. It is to note that too much pre-pruning (with the extrema of 1) is detrimental, but still has a steady accuracy score during test. Doing less and less pre-pruning yields way better training accuracy, but the testing score tends to be reduced. This confirms the fact that although pre-pruning does stop an increase in the training phase, it prevents overfitting and helps during the predictions.