In [209]:
# Imports needed for the script
import numpy as np
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from IPython.display import Image as PImage
from subprocess import check_call
from PIL import Image, ImageDraw, ImageFont

# Loading the data
train = pd.read_csv('titanic-dt/train.csv')
test = pd.read_csv('titanic-dt/test.csv')

# Store our test passenger IDs for easy access
PassengerId = test['PassengerId']

# Showing overview of the train dataset
train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [210]:
train.columns

Index([u'PassengerId', u'Survived', u'Pclass', u'Name', u'Sex', u'Age',
       u'SibSp', u'Parch', u'Ticket', u'Fare', u'Cabin', u'Embarked'],
      dtype='object')

In [211]:
original_train = train.copy()
full_data = [train, test]

In [212]:
# Remove all NULLS
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [213]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [214]:
train.Embarked = train.Embarked.fillna("S")
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [215]:
train.Fare = train.Fare.fillna(train.Fare.median())
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [216]:
mean = int(train.Age.mean())
std = int(train.Age.std())
for i,j in train.iterrows():
    if(np.isnan(train.loc[i].Age)):
        train.loc[i,"Age"] = np.random.randint(mean-std,mean+std,size = 1)
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [217]:
mean = int(test.Age.mean())
std = int(test.Age.std())
for i,j in test.iterrows():
    if(np.isnan(test.loc[i].Age)):
        test.loc[i,"Age"] = np.random.randint(mean-std,mean+std,size = 1)
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [218]:
test.Fare = test.Fare.fillna(test.Fare.median())
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64

In [219]:
#transform embarked, gender
train.Embarked = train.Embarked.replace(["C","Q","S"],[1,2,3])
train["Embarked"] = train.Embarked
test.Embarked = test.Embarked.replace(["C","Q","S"],[1,2,3])
test["Embarked"] = test.Embarked
train.Sex = train.Sex.replace(["male","female"],[1,2])
train["Sex"] = train.Sex
test.Sex = test.Sex.replace(["male","female"],[1,2])
test["Sex"] = test.Sex

In [220]:
cv = KFold(n_splits=10)
accuracies = list()
max_attributes = len(list(test))
depth_range = range(1, max_attributes + 1)
for depth in depth_range:
    fold_accuracy = []
    tree_model = tree.DecisionTreeClassifier(max_depth = depth)
    for train_fold, valid_fold in cv.split(train):
        #print("TRAIN:", train_index, "TEST:", test_index)
        f_train = train.loc[train_fold] # Extract train data with cv indices
        f_valid = train.loc[valid_fold] # Extract valid data with cv indices
        model = tree_model.fit(X = f_train[["Age","SibSp","Parch","Pclass","Fare","Embarked","Sex"]],y = f_train["Survived"])
        valid_acc = model.score(X = f_valid[["Age","SibSp","Parch","Pclass","Fare","Embarked","Sex"]],y = f_valid["Survived"])
        fold_accuracy.append(valid_acc)
    avg = sum(fold_accuracy)/len(fold_accuracy)
    accuracies.append(avg)

# Just to show results conveniently
df = pd.DataFrame({"Max Depth": depth_range, "Average Accuracy": accuracies})
df = df[["Max Depth", "Average Accuracy"]]
print(df.to_string(index=False))    

Max Depth  Average Accuracy
        1          0.786729
        2          0.766654
        3          0.815943
        4          0.796966
        5          0.804794
        6          0.802572
        7          0.807004
        8          0.795830
        9          0.807004
       10          0.800287
       11          0.800287
