In [2]:
import pandas as pd

In [3]:
titanic = pd.read_csv("../data/train.csv")

In [4]:
titanic = titanic.drop(["Ticket", "Cabin"], axis=1)

In [5]:
titanic = titanic.dropna()

In [6]:
Pclas_pct = pd.crosstab(titanic.Pclass.astype("category"), titanic.Survived.astype("category"), margins=True)
Pclas_pct["Percent"] = Pclas_pct[1]/(Pclas_pct[0] + Pclas_pct[1])
Sex_pct = pd.crosstab(titanic.Sex.astype("category"), titanic.Survived.astype("category"), margins=True)
Sex_pct["Percent"] = Sex_pct[1]/(Sex_pct[0] + Sex_pct[1])

In [7]:
Pclas_pct

Survived,0,1,All,Percent
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,64,120,184,0.652174
2,90,83,173,0.479769
3,270,85,355,0.239437
All,424,288,712,0.404494


In [8]:
Sex_pct

Survived,0,1,All,Percent
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,64,195,259,0.752896
male,360,93,453,0.205298
All,424,288,712,0.404494


In [9]:
titanic = pd.concat([titanic, pd.get_dummies(titanic["Sex"])], axis=1)

In [10]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,female,male
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,1,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,1,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,0,1


In [11]:
X = titanic[["Pclass","Age","female"]]
Y = titanic["Survived"]

In [12]:
import sklearn.model_selection as ms

In [13]:
XTrain, XTest, YTrain, YTest = ms.train_test_split(X, Y, test_size=0.3, random_state=42)

In [14]:
import numpy as np

In [15]:
depth_val = np.arange(2, 11)
leaf_val = np.arange(1, 31, step=9)

In [16]:
from sklearn import tree

In [17]:
grid_s = [{"max_depth": depth_val, "min_samples_leaf": leaf_val}]

In [18]:
model = tree.DecisionTreeClassifier(criterion="entropy")

In [19]:
from sklearn.model_selection import GridSearchCV

In [20]:
cv_tree = GridSearchCV(estimator=model, param_grid=grid_s, cv=ms.KFold(n_splits=10))

In [21]:
cv_tree.fit(XTrain, YTrain)

GridSearchCV(cv=KFold(n_splits=10, random_state=None, shuffle=False),
             estimator=DecisionTreeClassifier(criterion='entropy'),
             param_grid=[{'max_depth': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10]),
                          'min_samples_leaf': array([ 1, 10, 19, 28])}])

In [22]:
best_depth = cv_tree.best_params_["max_depth"]
best_min_samples = cv_tree.best_params_["min_samples_leaf"]

In [23]:
model = tree.DecisionTreeClassifier(criterion="entropy", max_depth=best_depth, min_samples_leaf=best_min_samples)

In [24]:
TitanicTree = model.fit(XTrain, YTrain)

In [25]:
survive_pred = TitanicTree.predict(XTest)
survive_proba = TitanicTree.predict_proba(XTest)

In [26]:
from sklearn import metrics

In [27]:
metrics.confusion_matrix(YTest, survive_pred)

array([[113,   9],
       [ 46,  46]])

In [28]:
TitanicTree.score(XTest, YTest)

0.7429906542056075

In [29]:
tree.export_graphviz(TitanicTree, out_file="TitanicTree.dot", max_depth=3, feature_names=X.columns, class_names=["Dead", "Survived"])

In [30]:
titanic_test = pd.read_csv("../data/test.csv")
titanic_test = titanic_test.drop(["Ticket", "Cabin"], axis = 1)
titanic_test = titanic_test.dropna()
titanic_test = pd.concat([titanic_test, pd.get_dummies(titanic_test["Sex"])], axis = 1)

In [32]:
X_holdout = titanic_test[["Pclass", "Age", "female"]]
survive_holdout = TitanicTree.predict(X_holdout)