# Topgun titanic and decision trees - random forest

Data available: PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked

In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

import pandas as pd
import numpy as np

np.random.seed(0)

In [96]:
df = pd.read_csv("train.csv").dropna()

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S


Selected: sex, age, fare and ticket class.

Since decision trees can only work with numbers we need to map 'female' and 'male' to 0 and 1.

In [97]:

df['Sex'] = df['Sex'].factorize()[0]

features = ['Sex', 'Age', 'Fare', 'Pclass']

df[features].head()


Unnamed: 0,Sex,Age,Fare,Pclass
1,0,38.0,71.2833,1
3,0,35.0,53.1,1
6,1,54.0,51.8625,1
10,0,4.0,16.7,3
11,0,58.0,26.55,1


In [98]:
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75
train, test = df[df['is_train'] == True], df[df['is_train'] == False]

len(train), len(test)

(138, 45)

In [111]:
clf = RandomForestClassifier()

clf.fit(train[features], train['Survived'])

preds = clf.predict(test[features])
clf.predict_proba(test[features])[0:10]

array([[0.  , 1.  ],
       [0.69, 0.31],
       [0.11, 0.89],
       [0.61, 0.39],
       [0.6 , 0.4 ],
       [0.19, 0.81],
       [0.58, 0.42],
       [0.33, 0.67],
       [0.15, 0.85],
       [0.06, 0.94]])

In [112]:
pd.crosstab(test['Survived'], preds, rownames=['Reality'], colnames=['Predicted'])

Predicted,0,1
Reality,Unnamed: 1_level_1,Unnamed: 2_level_1
0,8,6
1,6,25


##### Normal decision tree result comparison

In [102]:
clf = DecisionTreeClassifier()
clf.fit(train[features], train['Survived'])
preds = clf.predict(test[features])
pd.crosstab(test['Survived'], preds, rownames=['Reality'], colnames=['Predicted'])

Predicted,0,1
Reality,Unnamed: 1_level_1,Unnamed: 2_level_1
0,7,7
1,5,26


randomforest accuracy: 33/45 = 73.33%
decisiontree accuracy: 33/45 = 73.33%
conclusion: random forests contributed with no improvemence