# Preamble

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import sys
if not sys.path.__contains__('..'): sys.path.append('..') #hack to import from parent directory
from plot_utils import plot_results


In [None]:
titanic = pd.read_csv('Titanic.csv')
titanic

# Ready datasets

First clean up the data a little

In [None]:
#one-hot encode categorical features
male_column = pd.get_dummies(titanic["Sex"])[['male']]

#replace categorical features with new features
titanic = pd.concat([titanic, male_column], axis='columns').drop(['Sex', 'Embarked'], axis='columns')

#drop non-categorical text features, redundant features and primary key
titanic = titanic.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis='columns')

#drop data with NaN values
titanic = titanic.dropna()

titanic

Train and test sets

In [None]:
y = titanic['Survived']
X = titanic.drop('Survived', axis='columns')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=504, test_size=.2)


# Training a random forest

Let's get ourselves a baseline

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier(random_state=504, max_depth=6)
tree_clf.fit(X_train, y_train)
tree_clf.score(X_train, y_train), tree_clf.score(X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Default is to train 100 trees
forest_clf = RandomForestClassifier(random_state=504, max_depth=6)
forest_clf.fit(X_train, y_train)

forest_clf.score(X_train, y_train), forest_clf.score(X_test, y_test)

It appears that the ensemble can be an improvement (albeit small) on even a good estimator, but let's dig a bit deeper.

Let's see how the number of estimators influences the result.

In [None]:
estimators = [n * 10 for n in range(1, 31)]

classifiers = [RandomForestClassifier(random_state=504, warm_start=True, oob_score=True, max_depth=6, n_estimators=n) for n in estimators]
for clf in classifiers: clf.fit(X_train, y_train)

train_scores = [clf.score(X_train, y_train) for clf in classifiers]
test_scores = [clf.score(X_test, y_test) for clf in classifiers]
plot_results(train_scores, test_scores, train_label="Train accuracy", test_label="Test accuracy", xlabel="estimators", ylabel="accuracy", xvalues = estimators)

You will note that the accuracy is fluctuating some due to the randomness of the random forest.

Let's look at the Out-Of-Box score

In [None]:
oob_scores = [clf.oob_score_ for clf in classifiers]
plot_results(train_scores, oob_scores, train_label="Train accuracy", test_label="OOB accuracy", xlabel="estimators", ylabel="accuracy", xvalues = estimators)


After 20 - 40 trees (depending on the hyperparameters) there is not much improvement - just noise.

In [None]:
forest_clf = RandomForestClassifier(random_state=504, max_depth=6, n_estimators=30)
forest_clf.fit(X_train, y_train)
forest_clf.score(X_train, y_train), forest_clf.score(X_test, y_test)

In [None]:
importances = zip(X_train.keys(), forest_clf.feature_importances_) #pairs up feature names with performance score
sorted(importances, key=lambda p: -p[1])

There are improvements to be had by using ensembles on overfitting estimators, but on good estimators there is not much to gain. With max_depth=3, you won't see an improvement. 

A forest is quite robust, but you need *some* constraint(s). With no max_depth the forest is overfitting, too.