In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [2]:
import pandas as pd

# Importing data

Getting data from kaggle first :

In [3]:
import pkg_resources

In [4]:
pkg_resources.resource_exists('deepforest', 'train.csv')

ModuleNotFoundError: No module named 'deepforest'

In [11]:
raw_data = pd.read_csv("../data/train.csv")

In [12]:
clean_data = raw_data.drop(["Cabin", "Name", "PassengerId", "Ticket"], axis=1)

In [13]:
clean_data = pd.get_dummies(clean_data).fillna(-1)

In [14]:
train, test = train_test_split(clean_data)

In [15]:
def split_x_y(dataframe, target):
    return dataframe.drop(target, axis=1), dataframe[target]

In [16]:
X_train, y_train = split_x_y(train, "Survived")
X_test, y_test = split_x_y(test, "Survived")

# Baseline model

In [17]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [18]:
y_pred = rf.predict_proba(X_test)

In [19]:
auc = roc_auc_score(y_true=y_test, y_score=y_pred[:, 1])

In [20]:
auc

0.84225937871329137

# Deep Forest

## By Hand

In [21]:
from sklearn.model_selection import StratifiedKFold

In [22]:
rf1 = RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=4)
rf2 = RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=10)

In [23]:
rf1.fit(X_train, y_train)
y_pred_train_1 = rf1.predict_proba(X_train)
y_pred_test_1 = rf1.predict_proba(X_test)

y_pred_train_1 = pd.DataFrame(y_pred_train_1, columns=["rf1_0", "rf1_1"], index=X_train.index)
y_pred_test_1 = pd.DataFrame(y_pred_test_1, columns=["rf1_0", "rf1_1"], index=X_test.index)

In [24]:
rf2.fit(X_train, y_train)
y_pred_train_2 = rf2.predict_proba(X_train)
y_pred_test_2 = rf2.predict_proba(X_test)

y_pred_train_2 = pd.DataFrame(y_pred_train_2, columns=["rf2_0", "rf2_1"], index=X_train.index)
y_pred_test_2 = pd.DataFrame(y_pred_test_2, columns=["rf2_0", "rf2_1"], index=X_test.index)

In [25]:
hidden_train_1 = pd.concat([X_train, y_pred_train_1, y_pred_train_2], axis=1)
hidden_test_1 = pd.concat([X_test, y_pred_test_1, y_pred_test_2], axis=1)

In [26]:
hidden_train_1.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,rf1_0,rf1_1,rf2_0,rf2_1
254,3,41.0,0,2,20.2125,1,0,0,0,1,0.45431,0.54569,0.73754,0.26246
126,3,-1.0,0,0,7.75,0,1,0,1,0,0.902804,0.097196,0.859167,0.140833
654,3,18.0,0,0,6.75,1,0,0,1,0,0.322238,0.677762,0.652946,0.347054
561,3,40.0,0,0,7.8958,0,1,0,0,1,0.886007,0.113993,0.962031,0.037969
8,3,27.0,0,2,11.1333,1,0,0,0,1,0.405044,0.594956,0.164837,0.835163


In [27]:
rf3 = RandomForestClassifier(n_estimators=300, n_jobs=-1)
rf3.fit(hidden_train_1, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [28]:
y_pred3 = rf3.predict_proba(hidden_test_1)

In [29]:
roc_auc_score(y_test, y_pred3[:, 1])

0.8276608385673061

This is not very handy, not at all. We already see a lot of code duplication, and one may feel there may be a way to abstract a lot of the logic that is happening here, in a way that is more flexible and powerful that all this boilerplate code.

## With API

In [30]:
from deepforest.layer import Layer, InputLayer

ModuleNotFoundError: No module named 'deepforest'

In [31]:
input_layer = InputLayer(RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=4),
                         RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=10))

NameError: name 'InputLayer' is not defined

In [32]:
hidden_layer = Layer(input_layer,
                     RandomForestClassifier(n_estimators=50, n_jobs=-1, max_depth=4),
                     RandomForestClassifier(n_estimators=50, n_jobs=-1, max_depth=10))

NameError: name 'Layer' is not defined

In [33]:
hidden_layer.fit(X_train, y_train)

NameError: name 'hidden_layer' is not defined

In [34]:
hidden_layer.predict(X_test).head()

NameError: name 'hidden_layer' is not defined

# Going Further

In [66]:
def random_forest_generator():
    for i in range(2, 15, 2):
        yield RandomForestClassifier(n_estimators=100,
                                     n_jobs=-1,
                                     min_samples_leaf=5,
                                     max_depth=i)
    for i in range(2, 15, 2):
        yield RandomForestClassifier(n_estimators=100,
                                     n_jobs=-1,
                                     max_features=1,
                                     min_samples_leaf=5,
                                     max_depth=i)

In [78]:
def build_input_layer():
    return InputLayer(*random_forest_generator())

In [79]:
def build_hidden_layer(layer):
    return Layer(layer, *random_forest_generator())

In [80]:
def build_output_layer(layer):
    return Layer(layer,
                 RandomForestClassifier(n_estimators=500,
                                        n_jobs=-1,
                                        min_samples_leaf=5,
                                        max_depth=10))

In [81]:
input_l = build_input_layer()
hidden_1 = build_hidden_layer(input_l)
hidden_2 = build_hidden_layer(hidden_1)
hidden_3 = build_hidden_layer(hidden_2)
hidden_4 = build_hidden_layer(hidden_3)
output_l = build_output_layer(hidden_4)

In [74]:
output_l.fit(X_train, y_train)

<deepforest.layer.Layer at 0x112740e48>

In [75]:
y_pred = output_l.predict(X_test)

In [76]:
roc_auc_score(y_test, y_pred.iloc[:, 1])

0.81224489795918364

Well the result is not that satisfactory yet, but let's not loose hope. There is a lot of room for improvement yet. First item on my todo list: make sure all the intermediary models are trained using cross-validation techniques, to reduce overfitting.