In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [2]:
import pandas as pd

# Importing data

Getting data from kaggle first :

In [3]:
import pkg_resources

In [4]:
raw_data = pd.read_csv(pkg_resources.resource_stream('deepforest', 'data/train.csv'))

In [5]:
clean_data = raw_data.drop(["Cabin", "Name", "PassengerId", "Ticket"], axis=1)

In [6]:
clean_data = pd.get_dummies(clean_data).fillna(-1)

In [7]:
train, test = train_test_split(clean_data)

In [8]:
def split_x_y(dataframe, target):
    return dataframe.drop(target, axis=1), dataframe[target]

In [9]:
X_train, y_train = split_x_y(train, "Survived")
X_test, y_test = split_x_y(test, "Survived")

# Baseline model

In [10]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [11]:
y_pred = rf.predict_proba(X_test)

In [12]:
auc = roc_auc_score(y_true=y_test, y_score=y_pred[:, 1])

In [13]:
auc

0.86788399570354469

# Deep Forest

## By Hand

In [14]:
from sklearn.model_selection import StratifiedKFold

In [15]:
rf1 = RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=4)
rf2 = RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=10)

In [16]:
rf1.fit(X_train, y_train)
y_pred_train_1 = rf1.predict_proba(X_train)
y_pred_test_1 = rf1.predict_proba(X_test)

y_pred_train_1 = pd.DataFrame(y_pred_train_1, columns=["rf1_0", "rf1_1"], index=X_train.index)
y_pred_test_1 = pd.DataFrame(y_pred_test_1, columns=["rf1_0", "rf1_1"], index=X_test.index)

In [17]:
rf2.fit(X_train, y_train)
y_pred_train_2 = rf2.predict_proba(X_train)
y_pred_test_2 = rf2.predict_proba(X_test)

y_pred_train_2 = pd.DataFrame(y_pred_train_2, columns=["rf2_0", "rf2_1"], index=X_train.index)
y_pred_test_2 = pd.DataFrame(y_pred_test_2, columns=["rf2_0", "rf2_1"], index=X_test.index)

In [18]:
hidden_train_1 = pd.concat([X_train, y_pred_train_1, y_pred_train_2], axis=1)
hidden_test_1 = pd.concat([X_test, y_pred_test_1, y_pred_test_2], axis=1)

In [19]:
hidden_train_1.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,rf1_0,rf1_1,rf2_0,rf2_1
786,3,18.0,0,0,7.4958,1,0,0,0,1,0.410273,0.589727,0.256744,0.743256
627,1,21.0,0,0,77.9583,1,0,0,0,1,0.070023,0.929977,0.0,1.0
695,2,52.0,0,0,13.5,0,1,0,0,1,0.863169,0.136831,0.938984,0.061016
379,3,19.0,0,0,7.775,0,1,0,0,1,0.890193,0.109807,0.931481,0.068519
708,1,22.0,0,0,151.55,1,0,0,0,1,0.087613,0.912387,0.03,0.97


In [20]:
rf3 = RandomForestClassifier(n_estimators=300, n_jobs=-1)
rf3.fit(hidden_train_1, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [21]:
y_pred3 = rf3.predict_proba(hidden_test_1)

In [22]:
roc_auc_score(y_test, y_pred3[:, 1])

0.86040995345506621

This is not very handy, not at all. We already see a lot of code duplication, and one may feel there may be a way to abstract a lot of the logic that is happening here, in a way that is more flexible and powerful that all this boilerplate code.

## With API

In [23]:
from deepforest.layer import Layer, InputLayer

In [24]:
input_layer = InputLayer(RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=4),
                         RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=10))

In [25]:
hidden_layer = Layer(input_layer,
                     RandomForestClassifier(n_estimators=50, n_jobs=-1, max_depth=4),
                     RandomForestClassifier(n_estimators=50, n_jobs=-1, max_depth=10))

In [26]:
hidden_layer.fit(X_train, y_train)

<deepforest.layer.Layer at 0x104522d30>

In [27]:
pd.DataFrame(hidden_layer.predict(X_test), index=X_test.index)

Unnamed: 0,0,1,2,3
81,0.886403,0.113597,0.840710,0.159290
889,0.176103,0.823897,0.020000,0.980000
188,0.929972,0.070028,0.993103,0.006897
742,0.029003,0.970997,0.000000,1.000000
146,0.892927,0.107073,0.921950,0.078050
99,0.902629,0.097371,0.755414,0.244586
197,0.893365,0.106635,0.892192,0.107808
814,0.876441,0.123559,0.781518,0.218482
836,0.890743,0.109257,0.918327,0.081673
13,0.909326,0.090674,0.936667,0.063333


# Going Further

In [28]:
def random_forest_generator():
    for i in range(2, 15, 2):
        yield RandomForestClassifier(n_estimators=100,
                                     n_jobs=-1,
                                     min_samples_leaf=5,
                                     max_depth=i)
    for i in range(2, 15, 2):
        yield RandomForestClassifier(n_estimators=100,
                                     n_jobs=-1,
                                     max_features=1,
                                     min_samples_leaf=5,
                                     max_depth=i)

In [29]:
def paper_like_generator():
    for i in range(2):
        yield RandomForestClassifier(n_estimators=1000,
                                     n_jobs=-1,
                                     min_samples_leaf=10)
    for i in range(2):
        yield RandomForestClassifier(n_estimators=1000,
                                     n_jobs=-1,
                                     max_features=1,
                                     min_samples_leaf=10)

In [30]:
def build_input_layer():
    return InputLayer(*paper_like_generator())

In [31]:
def build_hidden_layer(layer):
    return Layer(layer, *paper_like_generator())

In [32]:
def build_output_layer(layer):
    return Layer(layer,
                 RandomForestClassifier(n_estimators=500,
                                        n_jobs=-1,
                                        min_samples_leaf=5,
                                        max_depth=10))

In [33]:
input_l = build_input_layer()
hidden_1 = build_hidden_layer(input_l)
hidden_2 = build_hidden_layer(hidden_1)
hidden_3 = build_hidden_layer(hidden_2)
hidden_4 = build_hidden_layer(hidden_3)
output_l = build_output_layer(hidden_4)

In [34]:
output_l.fit(X_train, y_train)

<deepforest.layer.Layer at 0x104512978>

In [35]:
y_pred = output_l.predict(X_test)

In [36]:
y_pred

array([[  9.29638699e-01,   7.03613014e-02],
       [  1.72212883e-01,   8.27787117e-01],
       [  1.00000000e+00,   0.00000000e+00],
       [  7.27478992e-03,   9.92725210e-01],
       [  1.00000000e+00,   0.00000000e+00],
       [  1.00000000e+00,   0.00000000e+00],
       [  7.52790052e-01,   2.47209948e-01],
       [  9.59129256e-01,   4.08707444e-02],
       [  1.00000000e+00,   0.00000000e+00],
       [  1.00000000e+00,   0.00000000e+00],
       [  1.00000000e+00,   0.00000000e+00],
       [  1.00000000e+00,   0.00000000e+00],
       [  3.09158979e-01,   6.90841021e-01],
       [  9.97466667e-01,   2.53333333e-03],
       [  8.88917085e-01,   1.11082915e-01],
       [  3.52073705e-01,   6.47926295e-01],
       [  3.72293325e-01,   6.27706675e-01],
       [  2.88891001e-01,   7.11108999e-01],
       [  1.00000000e+00,   0.00000000e+00],
       [  1.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   1.00000000e+00],
       [  1.42849243e-01,   8.57150757e-01],
       [  

In [35]:
roc_auc_score(y_test, y_pred.iloc[:, 1])

0.81846846846846855

Well the result is not that satisfactory yet, but let's not loose hope. There is a lot of room for improvement yet. First item on my todo list: make sure all the intermediary models are trained using cross-validation techniques, to reduce overfitting.