In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [2]:
import pandas as pd

# Importing data

In [3]:
raw_data = pd.read_csv("../data/train.csv")

In [4]:
clean_data = raw_data.drop(["Cabin", "Name", "PassengerId", "Ticket"], axis=1)

In [5]:
clean_data = pd.get_dummies(clean_data).fillna(-1)

In [6]:
train, test = train_test_split(clean_data)

In [7]:
def split_x_y(dataframe, target):
    return dataframe.drop(target, axis=1), dataframe[target]

In [8]:
X_train, y_train = split_x_y(train, "Survived")
X_test, y_test = split_x_y(test, "Survived")

# Baseline model

In [9]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [10]:
y_pred = rf.predict_proba(X_test)

In [11]:
auc = roc_auc_score(y_true=y_test, y_score=y_pred[:, 1])

In [12]:
auc

0.824812030075188

# Deep Forest

## By Hand

In [13]:
from sklearn.model_selection import StratifiedKFold

In [14]:
rf1 = RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=4)
rf2 = RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=10)

In [15]:
rf1.fit(X_train, y_train)
y_pred_train_1 = rf1.predict_proba(X_train)
y_pred_test_1 = rf1.predict_proba(X_test)

y_pred_train_1 = pd.DataFrame(y_pred_train_1, columns=["rf1_0", "rf1_1"], index=X_train.index)
y_pred_test_1 = pd.DataFrame(y_pred_test_1, columns=["rf1_0", "rf1_1"], index=X_test.index)

In [16]:
rf2.fit(X_train, y_train)
y_pred_train_2 = rf2.predict_proba(X_train)
y_pred_test_2 = rf2.predict_proba(X_test)

y_pred_train_2 = pd.DataFrame(y_pred_train_2, columns=["rf2_0", "rf2_1"], index=X_train.index)
y_pred_test_2 = pd.DataFrame(y_pred_test_2, columns=["rf2_0", "rf2_1"], index=X_test.index)

In [17]:
hidden_train_1 = pd.concat([X_train, y_pred_train_1, y_pred_train_2], axis=1)
hidden_test_1 = pd.concat([X_test, y_pred_test_1, y_pred_test_2], axis=1)

In [18]:
hidden_train_1.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,rf1_0,rf1_1,rf2_0,rf2_1
751,3,6.0,0,1,12.475,0,1,0,0,1,0.632679,0.367321,0.140766,0.859234
515,1,47.0,0,0,34.0208,0,1,0,0,1,0.677046,0.322954,0.848471,0.151529
191,2,19.0,0,0,13.0,0,1,0,0,1,0.849681,0.150319,0.902643,0.097357
590,3,35.0,0,0,7.125,0,1,0,0,1,0.908102,0.091898,0.968545,0.031455
441,3,20.0,0,0,9.5,0,1,0,0,1,0.867338,0.132662,0.928332,0.071668


In [19]:
rf3 = RandomForestClassifier(n_estimators=300, n_jobs=-1)
rf3.fit(hidden_train_1, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [20]:
y_pred3 = rf3.predict_proba(hidden_test_1)

In [21]:
roc_auc_score(y_test, y_pred3[:, 1])

0.81624895572264

## With API

In [22]:
from deepforest.layer import Layer

In [23]:
layer1 = Layer(RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=4), 
               RandomForestClassifier(n_estimators=100, n_jobs=-1, max_depth=10))

In [24]:
layer1.fit(X_train, y_train)

<deepforest.layer.Layer at 0x110af8e80>

In [25]:
layer1.predict(X_test)

array([[  1.19915048e-01,   8.80084952e-01,   4.64682540e-02,
          9.53531746e-01],
       [  8.99451879e-01,   1.00548121e-01,   9.78058477e-01,
          2.19415235e-02],
       [  8.79372092e-01,   1.20627908e-01,   9.80502477e-01,
          1.94975232e-02],
       [  4.90681980e-01,   5.09318020e-01,   6.36271666e-01,
          3.63728334e-01],
       [  4.82562141e-02,   9.51743786e-01,   0.00000000e+00,
          1.00000000e+00],
       [  8.90322006e-01,   1.09677994e-01,   8.90033666e-01,
          1.09966334e-01],
       [  5.96244205e-01,   4.03755795e-01,   3.57722474e-01,
          6.42277526e-01],
       [  7.69554170e-01,   2.30445830e-01,   8.76161197e-01,
          1.23838803e-01],
       [  1.70317800e-01,   8.29682200e-01,   3.37518933e-01,
          6.62481067e-01],
       [  9.69505535e-02,   9.03049447e-01,   1.32500000e-01,
          8.67500000e-01],
       [  4.88576616e-01,   5.11423384e-01,   5.51779603e-01,
          4.48220397e-01],
       [  3.18803034e