In [1]:
import extra_py as ep
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

model = ep.ExtraForestRegressor(
    n_estimators = 1,
    min_samples_split = 1,
    max_depth = 2,
    n_jobs = -1
)
model

<ExtraForestRegressor at 0x1649dabb0>

In [2]:
N = 100000
df = pd.read_parquet('data.parquet').head(N)
df_train, df_test = df.iloc[:int(0.8 * N)], df.iloc[int(0.8 * N):]
X_train = df_train.drop(columns=['y'])
y_train = df_train['y']
X_test = df_test.drop(columns=['y'])
y_test = df_test['y']

In [3]:
%%time 
model.fit(X_train.values.astype(np.float32), y_train.values.astype(np.float32))

CPU times: user 443 ms, sys: 127 ms, total: 570 ms
Wall time: 576 ms


In [4]:
inferencer = model.get_inferencer()
inferencer

<ExtraForestRegressorInferencer at 0x165279070>

In [5]:
%%timeit
inferencer.predict(X_test.values.astype(np.float32))

1.06 ms ± 4.71 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [6]:
%%timeit
model.predict(X_test.values.astype(np.float32))

774 µs ± 42.6 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [7]:
y_inf = inferencer.predict(X_test.values.astype(np.float32))
y_mod = model.predict(X_test.values.astype(np.float32))
np.corrcoef(y_inf, y_mod)[0, 1]

0.01171048078165576

In [12]:
y_inf[:10], y_mod[:10]

(array([ 2.171153 , -0.5154649,  2.171153 , -0.5154649,  2.171153 ,
         2.171153 , -0.5154649,  2.171153 ,  2.171153 ,  2.171153 ],
       dtype=float32),
 array([-0.5154649 ,  2.171153  ,  0.33907044,  0.33907044,  0.33907044,
         2.171153  ,  2.171153  ,  2.171153  ,  2.171153  ,  2.171153  ],
       dtype=float32))

In [13]:
df_test.loc[:, ["x9", "x43", "x39"]].iloc[:3]

Unnamed: 0,x9,x43,x39
80000,0.050947,0.158173,0.274849
80001,-0.675796,-0.14729,-0.999288
80002,0.129653,-0.452544,-1.963666


In [14]:
model.get_debug_tree_descriptions()

['Branch(NumericalSplitter(13, 0.39278173), Branch(NumericalSplitter(1, -0.15522432), Leaf(2.171153), Leaf(0.33907044)), Branch(NumericalSplitter(39, -0.18017149), Leaf(-4.3021545), Leaf(-0.5154649)))']

In [15]:
inferencer.get_debug_tree_descriptions()

['[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [-inf, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0