# Analysis of Rerf-sporf, Py-sporf and Cythonized-sporf Performance

Here, we are interested in looking at the runtime of each implementation of SPORF on a fixed classification task.

Namely, we will utilize the orthant and sparse-parity tasks in the original SPORF paper.

In [1]:
%load_ext lab_black

In [3]:
import sys
from pathlib import Path
import numpy as np
import collections

from sklearn.ensemble import RandomForestClassifier as rfc

sys.path.append("../")

from oblique_forests.sporf import (
    ObliqueForestClassifier,
)  # , PythonObliqueForestClassifier
from rerf.rerfClassifier import rerfClassifier

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
def load_data(n, data_path, exp_name):
    """Function to load in data as a function of sample size."""
    ftrain = data_path / f"{exp_name}_train_{n}.npy"
    ftest = data_path / f"{exp_name}_test.npy"

    dftrain = np.load(ftrain)
    dftest = np.load(ftest)

    X_train = dftrain[:, :-1]
    y_train = dftrain[:, -1]

    X_test = dftest[:, :-1]
    y_test = dftest[:, -1]

    return X_train, y_train, X_test, y_test

In [15]:
def test_rf(n, reps, n_estimators, exp_name):
    """Test traditional RF classifier"""

    preds = np.zeros((reps, 10000))
    acc = np.zeros(reps)
    for i in range(reps):

        X_train, y_train, X_test, y_test = load_data(n, data_path, exp_name)

        clf = rerfClassifier(
            n_estimators=n_estimators, projection_matrix="Base", n_jobs=8
        )

        clf.fit(X_train, y_train)

        preds[i] = clf.predict(X_test)
        acc[i] = np.sum(preds[i] == y_test) / len(y_test)

    np.save(f"output/rf_{exp_name}_preds_{n}.npy", preds)
    return acc


def test_rerf(n, reps, n_estimators, feature_combinations, max_features, exp_name):
    """Test SPORF rerf implemnetation."""
    preds = np.zeros((reps, 10000))
    acc = np.zeros(reps)
    for i in range(reps):

        X_train, y_train, X_test, y_test = load_data(n, data_path, exp_name)

        clf = rerfClassifier(
            n_estimators=n_estimators,
            projection_matrix="RerF",
            feature_combinations=feature_combinations,
            max_features=max_features,
            n_jobs=8,
        )

        clf.fit(X_train, y_train)

        preds[i] = clf.predict(X_test)
        acc[i] = np.sum(preds[i] == y_test) / len(y_test)

    np.save(f"output/rerf_{exp_name}_preds_" + str(n) + ".npy", preds)
    return acc


def test_cython_of(n, reps, n_estimators, feature_combinations, max_features, exp_name):
    """Test SPORF rerf implemnetation."""
    preds = np.zeros((reps, 10000))
    acc = np.zeros(reps)
    for i in range(reps):

        X_train, y_train, X_test, y_test = load_data(n, data_path, exp_name)

        clf = ObliqueForestClassifier(
            n_estimators=n_estimators,
            feature_combinations=feature_combinations,
            max_features=max_features,
            n_jobs=8,
        )

        clf.fit(X_train, y_train)

        preds[i] = clf.predict(X_test)
        acc[i] = np.sum(preds[i] == y_test) / len(y_test)

    np.save(f"output/cythonof_{exp_name}_preds_" + str(n) + ".npy", preds)
    return acc


def test_python_of(n, reps, n_estimators, feature_combinations, max_features, exp_name):
    """Test PySporf."""
    preds = np.zeros((reps, 10000))
    acc = np.zeros(reps)
    for i in range(reps):

        X_train, y_train, X_test, y_test = load_data(n, data_path, exp_name)

        clf = PythonObliqueForestClassifier(
            n_estimators=n_estimators,
            feature_combinations=feature_combinations,
            max_features=max_features,
            n_jobs=8,
        )

        clf.fit(X_train, y_train)
        preds[i] = clf.predict(X_test)
        acc[i] = np.sum(preds[i] == y_test) / len(y_test)

    np.save(f"output/of_{exp_name}_preds_" + str(n) + ".npy", preds)
    return acc

In [16]:
data_path = Path("./data/")

In [40]:
# How many samples to train on
n = 1000

# How many repetitions
reps = 3

# experiment name
exp_name = "sparse_parity"
# exp_name = 'orthant'

# Tree parameters
n_estimators = 100
feature_combinations = 2
max_features = "auto"

# Test Classification Performance

In [18]:
# acc = test_python_of(n, reps, n_estimators, feature_combinations, max_features, exp_name)
# print(acc)

In [41]:
acc = test_rerf(n, reps, n_estimators, feature_combinations, max_features, exp_name)
print(acc)

[0.7043 0.6796 0.6941]


In [42]:
acc = test_rf(n, reps, n_estimators, exp_name)
print(acc)

[0.6396 0.5934 0.6273]


In [43]:
acc = test_cython_of(
    n, reps, n_estimators, feature_combinations, max_features, exp_name
)
print(acc)

[0.7191 0.673  0.6911]


# Test Actual Runtime

In [22]:
# keep track of a list of runtimes
n_list = collections.defaultdict(list)

In [25]:
clf = rfc(n_estimators=n_estimators, n_jobs=8)
rerf_clf = rerfClassifier(n_estimators=n_estimators, projection_matrix="RerF", n_jobs=8)
# py_of_clf = PythonObliqueForestClassifier(
#     n_estimators=n_estimators,
#     feature_combinations=feature_combinations,
#     max_features=max_features,
#     n_jobs=8,
# )
cy_of_clf = ObliqueForestClassifier(
    n_estimators=n_estimators,
    feature_combinations=feature_combinations,
    max_features=max_features,
    n_jobs=8,
)

## 1000 sample size

In [26]:
# run on higher sample size now
# How many samples to train on
n = 1000
X_train, y_train, X_test, y_test = load_data(n, data_path, exp_name)

In [27]:
time = %timeit -n 1 -o clf.fit(X_train, y_train)
n_list["BaseRF"].append(np.mean(time.timings))

87.8 ms ± 7.58 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [28]:
time = %timeit -n 1 -o rerf_clf.fit(X_train, y_train)
n_list["ReRF-Sporf"].append(np.mean(time.timings))

39.2 ms ± 4.21 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [35]:
time = %timeit -n 1 -o py_of_clf.fit(X_train, y_train)
n_list["Py-Sporf"].append(np.mean(time.timings))

19.4 s ± 319 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [29]:
# with pointer arrays (old)
time = %timeit -n 1 -o cy_of_clf.fit(X_train, y_train)
n_list["Cy-Sporf"].append(np.mean(time.timings))

124 ms ± 4.61 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
# with pointer arrays
time = %timeit -n 1 -o cy_of_clf.fit(X_train, y_train)
n_list["Cy-Sporf"].append(np.mean(time.timings))

209 ms ± 4.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
# with std::vectors
time = %timeit -n 1 -o cy_of_clf.fit(X_train, y_train)
n_list["Cy-Sporf"].append(np.mean(time.timings))

212 ms ± 9.02 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## 5000 sample size

In [30]:
# run on higher sample size now
# How many samples to train on
n = 5000

In [31]:
X_train, y_train, X_test, y_test = load_data(n, data_path, exp_name)

In [32]:
time = %timeit -n 1 -o clf.fit(X_train, y_train)
n_list["BaseRF"].append(np.mean(time.timings))

245 ms ± 12.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [33]:
time = %timeit -n 1 -o rerf_clf.fit(X_train, y_train)
n_list["ReRF-Sporf"].append(np.mean(time.timings))

199 ms ± 7.27 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [34]:
# with pointer arrays
time = %timeit -n 1 -o cy_of_clf.fit(X_train, y_train)
n_list["Cy-Sporf"].append(np.mean(time.timings))

508 ms ± 8.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [25]:
# with pointer arrays 
time = %timeit -n 1 -o cy_of_clf.fit(X_train, y_train)
n_list["Cy-Sporf"].append(np.mean(time.timings))

680 ms ± 6.55 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
# with std::vectors
time = %timeit -n 1 -o cy_of_clf.fit(X_train, y_train)
n_list["Cy-Sporf"].append(np.mean(time.timings))

579 ms ± 8.19 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## 10000 sample size

In [35]:
# run on higher sample size now
# How many samples to train on
n = 10000

In [36]:
X_train, y_train, X_test, y_test = load_data(n, data_path, exp_name)

In [37]:
time = %timeit -n 1 -o clf.fit(X_train, y_train)
n_list["BaseRF"].append(np.mean(time.timings))

456 ms ± 11.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [38]:
time = %timeit -n 1 -o rerf_clf.fit(X_train, y_train)
n_list["ReRF-Sporf"].append(np.mean(time.timings))

407 ms ± 15.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [39]:
# with pointer arrays
time = %timeit -n 1 -o cy_of_clf.fit(X_train, y_train)
n_list["Cy-Sporf"].append(np.mean(time.timings))

971 ms ± 13.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [32]:
# with pointer arrays
time = %timeit -n 1 -o cy_of_clf.fit(X_train, y_train)
n_list["Cy-Sporf"].append(np.mean(time.timings))

1.18 s ± 12.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
# with pointer arrays (c++ std vector)
time = %timeit -n 1 -o cy_of_clf.fit(X_train, y_train)
n_list["Cy-Sporf"].append(np.mean(time.timings))

1.04 s ± 15.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
