In [None]:
# The Platonic Ideal: Verify _empirically_ that
# - train, test, oob mutually disjunct
# - train U test U oob = entire sample
# - all oob observations get a leaf assignment
# - all observations within leaf cell bounds
# - any way to verify optimal splits subject to constraints?
#
# The Capitulation to Reality:
# quite a bit of shenanigans to work around the fact that the base
# DecisionTreeClassifier does not retain training indices in the nodes,
# and therefore node membership by index cannot be verified post hoc
#
# instead we settle for the following procedure
# - eliminate randomness
# - train on untampered data to identify purported honest, structure, and oob
#   sample indices
# - shuffle y values among honest samples. if y altered y values are considered
#   (thereby violating honesty), the splits should change
# - train again from scratch on data with altered honest set
# - verify that splits remain the same
# - we only test unstratified sampling here so that we can shuffle the honest y values
# - we test stratified sampling at the forest level

In [5]:
import numpy as np

from treeple.datasets import make_trunk_classification

from sklearn.tree import DecisionTreeClassifier


N_ITER = 100
SAMPLE_SIZE = 256
RANDOM_STATE = 1

X, y = make_trunk_classification(
    n_samples=SAMPLE_SIZE,
    n_dim=1,
    n_informative=1,
    seed=0,
)
X_t = np.concatenate((
    X[: SAMPLE_SIZE // 2],
    X[SAMPLE_SIZE // 2 :]
))
y_t = np.concatenate((np.zeros(SAMPLE_SIZE // 2), np.ones(SAMPLE_SIZE // 2)))
all_indices = [i for i in range(SAMPLE_SIZE)]
structure_indices = [i for i in range(SAMPLE_SIZE) if i % 2 == 0]
honest_indices = np.setdiff1d(all_indices, structure_indices)
w = np.ones(SAMPLE_SIZE)
w[honest_indices] = 0

tree = DecisionTreeClassifier(random_state=RANDOM_STATE)
y_perm = y_t.ravel().copy()
tree.fit(X_t, y_perm, sample_weight=w)
old_threshold = tree.tree_.threshold.copy()
old_y = y_perm.copy()

for it in range(N_ITER):
    tree = DecisionTreeClassifier(random_state=RANDOM_STATE)
    y_perm = y_t.ravel().copy()
    honest_shuffled = honest_indices.copy()
    np.random.shuffle(honest_shuffled)

    for i in range(len(honest_indices)):
        y_perm[honest_indices[i]] = y_t[honest_shuffled[i]]

    # print(f"y_perm = {y_perm}")
    assert(not np.array_equal(y_t, y_perm))
    assert(not np.array_equal(old_y, y_perm))

    tree.fit(X_t, y_perm, sample_weight=w)
    assert(np.array_equal(old_threshold, tree.tree_.threshold))
    old_threshold = tree.tree_.threshold.copy()

print("Done.")

Done.
