In [1]:
# The Platonic Ideal: Verify _empirically_ that
# - train, test, oob mutually disjunct
# - train U test U oob = entire sample
# - all oob observations get a leaf assignment
# - all observations within leaf cell bounds
# - any way to verify optimal splits subject to constraints?
#
# The Capitulation to Reality:
# quite a bit of shenanigans to work around the fact that the base
# DecisionTreeClassifier does not retain training indices in the nodes,
# and therefore node membership by index cannot be verified post hoc
#
# instead we settle for the following procedure
# - eliminate randomness
# - train on untampered data to identify purported honest, structure, and oob
#   sample indices
# - shuffle y values among honest samples. if y altered y values are considered
#   (thereby violating honesty), the splits should change
# - train again from scratch on data with altered honest set
# - verify that splits remain the same
# - we only test unstratified sampling here so that we can shuffle the honest y values
# - we test stratified sampling at the forest level


In [4]:
import numpy as np

from treeple.datasets import make_trunk_classification

from sklearn.tree import DecisionTreeClassifier, HonestDecisionTree
#from treeple.ensemble import HonestForestClassifier


N_ITER = 100
SAMPLE_SIZE = 1024
RANDOM_STATE = 1
HONEST_PRIOR = "ignore"
HONEST_FRACTION = 0.9

X, y = make_trunk_classification(
    n_samples=SAMPLE_SIZE,
    n_dim=1,
    n_informative=1,
    seed=0,
)
X_t = np.concatenate((
    X[: SAMPLE_SIZE // 2],
    X[SAMPLE_SIZE // 2 :]
))
y_t = np.concatenate((np.zeros(SAMPLE_SIZE // 2), np.ones(SAMPLE_SIZE // 2)))


tree=HonestDecisionTree(
    target_tree_class=DecisionTreeClassifier,
    target_tree_kwargs={
        "criterion": "gini",
        "random_state": RANDOM_STATE
    },
    honest_prior=HONEST_PRIOR,
    honest_fraction=HONEST_FRACTION
)
tree.fit(X_t, y_t.ravel())
honest_tree = tree.tree_
structure_tree = honest_tree.target_tree
old_threshold = structure_tree.threshold.copy()
old_y = y_t.copy()

honest_indices = tree.honest_indices_

for _ in range(N_ITER):
    y_perm = y_t.copy()
    honest_shuffled = honest_indices.copy()
    np.random.shuffle(honest_shuffled)
    for i in range(len(honest_indices)):
        y_perm[honest_indices[i]] = y_t[honest_shuffled[i]]
    
    assert(not np.array_equal(y_t, y_perm))
    assert(not np.array_equal(old_y, y_perm))

    tree=HonestDecisionTree(
        target_tree_class=DecisionTreeClassifier,
        target_tree_kwargs={
            "criterion": "gini",
            "random_state": RANDOM_STATE
        },
        honest_prior=HONEST_PRIOR,
        honest_fraction=HONEST_FRACTION
    )
    tree.fit(X_t, y_perm.ravel())
    honest_tree = tree.tree_
    structure_tree = honest_tree.target_tree

    assert(np.array_equal(old_threshold, structure_tree.threshold))
    old_threshold = structure_tree.threshold.copy()
    old_y = y_perm.copy()

print("done")


done
