In [1]:
# The Platonic Ideal: Verify _empirically_ that
# - train, test, oob mutually disjunct
# - train U test U oob = entire sample
# - all oob observations get a leaf assignment
# - all observations within leaf cell bounds
# - any way to verify optimal splits subject to constraints?
#
# The Capitulation to Reality:
# quite a bit of shenanigans to work around the fact that the base
# DecisionTreeClassifier does not retain training indices in the nodes,
# and therefore node membership by index cannot be verified post hoc
#
# instead we settle for the following procedure
# - eliminate randomness
# - train on untampered data to identify purported honest, structure, and oob
#   sample indices
# - shuffle y values among honest samples. if y altered y values are considered
#   (thereby violating honesty), the splits should change
# - train again from scratch on data with altered honest set
# - verify that splits remain the same
# - we only test unstratified sampling here so that we can shuffle the honest y values
# - we test stratified sampling at the forest level


In [1]:
import numpy as np

def make_trunk_classification(
    n_samples,
    n_dim,
    n_informative=1,
    simulation: str = "trunk",
    mu_0: float = 0,
    mu_1: float = 1,
    rho: int = 0,
    band_type: str = "ma",
    return_params: bool = False,
    mix: float = 0.5,
    seed=None,
):
    if n_dim < n_informative:
        raise ValueError(
            f"Number of informative dimensions {n_informative} must be less than number "
            f"of dimensions, {n_dim}"
        )
    rng = np.random.default_rng(seed=seed)
    rng1 = np.random.default_rng(seed=seed)
    mu_0 = np.array([mu_0 / np.sqrt(i) for i in range(1, n_informative + 1)])
    mu_1 = np.array([mu_1 / np.sqrt(i) for i in range(1, n_informative + 1)])
    if rho != 0:
        if band_type == "ma":
            cov = _moving_avg_cov(n_informative, rho)
        elif band_type == "ar":
            cov = _autoregressive_cov(n_informative, rho)
        else:
            raise ValueError(f'Band type {band_type} must be one of "ma", or "ar".')
    else:
        cov = np.identity(n_informative)
    if mix < 0 or mix > 1:
        raise ValueError("Mix must be between 0 and 1.")
    # speed up computations for large multivariate normal matrix with SVD approximation
    if n_informative > 1000:
        method = "cholesky"
    else:
        method = "svd"
    if simulation == "trunk":
        X = np.vstack(
            (
                rng.multivariate_normal(mu_0, cov, n_samples // 2, method=method),
                rng1.multivariate_normal(mu_1, cov, n_samples // 2, method=method),
            )
        )
    elif simulation == "trunk_overlap":
        mixture_idx = rng.choice(
            2, n_samples // 2, replace=True, shuffle=True, p=[mix, 1 - mix]
        )
        norm_params = [[mu_0, cov], [mu_1, cov]]
        X_mixture = np.fromiter(
            (
                rng.multivariate_normal(*(norm_params[i]), size=1, method=method)
                for i in mixture_idx
            ),
            dtype=np.dtype((float, n_informative)),
        )
        X_mixture_2 = np.fromiter(
            (
                rng1.multivariate_normal(*(norm_params[i]), size=1, method=method)
                for i in mixture_idx
            ),
            dtype=np.dtype((float, n_informative)),
        )
        X = np.vstack(
            (
                X_mixture.reshape(n_samples // 2, n_informative),
                X_mixture_2.reshape(n_samples // 2, n_informative),
            )
        )
    elif simulation == "trunk_mix":
        mixture_idx = rng.choice(
            2, n_samples // 2, replace=True, shuffle=True, p=[mix, 1 - mix]
        )
        norm_params = [[mu_0, cov], [mu_1, cov]]
        X_mixture = np.fromiter(
            (
                rng1.multivariate_normal(*(norm_params[i]), size=1, method=method)
                for i in mixture_idx
            ),
            dtype=np.dtype((float, n_informative)),
        )
        X = np.vstack(
            (
                rng.multivariate_normal(
                    np.zeros(n_informative), cov, n_samples // 2, method=method
                ),
                X_mixture.reshape(n_samples // 2, n_informative),
            )
        )
    else:
        raise ValueError("Simulation must be: trunk, trunk_overlap, trunk_mix")
    if n_dim > n_informative:
        X = np.hstack(
            (X, rng.normal(loc=0, scale=1, size=(X.shape[0], n_dim - n_informative)))
        )
    y = np.concatenate((np.zeros(n_samples // 2), np.ones(n_samples // 2)))
    if return_params:
        returns = [X, y]
        if simulation == "trunk":
            returns += [[mu_0, mu_1], [cov, cov]]
        elif simulation == "trunk-overlap":
            returns += [[np.zeros(n_informative), np.zeros(n_informative)], [cov, cov]]
        elif simulation == "trunk-mix":
            returns += [*list(zip(*norm_params)), X_mixture]
        return returns
    return X, y

In [4]:
from sklearn.tree import DecisionTreeClassifier, HonestDecisionTree
#from treeple.ensemble import HonestForestClassifier


N_ITER = 100
SAMPLE_SIZE = 1024
RANDOM_STATE = 1
HONEST_PRIOR = "ignore"
HONEST_FRACTION = 0.9

X, y = make_trunk_classification(
    n_samples=SAMPLE_SIZE,
    n_dim=1,
    n_informative=1,
    seed=0,
)
X_t = np.concatenate((
    X[: SAMPLE_SIZE // 2],
    X[SAMPLE_SIZE // 2 :]
))
y_t = np.concatenate((np.zeros(SAMPLE_SIZE // 2), np.ones(SAMPLE_SIZE // 2)))


tree=HonestDecisionTree(
    target_tree_class=DecisionTreeClassifier,
    target_tree_kwargs={
        "criterion": "gini",
        "random_state": RANDOM_STATE
    },
    honest_prior=HONEST_PRIOR,
    honest_fraction=HONEST_FRACTION
)
tree.fit(X_t, y_t.ravel())
honest_tree = tree.tree_
structure_tree = honest_tree.target_tree
old_threshold = structure_tree.threshold.copy()
old_y = y_t.copy()

honest_indices = tree.honest_indices_

for _ in range(N_ITER):
    y_perm = y_t.copy()
    honest_shuffled = honest_indices.copy()
    np.random.shuffle(honest_shuffled)
    for i in range(len(honest_indices)):
        y_perm[honest_indices[i]] = y_t[honest_shuffled[i]]
    
    assert(not np.array_equal(y_t, y_perm))
    assert(not np.array_equal(old_y, y_perm))

    tree=HonestDecisionTree(
        target_tree_class=DecisionTreeClassifier,
        target_tree_kwargs={
            "criterion": "gini",
            "random_state": RANDOM_STATE
        },
        honest_prior=HONEST_PRIOR,
        honest_fraction=HONEST_FRACTION
    )
    tree.fit(X_t, y_perm.ravel())
    honest_tree = tree.tree_
    structure_tree = honest_tree.target_tree

    assert(np.array_equal(old_threshold, structure_tree.threshold))
    old_threshold = structure_tree.threshold.copy()
    old_y = y_perm.copy()

print("done")


done
