In [25]:
import sys
from pathlib import Path

src_path = Path("../src/dt-distance").resolve()

if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample
from dt_distance.data_processor import DataProcessor  # correct!
from dt_distance.tree_parser import TreeParser
from dt_distance.distance_calculator import DistanceCalculator
from dt_distance.problem_params import ProblemParams

In [None]:
#params from paper
depths = list(range(3, 13))
min_samples = [3, 5, 10, 30, 50]

## Step 1: Split Train and Test set

In [None]:
# Step 1: Split data into two batches
'''
randonly split training data
'''
def random_train_split(X,y):
    N = X.shape[0]
    indices = np.random.permutation(N)
    X0, y0 = X[indices[:N // 2]], y[indices[:N // 2]]
    return X0, y0

## Step 2: Training the decision trees

In [None]:
'''
Helper function to generate trained tree with sklearn DecisionTreeClassifier
'''
def train_decision_tree(X, y, depth, min_samples_leaf):
    clf = DecisionTreeClassifier(max_depth=depth, min_samples_leaf=min_samples_leaf)
    clf.fit(X, y)
    return clf

## Step 3: Bootstrap and Train $T_{0}$ Tree Set
- subset $N_0$ of $N$

In [None]:
'''
Takes in X_0
Take in full training data
Sample with replacement
'''
def bootstrap_trees(X, y, depths, min_samples, B):
    trees = []
    for _ in range(B):
        X_sample, y_sample = resample(X, y, replace= True)
        depth = np.random.choice(depths)
        min_leaf = np.random.choice(min_samples)
        tree = train_decision_tree(X_sample, y_sample, depth, min_leaf)
        trees.append(tree)
    return trees

## Step 4: Train Second Tree Collection: $\mathcal{T}$ (Call Bootstrap trees on X)
- full training data $N$


## Step 5: Compute Mean distance for each $T \in T$
- For each tree in $\mathcal{T}$, compute `dt-distance` for all $T \in T_{0}$ and average over all B
- Compute AUC score from Test Data to get out-of-sample predictive power
- Return $B$ average distances 
- Intuition for larger set: Say we get new data in the future-> how much do these new trees (entire set)$\mathcal{T}$ deviate from the previosuly smaller set of trees $T_{0}$?
- only structural differences (via path definitions) matter for problem params, so the path_converstion does not care about the dataset, but the bounds on features, quantification of categories, and assigned class labels as a sequence of splits 

In [None]:

def compute_centroid_trees(trees_ref, trees_target, X,y):
    distances = []
    #TT
    for target_tree in trees_target:
            #t0
            for ref_tree in trees_ref:
                distance_calculator = DistanceCalculator(trees_target, ref_tree, X=X, y=y)
                tree_dist = distance_calculator.compute_tree_distance()


        parser_target = TreeParser(target_tree, problem_params)
        paths_target = parser_target.get_paths()
        distance_accum = 0

            parser_ref = TreeParser(ref_tree, problem_params)
            paths_ref = parser_ref.get_paths()
            distance_accum += dt-distance(paths_ref, paths_target).calculate()
        avg_distance = distance_accum / len(trees_ref)
        distances.append(avg_distance)
    return distances

In [None]:
'''

'''


def compute_tree_stability(trees_ref, trees_target, problem_params):
    distances = []
    for target_tree in trees_target:
        parser_target = TreeParser(target_tree, problem_params)
        paths_target = parser_target.get_paths()
        distance_accum = 0
        for ref_tree in trees_ref:
            parser_ref = TreeParser(ref_tree, problem_params)
            paths_ref = parser_ref.get_paths()
            distance_accum += dt-distance(paths_ref, paths_target).calculate()
        avg_distance = distance_accum / len(trees_ref)
        distances.append(avg_distance)
    return distances


def evaluate_predictive_power(trees, X_holdout, y_holdout):
    auc_scores = []
    for tree in trees:
        y_proba = tree.predict_proba(X_holdout)[:, 1]
        auc = roc_auc_score(y_holdout, y_proba)
        auc_scores.append(auc)
    return auc_scores


def pareto_optimal_trees(distances, auc_scores):
    pareto_trees = []
    for i, (d_i, a_i) in enumerate(zip(distances, auc_scores)):
        dominated = False
        for j, (d_j, a_j) in enumerate(zip(distances, auc_scores)):
            if i != j and ((d_j <= d_i and a_j > a_i) or (d_j < d_i and a_j >= a_i)):
                dominated = True
                break
        if not dominated:
            pareto_trees.append(i)
    return pareto_trees


def select_final_tree(distances, auc_scores, pareto_indices, epsilon=0.01):
    best_auc = max(auc_scores)
    candidates = [i for i in pareto_indices if auc_scores[i] >= (1 - epsilon) * best_auc]
    if not candidates:
        candidates = pareto_indices
    best_idx = max(candidates, key=lambda i: auc_scores[i] - distances[i])
    return best_idx



# Main method implementing the training of stable trees
def train_stable_tree(X, y, X_holdout, y_holdout, B=20):
    # Parameters
    depths = list(range(3, 13))
    min_samples = [3, 5, 10, 30, 50]



    # Initialize DataProcessor
    dp = DataProcessor(data=X, target=y)
    problem_params = dp.get_problem_params()

    # Step 2: Train initial collection of trees
    trees_batch_0 = bootstrap_trees(X0, y0, depths, min_samples, B)

    # Step 3: Train second collection of trees on entire data
    trees_full_batch = bootstrap_trees(X, y, depths, min_samples, B)

    # Step 4: Compute stability and predictive performance
    distances = compute_tree_stability(trees_batch_0, trees_full_batch, problem_params)
    auc_scores = evaluate_predictive_power(trees_full_batch, X_holdout, y_holdout)

    # Step 5: Pareto frontier
    pareto_indices = pareto_optimal_trees(distances, auc_scores)

    # Step 6: Select optimal stable tree
    best_tree_idx = select_final_tree(distances, auc_scores, pareto_indices)

    stable_tree = trees_full_batch[best_tree_idx]

    return stable_tree, distances[best_tree_idx], auc_scores[best_tree_idx]


# Example usage (User to replace X, y, X_holdout, y_holdout with actual data)
# stable_tree, stability_score, auc_score = train_stable_tree(X, y, X_holdout, y_holdout)
