In [27]:
import sys
from pathlib import Path

src_path = Path("../src/dt-distance").resolve()

if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score# Analy## Visu# d
from sklearn.utils import resample
from dt_distance.data_processor import DataProcessor  # correct!
from dt_distance.tree_parser import TreeParser
from dt_distance.distance_calculator import DistanceCalculator
from dt_distance.problem_params import ProblemParams
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import roc_auc_score


In [None]:
#params from paper
depths = list(range(3, 13))
min_samples = [3, 5, 10, 30, 50]

## Step 1: Split Train and Test set

In [None]:
# Step 1: Split data into two batches
'''
randonly split training data
'''
def random_train_split(X,y):
    N = X.shape[0]
    indices = np.random.permutation(N)
    X0, y0 = X[indices[:N // 2]], y[indices[:N // 2]]
    return X0, y0

## Step 2: Training the decision trees

In [26]:
'''
Helper function to generate trained tree with sklearn DecisionTreeClassifier
'''
def train_decision_tree(X, y, depth, min_samples_leaf):
    clf = DecisionTreeClassifier(max_depth=depth, min_samples_leaf=min_samples_leaf)
    clf.fit(X, y)
    return clf

## Step 3: Bootstrap and Train $T_{0}$ Tree Set
- subset $N_0$ of $N$

In [None]:
'''
Takes in X_0
Take in full training data
Sample with replacement
'''
def bootstrap_trees(X, y, depths, min_samples, B):
    trees = []
    for _ in range(B):
        X_sample, y_sample = resample(X, y, replace= True)
        depth = np.random.choice(depths)
        min_leaf = np.random.choice(min_samples)
        tree = train_decision_tree(X_sample, y_sample, depth, min_leaf)
        trees.append(tree)
    return trees

## Step 4: Train Second Tree Collection: $\mathcal{T}$ (Call Bootstrap trees on X)
- full training data $N$


## Step 5.1: Compute Mean distance for each $T \in T$
- For each tree in $\mathcal{T}$, compute `dt-distance` for all $T \in T_{0}$ and average over all B
- Compute AUC score from Test Data to get out-of-sample predictive power
- Return $B$ average distances 
- Intuition for larger set: Say we get new data in the future-> how much do these new trees (entire set)$\mathcal{T}$ deviate from the previosuly smaller set of trees $T_{0}$?
- only structural differences (via path definitions) matter for problem params, so the path_converstion does not care about the dataset, but the bounds on features, quantification of categories, and assigned class labels as a sequence of splits 

In [None]:

def compute_centroid_trees(trees_ref, trees_target, X,y):
    distances = []
    #TT
    for target_tree in trees_target:
        #t0
        tree_dist_sum = 0
        for ref_tree in trees_ref:
            distance_calculator = DistanceCalculator(trees_target, ref_tree, X=X, y=y)
            tree_dist = distance_calculator.compute_tree_distance()
            tree_dist_sum += tree_dist
        avg_distance = tree_dist_sum  / len(trees_ref)
        distances.append(avg_distance)
    return distances

## Step 5.2: Compute out-of-sample Predicitive Performance 
- ROC_AUC score on test-set (our validation set)

In [None]:
def evaluate_predictive_power(trees, X_holdout, y_holdout):
    auc_scores = []
    for tree in trees:
        y_proba = tree.predict_proba(X_holdout)[:, 1]
        auc = roc_auc_score(y_holdout, y_proba)
        auc_scores.append(auc)
    return auc_scores

## Step 6: Find the Pareto Optimal Set $\mathcal{T}^{*}$ from $\mathcal{T}$
- multi-objective function to find pareto optimal tree set from $\mathcal{T}$ based on average distance, $d_{b}$ , $\forall b \in \mathcal{T}$ and the out-of-sample AUC_ROC score $a_{b}$, $\forall b \in \mathcal{T}$
- **Pareto Optimal Definition:** $(d_{b'} \leq d_b \text{ and } \alpha_{b'} > \alpha_b) \text{ or } (d_{b'} < d_b \text{ and } \alpha_{b'} \geq \alpha_b)$

In [None]:

def pareto_optimal_trees(distances, auc_scores):
    pareto_trees = []
    for i, (d_i, a_i) in enumerate(zip(distances, auc_scores)):
        dominated = False
        for j, (d_j, a_j) in enumerate(zip(distances, auc_scores)):
            if i != j and ((d_j <= d_i and a_j > a_i) or (d_j < d_i and a_j >= a_i)):
                dominated = True
                break
        if not dominated:
            pareto_trees.append(i)
    return pareto_trees

## Step 7: Find the Optimal Tree from the Pareto Optimal Set, $\mathcal{T^{*}}$
-  $\mathbb{T}^\star = \underset{\mathbb{T}_b \in \mathcal{T}^\star}{\text{argmax}} \ f(d_b, \alpha_b)$
- need to consider here what we value: stability or predicitve power.
-  current function is most stable model among all “good enough” performers.
- Can modify to find optimal trade-off for accuracy-stability
- Indicator function where:
    - 1 if $\alpha_{b}$ is within ε of the best score
    - 0 otherwise

### Optional Later step: Impose interpretability constraints
- increases dimensionality of the multi-objective function

In [None]:
def select_final_tree(distances, auc_scores, pareto_indices, epsilon=0.01):
    best_auc = max(auc_scores)
    candidates = [i for i in pareto_indices if auc_scores[i] >= (1 - epsilon) * best_auc]
    if not candidates:
        candidates = pareto_indices
    best_idx = max(candidates, key=lambda i: auc_scores[i] - distances[i])
    return best_idx

### Step 7 Variation Benchmark: AUC maximizing Pareto Critera

In [None]:
def select_best_auc_tree(trees, auc_scores):
    best_idx = np.argmax(auc_scores)
    return best_idx, trees[best_idx], auc_scores[best_idx]

### Step 7 Variation Benchmark: Distance minimizing Criteria 

In [None]:
def select_best_distance_tree(trees, distances):
    best_idx = np.argmin(distances)
    return best_idx, trees[best_idx], distances[best_idx]

# Main Stable Tree Trainer Pipeline

In [None]:

# Main method implementing the training of stable trees
'''Pareto Objective will return the full_tree_set, pareto_tree_indices, distances, auc_scores'''
def generate_pareto_set(X, y, X_holdout, y_holdout, B=20):
    # Parameters
    depths = list(range(3, 13))
    min_samples = [3, 5, 10, 30, 50]

    # Step 1: split data
    X0, y0 = random_train_split(X,y)

    # Step 2: Train initial collection of trees
    trees_batch_0 = bootstrap_trees(X0, y0, depths, min_samples, B)

    # Step 3: Train second collection of trees on entire data
    trees_full_batch = bootstrap_trees(X, y, depths, min_samples, B)

    # Step 4: Compute stability and predictive performance
    distances = compute_centroid_trees(trees_batch_0, trees_full_batch)
    auc_scores = evaluate_predictive_power(trees_full_batch, X_holdout, y_holdout)

    '''later save as model class attrbute to visualize'''
    # Step 5: Pareto frontier
    pareto_indices = pareto_optimal_trees(distances, auc_scores)

    return (trees_full_batch, pareto_indices, distances, scores)

'''Stabler_tree_trainer will return the best tree based on the single objective pareto function for the pareto optimal set'''
def stable_tree_trainer(trees_tuple, pareto_indices,objective =None):
    # Step 6: Select optimal stable tree
    best_tree_idx = select_final_tree(distances, auc_scores, pareto_indices)
    if objective ="balanced":
        
    if objective = "distance":
    #step 7: Choose the best stable tree
    
    stable_tree = trees_full_batch[best_tree_idx]
    return stable_tree, distances[best_tree_idx], auc_scores[best_tree_idx]

# Benchmarking Performance across Pareto-AUC, Pareto-Dist, CVCART, and RF

In [None]:

def benchmark_models(trees_full_batch, distances, auc_scores, X_holdout, y_holdout):
    """
    Benchmarks the following:
    - CART Pareto AUC: Pareto-optimal tree with max AUC
    - CART Pareto Distance: Pareto-optimal tree with min distance
    - CART CV: Best tree from 5-fold CV
    - RF: Random Forest AUC (as baseline)
    """
    pareto_indices = pareto_optimal_trees(distances, auc_scores)

    # Best AUC in Pareto
    auc_max_idx = max(pareto_indices, key=lambda i: auc_scores[i])
    auc_max_tree = trees_full_batch[auc_max_idx]
    auc_max_auc = auc_scores[auc_max_idx]

    # Best distance in Pareto
    dist_min_idx = min(pareto_indices, key=lambda i: distances[i])
    dist_min_tree = trees_full_batch[dist_min_idx]
    dist_min_auc = auc_scores[dist_min_idx]

    # CART CV benchmark (best DT from 5-fold CV)
    dt_cv = DecisionTreeClassifier()
    cv_probs = cross_val_predict(dt_cv, X_holdout, y_holdout, method='predict_proba',
                                 cv=StratifiedKFold(n_splits=5), n_jobs=-1)
    cv_auc = roc_auc_score(y_holdout, cv_probs[:, 1])

    # Random Forest benchmark
    rf = RandomForestClassifier(n_estimators=100, random_state=0)
    rf.fit(X_holdout, y_holdout)
    rf_auc = roc_auc_score(y_holdout, rf.predict_proba(X_holdout)[:, 1])

    return {
        "CART Pareto AUC": auc_max_auc,
        "CART Pareto Distance": dist_min_auc,
        "CART CV": cv_auc,
        "Random Forest": rf_auc,
        "models": {
            "pareto_auc_tree": auc_max_tree,
            "pareto_dist_tree": dist_min_tree,
            "random_forest": rf
        }
    }

