In [1]:
%load_ext autoreload
%autoreload 2
#%matplotlib inline

In [2]:
#Import Tree Models from scratch functions
import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path+"/TreeModelsFromScratch")

from DecisionTree import DecisionTree
from RandomForest import RandomForest
#from SmoothShap import verify_shap_model, smooth_shap

In [3]:
# Import other libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
#import seaborn as sns
from imodels import HSTreeClassifier, HSTreeRegressor, HSTreeClassifierCV, HSTreeRegressorCV
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from copy import deepcopy
#from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.metrics import roc_auc_score, r2_score
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, KFold
#import shap
#from shap.explainers._tree import SingleTree

In [4]:
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False

In [5]:
# change working directory to imodels-experiment folder
if os.getcwd().split('/')[-1] == 'notebooks':
    os.chdir('../../imodels-experiments')
    
module_path_imodels = os.getcwd()
if module_path not in sys.path:
    sys.path.append(module_path_imodels+"/config/shrinkage")
    
from imodels.util.data_util import get_clean_dataset
from datasets import DATASETS_CLASSIFICATION, DATASETS_REGRESSION
from util import DATASET_PATH

#pd.options.display.max_rows = 100

# Recreation of results from hierarchical shrinkage paper

Hierarchical Shrinkage: Improving the accuracy and interpretability of tree-based models <br>
[Link to paper](https://proceedings.mlr.press/v162/agarwal22b.html)


>HS is integrated into the imodels package [imodels](github.com/csinva/imodels) (Singh et al., 2021) with an sklearn compatible API. Experiments for reproducing the results here can be found at [imodels-experiments](github.com/Yu-Group/imodels-experiments)

Unfortunately, not all experiments/ figures of the paper can be found in the aforementioned GitHub repository, therefore some of the charts will not be identical to the paper.

## HS performance across various datasets (Fig. 4)

> *Hierarchical Shrinkage (solid lines) often improves predictive performance across various datasets, particularly for small datasets. (A) Top two rows show results for classification datasets (measured by AUC of the ROC curve) and (B) the next two rows show results for regression datasets (measured by R2). HS often significantly improves the performance over CART, CART with CCP, and (C) leaf-based shrinkage. (D) HS even improves results for Random Forests as a function of the number of trees. Across all panels, errors bars show standard error of the mean computed over 10 random data splits. Note that the y-axis scales differ across plots.*

### Create functions to recreate plot

In [77]:
def cross_val_score_scratch(estimator, X, y, cv=10, scoring_func=roc_auc_score, shuffle=True, random_state=None):
    
    kf = KFold(n_splits=cv, shuffle=shuffle, random_state=random_state)
    scores = []
    
    for train_index, test_index in kf.split(X):

        # Create true copy of estimator (refitting of scratch models is not possbile)
        estimator_copy = deepcopy(estimator)
        
        #split data
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        #fit estimator, predict & score
        estimator_copy.fit(X_train, y_train)
        y_pred = estimator_copy.predict(X_test)
        scores.append(scoring_func(y_test, y_pred))

    return scores

In [78]:
def create_performance_plot_clf(X, y, cv=10, scoring_func=roc_auc_score, reg_param=100, 
                                  shuffle=True, random_state=42, dset_name=None, show_adapted_plot=False):

    
    # Original plot with n_leaves as x-axis
    n_leafs = [2,4,6,12,15,18,23,26,27,28,29,30]

    cv_res_sk = []
    cv_res_im = []

    for n_leaf_nodes in n_leafs:

        #sklearn
        clf_tree_sk = DecisionTreeClassifier(max_leaf_nodes=n_leaf_nodes, random_state=random_state)
        cv_res_sk.append(cross_val_score_scratch(clf_tree_sk, X, y, cv=cv, scoring_func=scoring_func, 
                                                 shuffle=shuffle, random_state=random_state))

        #imodels 
        clf_tree_im = HSTreeClassifier(deepcopy(clf_tree_sk), reg_param=reg_param)
        cv_res_im.append(cross_val_score_scratch(clf_tree_im, X, y, cv=cv, scoring_func=scoring_func, 
                                                 shuffle=shuffle, random_state=random_state))
        
    #Compute standard error of mean
    data_sk = np.array(cv_res_sk)
    sem_data_sk = np.std(data_sk, ddof=1, axis=1) / np.sqrt(np.size(data_sk, axis=1))

    data_im = np.array(cv_res_im)
    sem_data_im = np.std(data_im, ddof=1, axis=1) / np.sqrt(np.size(data_im, axis=1))
    
    #Create plot
    fig, axs = plt.subplots(1,2, figsize=(15,5))
    
    fig.suptitle(f"{dset_name} (n = {X.shape[0]}, p = {X.shape[1]})")
    
    axs[0].errorbar(x=n_leafs, y=np.array(data_im).mean(axis=1), yerr=sem_data_im, color="tab:blue", 
                   alpha=1., linewidth=3, marker="o")
    axs[0].errorbar(x=n_leafs, y=np.array(data_sk).mean(axis=1), yerr=sem_data_sk, color="tab:blue", 
                   alpha=1., linewidth=3, marker="o")

    axs[0].set_xlabel("number of leaves")
    y_label = "AUC" if str(scoring_func).split()[1]=="roc_auc_score" else str(scoring_func).split()[1]
    axs[0].set_ylabel(y_label)
    axs[0].legend(["hsCART", "CART"])
    axs[0].set_title("Original plot: n_leaves as x-axis")
    
    # if only original plot should be shown
    if not show_adapted_plot:
        axs[-1].axis('off')
        plt.show()
        return [[data_sk, sem_data_sk], [data_im, sem_data_im]]
    
    #Adapted plot with tree_depth as x-axis
    tree_depths = range(1,7)

    new_cv_res_sk = []
    new_cv_res_im = []
    new_cv_res_scr = []
    new_cv_res_scrHS = []

    for depth in tree_depths:

        #sklearn
        clf_tree_sk = DecisionTreeClassifier(max_depth=depth, random_state=random_state)
        new_cv_res_sk.append(cross_val_score_scratch(clf_tree_sk, X, y, cv=cv, scoring_func=scoring_func, 
                                                 shuffle=shuffle, random_state=random_state))

        #imodels 
        clf_tree_im = HSTreeClassifier(deepcopy(clf_tree_sk), reg_param=reg_param)
        new_cv_res_im.append(cross_val_score_scratch(clf_tree_im, X, y, cv=cv, scoring_func=scoring_func, 
                                                 shuffle=shuffle, random_state=random_state))
        
        #scratch
        clf_tree_scr = DecisionTree(max_depth=depth, random_state=random_state)
        new_cv_res_scr.append(cross_val_score_scratch(clf_tree_scr, X, y, cv=cv, scoring_func=scoring_func, 
                                                 shuffle=shuffle, random_state=random_state))

        #scratchHS
        clf_tree_scrHS = DecisionTree(max_depth=depth, HShrinkage=True, HS_lambda=reg_param, random_state=random_state)
        new_cv_res_scrHS.append(cross_val_score_scratch(clf_tree_scrHS, X, y, cv=cv, scoring_func=scoring_func, 
                                                 shuffle=shuffle, random_state=random_state))        
        
    #Compute standard error of mean
    new_data_sk = np.array(new_cv_res_sk)
    new_sem_data_sk = np.std(new_data_sk, ddof=1, axis=1) / np.sqrt(np.size(new_data_sk, axis=1))

    new_data_im = np.array(new_cv_res_im)
    new_sem_data_im = np.std(new_data_im, ddof=1, axis=1) / np.sqrt(np.size(new_data_im, axis=1))
    
    new_data_scr = np.array(new_cv_res_scr)
    new_sem_data_scr = np.std(new_data_scr, ddof=1, axis=1) / np.sqrt(np.size(new_data_scr, axis=1))

    new_data_scrHS = np.array(new_cv_res_scrHS)
    new_sem_data_scrHS = np.std(new_data_scrHS, ddof=1, axis=1) / np.sqrt(np.size(new_data_scrHS, axis=1))
    
    #Create adapted plot
    axs[1].errorbar(x=tree_depths, y=np.array(new_data_im).mean(axis=1), yerr=new_sem_data_im, color="tab:blue", 
                   alpha=1., linewidth=3, marker="o")
    axs[1].errorbar(x=tree_depths, y=np.array(new_data_scrHS).mean(axis=1), yerr=new_sem_data_scrHS, color="orange", 
                   alpha=1., linewidth=3, marker="o")
    axs[1].errorbar(x=tree_depths, y=np.array(new_data_sk).mean(axis=1), yerr=new_sem_data_sk, color="tab:blue", 
                   alpha=.5, linewidth=3, marker="o")
    axs[1].errorbar(x=tree_depths, y=np.array(new_data_scr).mean(axis=1), yerr=new_sem_data_scr, color="orange", 
                   alpha=.5, linewidth=3, marker="o")


    axs[1].set_xlabel("Max tree depth")
    y_label = "AUC" if str(scoring_func).split()[1]=="roc_auc_score" else str(scoring_func).split()[1]
    axs[1].set_ylabel(y_label)
    axs[1].legend(["hsCART", "hsCART scratch", "CART", "CART scratch"])
    axs[1].set_title("Adapted plot: maximum tree depth as x-axis")
    
    plt.show()
    return [[data_sk, sem_data_sk], [data_im, sem_data_im]], [[new_data_sk, new_sem_data_sk], [new_data_im, new_sem_data_im], [new_data_scr, new_sem_data_scr], [new_data_scrHS, new_sem_data_scrHS]]
    

In [79]:
#Load dataset
dset_name, dset_file, data_source = DATASETS_CLASSIFICATION[1]
X, y, feat_names = get_clean_dataset(dset_file, data_source, DATASET_PATH)

In [None]:
cv_results_orig, cv_results_new = create_performance_plot_clf(X, y, scoring_func=roc_auc_score, 
                                                                      reg_param=100, shuffle=True, random_state=42, 
                                                                      dset_name=dset_name, show_adapted_plot=True)

In the current state of the `TreeModelsfromScratch` there is no parameter to specify the number of leaves per tree. Therefore, we will use tree depth as scale for the x-axis.