In [None]:
import numpy as np 
import pandas as pd 

In [None]:
df = pd.read_csv('single_tree/aggregated.csv')

In [None]:
df['proportion_nan'] = df['n_nan_preds'] / df['test_set_size']

In [None]:
df.head()

In [None]:
groups = df.groupby(['base_tree', 'dataset', 'NaN_condition', 'miss_prob', 'balance', 'prebin_missingness'])

In [None]:
table = groups['proportion_nan'].agg(['mean', 'std', 'count'])

In [None]:
table['std_err'] = table['std']/np.sqrt(table['count'])

In [None]:
table['proportion_valid_predictions'] = 1 - table['mean']

In [None]:
printable_dataset_names = {
    'compas_complete.csv': 'COMPAS',
    'wine_quality.csv': 'Wine Quality',
    'wisconsin.csv': 'Wisconsin',
    'coupon_carryout.csv': 'Coupon Carryout',
    'coupon_rest20.csv': 'Coupon Restaurant',
    'coupon_full.csv': 'Coupon',
    'fico_complete.csv': 'FICO',
    'netherlands.csv': 'Netherlands',
    'spiral.csv': 'Spiral',
    'tic-tac-toe.csv': 'Tic-Tac-Toe',
    'iris_virginica.csv': 'Iris Virginica',
    'iris_versicolor.csv': 'Iris Versicolor',
    'iris_setosa.csv': 'Iris Setosa',
    'broward_general_2y.csv': 'Broward General',
    'higgs.csv': 'Higgs (1M)'
}

In [None]:
import matplotlib.pyplot as plt
MISS_PROBS = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
def plot_dataset(dataset='compas.csv', base_tree = 'gosdt', balance=True, prebin_missingness=False): 

    for NaN_condition in ['ours', 'path-based', 'used-features']: 
        ys = [1]
        y_errs = [0]
        for miss_prob in MISS_PROBS:#todo: go higher
            row = table.loc[(base_tree, dataset, NaN_condition, miss_prob, balance, prebin_missingness), :]
            ys.append(row['proportion_valid_predictions'])
            y_errs.append(row['std_err'])
        ys.append(0)
        y_errs.append(0)
        plt.errorbar([0] + MISS_PROBS + [1], ys, yerr=y_errs, label=f'{NaN_condition}')
    
    plt.ylim(0, 1)
    plt.xlim(0, 1)

    plt.xlabel('Missingness Probability per Feature')
    plt.ylabel('Proportion Predictions Identified as Unaffected')
    plt.legend(loc='upper right')
    plt.title(f'Proportion of Predictions Completely Robust to Missingness in {dataset} \n for a {base_tree} tree learned on Complete data')
    plt.savefig(f'plots/single_tree_{dataset}.png')
    

In [None]:
def plot_dataset(ax, dataset='compas.csv', base_tree='gosdt', balance=True, prebin_missingness=False, legend=False): 
    for NaN_condition in ['ours', 'path-based', 'used-features']: 
        ys = [100]
        y_errs = [0]
        for miss_prob in MISS_PROBS:
            row = table.loc[(base_tree, dataset, NaN_condition, miss_prob, balance, prebin_missingness), :]
            ys.append(row['proportion_valid_predictions']*100)
            y_errs.append(row['std_err']*100)
        ys.append(0)
        y_errs.append(0)
        ax.errorbar([0] + MISS_PROBS + [1], ys, yerr=y_errs, label=f'{NaN_condition}')
    
    ax.set_ylim(0, 100)
    ax.set_xlim(0, 1)
    ax.set_xlabel('Missingness Prob per Feature')
    ax.set_ylabel(f'% Predictions Proven \n Unaffected')
    if legend:
        ax.legend(loc='upper right')
    ax.set_title(f'{base_tree}')

In [None]:
BALANCE_TO_PLOT = False
PREBIN_TO_PLOT = False

plt.rcParams.update({'font.size': 22})
datasets = ['compas_complete.csv', 'wine_quality.csv', 'wisconsin.csv', 'coupon_full.csv']
fig, axs = plt.subplots(len(datasets), 3, figsize=(21, 5 * len(datasets)))

for i, dataset in enumerate(datasets):
    plot_dataset(axs[i, 0], dataset, 'sklearn', balance=BALANCE_TO_PLOT, prebin_missingness=PREBIN_TO_PLOT, legend = i==0)
    plot_dataset(axs[i, 1], dataset, 'gosdt', balance=BALANCE_TO_PLOT, prebin_missingness=PREBIN_TO_PLOT)
    plot_dataset(axs[i, 2], dataset, 'dl85', balance=BALANCE_TO_PLOT, prebin_missingness=PREBIN_TO_PLOT)
    axs[i, 1].set_ylabel('')
    axs[i, 1].set_yticklabels([])
    axs[i, 2].set_ylabel('')
    axs[i, 2].set_yticklabels([])
    axs[i, 0].set_title(f'{printable_dataset_names[dataset]} - sklearn')
    axs[i, 1].set_title(f'{printable_dataset_names[dataset]} - gosdt')
    axs[i, 2].set_title(f'{printable_dataset_names[dataset]} - dl85')

    if i != len(datasets) - 1: 
        axs[i, 0].set_xlabel('')
        axs[i, 1].set_xlabel('')
        axs[i, 2].set_xlabel('')

fig.suptitle(f'% Predictions Completely Robust to Missingness', fontsize='xx-large')
fig.tight_layout(rect=[0, 0, 1, .98])
plt.savefig(f'plots/single_tree_all_{BALANCE_TO_PLOT}_{PREBIN_TO_PLOT}.pdf')
plt.show()


In [None]:
BALANCE_TO_PLOT = False
PREBIN_TO_PLOT = False

plt.rcParams.update({'font.size': 22})
datasets = ['compas_complete.csv', 'wine_quality.csv', 'wisconsin.csv', 'coupon_full.csv']
fig, axs = plt.subplots(len(datasets), 3, figsize=(21, 5 * len(datasets)))

for i, dataset in enumerate(datasets):
    plot_dataset(axs[i, 0], dataset, 'sklearn', balance=BALANCE_TO_PLOT, prebin_missingness=PREBIN_TO_PLOT, legend = i==0)
    plot_dataset(axs[i, 1], dataset, 'sklearn-4', balance=BALANCE_TO_PLOT, prebin_missingness=PREBIN_TO_PLOT)
    try: 
        plot_dataset(axs[i, 2], dataset, 'sklearn-5', balance=BALANCE_TO_PLOT, prebin_missingness=PREBIN_TO_PLOT)
    except: 
        pass
    axs[i, 1].set_ylabel('')
    axs[i, 1].set_yticklabels([])
    axs[i, 2].set_ylabel('')
    axs[i, 2].set_yticklabels([])
    axs[i, 0].set_title(f'{printable_dataset_names[dataset]} - sklearn depth 3')
    axs[i, 1].set_title(f'{printable_dataset_names[dataset]} - sklearn depth 4')
    axs[i, 2].set_title(f'{printable_dataset_names[dataset]} - sklearn depth 5')

    if i != len(datasets) - 1: 
        axs[i, 0].set_xlabel('')
        axs[i, 1].set_xlabel('')
        axs[i, 2].set_xlabel('')

fig.suptitle(f'% Predictions Completely Robust to Missingness', fontsize='xx-large')
fig.tight_layout(rect=[0, 0, 1, .98])
plt.savefig(f'plots/single_tree_depths_{BALANCE_TO_PLOT}_{PREBIN_TO_PLOT}.pdf')
plt.show()


In [None]:
BALANCE_TO_PLOT = False
PREBIN_TO_PLOT = False
def plot_dataset(ax, dataset='compas.csv', base_tree='gosdt', balance=False, prebin_missingness=False, legend=False): 
    for NaN_condition in ['ours', 'path-based', 'used-features']: 
        ys = [100]
        y_errs = [0]
        for miss_prob in MISS_PROBS:
            row = table.loc[(base_tree, dataset, NaN_condition, miss_prob, balance, prebin_missingness), :]
            ys.append(row['proportion_valid_predictions']*100)
            y_errs.append(row['std_err']*100)
        ys.append(0)
        y_errs.append(0)
        ax.errorbar([0] + MISS_PROBS + [1], ys, yerr=y_errs, label=f'{NaN_condition}')
    
    ax.set_ylim(0, 100)
    ax.set_xlim(0, 1)
    ax.set_xlabel('Missingness Prob per Feature')
    ax.set_ylabel(f'% Predictions Proven \n Unaffected')
    if legend:
        ax.legend(loc='upper right')
    ax.set_title(f'{printable_dataset_names[dataset]}')

plt.rcParams.update({'font.size': 24})
datasets = ['fico_complete.csv', 'netherlands.csv', 'spiral.csv', 'tic-tac-toe.csv', 'iris_virginica.csv', 'iris_versicolor.csv', 'iris_setosa.csv', 'broward_general_2y.csv', 'higgs.csv']
fig, axs = plt.subplots(3, 3, figsize=(20, 15))

for i, dataset in enumerate(datasets):
    plot_dataset(axs[i // 3, i % 3], dataset, 'sklearn', balance=BALANCE_TO_PLOT, prebin_missingness=PREBIN_TO_PLOT, legend = i==0)
    # plot_dataset(axs[i, 1], dataset, 'gosdt', balance=BALANCE_TO_PLOT, prebin_missingness=PREBIN_TO_PLOT)
    # plot_dataset(axs[i, 2], dataset, 'dl85', balance=BALANCE_TO_PLOT, prebin_missingness=PREBIN_TO_PLOT)
    if i < len(datasets) - 3: 
        axs[i // 3, i %3].set_xlabel('')

fig.suptitle(f'% Samples such that CART trees \n can Predict without Imputation', fontsize='x-large')
fig.tight_layout()
plt.savefig(f'plots/sklearn_app_{BALANCE_TO_PLOT}_{PREBIN_TO_PLOT}.pdf')
plt.show()

In [None]:
BALANCE_TO_PLOT = False
PREBIN_TO_PLOT = True
def plot_dataset(ax, dataset='compas.csv', base_tree='gosdt', balance=False, prebin_missingness=False, legend=False): 
    for NaN_condition in ['ours', 'path-based', 'used-features']: 
        ys = [100]
        y_errs = [0]
        for miss_prob in MISS_PROBS:
            row = table.loc[(base_tree, dataset, NaN_condition, miss_prob, balance, prebin_missingness), :]
            ys.append(row['proportion_valid_predictions']*100)
            y_errs.append(row['std_err']*100)
        ys.append(0)
        y_errs.append(0)
        ax.errorbar([0] + MISS_PROBS + [1], ys, yerr=y_errs, label=f'{NaN_condition}')
    
    ax.set_ylim(0, 100)
    ax.set_xlim(0, 1)
    ax.set_xlabel('Missingness Prob per Feature')
    ax.set_ylabel(f'% Predictions Proven \n Unaffected')
    if legend:
        ax.legend(loc='upper right')
    ax.set_title(f'{printable_dataset_names[dataset]}')

plt.rcParams.update({'font.size': 24})
datasets = ['compas_complete.csv', 'wine_quality.csv', 'wisconsin.csv', 'coupon_full.csv']
fig, axs = plt.subplots(2, 2, figsize=(15, 12))

for i, dataset in enumerate(datasets):
    plot_dataset(axs[i // 2, i % 2], dataset, 'sklearn', balance=BALANCE_TO_PLOT, prebin_missingness=PREBIN_TO_PLOT, legend = i==0)
    # plot_dataset(axs[i, 1], dataset, 'gosdt', balance=BALANCE_TO_PLOT, prebin_missingness=PREBIN_TO_PLOT)
    # plot_dataset(axs[i, 2], dataset, 'dl85', balance=BALANCE_TO_PLOT, prebin_missingness=PREBIN_TO_PLOT)
    if i < len(datasets) - 2: 
        axs[i // 2, i %2].set_xlabel('')

fig.suptitle(f'% Samples such that CART trees \n can Predict without Imputation', fontsize='x-large')
fig.tight_layout()
plt.savefig(f'plots/sklearn_{BALANCE_TO_PLOT}_{PREBIN_TO_PLOT}.pdf')
plt.show()