### Perform explanation based on randomly flagging one or more branches.
For n_branches_active_i in n_branches:
   &nbsp; - For trial_i in n_trials:
   &emsp;    - Randomly select n_branches_active_i and flag them
   &emsp;    - Compute precision, recall and other performance metrics
   &nbsp; - Compute mean of performance metrics across trials

In [1]:
# import random
import pandas as pd
import numpy as np
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [24]:
def random_n_choice(n, n_examples, n_groupings):
    """ Populate table of explanations randomly. Set `n` grouping of features as explanation for the score.

    :param n: int, number of groupings of features set as explaining the score for any example.
    :param n_examples: int, number of examples
    :param n_groupings: int, number of groupings of features
    :return:
        NumPy array, random explanations
    """

    random_array=np.zeros([n_examples, n_groupings])
    for i in range(n_examples):
        # choose n out of n_groupings
        ind_of_pos=np.random.choice(np.arange(0,n_groupings),size=n, replace=False)
        random_array[i][ind_of_pos]=1

    return random_array

In [36]:
# test random flags generator function
test_2=random_n_choice(2, 3063, 5)

In [28]:
# Read SHAP scores
test=pd.read_csv('/nobackup/khauskne/kdd/all_shap_scores.csv')

In [29]:
# Read Robovetter Minor Flags
robo_flags=pd.read_csv('/nobackup/khauskne/kdd/robovetter_flags_all_DR25.csv')

In [30]:
# Merge Robovetter Minor Flags with SHAP scores
test_csv=pd.merge(test, robo_flags, on=['target_id', 'tce_plnt_num'], how='left', validate='one_to_one')

In [31]:
# choose only columns of interest
shap_array=np.array(test_csv[test_csv.columns[4:11]])
robo_array=np.array(test_csv[test_csv.columns[20:]])

In [32]:
def post_process_flags(robo_flags, branch_explanations):

    branch_inds_to_keep=[1,2,3,4,6]
    
    robo_new=np.moveaxis(robo_flags ,0, -1)[branch_inds_to_keep]
    robo_new=np.moveaxis(robo_new, 0, -1)
    
    exp_new=np.moveaxis(branch_explanations, 0, -1)
    # global_flux=exp_new[0]
    # local_flux=exp_new[1]
    # greater_flux=np.amax(test_csv[test_csv.columns[4:6]], axis=1)
    greater_flux=np.amax(branch_explanations[0:2], axis=1)
    #combined_flux=np.where(added_flux>0, 1, 0)
    exp_new=exp_new[branch_inds_to_keep]
    exp_new[0]=greater_flux
    exp_new=np.moveaxis(exp_new, 0, -1)
    
    return robo_new, exp_new

In [33]:
robo, _=post_process_flags(robo_array, shap_array)

In [35]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import jaccard_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss

In [43]:
def run_x_trials(x, n):
    """ Run random flag generating trials and compute performance metrics for each trial.

    Args:
        x: int, number of trials
        n: int, number of groupings of features set as explaining the score for any example.

    Returns:

    """

    prec_micro=[]
    rec_micro=[]
    prec_macro=[]
    rec_macro=[]
    prec_weighted=[]
    rec_weighted=[]
    hamming=[]
    accuracy=[]
    jacc=[]

    for i in range(x):
        rand_pred=random_n_choice(n, 3063, 5)
        prec_micro.append(precision_score(robo, rand_pred, average='micro'))
        rec_micro.append(recall_score(robo, rand_pred, average='micro'))
        
        prec_macro.append(precision_score(robo, rand_pred, average='macro'))
        rec_macro.append(recall_score(robo, rand_pred, average='macro'))
        
        prec_weighted.append(precision_score(robo, rand_pred, average='weighted'))
        rec_weighted.append(recall_score(robo, rand_pred, average='weighted'))
        
        hamming.append(hamming_loss(robo, rand_pred))
        accuracy.append(accuracy_score(robo, rand_pred))
        jacc.append(jaccard_score(robo, rand_pred, average='weighted'))

    print(str(np.round(np.mean(prec_micro),4))+' \& ' + str(np.round(np.mean(rec_micro),4)))
    print(str(np.round(np.mean(prec_macro),4))+' \& ' + str(np.round(np.mean(rec_macro),4)))
    print(str(np.round(np.mean(prec_weighted),4))+' \& ' + str(np.round(np.mean(rec_weighted),4)))
    print(np.mean(hamming))
    print(np.mean(accuracy))
    print(np.mean(jacc))

    return
    

In [46]:
run_x_trials(10, 1)

0.1973 \& 0.1991
0.1963 \& 0.1987
0.5124 \& 0.1991
0.3192033953640222
0.13114593535749264
0.14806107163348456


In [47]:
run_x_trials(10, 2)

0.1989 \& 0.4015
0.1987 \& 0.3957
0.5186 \& 0.4015
0.4390205680705191
0.013679399281749915
0.2646140680292526


In [48]:
run_x_trials(10, 3)

0.1978 \& 0.5991
0.1984 \& 0.6031
0.5143 \& 0.5991
0.5607182500816192
0.0020894547828925886
0.3565656463027673


In [49]:
def per_class_prec_rec_top_n(robo_array, occlusion_array, n, branch_ind):

    robo, occlusion=post_process_flags(robo_array, occlusion_array)

    for index in range(len(occlusion)):
        tce=occlusion[index]
        max_contrib=np.argsort(tce)[-1*n:]
        occlusion[index]=np.zeros([5])
        occlusion[index][max_contrib]=1

    robo_branch=np.moveaxis(robo, 0, -1)[branch_ind]
    occ_branch=np.moveaxis(occlusion, 0, -1)[branch_ind]
    weighted_prec=precision_score(robo_branch, occ_branch)
    weighted_rec=recall_score(robo_branch,occ_branch)

    print(str(np.round(weighted_prec, 4))+' \& '+str(np.round(weighted_rec, 4)) )

    return weighted_prec, weighted_rec