-
Notifications
You must be signed in to change notification settings - Fork 1.8k
/
submodular_pick.py
128 lines (113 loc) · 5.38 KB
/
submodular_pick.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import numpy as np
import warnings
class SubmodularPick(object):
"""Class for submodular pick
Saves a representative sample of explanation objects using SP-LIME,
as well as saving all generated explanations
First, a collection of candidate explanations are generated
(see explain_instance). From these candidates, num_exps_desired are
chosen using submodular pick. (see marcotcr et al paper)."""
def __init__(self,
explainer,
data,
predict_fn,
method='sample',
sample_size=1000,
num_exps_desired=5,
num_features=10,
**kwargs):
"""
Args:
data: a numpy array where each row is a single input into predict_fn
predict_fn: prediction function. For classifiers, this should be a
function that takes a numpy array and outputs prediction
probabilities. For regressors, this takes a numpy array and
returns the predictions. For ScikitClassifiers, this is
`classifier.predict_proba()`. For ScikitRegressors, this
is `regressor.predict()`. The prediction function needs to work
on multiple feature vectors (the vectors randomly perturbed
from the data_row).
method: The method to use to generate candidate explanations
method == 'sample' will sample the data uniformly at
random. The sample size is given by sample_size. Otherwise
if method == 'full' then explanations will be generated for the
entire data. l
sample_size: The number of instances to explain if method == 'sample'
num_exps_desired: The number of explanation objects returned
num_features: maximum number of features present in explanation
Sets value:
sp_explanations: A list of explanation objects that has a high coverage
explanations: All the candidate explanations saved for potential future use.
"""
top_labels = kwargs.get('top_labels', 1)
if 'top_labels' in kwargs:
del kwargs['top_labels']
# Parse args
if method == 'sample':
if sample_size > len(data):
warnings.warn("""Requested sample size larger than
size of input data. Using all data""")
sample_size = len(data)
all_indices = np.arange(len(data))
np.random.shuffle(all_indices)
sample_indices = all_indices[:sample_size]
elif method == 'full':
sample_indices = np.arange(len(data))
else:
raise ValueError('Method must be \'sample\' or \'full\'')
# Generate Explanations
self.explanations = []
for i in sample_indices:
self.explanations.append(
explainer.explain_instance(
data[i], predict_fn, num_features=num_features,
top_labels=top_labels,
**kwargs))
# Error handling
try:
num_exps_desired = int(num_exps_desired)
except TypeError:
return("Requested number of explanations should be an integer")
if num_exps_desired > len(self.explanations):
warnings.warn("""Requested number of explanations larger than
total number of explanations, returning all
explanations instead.""")
num_exps_desired = min(num_exps_desired, len(self.explanations))
# Find all the explanation model features used. Defines the dimension d'
features_dict = {}
feature_iter = 0
for exp in self.explanations:
labels = exp.available_labels() if exp.mode == 'classification' else [1]
for label in labels:
for feature, _ in exp.as_list(label=label):
if feature not in features_dict.keys():
features_dict[feature] = (feature_iter)
feature_iter += 1
d_prime = len(features_dict.keys())
# Create the n x d' dimensional 'explanation matrix', W
W = np.zeros((len(self.explanations), d_prime))
for i, exp in enumerate(self.explanations):
labels = exp.available_labels() if exp.mode == 'classification' else [1]
for label in labels:
for feature, value in exp.as_list(label):
W[i, features_dict[feature]] += value
# Create the global importance vector, I_j described in the paper
importance = np.sum(abs(W), axis=0)**.5
# Now run the SP-LIME greedy algorithm
remaining_indices = set(range(len(self.explanations)))
V = []
for _ in range(num_exps_desired):
best = 0
best_ind = None
current = 0
for i in remaining_indices:
current = np.dot(
(np.sum(abs(W)[V + [i]], axis=0) > 0), importance
) # coverage function
if current >= best:
best = current
best_ind = i
V.append(best_ind)
remaining_indices -= {best_ind}
self.sp_explanations = [self.explanations[i] for i in V]
self.V = V