-
Notifications
You must be signed in to change notification settings - Fork 0
/
classifier_bs_funcs.py
333 lines (251 loc) · 11.8 KB
/
classifier_bs_funcs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
import warnings
import numpy as np
from scipy.sparse import coo_matrix
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import assert_all_finite
from sklearn.utils import check_array
from sklearn.utils import check_consistent_length
from sklearn.utils import column_or_1d
from sklearn.utils.multiclass import unique_labels
from sklearn.utils.multiclass import type_of_target
from sklearn.utils.validation import _num_samples
from sklearn.utils.sparsefuncs import count_nonzero
from sklearn.exceptions import UndefinedMetricWarning
from sklearn.metrics._classification import _check_targets, _check_zero_division, _weighted_sum, precision_recall_fscore_support
import joblib
###The real MVP###
def gen_bs_idx(y_true, n_shuffles = 10000, random_state = None):
"""
Generate a list boostrapped indexies to generate classification metrics
Parameters
----------
y_true : 1d array-like, or label indicator array / sparse matrix
Ground truth (correct) target values.
n_shuffles : int (default = 10,000)
Number of bootstrapped samples to use for generation of distribution estimate.
random_state : int or None (default None)
Seed for reproducability.
Returns
-------
idx_bs = 1d array-like, shape = [len(y_true, shuffles])
Array of indicies to be included for classifier metric calculation.
"""
rs = np.random.RandomState(random_state)
idx_bs = rs.choice(np.arange(0, len(y_true)), [len(y_true), n_shuffles], replace = True)
return(idx_bs)
###Everyone else###
def metrics_bs(metric, y_true, y_pred, ci = 95, return_raw = False, n_shuffles = 10000, n_jobs = 1, random_state = None, verbose = 0):
"""
Generate boostrap of classification metric
Parameters
----------
metric: sklearn classification metric
Metric to bootstrap
y_true : 1d array-like, or label indicator array / sparse matrix
Ground truth (correct) target values.
y_pred : 1d array-like, or label indicator array / sparse matrix
Predicted values to compare against y_true.
ci : int (default 95)
Confidence interval to calculate (two tailed)
return_raw : bool (default False)
Return bootstrapped classification metrics
n_shuffles : int (default = 10,000)
Number of bootstrapped samples to use for generation of distribution estimate.
n_jobs : int (default 1)
Number of cores to use to produce bootstrapped samples
random_state : int or None (default None)
Seed for reproducability.
"""
y_type, y_true, y_pred = _check_targets(y_true, y_pred)
metric_name = str(metric).split(' ')[1]
bs_idx = gen_bs_idx(y_true, n_shuffles, random_state)
obs = metric(y_true, y_pred)
results = np.array(joblib.Parallel(n_jobs = n_jobs, verbose = verbose)(joblib.delayed(metric)(y_true[n], y_pred[n]) for n in bs_idx.T))
#Calc CI
l = (100 - ci) / 2
u = 100 - l
l_ci, u_ci = np.percentile(results, [l, u])
report_fmt = '***{}: {:.{digits}f} ({:.{digits}f} - {:.{digits}f})***'
report = report_fmt.format(metric_name, obs, l_ci, u_ci, digits = 2)
if return_raw:
return report, results
else:
return report
def classification_report_bs(y_true, y_pred, labels=None, target_names=None,
sample_weight=None, digits=2, output_dict=False,
zero_division="warn", ci = 95, n_shuffles = 10000,
return_raw = True, n_jobs = 1, verbose = 0, random_state = None):
"""
Build a text report showing the main classification metrics
Parameters
----------
y_true : 1d array-like, or label indicator array / sparse matrix
Ground truth (correct) target values.
y_pred : 1d array-like, or label indicator array / sparse matrix
Estimated targets as returned by a classifier.
labels : array, shape = [n_labels]
Optional list of label indices to include in the report.
target_names : list of strings
Optional display names matching the labels (same order).
sample_weight : array-like of shape (n_samples,), default=None
Sample weights.
digits : int
Number of digits for formatting output floating point values.
When ``output_dict`` is ``True``, this will be ignored and the
returned values will not be rounded.
output_dict : bool (default = False)
If True, return output as dict
zero_division : "warn", 0 or 1, default="warn"
Sets the value to return when there is a zero division. If set to
"warn", this acts as 0, but warnings are also raised.
ci : int
Confidence interval to compute.
Note - CI is two tailed.
shuffles : int (default = 10,000)
Number of bootstrapped samples to use for generation of distribution estimate.
return_raw : bool (default = True)
If True, returns an array of size n_samples predictions.
Returns
-------
report : string / dict
Text summary of the precision, recall, F1 score for each class.
Dictionary returned if output_dict is True. Dictionary has the
following structure::
{'label 1': {'precision':0.5,
'recall':1.0,
'f1-score':0.67,
'support':1},
'label 2': { ... },
...
}
The reported averages include macro average (averaging the unweighted
mean per label), weighted average (averaging the support-weighted mean
per label), and sample average (only for multilabel classification).
Micro average (averaging the total true positives, false negatives and
false positives) is only shown for multi-label or multi-class
with a subset of classes, because it corresponds to accuracy otherwise.
See also :func:`precision_recall_fscore_support` for more details
on averages.
Note that in binary classification, recall of the positive class
is also known as "sensitivity"; recall of the negative class is
"specificity".
preds_bs : 1d array-like
Predictions of estimator based on bootstrapped samples.
"""
y_type, y_true, y_pred = _check_targets(y_true, y_pred)
labels_given = True
if labels is None:
labels = unique_labels(y_true, y_pred)
labels_given = False
else:
labels = np.asarray(labels)
# labelled micro average
micro_is_accuracy = ((y_type == 'multiclass' or y_type == 'binary') and
(not labels_given or
(set(labels) == set(unique_labels(y_true, y_pred)))))
if target_names is not None and len(labels) != len(target_names):
if labels_given:
warnings.warn(
"labels size, {0}, does not match size of target_names, {1}"
.format(len(labels), len(target_names))
)
else:
raise ValueError(
"Number of classes, {0}, does not match size of "
"target_names, {1}. Try specifying the labels "
"parameter".format(len(labels), len(target_names))
)
if target_names is None:
target_names = ['%s' % l for l in labels]
headers = ["precision", "recall", "f1-score", "support"]
# compute per-class results without averaging
#Compute observed values:
p, r, f1, s = precision_recall_fscore_support(y_true, y_pred,
labels=labels,
average=None,
sample_weight=sample_weight,
zero_division=zero_division)
#Generate bootstrapped values:
bs_idx = gen_bs_idx(y_true, n_shuffles, random_state)
report_bs = np.array(joblib.Parallel(n_jobs = n_jobs, verbose = verbose)(joblib.delayed(precision_recall_fscore_support)(y_true[n], y_pred[n], labels=labels, average=None, sample_weight=sample_weight, zero_division=zero_division) for n in bs_idx.T))
#Extract bs into seperate variables
p_bs = report_bs[:, 0, :]
r_bs = report_bs[:, 1, :]
f1_bs = report_bs[:, 2, :]
s_bs = report_bs[:, 3, :]
#Calc CI
l = (100 - ci) / 2
u = 100 - l
#Join obs and CI
p_ci = np.percentile(p_bs, [l, u], axis = 0)
p_stack = [np.hstack([p[n], np.ravel(p_ci[:, n])]) for n in range(0, len(p))]
r_ci = np.percentile(r_bs, [l, u], axis = 0)
r_stack = [np.hstack([r[n], np.ravel(r_ci[:, n])]) for n in range(0, len(r))]
f1_ci = np.percentile(f1_bs, [l, u], axis = 0)
f1_stack = [np.hstack([f1[n], np.ravel(f1_ci[:, n])]) for n in range(0, len(f1))]
s_ci = np.percentile(s_bs, [l, u], axis = 0)
s_stack = [np.hstack([s[n], np.ravel(s_ci[:, n])]) for n in range(0, len(s))]
rows = zip(target_names, p_stack, r_stack, f1_stack, s_stack)
if y_type.startswith('multilabel'):
average_options = ('micro', 'macro', 'weighted', 'samples')
else:
average_options = ('micro', 'macro', 'weighted')
if output_dict:
report_dict = {label[0]: label[1:] for label in rows}
for label, scores in report_dict.items():
report_dict[label] = dict(zip(headers,
[i for i in scores]))
else:
longest_last_line_heading = 'weighted avg'
name_width = max(len(cn) for cn in target_names)
width = max(name_width, len(longest_last_line_heading), digits)
head_fmt = ' {:>{width}s} ' + ' {:<20}' * len(headers)
report = head_fmt.format('', *headers, width=width)
report += '\n\n'
row_fmt = '{:<{width}s} ' + '{:^1.{digits}f} ({:^1.{digits}f} - {:^1.{digits}f}) ' * 4 + '\n' #INCLUDE 95% CI HERE
for row in rows:
in_row = []
in_row.append(row[0])
for r in row[1:]:
if len(r) < 2:
in_row.append(r)
else:
for i in r:
in_row.append(i)
report += row_fmt.format(*in_row, width=width, digits=digits)
report += '\n'
# ###IGNORE AVG ACCURACY FOR THE TIME BEING###
# # compute all applicable averages
# for average in average_options:
# if average.startswith('micro') and micro_is_accuracy:
# line_heading = 'accuracy'
# else:
# line_heading = average + ' avg'
# # compute averages with specified averaging method
# avg_p, avg_r, avg_f1, _ = precision_recall_fscore_support(
# y_true, y_pred, labels=labels,
# average=average, sample_weight=sample_weight,
# zero_division=zero_division)
# avg = [avg_p, avg_r, avg_f1, np.sum(s)]
# if output_dict:
# report_dict[line_heading] = dict(
# zip(headers, [i.item() for i in avg]))
# else:
# if line_heading == 'accuracy':
# row_fmt_accuracy = '{:>{width}s} ' + \
# ' {:>9.{digits}}' * 2 + ' {:>9.{digits}f}' + \
# ' {:>9}\n'
# report += row_fmt_accuracy.format(line_heading, '', '',
# *avg[2:], width=width,
# digits=digits)
# else:
# report += row_fmt.format(line_heading, *avg,
# width=width, digits=digits)
if output_dict:
if 'accuracy' in report_dict.keys():
report_dict['accuracy'] = report_dict['accuracy']['precision']
return report_dict
else:
return report