forked from scikit-learn-contrib/hdbscan
/
_hdbscan_tree.pyx
528 lines (409 loc) · 16.8 KB
/
_hdbscan_tree.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
#cython: boundscheck=False, nonecheck=False, initializedcheck=False
# Tree handling (condensing, finding stable clusters) for hdbscan
# Authors: Leland McInnes
# License: 3-clause BSD
import numpy as np
cimport numpy as np
cdef np.double_t INFTY = np.inf
cdef list bfs_from_hierarchy(np.ndarray[np.double_t, ndim=2] hierarchy, np.intp_t bfs_root):
"""
Perform a breadth first search on a tree in scipy hclust format.
"""
cdef list to_process
cdef np.intp_t max_node
cdef np.intp_t num_points
cdef np.intp_t dim
dim = hierarchy.shape[0]
max_node = 2 * dim
num_points = max_node - dim + 1
to_process = [bfs_root]
result = []
while to_process:
result.extend(to_process)
to_process = [x - num_points for x in
to_process if x >= num_points]
if to_process:
to_process = hierarchy[to_process,:2].flatten().astype(np.intp).tolist()
return result
cpdef np.ndarray condense_tree(np.ndarray[np.double_t, ndim=2] hierarchy,
np.intp_t min_cluster_size=10):
cdef np.intp_t root
cdef np.intp_t num_points
cdef np.intp_t next_label
cdef list node_list
cdef list result_list
cdef np.ndarray[np.intp_t, ndim=1] relabel
cdef np.ndarray[np.int_t, ndim=1] ignore
cdef np.ndarray[np.double_t, ndim=1] children
cdef np.intp_t node
cdef np.intp_t sub_node
cdef np.intp_t left
cdef np.intp_t right
cdef double lambda_value
cdef np.intp_t left_count
cdef np.intp_t right_count
root = 2 * hierarchy.shape[0]
num_points = root // 2 + 1
next_label = num_points + 1
node_list = bfs_from_hierarchy(hierarchy, root)
relabel = np.empty(root + 1, dtype=np.intp)
relabel[root] = num_points
result_list = []
ignore = np.zeros(len(node_list), dtype=np.int)
for node in node_list:
if ignore[node] or node < num_points:
continue
children = hierarchy[node - num_points]
left = <np.intp_t> children[0]
right = <np.intp_t> children[1]
if children[2] > 0.0:
lambda_value = 1.0 / children[2]
else:
lambda_value = INFTY
if left >= num_points:
left_count = <np.intp_t> hierarchy[left - num_points][3]
else:
left_count = 1
if right >= num_points:
right_count = <np.intp_t> hierarchy[right - num_points][3]
else:
right_count = 1
if left_count >= min_cluster_size and right_count >= min_cluster_size:
relabel[left] = next_label
next_label += 1
result_list.append((relabel[node], relabel[left], lambda_value, left_count))
relabel[right] = next_label
next_label += 1
result_list.append((relabel[node], relabel[right], lambda_value, right_count))
elif left_count < min_cluster_size and right_count < min_cluster_size:
for sub_node in bfs_from_hierarchy(hierarchy, left):
if sub_node < num_points:
result_list.append((relabel[node], sub_node, lambda_value, 1))
ignore[sub_node] = True
for sub_node in bfs_from_hierarchy(hierarchy, right):
if sub_node < num_points:
result_list.append((relabel[node], sub_node, lambda_value, 1))
ignore[sub_node] = True
elif left_count < min_cluster_size:
relabel[right] = relabel[node]
for sub_node in bfs_from_hierarchy(hierarchy, left):
if sub_node < num_points:
result_list.append((relabel[node], sub_node, lambda_value, 1))
ignore[sub_node] = True
else:
relabel[left] = relabel[node]
for sub_node in bfs_from_hierarchy(hierarchy, right):
if sub_node < num_points:
result_list.append((relabel[node], sub_node, lambda_value, 1))
ignore[sub_node] = True
return np.array(result_list, dtype=[
('parent', np.intp),
('child', np.intp),
('lambda_val', float),
('child_size', np.intp)
])
cpdef dict compute_stability(np.ndarray condensed_tree):
cdef np.ndarray[np.double_t, ndim=1] result_arr
cdef np.ndarray sorted_child_data
cdef np.ndarray[np.intp_t, ndim=1] sorted_children
cdef np.ndarray[np.double_t, ndim=1] sorted_lambdas
cdef np.ndarray[np.intp_t, ndim=1] parents
cdef np.ndarray[np.intp_t, ndim=1] sizes
cdef np.ndarray[np.double_t, ndim=1] lambdas
cdef np.intp_t child
cdef np.intp_t parent
cdef np.intp_t child_size
cdef np.intp_t result_index
cdef np.intp_t current_child
cdef np.float64_t lambda_
cdef np.float64_t min_lambda
cdef np.ndarray[np.double_t, ndim=1] births_arr
cdef np.double_t *births
cdef np.intp_t largest_child = condensed_tree['child'].max()
cdef np.intp_t smallest_cluster = condensed_tree['parent'].min()
cdef np.intp_t num_clusters = condensed_tree['parent'].max() - smallest_cluster + 1
sorted_child_data = np.sort(condensed_tree[['child', 'lambda_val']], axis=0)
births_arr = np.nan * np.ones(largest_child + 1, dtype=np.double)
births = (<np.double_t *> births_arr.data)
sorted_children = sorted_child_data['child']
sorted_lambdas = sorted_child_data['lambda_val']
parents = condensed_tree['parent']
sizes = condensed_tree['child_size']
lambdas = condensed_tree['lambda_val']
current_child = -1
min_lambda = 0
for row in range(sorted_child_data.shape[0]):
child = <np.intp_t> sorted_children[row]
lambda_ = sorted_lambdas[row]
if child == current_child:
min_lambda = min(min_lambda, lambda_)
elif current_child != -1:
births[current_child] = min_lambda
current_child = child
min_lambda = lambda_
else:
# Initialize
current_child = child
min_lambda = lambda_
result_arr = np.zeros(num_clusters, dtype=np.double)
for i in range(condensed_tree.shape[0]):
parent = parents[i]
lambda_ = lambdas[i]
child_size = sizes[i]
result_index = parent - smallest_cluster
result_arr[result_index] += (lambda_ - births[parent]) * child_size
return dict(np.vstack((np.arange(smallest_cluster, condensed_tree['parent'].max() + 1), result_arr)).T)
cdef list bfs_from_cluster_tree(np.ndarray tree, np.intp_t bfs_root):
cdef list result
cdef np.ndarray[np.intp_t, ndim=1] to_process
result = []
to_process = np.array([bfs_root], dtype=np.intp)
while to_process.shape[0] > 0:
result.extend(to_process.tolist())
to_process = tree['child'][np.in1d(tree['parent'], to_process)]
return result
cdef max_lambdas(np.ndarray tree):
cdef np.ndarray sorted_parent_data
cdef np.ndarray[np.intp_t, ndim=1] sorted_parents
cdef np.ndarray[np.double_t, ndim=1] sorted_lambdas
cdef np.intp_t parent
cdef np.intp_t current_parent
cdef np.float64_t lambda_
cdef np.float64_t max_lambda
cdef np.ndarray[np.double_t, ndim=1] deaths_arr
cdef np.double_t *deaths
cdef np.intp_t largest_parent = tree['parent'].max()
sorted_parent_data = np.sort(tree[['parent', 'lambda_val']], axis=0)
deaths_arr = np.zeros(largest_parent + 1, dtype=np.double)
deaths = (<np.double_t *> deaths_arr.data)
sorted_parents = sorted_parent_data['parent']
sorted_lambdas = sorted_parent_data['lambda_val']
current_parent = -1
max_lambda = 0
for row in range(sorted_parent_data.shape[0]):
parent = <np.intp_t> sorted_parents[row]
lambda_ = sorted_lambdas[row]
if parent == current_parent:
max_lambda = max(max_lambda, lambda_)
elif current_parent != -1:
deaths[current_parent] = max_lambda
current_parent = parent
max_lambda = lambda_
else:
# Initialize
current_parent = parent
max_lambda = lambda_
return deaths_arr
cdef class TreeUnionFind (object):
cdef np.ndarray _data_arr
cdef np.intp_t[:,::1] _data
cdef np.ndarray is_component
def __init__(self, size):
self._data_arr = np.zeros((size, 2), dtype=np.intp)
self._data_arr.T[0] = np.arange(size)
self._data = (<np.intp_t[:size, :2:1]> (<np.intp_t *> self._data_arr.data))
self.is_component = np.ones(size, dtype=np.bool)
cdef union_(self, np.intp_t x, np.intp_t y):
cdef np.intp_t x_root = self.find(x)
cdef np.intp_t y_root = self.find(y)
if self._data[x_root, 1] < self._data[y_root, 1]:
self._data[x_root, 0] = y_root
elif self._data[x_root, 1] > self._data[y_root, 1]:
self._data[y_root, 0] = x_root
else:
self._data[y_root, 0] = x_root
self._data[x_root, 1] += 1
return
cdef find(self, np.intp_t x):
if self._data[x, 0] != x:
self._data[x, 0] = self.find(self._data[x, 0])
self.is_component[x] = False
return self._data[x, 0]
cdef np.ndarray[np.intp_t, ndim=1] components(self):
return self.is_component.nonzero()[0]
cpdef np.ndarray[np.intp_t, ndim=1] labelling_at_cut(np.ndarray linkage,
np.double_t cut,
np.intp_t min_cluster_size):
cdef np.intp_t root
cdef np.intp_t num_points
cdef np.ndarray[np.intp_t, ndim=1] result_arr
cdef np.ndarray[np.intp_t, ndim=1] unique_labels
cdef np.ndarray[np.intp_t, ndim=1] cluster_size
cdef np.intp_t *result
cdef TreeUnionFind union_find
cdef np.intp_t n
cdef np.intp_t cluster
cdef np.intp_t cluster_id
root = 2 * linkage.shape[0]
num_points = root // 2 + 1
result_arr = np.empty(num_points, dtype=np.intp)
result = (<np.intp_t *> result_arr.data)
union_find = TreeUnionFind(<np.intp_t> root + 1)
cluster = num_points
for row in linkage:
if row[2] < cut:
union_find.union_(<np.intp_t> row[0], cluster)
union_find.union_(<np.intp_t> row[1], cluster)
cluster += 1
cluster_size = np.zeros(num_points, dtype=np.intp)
for n in range(num_points):
cluster = union_find.find(n)
cluster_size[cluster] += 1
result[n] = cluster
cluster_label_map = {-1:-1}
cluster_label = 0
unique_labels = np.unique(result_arr)
for cluster in unique_labels:
if cluster_size[cluster] < min_cluster_size:
cluster_label_map[cluster] = -1
else:
cluster_label_map[cluster] = cluster_label
cluster_label += 1
for n in range(num_points):
result[n] = cluster_label_map[result[n]]
return result_arr
cdef np.ndarray[np.intp_t, ndim=1] do_labelling(np.ndarray tree, set clusters, dict cluster_label_map):
cdef np.intp_t root_cluster
cdef np.ndarray[np.intp_t, ndim=1] result_arr
cdef np.ndarray[np.intp_t, ndim=1] parent_array
cdef np.ndarray[np.intp_t, ndim=1] child_array
cdef np.intp_t *result
cdef TreeUnionFind union_find
cdef np.intp_t parent
cdef np.intp_t child
cdef np.intp_t n
cdef np.intp_t cluster
child_array = tree['child']
parent_array = tree['parent']
root_cluster = parent_array.min()
result_arr = np.empty(root_cluster, dtype=np.intp)
result = (<np.intp_t *> result_arr.data)
union_find = TreeUnionFind(parent_array.max() + 1)
for n in range(tree.shape[0]):
child = child_array[n]
parent = parent_array[n]
if child not in clusters:
union_find.union_(parent, child)
for n in range(root_cluster):
cluster = union_find.find(n)
if cluster <= root_cluster:
result[n] = -1
else:
result[n] = cluster_label_map[cluster]
return result_arr
cdef get_probabilities(np.ndarray tree, dict cluster_map, np.ndarray labels):
cdef np.ndarray[np.double_t, ndim=1] result
cdef np.ndarray[np.double_t, ndim=1] deaths
cdef np.ndarray[np.double_t, ndim=1] lambda_array
cdef np.ndarray[np.intp_t, ndim=1] child_array
cdef np.ndarray[np.intp_t, ndim=1] parent_array
cdef np.intp_t root_cluster
cdef np.intp_t n
cdef np.intp_t point
cdef np.intp_t cluster_num
cdef np.intp_t cluster
cdef np.double_t max_lambda
cdef np.double_t lambda_
child_array = tree['child']
parent_array = tree['parent']
lambda_array = tree['lambda_val']
result = np.zeros(labels.shape[0])
deaths = max_lambdas(tree)
root_cluster = parent_array.min()
for n in range(tree.shape[0]):
point = child_array[n]
if point >= root_cluster:
continue
cluster_num = labels[point]
if cluster_num == -1:
continue
cluster = cluster_map[cluster_num]
max_lambda = deaths[cluster]
if max_lambda == 0.0:
result[point] = 1.0
else:
lambda_ = min(lambda_array[n], max_lambda)
result[point] = lambda_ / max_lambda
return result
cpdef np.ndarray[np.double_t, ndim=1] outlier_scores(np.ndarray tree):
cdef np.ndarray[np.double_t, ndim=1] result
cdef np.ndarray[np.double_t, ndim=1] deaths
cdef np.ndarray[np.double_t, ndim=1] lambda_array
cdef np.ndarray[np.intp_t, ndim=1] child_array
cdef np.ndarray[np.intp_t, ndim=1] parent_array
cdef np.intp_t root_cluster
cdef np.intp_t point
cdef np.intp_t parent
cdef np.intp_t cluster
cdef np.double_t lambda_max
child_array = tree['child']
parent_array = tree['parent']
lambda_array = tree['lambda_val']
deaths = max_lambdas(tree)
root_cluster = parent_array.min()
result = np.zeros(root_cluster, dtype=np.double)
topological_sort_order = np.argsort(parent_array)
#topologically_sorted_tree = tree[topological_sort_order]
for n in topological_sort_order:
cluster = child_array[n]
if cluster < root_cluster:
break
parent = parent_array[n]
if deaths[cluster] > deaths[parent]:
deaths[parent] = deaths[cluster]
for n in range(tree.shape[0]):
point = child_array[n]
if point >= root_cluster:
continue
cluster = parent_array[n]
lambda_max = deaths[cluster]
result[point] = (lambda_max - lambda_array[n]) / lambda_max
return result
cpdef np.ndarray get_stability_scores(np.ndarray labels, set clusters, dict stability, np.double_t max_lambda):
result = np.empty(len(clusters), dtype=np.double)
for n, c in enumerate(clusters):
result[n] = stability[c] / (np.sum(labels == n) * max_lambda)
return result
cpdef tuple get_clusters(np.ndarray tree, dict stability):
"""
The tree is assumed to have numeric node ids such that a reverse numeric
sort is equivalent to a topological sort.
"""
cdef list node_list
cdef np.ndarray cluster_tree
cdef np.ndarray child_selection
cdef dict is_cluster
cdef float subtree_stability
cdef np.intp_t node
cdef np.intp_t sub_node
cdef np.intp_t cluster
cdef np.intp_t num_points
cdef np.ndarray labels
cdef np.double_t max_lambda
# Assume clusters are ordered by numeric id equivalent to
# a topological sort of the tree; This is valid given the
# current implementation above, so don't change that ... or
# if you do, change this accordingly!
node_list = sorted(stability.keys(), reverse=True)[:-1] # (exclude root)
cluster_tree = tree[tree['child_size'] > 1]
is_cluster = {cluster:True for cluster in node_list}
num_points = np.max(tree[tree['child_size'] == 1]['child']) + 1
max_lambda = np.max(tree['lambda_val'])
for node in node_list:
child_selection = (cluster_tree['parent'] == node)
subtree_stability = np.sum([stability[child] for
child in cluster_tree['child'][child_selection]])
if subtree_stability > stability[node]:
is_cluster[node] = False
stability[node] = subtree_stability
else:
for sub_node in bfs_from_cluster_tree(cluster_tree, node):
if sub_node != node:
is_cluster[sub_node] = False
clusters = set([c for c in is_cluster if is_cluster[c]])
cluster_map = {c:n for n, c in enumerate(clusters)}
reverse_cluster_map = {n:c for n, c in enumerate(clusters)}
labels = do_labelling(tree, clusters, cluster_map)
probs = get_probabilities(tree, reverse_cluster_map, labels)
stabilities = get_stability_scores(labels, clusters, stability, max_lambda)
return (labels, probs, stabilities)