hdbscan/_hdbscan_tree.pyx

#cython: boundscheck=False, nonecheck=False, initializedcheck=False
# Tree handling (condensing, finding stable clusters) for hdbscan
# Authors: Leland McInnes
# License: 3-clause BSD

import numpy as np
cimport numpy as np

cdef np.double_t INFTY = np.inf

cdef list bfs_from_hierarchy(np.ndarray[np.double_t, ndim=2] hierarchy, np.intp_t bfs_root):

    """
    Perform a breadth first search on a tree in scipy hclust format.
    """

    cdef list to_process
    cdef np.intp_t max_node
    cdef np.intp_t num_points
    cdef np.intp_t dim

    dim = hierarchy.shape[0]
    max_node = 2 * dim
    num_points = max_node - dim + 1

    to_process = [bfs_root]
    result = []

    while to_process:
        result.extend(to_process)
        to_process = [x - num_points for x in
                          to_process if x >= num_points]
        if to_process:
            to_process = hierarchy[to_process,:2].flatten().astype(np.intp).tolist()

    return result

cpdef np.ndarray condense_tree(np.ndarray[np.double_t, ndim=2] hierarchy,
                               np.intp_t min_cluster_size=10):

    cdef np.intp_t root
    cdef np.intp_t num_points
    cdef np.intp_t next_label
    cdef list node_list
    cdef list result_list

    cdef np.ndarray[np.intp_t, ndim=1] relabel
    cdef np.ndarray[np.int_t, ndim=1] ignore
    cdef np.ndarray[np.double_t, ndim=1] children

    cdef np.intp_t node
    cdef np.intp_t sub_node
    cdef np.intp_t left
    cdef np.intp_t right
    cdef double lambda_value
    cdef np.intp_t left_count
    cdef np.intp_t right_count

    root = 2 * hierarchy.shape[0]
    num_points = root // 2 + 1
    next_label = num_points + 1

    node_list = bfs_from_hierarchy(hierarchy, root)

    relabel = np.empty(root + 1, dtype=np.intp)
    relabel[root] = num_points
    result_list = []
    ignore = np.zeros(len(node_list), dtype=np.int)

    for node in node_list:
        if ignore[node] or node < num_points:
            continue

        children = hierarchy[node - num_points]
        left = <np.intp_t> children[0]
        right = <np.intp_t> children[1]
        if children[2] > 0.0:
            lambda_value = 1.0 / children[2]
        else:
            lambda_value = INFTY

        if left >= num_points:
            left_count = <np.intp_t> hierarchy[left - num_points][3]
        else:
            left_count = 1

        if right >= num_points:
            right_count = <np.intp_t> hierarchy[right - num_points][3]
        else:
            right_count = 1

        if left_count >= min_cluster_size and right_count >= min_cluster_size:
            relabel[left] = next_label
            next_label += 1
            result_list.append((relabel[node], relabel[left], lambda_value, left_count))

            relabel[right] = next_label
            next_label += 1
            result_list.append((relabel[node], relabel[right], lambda_value, right_count))

        elif left_count < min_cluster_size and right_count < min_cluster_size:
            for sub_node in bfs_from_hierarchy(hierarchy, left):
                if sub_node < num_points:
                    result_list.append((relabel[node], sub_node, lambda_value, 1))
                ignore[sub_node] = True

            for sub_node in bfs_from_hierarchy(hierarchy, right):
                if sub_node < num_points:
                    result_list.append((relabel[node], sub_node, lambda_value, 1))
                ignore[sub_node] = True

        elif left_count < min_cluster_size:
            relabel[right] = relabel[node]
            for sub_node in bfs_from_hierarchy(hierarchy, left):
                if sub_node < num_points:
                    result_list.append((relabel[node], sub_node, lambda_value, 1))
                ignore[sub_node] = True

        else:
            relabel[left] = relabel[node]
            for sub_node in bfs_from_hierarchy(hierarchy, right):
                if sub_node < num_points:
                    result_list.append((relabel[node], sub_node, lambda_value, 1))
                ignore[sub_node] = True

    return np.array(result_list, dtype=[
                                        ('parent', np.intp),
                                        ('child', np.intp),
                                        ('lambda_val', float),
                                        ('child_size', np.intp)
                                       ])

cpdef dict compute_stability(np.ndarray condensed_tree):

    cdef np.ndarray[np.double_t, ndim=1] result_arr
    cdef np.ndarray sorted_child_data
    cdef np.ndarray[np.intp_t, ndim=1] sorted_children
    cdef np.ndarray[np.double_t, ndim=1] sorted_lambdas

    cdef np.ndarray[np.intp_t, ndim=1] parents
    cdef np.ndarray[np.intp_t, ndim=1] sizes
    cdef np.ndarray[np.double_t, ndim=1] lambdas

    cdef np.intp_t child
    cdef np.intp_t parent
    cdef np.intp_t child_size
    cdef np.intp_t result_index
    cdef np.intp_t current_child
    cdef np.float64_t lambda_
    cdef np.float64_t min_lambda

    cdef np.ndarray[np.double_t, ndim=1] births_arr
    cdef np.double_t *births

    cdef np.intp_t largest_child = condensed_tree['child'].max()
    cdef np.intp_t smallest_cluster = condensed_tree['parent'].min()
    cdef np.intp_t num_clusters = condensed_tree['parent'].max() - smallest_cluster + 1

    sorted_child_data = np.sort(condensed_tree[['child', 'lambda_val']], axis=0)
    births_arr = np.nan * np.ones(largest_child + 1, dtype=np.double)
    births = (<np.double_t *> births_arr.data)
    sorted_children = sorted_child_data['child']
    sorted_lambdas = sorted_child_data['lambda_val']

    parents = condensed_tree['parent']
    sizes = condensed_tree['child_size']
    lambdas = condensed_tree['lambda_val']

    current_child = -1
    min_lambda = 0

    for row in range(sorted_child_data.shape[0]):
        child = <np.intp_t> sorted_children[row]
        lambda_ = sorted_lambdas[row]

        if child == current_child:
            min_lambda = min(min_lambda, lambda_)
        elif current_child != -1:
            births[current_child] = min_lambda
            current_child = child
            min_lambda = lambda_
        else:
            # Initialize
            current_child = child
            min_lambda = lambda_

    result_arr = np.zeros(num_clusters, dtype=np.double)

    for i in range(condensed_tree.shape[0]):
        parent = parents[i]
        lambda_ = lambdas[i]
        child_size = sizes[i]
        result_index = parent - smallest_cluster

        result_arr[result_index] += (lambda_ - births[parent]) * child_size

    return dict(np.vstack((np.arange(smallest_cluster, condensed_tree['parent'].max() + 1), result_arr)).T)

cdef list bfs_from_cluster_tree(np.ndarray tree, np.intp_t bfs_root):

    cdef list result
    cdef np.ndarray[np.intp_t, ndim=1] to_process

    result = []
    to_process = np.array([bfs_root], dtype=np.intp)

    while to_process.shape[0] > 0:
        result.extend(to_process.tolist())
        to_process = tree['child'][np.in1d(tree['parent'], to_process)]

    return result

cdef max_lambdas(np.ndarray tree):

    cdef np.ndarray sorted_parent_data
    cdef np.ndarray[np.intp_t, ndim=1] sorted_parents
    cdef np.ndarray[np.double_t, ndim=1] sorted_lambdas

    cdef np.intp_t parent
    cdef np.intp_t current_parent
    cdef np.float64_t lambda_
    cdef np.float64_t max_lambda

    cdef np.ndarray[np.double_t, ndim=1] deaths_arr
    cdef np.double_t *deaths

    cdef np.intp_t largest_parent = tree['parent'].max()

    sorted_parent_data = np.sort(tree[['parent', 'lambda_val']], axis=0)
    deaths_arr = np.zeros(largest_parent + 1, dtype=np.double)
    deaths = (<np.double_t *> deaths_arr.data)
    sorted_parents = sorted_parent_data['parent']
    sorted_lambdas = sorted_parent_data['lambda_val']

    current_parent = -1
    max_lambda = 0

    for row in range(sorted_parent_data.shape[0]):
        parent = <np.intp_t> sorted_parents[row]
        lambda_ = sorted_lambdas[row]

        if parent == current_parent:
            max_lambda = max(max_lambda, lambda_)
        elif current_parent != -1:
            deaths[current_parent] = max_lambda
            current_parent = parent
            max_lambda = lambda_
        else:
            # Initialize
            current_parent = parent
            max_lambda = lambda_

    return deaths_arr

cdef class TreeUnionFind (object):

    cdef np.ndarray _data_arr
    cdef np.intp_t[:,::1] _data
    cdef np.ndarray is_component

    def __init__(self, size):
        self._data_arr = np.zeros((size, 2), dtype=np.intp)
        self._data_arr.T[0] = np.arange(size)
        self._data = (<np.intp_t[:size, :2:1]> (<np.intp_t *> self._data_arr.data))
        self.is_component = np.ones(size, dtype=np.bool)

    cdef union_(self, np.intp_t x, np.intp_t y):
        cdef np.intp_t x_root = self.find(x)
        cdef np.intp_t y_root = self.find(y)

        if self._data[x_root, 1] < self._data[y_root, 1]:
            self._data[x_root, 0] = y_root
        elif self._data[x_root, 1] > self._data[y_root, 1]:
            self._data[y_root, 0] = x_root
        else:
            self._data[y_root, 0] = x_root
            self._data[x_root, 1] += 1

        return

    cdef find(self, np.intp_t x):
        if self._data[x, 0] != x:
            self._data[x, 0] = self.find(self._data[x, 0])
            self.is_component[x] = False
        return self._data[x, 0]

    cdef np.ndarray[np.intp_t, ndim=1] components(self):
        return self.is_component.nonzero()[0]

cpdef np.ndarray[np.intp_t, ndim=1] labelling_at_cut(np.ndarray linkage,
                                                      np.double_t cut,
                                                      np.intp_t min_cluster_size):

    cdef np.intp_t root
    cdef np.intp_t num_points
    cdef np.ndarray[np.intp_t, ndim=1] result_arr
    cdef np.ndarray[np.intp_t, ndim=1] unique_labels
    cdef np.ndarray[np.intp_t, ndim=1] cluster_size
    cdef np.intp_t *result
    cdef TreeUnionFind union_find
    cdef np.intp_t n
    cdef np.intp_t cluster
    cdef np.intp_t cluster_id


    root = 2 * linkage.shape[0]
    num_points = root // 2 + 1

    result_arr = np.empty(num_points, dtype=np.intp)
    result = (<np.intp_t *> result_arr.data)

    union_find = TreeUnionFind(<np.intp_t> root + 1)

    cluster = num_points
    for row in linkage:
        if row[2] < cut:
            union_find.union_(<np.intp_t> row[0], cluster)
            union_find.union_(<np.intp_t> row[1], cluster)
        cluster += 1


    cluster_size = np.zeros(num_points, dtype=np.intp)
    for n in range(num_points):
        cluster = union_find.find(n)
        cluster_size[cluster] += 1
        result[n] = cluster

    cluster_label_map = {-1:-1}
    cluster_label = 0
    unique_labels = np.unique(result_arr)

    for cluster in unique_labels:
        if cluster_size[cluster] < min_cluster_size:
            cluster_label_map[cluster] = -1
        else:
            cluster_label_map[cluster] = cluster_label
            cluster_label += 1

    for n in range(num_points):
        result[n] = cluster_label_map[result[n]]

    return result_arr

cdef np.ndarray[np.intp_t, ndim=1] do_labelling(np.ndarray tree, set clusters, dict cluster_label_map):

    cdef np.intp_t root_cluster
    cdef np.ndarray[np.intp_t, ndim=1] result_arr
    cdef np.ndarray[np.intp_t, ndim=1] parent_array
    cdef np.ndarray[np.intp_t, ndim=1] child_array
    cdef np.intp_t *result
    cdef TreeUnionFind union_find
    cdef np.intp_t parent
    cdef np.intp_t child
    cdef np.intp_t n
    cdef np.intp_t cluster

    child_array = tree['child']
    parent_array = tree['parent']

    root_cluster = parent_array.min()
    result_arr = np.empty(root_cluster, dtype=np.intp)
    result = (<np.intp_t *> result_arr.data)


    union_find = TreeUnionFind(parent_array.max() + 1)

    for n in range(tree.shape[0]):
        child = child_array[n]
        parent = parent_array[n]
        if child not in clusters:
            union_find.union_(parent, child)

    for n in range(root_cluster):
        cluster = union_find.find(n)
        if cluster <= root_cluster:
            result[n] = -1
        else:
            result[n] = cluster_label_map[cluster]

    return result_arr

cdef get_probabilities(np.ndarray tree, dict cluster_map, np.ndarray labels):

    cdef np.ndarray[np.double_t, ndim=1] result
    cdef np.ndarray[np.double_t, ndim=1] deaths
    cdef np.ndarray[np.double_t, ndim=1] lambda_array
    cdef np.ndarray[np.intp_t, ndim=1] child_array
    cdef np.ndarray[np.intp_t, ndim=1] parent_array
    cdef np.intp_t root_cluster
    cdef np.intp_t n
    cdef np.intp_t point
    cdef np.intp_t cluster_num
    cdef np.intp_t cluster
    cdef np.double_t max_lambda
    cdef np.double_t lambda_

    child_array = tree['child']
    parent_array = tree['parent']
    lambda_array = tree['lambda_val']

    result = np.zeros(labels.shape[0])
    deaths = max_lambdas(tree)
    root_cluster = parent_array.min()

    for n in range(tree.shape[0]):
        point = child_array[n]
        if point >= root_cluster:
            continue

        cluster_num = labels[point]

        if cluster_num == -1:
            continue

        cluster = cluster_map[cluster_num]
        max_lambda = deaths[cluster]
        if max_lambda == 0.0:
            result[point] = 1.0
        else:
            lambda_ = min(lambda_array[n], max_lambda)
            result[point] = lambda_ / max_lambda

    return result

cpdef np.ndarray[np.double_t, ndim=1] outlier_scores(np.ndarray tree):

    cdef np.ndarray[np.double_t, ndim=1] result
    cdef np.ndarray[np.double_t, ndim=1] deaths
    cdef np.ndarray[np.double_t, ndim=1] lambda_array
    cdef np.ndarray[np.intp_t, ndim=1] child_array
    cdef np.ndarray[np.intp_t, ndim=1] parent_array
    cdef np.intp_t root_cluster
    cdef np.intp_t point
    cdef np.intp_t parent
    cdef np.intp_t cluster
    cdef np.double_t lambda_max

    child_array = tree['child']
    parent_array = tree['parent']
    lambda_array = tree['lambda_val']

    deaths = max_lambdas(tree)
    root_cluster = parent_array.min()
    result = np.zeros(root_cluster, dtype=np.double)

    topological_sort_order = np.argsort(parent_array)
    #topologically_sorted_tree = tree[topological_sort_order]

    for n in topological_sort_order:
        cluster = child_array[n]
        if cluster < root_cluster:
            break

        parent = parent_array[n]
        if deaths[cluster] > deaths[parent]:
            deaths[parent] = deaths[cluster]

    for n in range(tree.shape[0]):
        point = child_array[n]
        if point >= root_cluster:
            continue

        cluster = parent_array[n]
        lambda_max = deaths[cluster]
        result[point] = (lambda_max - lambda_array[n]) / lambda_max

    return result

cpdef np.ndarray get_stability_scores(np.ndarray labels, set clusters, dict stability, np.double_t max_lambda):

    result = np.empty(len(clusters), dtype=np.double)
    for n, c in enumerate(clusters):
        result[n] = stability[c] / (np.sum(labels == n) * max_lambda)

    return result

cpdef tuple get_clusters(np.ndarray tree, dict stability):
    """
    The tree is assumed to have numeric node ids such that a reverse numeric
    sort is equivalent to a topological sort.
    """
    cdef list node_list
    cdef np.ndarray cluster_tree
    cdef np.ndarray child_selection
    cdef dict is_cluster
    cdef float subtree_stability
    cdef np.intp_t node
    cdef np.intp_t sub_node
    cdef np.intp_t cluster
    cdef np.intp_t num_points
    cdef np.ndarray labels
    cdef np.double_t max_lambda

    # Assume clusters are ordered by numeric id equivalent to
    # a topological sort of the tree; This is valid given the
    # current implementation above, so don't change that ... or
    # if you do, change this accordingly!
    node_list = sorted(stability.keys(), reverse=True)[:-1] # (exclude root)
    cluster_tree = tree[tree['child_size'] > 1]
    is_cluster = {cluster:True for cluster in node_list}
    num_points = np.max(tree[tree['child_size'] == 1]['child']) + 1
    max_lambda = np.max(tree['lambda_val'])

    for node in node_list:
        child_selection = (cluster_tree['parent'] == node)
        subtree_stability = np.sum([stability[child] for 
                                    child in cluster_tree['child'][child_selection]])
        if subtree_stability > stability[node]:
            is_cluster[node] = False
            stability[node] = subtree_stability
        else:
            for sub_node in bfs_from_cluster_tree(cluster_tree, node):
                if sub_node != node:
                    is_cluster[sub_node] = False

    clusters = set([c for c in is_cluster if is_cluster[c]])
    cluster_map = {c:n for n, c in enumerate(clusters)}
    reverse_cluster_map = {n:c for n, c in enumerate(clusters)}

    labels = do_labelling(tree, clusters, cluster_map)
    probs = get_probabilities(tree, reverse_cluster_map, labels)
    stabilities = get_stability_scores(labels, clusters, stability, max_lambda)

    return (labels, probs, stabilities)