Skip to content

Commit

Permalink
Refactor common routines, improve RSL coverage, improve plots coverage
Browse files Browse the repository at this point in the history
  • Loading branch information
lmcinnes committed Aug 20, 2016
1 parent a03ce02 commit 4243ef1
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 142 deletions.
2 changes: 1 addition & 1 deletion ci_scripts/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ popd
# Configure the conda environment and put it in the path using the
# provided versions
conda create -n testenv --yes python=$PYTHON_VERSION pip nose \
numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION cython=$CYTHON_VERSION
numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION cython=$CYTHON_VERSION matplotlib pandas networkx

source activate testenv

Expand Down
76 changes: 11 additions & 65 deletions hdbscan/hdbscan_.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,20 +55,10 @@ def _tree_to_labels(X, single_linkage_tree, min_cluster_size=10, allow_single_cl
def _hdbscan_generic(X, min_samples=5, alpha=1.0,
metric='minkowski', p=2, leaf_size=None, gen_min_span_tree=False, **kwargs):
if metric == 'minkowski':
if p is None:
raise TypeError('Minkowski metric given but no p value supplied!')
if p < 0:
raise ValueError('Minkowski metric with negative p value is not defined!')

distance_matrix = pairwise_distances(X, metric=metric, p=p)
else:
distance_matrix = pairwise_distances(X, metric=metric, **kwargs)

size = X.shape[0]
min_samples = min(size - 1, min_samples)
if min_samples == 0:
min_samples = 1

if issparse(distance_matrix):
# raise TypeError('Sparse distance matrices not yet supported')
return _hdbscan_sparse_distance_matrix(distance_matrix, min_samples, alpha, metric, p,
Expand Down Expand Up @@ -105,21 +95,10 @@ def _hdbscan_sparse_distance_matrix(X, min_samples=5, alpha=1.0,
metric='minkowski', p=2, leaf_size=40,
gen_min_span_tree=False, **kwargs):

if metric == 'minkowski':
if p is None:
raise TypeError('Minkowski metric given but no p value supplied!')
if p < 0:
raise ValueError('Minkowski metric with negative p value is not defined!')

assert(issparse(X))

lil_matrix = X.tolil()

size = X.shape[0]
min_samples = min(size - 1, min_samples)
if min_samples == 0:
min_samples = 1

# Compute sparse mutual reachability graph
mutual_reachability_ = sparse_mutual_reachability(lil_matrix, min_points=min_samples)

Expand Down Expand Up @@ -148,23 +127,11 @@ def _hdbscan_sparse_distance_matrix(X, min_samples=5, alpha=1.0,

def _hdbscan_prims_kdtree(X, min_samples=5, alpha=1.0,
metric='minkowski', p=2, leaf_size=40, gen_min_span_tree=False, **kwargs):
if metric == 'minkowski':
if p is None:
raise TypeError('Minkowski metric given but no p value supplied!')
if p < 0:
raise ValueError('Minkowski metric with negative p value is not defined!')
elif p is None:
p = 2 # Unused, but needs to be integer; assume euclidean

# The Cython routines used require contiguous arrays
if not X.flags['C_CONTIGUOUS']:
X = np.array(X, dtype=np.double, order='C')

size = X.shape[0]
min_samples = min(size - 1, min_samples)
if min_samples == 0:
min_samples = 1

tree = KDTree(X, metric=metric, leaf_size=leaf_size, **kwargs)

# TO DO: Deal with p for minkowski appropriately
Expand All @@ -188,23 +155,11 @@ def _hdbscan_prims_kdtree(X, min_samples=5, alpha=1.0,

def _hdbscan_prims_balltree(X, min_samples=5, alpha=1.0,
metric='minkowski', p=2, leaf_size=40, gen_min_span_tree=False, **kwargs):
if metric == 'minkowski':
if p is None:
raise TypeError('Minkowski metric given but no p value supplied!')
if p < 0:
raise ValueError('Minkowski metric with negative p value is not defined!')
elif p is None:
p = 2 # Unused, but needs to be integer; assume euclidean

# The Cython routines used require contiguous arrays
if not X.flags['C_CONTIGUOUS']:
X = np.array(X, dtype=np.double, order='C')

size = X.shape[0]
min_samples = min(size - 1, min_samples)
if min_samples == 0:
min_samples = 1

tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs)

dist_metric = DistanceMetric.get_metric(metric, **kwargs)
Expand All @@ -229,23 +184,13 @@ def _hdbscan_boruvka_kdtree(X, min_samples=5, alpha=1.0,
approx_min_span_tree=True,
gen_min_span_tree=False,
core_dist_n_jobs=4, **kwargs):
if metric == 'minkowski':
if p is None:
raise TypeError('Minkowski metric given but no p value supplied!')
if p < 0:
raise ValueError('Minkowski metric with negative p value is not defined!')

if leaf_size < 3:
leaf_size = 3

if core_dist_n_jobs < 1:
raise ValueError('Parallel core distance computation requires 1 or more jobs!')

size = X.shape[0]
min_samples = min(size - 1, min_samples)
if min_samples == 0:
min_samples = 1

tree = KDTree(X, metric=metric, leaf_size=leaf_size, **kwargs)
alg = KDTreeBoruvkaAlgorithm(tree, min_samples, metric=metric, leaf_size=leaf_size // 3,
approx_min_span_tree=approx_min_span_tree, n_jobs=core_dist_n_jobs, **kwargs)
Expand All @@ -267,23 +212,13 @@ def _hdbscan_boruvka_balltree(X, min_samples=5, alpha=1.0,
approx_min_span_tree=True,
gen_min_span_tree=False,
core_dist_n_jobs=4, **kwargs):
if metric == 'minkowski':
if p is None:
raise TypeError('Minkowski metric given but no p value supplied!')
if p < 0:
raise ValueError('Minkowski metric with negative p value is not defined!')

if leaf_size < 3:
leaf_size = 3

if core_dist_n_jobs < 1:
raise ValueError('Parallel core distance computation requires 1 or more jobs!')

size = X.shape[0]
min_samples = min(size - 1, min_samples)
if min_samples == 0:
min_samples = 1

tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs)
alg = BallTreeBoruvkaAlgorithm(tree, min_samples, metric=metric, leaf_size=leaf_size // 3,
approx_min_span_tree=approx_min_span_tree, n_jobs=core_dist_n_jobs, **kwargs)
Expand Down Expand Up @@ -441,12 +376,23 @@ def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0,
if leaf_size < 1:
raise ValueError('Leaf size must be greater than 0!')

if metric == 'minkowski':
if p is None:
raise TypeError('Minkowski metric given but no p value supplied!')
if p < 0:
raise ValueError('Minkowski metric with negative p value is not defined!')

# Checks input and converts to an nd-array where possible
X = check_array(X, accept_sparse='csr')
# Python 2 and 3 compliant string_type checking
if isinstance(memory, six.string_types):
memory = Memory(cachedir=memory, verbose=0)

size = X.shape[0]
min_samples = min(size - 1, min_samples)
if min_samples == 0:
min_samples = 1

if algorithm != 'best':
if algorithm == 'generic':
(single_linkage_tree,
Expand Down
90 changes: 33 additions & 57 deletions hdbscan/robust_single_linkage_.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,6 @@

def _rsl_generic(X, k=5, alpha=1.4142135623730951, metric='minkowski', p=2):
if metric == 'minkowski':
if p is None:
raise TypeError('Minkowski metric given but no p value supplied!')
if p < 0:
raise ValueError('Minkowski metric with negative p value is not defined!')

distance_matrix = pairwise_distances(X, metric=metric, p=p)
else:
distance_matrix = pairwise_distances(X, metric=metric)
Expand All @@ -50,14 +45,7 @@ def _rsl_generic(X, k=5, alpha=1.4142135623730951, metric='minkowski', p=2):
return single_linkage_tree


def _rsl_prims_kdtree(X, k=5, alpha=1.4142135623730951, metric='minkowski', p=2):
if metric == 'minkowski':
if p is None:
raise TypeError('Minkowski metric given but no p value supplied!')
if p < 0:
raise ValueError('Minkowski metric with negative p value is not defined!')
elif p is None:
p = 2 # Unused, but needs to be integer; assume euclidean
def _rsl_prims_kdtree(X, k=5, alpha=1.4142135623730951, metric='minkowski', **kwargs):

# The Cython routines used require contiguous arrays
if not X.flags['C_CONTIGUOUS']:
Expand All @@ -66,11 +54,11 @@ def _rsl_prims_kdtree(X, k=5, alpha=1.4142135623730951, metric='minkowski', p=2)
dim = X.shape[0]
k = min(dim - 1, k)

tree = KDTree(X, metric=metric)
tree = KDTree(X, metric=metric, **kwargs)

dist_metric = DistanceMetric.get_metric(metric)
dist_metric = DistanceMetric.get_metric(metric, **kwargs)

core_distances = tree.query(X, k=k)[0][:, -1]
core_distances = tree.query(X, k=k)[0][:, -1].copy(order='C')
min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha)

single_linkage_tree = label(min_spanning_tree)
Expand All @@ -79,14 +67,7 @@ def _rsl_prims_kdtree(X, k=5, alpha=1.4142135623730951, metric='minkowski', p=2)
return single_linkage_tree


def _rsl_prims_balltree(X, k=5, alpha=1.4142135623730951, metric='minkowski', p=2):
if metric == 'minkowski':
if p is None:
raise TypeError('Minkowski metric given but no p value supplied!')
if p < 0:
raise ValueError('Minkowski metric with negative p value is not defined!')
elif p is None:
p = 2 # Unused, but needs to be integer; assume euclidean
def _rsl_prims_balltree(X, k=5, alpha=1.4142135623730951, metric='minkowski', **kwargs):

# The Cython routines used require contiguous arrays
if not X.flags['C_CONTIGUOUS']:
Expand All @@ -95,11 +76,11 @@ def _rsl_prims_balltree(X, k=5, alpha=1.4142135623730951, metric='minkowski', p=
dim = X.shape[0]
k = min(dim - 1, k)

tree = BallTree(X, metric=metric)
tree = BallTree(X, metric=metric, **kwargs)

dist_metric = DistanceMetric.get_metric(metric)
dist_metric = DistanceMetric.get_metric(metric, **kwargs)

core_distances = tree.query(X, k=k)[0][:, -1]
core_distances = tree.query(X, k=k)[0][:, -1].copy(order='C')
min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, alpha)

single_linkage_tree = label(min_spanning_tree)
Expand All @@ -109,19 +90,14 @@ def _rsl_prims_balltree(X, k=5, alpha=1.4142135623730951, metric='minkowski', p=


def _rsl_boruvka_kdtree(X, k=5, alpha=1.0,
metric='minkowski', p=2, leaf_size=40):
if metric == 'minkowski':
if p is None:
raise TypeError('Minkowski metric given but no p value supplied!')
if p < 0:
raise ValueError('Minkowski metric with negative p value is not defined!')
metric='minkowski', leaf_size=40, **kwargs):

dim = X.shape[0]
min_samples = min(dim - 1, k)

tree = KDTree(X, metric=metric, leaf_size=leaf_size)
tree = KDTree(X, metric=metric, leaf_size=leaf_size, **kwargs)
alg = KDTreeBoruvkaAlgorithm(tree, min_samples, metric=metric,
alpha=alpha, leaf_size=leaf_size)
alpha=alpha, leaf_size=leaf_size, **kwargs)
min_spanning_tree = alg.spanning_tree()

single_linkage_tree = label(min_spanning_tree)
Expand All @@ -131,19 +107,14 @@ def _rsl_boruvka_kdtree(X, k=5, alpha=1.0,


def _rsl_boruvka_balltree(X, k=5, alpha=1.0,
metric='minkowski', p=2, leaf_size=40):
if metric == 'minkowski':
if p is None:
raise TypeError('Minkowski metric given but no p value supplied!')
if p < 0:
raise ValueError('Minkowski metric with negative p value is not defined!')
metric='minkowski', leaf_size=40, **kwargs):

dim = X.shape[0]
min_samples = min(dim - 1, k)

tree = BallTree(X, metric=metric, leaf_size=leaf_size)
tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs)
alg = BallTreeBoruvkaAlgorithm(tree, min_samples, metric=metric,
alpha=alpha, leaf_size=leaf_size)
alpha=alpha, leaf_size=leaf_size, **kwargs)
min_spanning_tree = alg.spanning_tree()

single_linkage_tree = label(min_spanning_tree)
Expand All @@ -153,8 +124,8 @@ def _rsl_boruvka_balltree(X, k=5, alpha=1.0,


def robust_single_linkage(X, cut, k=5, alpha=1.4142135623730951,
gamma=5, metric='minkowski', p=2, algorithm='best',
memory=Memory(cachedir=None, verbose=0), leaf_size=40):
gamma=5, metric='euclidean', algorithm='best',
memory=Memory(cachedir=None, verbose=0), leaf_size=40, **kwargs):
"""Perform robust single linkage clustering from a vector array
or distance matrix.
Expand Down Expand Up @@ -242,48 +213,54 @@ def robust_single_linkage(X, cut, k=5, alpha=1.4142135623730951,
if type(leaf_size) is not int or leaf_size < 1:
raise ValueError('Leaf size must be at least one!')

if metric == 'minkowski':
if 'p' not in kwargs or kwargs['p'] is None:
raise TypeError('Minkowski metric given but no p value supplied!')
if kwargs['p'] < 0:
raise ValueError('Minkowski metric with negative p value is not defined!')

X = check_array(X, accept_sparse='csr')
if isinstance(memory, six.string_types):
memory = Memory(cachedir=memory, verbose=0)

if algorithm != 'best':
if algorithm == 'generic':
single_linkage_tree = \
memory.cache(_rsl_generic)(X, k, alpha, metric, p)
memory.cache(_rsl_generic)(X, k, alpha, metric, **kwargs)
elif algorithm == 'prims_kdtree':
single_linkage_tree = \
memory.cache(_rsl_prims_kdtree)(X, k, alpha, metric, p)
memory.cache(_rsl_prims_kdtree)(X, k, alpha, metric, **kwargs)
elif algorithm == 'prims_balltree':
single_linkage_tree = \
memory.cache(_rsl_prims_balltree)(X, k, alpha, metric, p)
memory.cache(_rsl_prims_balltree)(X, k, alpha, metric, **kwargs)
elif algorithm == 'boruvka_kdtree':
single_linkage_tree = \
memory.cache(_rsl_boruvka_kdtree)(X, k, alpha, metric, p, leaf_size)
memory.cache(_rsl_boruvka_kdtree)(X, k, alpha, metric, leaf_size, **kwargs)
elif algorithm == 'boruvka_balltree':
single_linkage_tree = \
memory.cache(_rsl_boruvka_balltree)(X, k, alpha, metric, p, leaf_size)
memory.cache(_rsl_boruvka_balltree)(X, k, alpha, metric, leaf_size, **kwargs)
else:
raise TypeError('Unknown algorithm type %s specified' % algorithm)
else:
if issparse(X) or metric not in FAST_METRICS: # We can't do much with sparse matrices ...
single_linkage_tree = \
memory.cache(_rsl_generic)(X, k, alpha, metric, p)
memory.cache(_rsl_generic)(X, k, alpha, metric, **kwargs)
elif metric in KDTree.valid_metrics:
# Need heuristic to decide when to go to boruvka; still debugging for now
if X.shape[1] > 128:
single_linkage_tree = \
memory.cache(_rsl_prims_kdtree)(X, k, alpha, metric, p)
memory.cache(_rsl_prims_kdtree)(X, k, alpha, metric, **kwargs)
else:
single_linkage_tree = \
memory.cache(_rsl_boruvka_kdtree)(X, k, alpha, metric, p)
memory.cache(_rsl_boruvka_kdtree)(X, k, alpha, metric, **kwargs)
else: # Metric is a valid BallTree metric
# Need heuristic to decide when to go to boruvka; still debugging for now
if X.shape[1] > 128:
single_linkage_tree = \
memory.cache(_rsl_prims_kdtree)(X, k, alpha, metric, p)
memory.cache(_rsl_prims_kdtree)(X, k, alpha, metric, **kwargs)
else:
single_linkage_tree = \
memory.cache(_rsl_boruvka_balltree)(X, k, alpha, metric, p)
memory.cache(_rsl_boruvka_balltree)(X, k, alpha, metric, **kwargs)

labels = single_linkage_tree.get_clusters(cut, gamma)

Expand Down Expand Up @@ -365,15 +342,14 @@ class RobustSingleLinkage(BaseEstimator, ClusterMixin):
"""

def __init__(self, cut=0.4, k=5, alpha=1.4142135623730951, gamma=5, metric='euclidean',
algorithm='best', p=None):
algorithm='best'):

self.cut = cut
self.k = k
self.alpha = alpha
self.gamma = gamma
self.metric = metric
self.algorithm = algorithm
self.p = p

self._cluster_hierarchy_ = None

Expand Down
2 changes: 1 addition & 1 deletion hdbscan/tests/test_hdbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ def test_hdbscan_boruvka_balltree_matches():
def test_condensed_tree_plot():
clusterer = HDBSCAN().fit(X)
clusterer.condensed_tree_.get_plot_data()
if_matplotlib(clusterer.condensed_tree_.plot)()
if_matplotlib(clusterer.condensed_tree_.plot)(select_clusters=True, selection_palette=('r','g','b'))

def test_tree_numpy_output_formats():

Expand Down
Loading

0 comments on commit 4243ef1

Please sign in to comment.