Skip to content

Commit

Permalink
Better coverage for RSL
Browse files Browse the repository at this point in the history
  • Loading branch information
lmcinnes committed Aug 20, 2016
1 parent a657a9b commit 5528e10
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 11 deletions.
12 changes: 11 additions & 1 deletion hdbscan/robust_single_linkage_.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ def _rsl_prims_kdtree(X, k=5, alpha=1.4142135623730951, metric='minkowski', p=2)
elif p is None:
p = 2 # Unused, but needs to be integer; assume euclidean

# The Cython routines used require contiguous arrays
if not X.flags['C_CONTIGUOUS']:
X = np.array(X, dtype=np.double, order='C')

dim = X.shape[0]
k = min(dim - 1, k)

Expand All @@ -90,6 +94,10 @@ def _rsl_prims_balltree(X, k=5, alpha=1.4142135623730951, metric='minkowski', p=
elif p is None:
p = 2 # Unused, but needs to be integer; assume euclidean

# The Cython routines used require contiguous arrays
if not X.flags['C_CONTIGUOUS']:
X = np.array(X, dtype=np.double, order='C')

dim = X.shape[0]
k = min(dim - 1, k)

Expand Down Expand Up @@ -343,13 +351,15 @@ class RobustSingleLinkage(BaseEstimator, ClusterMixin):
"""

def __init__(self, cut=0.4, k=5, alpha=1.4142135623730951, gamma=5, metric='euclidean', p=None):
def __init__(self, cut=0.4, k=5, alpha=1.4142135623730951, gamma=5, metric='euclidean',
algorithm='best', p=None):

self.cut = cut
self.k = k
self.alpha = alpha
self.gamma = gamma
self.metric = metric
self.algorithm = algorithm
self.p = p

self._cluster_hierarchy_ = None
Expand Down
55 changes: 45 additions & 10 deletions hdbscan/tests/test_rsl.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,47 +20,82 @@

from sklearn import datasets

from sklearn.datasets import make_blobs
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler


n_clusters = 3
X = generate_clustered_data(n_clusters=n_clusters, n_samples_per_cluster=50)
X, y = make_blobs(n_samples=50, random_state=1)
X, y = shuffle(X, y, random_state=7)
X = StandardScaler().fit_transform(X)
# X = generate_clustered_data(n_clusters=n_clusters, n_samples_per_cluster=50)

def test_rsl_distance_matrix():
D = distance.squareform(distance.pdist(X))
D /= np.max(D)

labels, tree = robust_single_linkage(D, 0.25, metric='precomputed')
labels, tree = robust_single_linkage(D, 0.4, metric='precomputed')
# number of clusters, ignoring noise if present
n_clusters_1 = len(set(labels)) - int(-1 in labels) # ignore noise
#assert_equal(n_clusters_1, n_clusters)
assert_equal(n_clusters_1, 2)

labels = RobustSingleLinkage(metric="precomputed").fit(D).labels_
n_clusters_2 = len(set(labels)) - int(-1 in labels)
#assert_equal(n_clusters_2, n_clusters)
assert_equal(n_clusters_2, 2)

def test_rsl_feature_vector():
labels, tree = robust_single_linkage(X, 0.2)
labels, tree = robust_single_linkage(X, 0.4)
n_clusters_1 = len(set(labels)) - int(-1 in labels)
#assert_equal(n_clusters_1, n_clusters)
assert_equal(n_clusters_1, n_clusters)

labels = RobustSingleLinkage().fit(X).labels_
n_clusters_2 = len(set(labels)) - int(-1 in labels)
#assert_equal(n_clusters_2, n_clusters)
assert_equal(n_clusters_2, n_clusters)

def test_rsl_callable_metric():
# metric is the function reference, not the string key.
metric = distance.euclidean

labels, tree = robust_single_linkage(X, 0.2, metric=metric)
labels, tree = robust_single_linkage(X, 0.4, metric=metric)
n_clusters_1 = len(set(labels)) - int(-1 in labels)
#assert_equal(n_clusters_1, n_clusters)
assert_equal(n_clusters_1, n_clusters)

labels = RobustSingleLinkage(metric=metric).fit(X).labels_
n_clusters_2 = len(set(labels)) - int(-1 in labels)
#assert_equal(n_clusters_2, n_clusters)
assert_equal(n_clusters_2, n_clusters)

def test_rsl_input_lists():
X = [[1., 2.], [3., 4.]]
RobustSingleLinkage().fit(X) # must not raise exception

def test_rsl_boruvka_balltree():
labels, tree = robust_single_linkage(X, 0.45, algorithm='boruvka_balltree')
n_clusters_1 = len(set(labels)) - int(-1 in labels)
assert_equal(n_clusters_1, n_clusters)

labels = RobustSingleLinkage(cut=0.45, algorithm='boruvka_balltree').fit(X).labels_
n_clusters_2 = len(set(labels)) - int(-1 in labels)
assert_equal(n_clusters_2, n_clusters)

# def test_rsl_prims_balltree():
# labels, tree = robust_single_linkage(X, 0.4, algorithm='prims_balltree')
# n_clusters_1 = len(set(labels)) - int(-1 in labels)
# assert_equal(n_clusters_1, n_clusters)
#
# labels = RobustSingleLinkage(algorithm='prims_balltree').fit(X).labels_
# n_clusters_2 = len(set(labels)) - int(-1 in labels)
# assert_equal(n_clusters_2, n_clusters)
#
# def test_rsl_prims_kdtree():
# labels, tree = robust_single_linkage(X, 0.4, algorithm='prims_kdtree')
# n_clusters_1 = len(set(labels)) - int(-1 in labels)
# assert_equal(n_clusters_1, n_clusters)
#
# labels = RobustSingleLinkage(algorithm='prims_kdtree').fit(X).labels_
# n_clusters_2 = len(set(labels)) - int(-1 in labels)
# assert_equal(n_clusters_2, n_clusters)

def test_rsl_is_sklearn_estimator():

check_estimator(RobustSingleLinkage)

0 comments on commit 5528e10

Please sign in to comment.