forked from scikit-learn/scikit-learn
/
plot_clustering_toy_2D_circles.py
149 lines (123 loc) · 4.27 KB
/
plot_clustering_toy_2D_circles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"""
=================================================
Spectral clustering for non convex cluster shapes
=================================================
"""
print __doc__
# Authors: Olivier Grisel
# License: BSD
from time import time
import numpy as np
import pylab as pl
from sklearn.cluster import k_means
from sklearn.cluster import affinity_propagation
from sklearn.cluster import mean_shift
from sklearn.cluster import spectral_clustering
from sklearn.cluster import Ward
from sklearn.cluster import power_iteration_clustering
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import homogeneity_completeness_v_measure
from sklearn.neighbors import kneighbors_graph
# Generate random samples roughly arranged as nested circles
circle_parameters = (
# (center_x, center_y, radius, n_points)
(0, 0, 10, 100),
(8, 0, 25, 200),
(8, 4, 55, 300),
)
noise_level = 0.05
random_state = np.random.RandomState(42)
circles = []
labels = []
for i, (center_x, center_y, radius, n_points) in enumerate(circle_parameters):
t = random_state.uniform(12 * np.pi, size=n_points)
circle_x = center_x + radius * np.cos(t)
circle_y = center_y + radius * np.sin(t)
circle = np.array([circle_x, circle_y]).T
noise = random_state.normal(scale=noise_level * radius, size=(n_points, 2))
circles.append(circle + noise)
labels += [i] * n_points
X = np.concatenate(circles)
labels_true = np.array(labels)
# Shuffle the samples to ensure that the algo has no way of cheating
indices = np.arange(X.shape[0])
random_state.shuffle(indices)
X = X[indices]
labels_true = labels_true[indices]
# Utility functions to report on the results of the various strategies
def plot_labels(title, labels):
"""Visual clustering port as 2D plot"""
unique_labels = np.unique(labels)
for l in unique_labels:
X_l = X[labels == l, :]
color = pl.cm.hsv(float(l) / unique_labels.shape[0])
pl.scatter(X_l[:, 0], X_l[:, 1], color=color)
pl.title(title)
pl.xticks(())
pl.yticks(())
def report(title, labels_true, labels_pred, duration, do_plot=True):
"""Print lustering report on stdout"""
h, c, v = homogeneity_completeness_v_measure(labels_true, labels_pred)
print title
print "Homogeneity: %0.3f" % h
print "Completeness: %0.3f" % c
print "V-Measure: %0.3f" % v
print "Duration: %0.3fs" % duration
print
if do_plot:
title = "%s\nv=%0.2f (%0.3fs)" % (title, v, duration)
plot_labels(title, labels)
pl.figure()
# Random assignment
t0 = time()
labels = random_state.randint(0, np.unique(labels_true).shape[0],
size=labels_true.shape)
duration = time() - t0
pl.subplot(331)
report("Random", labels_true, labels, duration)
# K-Means
t0 = time()
_, labels, inertia = k_means(X, k=3)
duration = time() - t0
pl.subplot(332)
report("K-Means", labels_true, labels, duration)
# Mean Shift
t0 = time()
_, labels = mean_shift(X, bandwidth=28.0)
duration = time() - t0
pl.subplot(333)
report("Mean Shift", labels_true, labels, duration)
# Build a knn graph as affinity matrix
t0 = time()
affinity = kneighbors_graph(X, n_neighbors=10)
affinity = 0.5 * (affinity + affinity.T) # make affinity symmetric
duration_affinity = time() - t0
# Affinity propagation
# XXX: I cannot get it to work as expected
#_, labels = affinity_propagation(affinity.toarray(), p=0.5)
#pl.subplot(334)
#plot_labels(labels, "Affinity propagation")
# Ward clustering
t0 = time()
labels = Ward(n_clusters=3, connectivity=affinity).fit(X).labels_
duration = time() - t0
pl.subplot(335)
report("Ward", labels_true, labels, duration + duration_affinity)
# Spectral Clustering
# XXX: the spectral clustering results is unstable with the amg-based method
# XXX: we should implement the fast_svd method too
t0 = time()
labels = spectral_clustering(affinity, k=3, mode='arpack',
random_state=random_state)
duration = time() - t0
pl.subplot(337)
report("Spectral", labels_true, labels, duration + duration_affinity)
# Power iteration
t0 = time()
labels = power_iteration_clustering(
affinity, k=3, n_vectors=5, tol=1e-6, random_state=random_state,
verbose=False, plot_vector=False)
duration = time() - t0
pl.subplot(338)
report("Power Iteration", labels_true, labels, duration + duration_affinity)
pl.show()