-
Notifications
You must be signed in to change notification settings - Fork 2
/
stability_study.py
51 lines (40 loc) · 1.54 KB
/
stability_study.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import numpy as np
import matplotlib.pyplot as plt
import tqdm
import k_means_l as kml
n = 100
bbox_performance = []
subsamp_performance = []
for j in tqdm.trange(n):
k = np.random.randint(2, 7)
d = np.random.randint(2, 10)
#generate k test clusters in d-dimensional space.
data = kml.generate_data(k,d,200)
bbox_centroids, bbox_bins = kml.k_means(data, k,
init_function = kml.generate_initial_centroids_from_bbox)
subsamp_centroids, norm_bins = kml.k_means(data, k,
init_function = kml.generate_initial_centroids_from_data)
bbox_performance.append(kml.SSE(data, bbox_centroids))
subsamp_performance.append(kml.SSE(data, subsamp_centroids))
plt.hist(bbox_performance,
color = "r",
label = "bbox method",
density= True,
bins = int(n/10)+1,
alpha = 0.5)
plt.hist(subsamp_performance,
color = "b",
label = "subsampling method",
density= True,
bins = int(n/10)+1,
alpha = 0.5)
plt.xlabel("Root Mean Squared Error, Goodness of Fit")
plt.ylabel("Normed Frequency of Occurrence")
plt.title("A Comparison of Initialization Techniques for K-Means")
plt.axvline(np.mean(bbox_performance),
0, 1, color = "r", linestyle = "dashed",
label = "subsampling mean")
plt.axvline(np.mean(subsamp_performance),
0, 1, color = "b", linestyle = "dashed",
label = "bbox mean")
plt.legend(loc = "best")