-
Notifications
You must be signed in to change notification settings - Fork 0
/
clustering.py
113 lines (83 loc) · 3.58 KB
/
clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
from os.path import exists, join
from subprocess import call
import numpy as np
from graph_tool.inference import minimize_blockmodel_dl
from matplotlib.cbook import mkdirs
from sklearn.metrics import silhouette_score
from induce_graph import induce_graph, cutoff, save_to_file
class SilhoutteSelection:
"""Wrap clustering methods that can't determine the optimal number of
clusters by themselves.
Try the method with several parameter for `n_clusters` and return the
labelling with the best silhouette score.
"""
def __init__(self, method, n_clusters=10, *args, **kwargs):
self.method = method
self.n_clusters = n_clusters
self.args = args
self.kwargs = kwargs
def fit_predict(self, data):
scored_labels = []
# Try different values for `n_clusters`
for n_clusters in range(2, self.n_clusters):
method = self.method(n_clusters, *self.args, **self.kwargs)
labels = method.fit_predict(data)
score = silhouette_score(data, labels)
scored_labels.append((score, labels))
# Find the labelling with the best score and return that
return np.array(max(scored_labels)[1])
class ClusteringWithCutoff:
"""Perform clustering using the stochastic block model.
Since we're working with full graphs, different threshold rates are checked
and the one that produces the best silhouette score is selected.
"""
def __init__(self, metric='manhattan_inv', threshold_interval=10):
self.metric = metric
self.cutoff_interval = threshold_interval
def fit_predict(self, data):
graph = induce_graph(data, distance=self.metric)
result_blocks = []
weights = graph.edge_properties['weights'].get_array()
for threshold in np.linspace(0, weights.max(), self.cutoff_interval):
working_graph = cutoff(graph, threshold, inplace=True)
# Apply the sbm to the pruned graph
blocks = minimize_blockmodel_dl(working_graph)
blocks = blocks.get_blocks().get_array()
# Silhouette doesn't work if there's only one cluster label
if len(np.unique(blocks)) > 1:
cutoff_score = silhouette_score(data, blocks)
result_blocks.append((cutoff_score, blocks))
return np.array(max(result_blocks)[1])
class WSBM:
def __init__(self, metric='manhattan_inv'):
self.metric = metric
def fit_predict(self, data):
graph = induce_graph(data, distance=self.metric)
fname = save_to_file(graph, force_integer=True)
output_directory = '_labellings'
if not exists(output_directory):
mkdirs(output_directory)
cwd = os.getcwd()
os.chdir('YWWTools/target')
result_blocks = []
for num_blocks in range(2, 15):
result_fname = '../../%s_blocks_%d' % (fname, num_blocks)
call(' '.join(
['java',
'-cp YWWTools.jar:deps.jar yang.weiwei.Tools',
'--tool wsbm',
'--nodes %d' % graph.num_vertices(),
'--blocks %d' % num_blocks,
'--graph ../../%s' % fname,
'--output %s' % result_fname,
'--no-verbose',
]
), shell=True)
blocks = np.genfromtxt(result_fname)
score = silhouette_score(data, blocks)
result_blocks.append((score, blocks))
os.remove(result_fname)
os.chdir(cwd)
os.remove(fname)
return np.array(max(result_blocks)[1])