forked from apache/madlib
/
kmeans_create.sql_in
248 lines (208 loc) · 7.71 KB
/
kmeans_create.sql_in
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
/* ----------------------------------------------------------------------- *//**
*
* @file kmeans_create.sql_in
*
* @brief SQL functions for k-means clustering
* @date January 2011
*
* @sa For a brief introduction to k-means clustering, see the module
* description \ref grp_kmeans.
*
*//* ----------------------------------------------------------------------- */
/**
@addtogroup grp_kmeans
@about
K-means clustering algorithm helps in dividing a list of objects
into K groups based on the similarity of their attributes.
Both objects and groups are represented as points in n-dimensional
space. The similarity is expressed as distance between objects (points) and
group centers (centroids). Each object is assigned to a group with
the nearest centroid in a series of iterations, the goal of which is
to minimize the total distance between points and their centroids.
The distance is currently measured using l2-norm,
also know as
<a href="http://en.wikipedia.org/wiki/Norm_(mathematics)#Euclidean_norm">Euclidean
distance</a>.
This method works on a set of data points accessible in a table or through a view.
Initial centroids are found according to
k-means++ algorithm [1].
Further adjustments are based on the Euclidean distance between
the current centroids and all available data points or a random subset of them
, such that there are at least 200 points from each initial cluster.
The algorithm stops when one of the following conditions is met:
- fraction of reassigned nodes is not growing
- fraction of reassigned nodes is smaller than the limit (default = 0.001)
- reached the maximum number of allowed iterations (default = 20)
@prereq
Requires MADlib \link svec svec module\endlink, which provides a sparse vector
data type.
@usage
Function: <tt>kmeans_run( '<em>input_table</em>', <em>k</em>,
'<em>goodness</em>', '<em>run_id</em>', '<em>output_schema</em>')</tt>
Parameters:
- <em>input_table</em> : table with the source data
- <em>k</em> : max number of clusters to consider
(must be smaller than number of input points)
- <em>goodness</em> : goodness of fit test flag [0,1]
- <em>run_id</em> : execution identifier
- <em>output_schema</em> : target schema for output tables (points, centroids)
@examp
1) Prepare the input table/view with the following structure. This method
currently requires the following column names, as well as the data types:
\code
pid INTEGER
position {SVEC|INT[]|FLOAT[]}
\endcode
where:
- <em>pid</em> is the name of the column with an ID of the object (point)
- <em>position</em> is the attribute vector (point coordinates)
2) Populate the input table with some data. You can use a helper function (create_test_table) from the sql/setup.sql file.
3) Call kmeans_run() stored procedure, e.g.:
\code
SQL> select madlib.kmeans_run( 'source_schema.source_name', 10, 1, 'my_test_run', 'my_schema');
\endcode
4) Sample output:
\code
INFO: Parameters:
INFO: * k = 10 (number of centroids)
INFO: * input_table = source_schema.source_name
INFO: * goodness = 1 (GOF test on)
INFO: * run_id = my_test_run
INFO: * output_schema = my_schema
INFO: Seeding 10 centroids...
INFO: Using sample data set for analysis... (9200 out of 10000 points)
INFO: ...Iteration 1
INFO: ...Iteration 2
INFO: Exit reason: fraction of reassigned nodes is smaller than the limit: 0.001
INFO: Expanding cluster assignment to all points...
INFO: Calculating goodness of fit...
Finished k-means clustering on "my_schema"."kmeans_input_table" table.
Results:
* 10 centroids (goodness of fit = 2.9787570001).
Output:
* table : "my_schema"."kmeans_out_centroids_testrun"
* table : "my_schema"."kmeans_out_points_testrun"
Time elapsed: 0 minutes 4.335644 seconds.
\endcode
@sa file kmeans_create.sql_in (documenting the SQL functions)
@internal
@sa namespace kmeans (documenting the implementation in Python)
@endinternal
@literature
[1] Wikipedia, K-means++,
http://en.wikipedia.org/wiki/K-means%2B%2B
*/
BEGIN;
/**
* @internal
* Support function: takes a single SVEC (A) and an array of SVECs (B)
* and returns the index of (B) with the shortest distance to (A).
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_closestID(
p_point MADLIB_SCHEMA.SVEC, p_centroids MADLIB_SCHEMA.SVEC[]
)
RETURNS INTEGER
AS $$
DECLARE
minCID INTEGER := 1;
min_val FLOAT;
temp_val FLOAT;
BEGIN
-- Check the arguments
IF p_point is NULL or p_centroids is NULL THEN
RETURN null;
END IF;
min_val = MADLIB_SCHEMA.l2norm( p_point - p_centroids[1]);
FOR i IN 2..array_upper( p_centroids, 1)
LOOP
temp_val = MADLIB_SCHEMA.l2norm( p_point - p_centroids[i]);
IF ( temp_val < coalesce( min_val, temp_val + 1) ) THEN
min_val = temp_val;
minCID = i;
END IF;
END LOOP;
RETURN minCID;
END
$$ LANGUAGE plpgsql;
-- Finalize function for _kmeans_meanPosition() aggregate.
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_mean_finalize( p_centroid MADLIB_SCHEMA.SVEC)
RETURNS MADLIB_SCHEMA.SVEC AS $$
DECLARE
new_location FLOAT[];
new_location2 FLOAT[];
sum FLOAT;
BEGIN
new_location = MADLIB_SCHEMA.SVEC_return_array(p_centroid);
sum = new_location[array_upper(new_location, 1)];
FOR i in 1..(array_upper(new_location, 1)-1) LOOP
new_location2[i] = new_location[i]/sum;
END LOOP;
RETURN MADLIB_SCHEMA.SVEC_cast_float8arr(new_location2);
END
$$ LANGUAGE plpgsql;
-- Sfunc function for _kmeans_meanPosition() aggregate.
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_mean_product(MADLIB_SCHEMA.SVEC, MADLIB_SCHEMA.SVEC)
RETURNS MADLIB_SCHEMA.SVEC AS $$
DECLARE
new_location MADLIB_SCHEMA.SVEC;
BEGIN
new_location = MADLIB_SCHEMA.SVEC_concat($2,MADLIB_SCHEMA.SVEC_cast_float8(1.0));
IF ($1 IS NOT NULL) THEN
new_location = $1 + new_location;
END IF;
RETURN new_location;
END
$$ LANGUAGE plpgsql;
-- Prefunc function for _kmeans_meanPosition() aggregate.
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_mean_aggr( MADLIB_SCHEMA.SVEC, MADLIB_SCHEMA.SVEC) RETURNS MADLIB_SCHEMA.SVEC AS $$
DECLARE
BEGIN
IF (($1 IS NOT NULL) AND ($2 IS NOT NULL)) THEN
RETURN $1 + $2;
END IF;
IF ($1 IS NOT NULL) THEN
RETURN $1;
END IF;
IF ($2 IS NOT NULL) THEN
RETURN $2;
END IF;
END
$$ LANGUAGE plpgsql;
/**
* @internal
* @brief Compute a mean position for a set of SVECs.
*/
CREATE AGGREGATE MADLIB_SCHEMA.__kmeans_meanPosition( MADLIB_SCHEMA.SVEC )
(
stype = MADLIB_SCHEMA.SVEC,
sfunc = MADLIB_SCHEMA.__kmeans_mean_product,
ifdef(`GREENPLUM',`prefunc = MADLIB_SCHEMA.__kmeans_mean_aggr,')
finalfunc = MADLIB_SCHEMA.__kmeans_mean_finalize
);
/**
* @brief Compute a k-means clustering
*
* @param input_table Name of relation containing the input data points
* @param k Number of centroids to generate
* @param goodness Goodness of fit test flag (allowed values: 0, 1)
* @param run_id Name/ID of the execution
* @param output_schema Target schema for the output tables
* @return Textual summary of the algorithm run, including the names of the
* created tables and run time statistics.
*
* @internal
* @sa This function is a wrapper for kmeans::kmeans_run()
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_run( input_table text, k int, goodness int, run_id text, output_table text)
RETURNS text
AS $$
import sys
try:
from madlib import kmeans
except:
sys.path.append("PLPYTHON_LIBDIR")
from madlib import kmeans
plpy.execute( 'set client_min_messages=warning');
return kmeans.kmeans_run( input_table, k, goodness, run_id, output_table);
$$ LANGUAGE plpythonu;
COMMIT;