methods/k-means/src/pg_gp/kmeans_create.sql_in

/* ----------------------------------------------------------------------- *//** 
 *
 * @file kmeans_create.sql_in
 *
 * @brief SQL functions for k-means clustering
 * @date January 2011
 *
 * @sa For a brief introduction to k-means clustering, see the module
 *     description \ref grp_kmeans.
 *
 *//* ----------------------------------------------------------------------- */

/**
@addtogroup grp_kmeans 

@about

K-means clustering algorithm helps in dividing a list of objects 
into K groups based on the similarity of their attributes. 

Both objects and groups are represented as points in n-dimensional 
space. The similarity is expressed as distance between objects (points) and
group centers (centroids). Each object is assigned to a group with 
the nearest centroid in a series of iterations, the goal of which is
to minimize the total distance between points and their centroids. 
The distance is currently measured using l2-norm, 
also know as
<a href="http://en.wikipedia.org/wiki/Norm_(mathematics)#Euclidean_norm">Euclidean
distance</a>.


This method works on a set of data points accessible in a table or through a view. 
Initial centroids are found according to  
k-means++ algorithm [1]. 
Further adjustments are based on the Euclidean distance between 
the current centroids and all available data points or a random subset of them
, such that there are at least 200 points from each initial cluster.

The algorithm stops when one of the following conditions is met:
- fraction of reassigned nodes is not growing
- fraction of reassigned nodes is smaller than the limit (default = 0.001)
- reached the maximum number of allowed iterations (default = 20)

@prereq

Requires MADlib \link svec svec module\endlink, which provides a sparse vector
data type.

@usage

Function: <tt>kmeans_run( '<em>input_table</em>', <em>k</em>,
   '<em>goodness</em>', '<em>run_id</em>', '<em>output_schema</em>')</tt>

Parameters:
    - <em>input_table</em> :      table with the source data
    - <em>k</em> :                max number of clusters to consider 
                                  (must be smaller than number of input points)
    - <em>goodness</em> :         goodness of fit test flag [0,1]
    - <em>run_id</em> :           execution identifier
    - <em>output_schema</em> :    target schema for output tables (points, centroids)

@examp

1) Prepare the input table/view with the following structure. This method
currently requires the following column names, as well as the data types:

\code
	pid 		INTEGER                            
	position 	{SVEC|INT[]|FLOAT[]}   
\endcode
 
where: 
    - <em>pid</em> is the name of the column with an ID of the object (point)
    - <em>position</em> is the attribute vector (point coordinates)
    
2) Populate the input table with some data. You can use a helper function (create_test_table) from the sql/setup.sql file.
   
3) Call kmeans_run() stored procedure, e.g.:  

\code
SQL> select madlib.kmeans_run( 'source_schema.source_name', 10, 1, 'my_test_run', 'my_schema');
\endcode
  
4) Sample output:

\code
INFO: Parameters:
INFO:  * k = 10 (number of centroids)
INFO:  * input_table = source_schema.source_name
INFO:  * goodness = 1 (GOF test on)
INFO:  * run_id = my_test_run
INFO:  * output_schema = my_schema
INFO: Seeding 10 centroids...
INFO: Using sample data set for analysis... (9200 out of 10000 points)
INFO: ...Iteration 1
INFO: ...Iteration 2
INFO: Exit reason: fraction of reassigned nodes is smaller than the limit: 0.001
INFO: Expanding cluster assignment to all points...
INFO: Calculating goodness of fit...

Finished k-means clustering on "my_schema"."kmeans_input_table" table. 
Results: 
    * 10 centroids (goodness of fit = 2.9787570001).
Output:
    * table : "my_schema"."kmeans_out_centroids_testrun"
    * table : "my_schema"."kmeans_out_points_testrun"
Time elapsed: 0 minutes 4.335644 seconds.
\endcode

@sa file kmeans_create.sql_in (documenting the SQL functions)

@internal
@sa namespace kmeans (documenting the implementation in Python)
@endinternal

@literature

[1] Wikipedia, K-means++,
    http://en.wikipedia.org/wiki/K-means%2B%2B
*/

BEGIN;


/**
 * @internal
 * Support function: takes a single SVEC (A) and an array of SVECs (B)
 * and returns the index of (B) with the shortest distance to (A).
 */
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_closestID( 
    p_point MADLIB_SCHEMA.SVEC, p_centroids MADLIB_SCHEMA.SVEC[]
) 
RETURNS INTEGER
AS $$
DECLARE
    minCID      INTEGER := 1;
    min_val     FLOAT;
    temp_val    FLOAT;
BEGIN

    -- Check the arguments
    IF p_point is NULL or p_centroids is NULL THEN
        RETURN null;
    END IF;

    min_val = MADLIB_SCHEMA.l2norm( p_point - p_centroids[1]);

    FOR i IN 2..array_upper( p_centroids, 1) 
    LOOP
        temp_val = MADLIB_SCHEMA.l2norm( p_point - p_centroids[i]);
        IF ( temp_val < coalesce( min_val, temp_val + 1) ) THEN
            min_val = temp_val;
            minCID = i;
        END IF;
    END LOOP;
    
    RETURN minCID;
END
$$ LANGUAGE plpgsql;

-- Finalize function for _kmeans_meanPosition() aggregate.
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_mean_finalize( p_centroid MADLIB_SCHEMA.SVEC) 
RETURNS MADLIB_SCHEMA.SVEC AS $$
DECLARE
    new_location FLOAT[];
    new_location2 FLOAT[];
    sum FLOAT;
BEGIN
    new_location = MADLIB_SCHEMA.SVEC_return_array(p_centroid);
    sum = new_location[array_upper(new_location, 1)];
    FOR i in 1..(array_upper(new_location, 1)-1) LOOP
        new_location2[i] = new_location[i]/sum;
    END LOOP;
    RETURN MADLIB_SCHEMA.SVEC_cast_float8arr(new_location2);
END
$$ LANGUAGE plpgsql;

-- Sfunc function for _kmeans_meanPosition() aggregate.
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_mean_product(MADLIB_SCHEMA.SVEC, MADLIB_SCHEMA.SVEC) 
RETURNS MADLIB_SCHEMA.SVEC AS $$
DECLARE
    new_location MADLIB_SCHEMA.SVEC;
BEGIN
    new_location = MADLIB_SCHEMA.SVEC_concat($2,MADLIB_SCHEMA.SVEC_cast_float8(1.0));
    IF ($1 IS NOT NULL) THEN
        new_location = $1 + new_location;
    END IF;
    RETURN new_location;
END
$$ LANGUAGE plpgsql;

-- Prefunc function for _kmeans_meanPosition() aggregate.
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__kmeans_mean_aggr( MADLIB_SCHEMA.SVEC, MADLIB_SCHEMA.SVEC) RETURNS MADLIB_SCHEMA.SVEC AS $$
DECLARE
BEGIN
    IF (($1 IS NOT NULL) AND ($2 IS NOT NULL)) THEN
        RETURN $1 + $2;
    END IF;
    IF ($1 IS NOT NULL) THEN
        RETURN $1;
    END IF;
    IF ($2 IS NOT NULL) THEN
        RETURN $2;
    END IF;
END
$$ LANGUAGE plpgsql;

/**
 * @internal
 * @brief Compute a mean position for a set of SVECs.
 */
CREATE AGGREGATE MADLIB_SCHEMA.__kmeans_meanPosition( MADLIB_SCHEMA.SVEC ) 
(
  stype = MADLIB_SCHEMA.SVEC,
  sfunc = MADLIB_SCHEMA.__kmeans_mean_product,
  ifdef(`GREENPLUM',`prefunc = MADLIB_SCHEMA.__kmeans_mean_aggr,')
  finalfunc = MADLIB_SCHEMA.__kmeans_mean_finalize
);

/**
 * @brief Compute a k-means clustering
 *
 * @param input_table Name of relation containing the input data points
 * @param k Number of centroids to generate
 * @param goodness Goodness of fit test flag (allowed values: 0, 1)
 * @param run_id Name/ID of the execution
 * @param output_schema Target schema for the output tables
 * @return Textual summary of the algorithm run, including the names of the
 *     created tables and run time statistics.
 *
 * @internal
 * @sa This function is a wrapper for kmeans::kmeans_run()
 */
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.kmeans_run( input_table text, k int, goodness int, run_id text, output_table text)
  RETURNS text
AS $$
    import sys
    try:
        from madlib import kmeans
    except:
        sys.path.append("PLPYTHON_LIBDIR")
        from madlib import kmeans
    
    plpy.execute( 'set client_min_messages=warning');
    return kmeans.kmeans_run( input_table, k, goodness, run_id, output_table);
$$ LANGUAGE plpythonu;

COMMIT;