Utilties: Refactor and clean cols2vec from 109be7d

JIRA: MADLIB-1239 Closes apache#288
madlib · Jul 13, 2018 · 625e537 · 625e537
1 parent 109be7d
commit 625e537
Show file tree

Hide file tree

Showing 5 changed files with 146 additions and 159 deletions.
diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
@@ -262,6 +262,9 @@ Contains graph algorithms.
 
 
 @defgroup grp_other_functions Other Functions
+    @defgroup grp_cols2vec Columns to Vector
+    @ingroup grp_other_functions
+
     @defgroup grp_linear_solver Linear Solvers
     @ingroup grp_other_functions
     @{A collection of methods that implement solutions for systems of consistent linear equations. @}
@@ -284,8 +287,6 @@ Contains graph algorithms.
     @defgroup @grp_utilities Utilities
     @ingroup grp_other_functions
 
-    @defgroup grp_cols2vec Columns to Vector
-    @ingroup grp_utility_functions
 
 @defgroup grp_early_stage Early Stage Development
 @brief A collection of implementations which are in early stage of development.

diff --git a/src/ports/postgres/modules/utilities/cols2vec.py_in b/src/ports/postgres/modules/utilities/cols2vec.py_in
@@ -6,15 +6,17 @@
 """
 
 import plpy
-from utilities.control import MinWarning
-from utilities.utilities import split_quoted_delimited_str
-from utilities.utilities import _string_to_array
-from utilities.utilities import _assert
-from utilities.validate_args import columns_exist_in_table
-from utilities.validate_args import is_var_valid
-from utilities.validate_args import get_cols
-from utilities.validate_args import quote_ident
-from utilities.utilities import py_list_to_sql_string
+from control import MinWarning
+from internal.db_utils import quote_literal
+from utilities import split_quoted_delimited_str
+from utilities import _string_to_array
+from utilities import _assert
+from utilities import add_postfix
+from validate_args import columns_exist_in_table
+from validate_args import is_var_valid
+from validate_args import get_cols
+from validate_args import quote_ident
+from utilities import py_list_to_sql_string
 
 
 m4_changequote(`<!', `!>')
@@ -31,81 +33,74 @@ def validate_cols2vec_args(source_table, output_table,
         _assert(
             columns_exist_in_table(
                 source_table, split_quoted_delimited_str(list_of_features)),
-            "Invalid columns to list of features {0}".format(list_of_features))
+            "Invalid columns in list_of_features {0}".format(list_of_features))
 
     if cols_to_output and cols_to_output.strip() != '*':
         _assert(
             columns_exist_in_table(
-                source_table, _string_to_array(cols_to_output)),
+                source_table, split_quoted_delimited_str(cols_to_output)),
             "Invalid columns to output list {0}".format(cols_to_output))
 
 
 def cols2vec(schema_madlib, source_table, output_table, list_of_features,
              list_of_features_to_exclude=None, cols_to_output=None, **kwargs):
     """
     Args:
-        @param schema_madlib:               Name of MADlib schema
-        @param model:                       Name of table containing the tree model
-        @param source_table:                Name of table containing prediction data
-        @param output_table:                Name of table to output the results
-        @param list_of_features:            Comma-separated string of column names or
-                                            expressions to put into feature array.
-                                            Can also be a '*' implying all columns
-                                            are to be put into feature array.
+        @param schema_madlib: Name of MADlib schema
+        @param model: Name of table containing the tree model
+        @param source_table: Name of table containing prediction data
+        @param output_table: Name of table to output the results
+        @param list_of_features: Comma-separated string of column names or
+                                 expressions to put into feature array.
+                                 Can also be a '*' implying all columns
+                                 are to be put into feature array.
         @param list_of_features_to_exclude: Comma-separated string of column names
                                             to exclude from the feature array
-        @param cols_to_output:              Comma-separated string of column names
-                                            from the source table to keep in the output table,
-                                            in addition to the feature array.
+        @param cols_to_output: Comma-separated string of column names
+                               from the source table to keep in the output table,
+                               in addition to the feature array.
 
     Returns:
         None
 
     """
-
     with MinWarning('warning'):
         validate_cols2vec_args(source_table, output_table, list_of_features,
                                list_of_features_to_exclude, cols_to_output, **kwargs)
 
+        all_cols = get_cols(source_table, schema_madlib)
         if list_of_features.strip() == '*':
-            all_cols = get_cols(source_table, schema_madlib)
-            exclude_set = set(split_quoted_delimited_str(
-                list_of_features_to_exclude))
-            feature_list = [col for col in all_cols if col not in exclude_set]
-
+            exclude_set = set(split_quoted_delimited_str(list_of_features_to_exclude))
+            feature_list = [c for c in all_cols if c not in exclude_set]
         else:
             feature_list = split_quoted_delimited_str(list_of_features)
 
-        feature_cols = py_list_to_sql_string(
-            list(feature_list), "text", False)
-        filtered_list_of_features = ",".join(feature_list)
-
-        output_cols = ''
         if cols_to_output:
-            output_cols_list = [', '.join(get_cols(source_table, schema_madlib)) if col == '*' else col
-                                for col in split_quoted_delimited_str(cols_to_output)]
-            output_cols = ', '.join(output_cols_list) + ","
+            additional_cols = (all_cols if cols_to_output == '*' else
+                               split_quoted_delimited_str(cols_to_output))
+            additional_cols_str = ', '.join(additional_cols) + ","
+        else:
+            additional_cols_str = ''
 
+        feature_list_str = py_list_to_sql_string(feature_list, "TEXT[]", True)
         plpy.execute("""
-    		CREATE TABLE {output_table} AS
-    		select {output_cols}
-            array[{filtered_list_of_features}] as feature_vector
-    		from {source_table}
-    		""".format(**locals()))
+            CREATE TABLE {output_table} AS
+                SELECT {additional_cols_str}
+                       {feature_list_str} AS feature_vector
+                FROM {source_table}
+            """.format(**locals()))
 
-        plpy.execute("""
-            CREATE TABLE {output_table}_summary
-            (
-            source_table TEXT,
-            list_of_features TEXT,
-            list_of_features_to_exclude TEXT,
-            feature_cols TEXT[]
-            )""".format(output_table=output_table))
+        feature_cols = py_list_to_sql_string(
+            [quote_literal(f) for f in feature_list], "TEXT", True)
 
+        output_table_summary = add_postfix(output_table, "_summary")
+        # Dollar-quote the text to allow single-quotes without escaping
+        dq = "$__MADLIB_OUTER__$"
         plpy.execute("""
-            INSERT INTO {output_table}_summary
-            VALUES ('{source_table}','{list_of_features}',
-            '{list_of_features_to_exclude}',
-            (SELECT {feature_cols} as feature_names from {source_table} limit 1))
-            """.format( **locals()))
-        return
+            CREATE TABLE {output_table_summary} AS
+            SELECT
+                {dq}{source_table}{dq}::TEXT AS source_table,
+                {dq}{list_of_features}{dq}::TEXT AS list_of_features,
+                {dq}{list_of_features_to_exclude}{dq}::TEXT AS list_of_features_to_exclude,
+                {feature_cols} AS feature_names
+            """.format(**locals()))
diff --git a/src/ports/postgres/modules/utilities/cols2vec.sql_in b/src/ports/postgres/modules/utilities/cols2vec.sql_in
@@ -1,5 +1,5 @@
-/* ----------------------------------------------------------------------- *//**
- *
+/* ----------------------------------------------------------------------- */
+/**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -17,17 +17,16 @@
  * specific language governing permissions and limitations
  * under the License.
  *
- *//* ----------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------- *//**
-*
-*file cols2vec.sql_in
-*
-*brief A set of utilities to ease basic table transformations (such as *aggregating multiple columns in an array)
-*
-*
-*//* ----------------------------------------------------------------------- */
- m4_include(`SQLCommon.m4')
+ *
+ * @file cols2vec.sql_in
+ * @brief A set of utilities to ease basic table transformations
+ *  (such as *aggregating multiple columns in an array)
+ * @date July 2018
+ *
+ */
+/* ----------------------------------------------------------------------- */
+
+m4_include(`SQLCommon.m4')
 
 
 /**
@@ -46,50 +45,60 @@
 @about
 Convert all feature columns in your table into an array in a single column.
 
-Given a table with varying number of columns, this function will create an output table that will contain the feature columns into an array. A summry table will be created and will the names
-of the features into array so that this process can be reversed using the function
-vec2cols from array_utilities in PDLTools.
+Given a table with varying number of columns, this function will create an
+output table that will contain the feature columns into an array. A summary
+table will be created and will contain the names of the features combined into
+array so that this process can be reversed using the function vec2cols.
 
-The columns that need NOT be included in the feature array need to be specified in the exclude_columns field.
+The columns that need NOT be included in the feature array need to be specified
+in the exclude_columns field.
 
 @anchor cols2vec_usage
 @usage
 
 <pre class="syntax">
 cols2vec(
-    source_table ,
-    output_table ,
+    source_table,
+    output_table,
     list_of_features,
-    list_of_features_to_exclude ,
-    cols_to_output  
-) 
+    list_of_features_to_exclude,
+    cols_to_output
+)
 </pre>
 
 \b Arguments
 <dl class="arglist">
 <dt>source_table</dt>
-<dd>TEXT. Name of the table containing the source data.</tt>.
+<dd>TEXT. Name of the table containing the source data.</dd>.
 
 <dt>output_table</dt>
-<dd>TEXT. Name of the generated table containing the output.</tt>
+<dd>TEXT. Name of the generated table containing the output.</dd>
 
 <dt>list_of_features</dt>
-<dd>TEXT. Comma-separated string of column names or expressions to put into feature array. Can also be a '*' implying all columns are to be put into feature array (except for the ones included in the next argument that lists exclusions). Array columns in the source table are not supported in the 'list_of_features'. </tt>
+<dd>TEXT.
+Comma-separated string of column names or expressions to put into feature array.
+Can also be a '*' implying all columns are to be put into feature array (except
+for the ones included in the next argument that lists exclusions). Array columns
+in the source table are not supported in the 'list_of_features'. </dd>
 
 <dt>list_of_features_to_exclude</dt>
-<dd>TEXT. Default NULL. Comma-separated string of column names to exclude from the feature array.  Use only when 'list_of_features' is '*'. </tt>
+<dd>TEXT. Default NULL.
+Comma-separated string of column names to exclude from the feature array.  Use
+only when 'list_of_features' is '*'. </dd>
 
 <dt>cols_to_output</dt>
-<dd>TEXT. Default NULL. Comma-separated string of column names from the source table to keep in the output table, in addition to the feature array.  To keep all columns from the source table, use '*'. </tt>
-
-</dd>
+<dd>TEXT. Default NULL.
+Comma-separated string of column names from the source table to keep in the
+output table, in addition to the feature array.  To keep all columns from the
+source table, use '*'. </dd>
 
+</dl>
 
 @anchor cols2vec_example
-@examp
-
-<pre class="syntax">
+@par Examples
 
+-# Create an input dataset and run the function
+<pre class="example">
 DROP TABLE IF EXISTS cols2vec;
 CREATE TABLE cols2vec (
     id              bigint,
@@ -99,60 +108,42 @@ CREATE TABLE cols2vec (
     feat3           float,
     other_col       float
 );
-
 INSERT INTO cols2vec  VALUES
 (1, 0, 1, 1, 0.5, 0.9),
 (2, 1, 0, 1, 0.3, 0.3),
 (3, 0, 0, 0, 0.1, 1.1),
 (4, 1, 1, 0, 0.9, 0.4);
-
-
-<pre>
-
-<pre class="syntax">
-drop table if exists cols2vec_result;
-drop table if exists cols2vec_result_summary;
-
-
-select cols2vec(
+DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;
+SELECT cols2vec(
       'cols2vec',                  -- input table
-      'cols2vec_result',		       -- Output table
+      'cols2vec_result',           -- Output table
       'feat1,feat2,feat3',         -- Comma Seperated List of Features
-      'id',                        --  Features To Exclude
-      'id,label'	 			           -- Output columns to be included in
-      								                output table
+      'id',                        -- Features To Exclude
+      'id,label'                   -- Columns to be included in output
 );
-
-select * from cols2vec_result;
+SELECT * FROM cols2vec_result;
 </pre>
 
-
 -# Expected output:
-
 <pre class="result">
-select * from cols2vec_result;
-
- id | label | feature_vector 
+ id | label | feature_vector
 ----+-------+----------------
   1 |     0 | {1,1,0.5}
   2 |     1 | {0,1,0.3}
   3 |     0 | {0,0,0.1}
   4 |     1 | {1,0,0.9}
 (4 rows)
-
-select * from cols2vec_result_summary;
-    feature_names    
----------------------
- {feat1,feat2,feat3}
- {feat1,feat2,feat3}
- {feat1,feat2,feat3}
- {feat1,feat2,feat3}
-(4 rows)
-
 </pre>
 
-
-<pre class="syntax">
+-# Summary table
+<pre>
+SELECT * FROM cols2vec_result_summary
+</pre>
+<pre class="result">
+ source_table | list_of_features  | list_of_features_to_exclude |    feature_names
+--------------+-------------------+-----------------------------+---------------------
+ cols2vec     | feat1,feat2,feat3 | id                          | {feat1,feat2,feat3}
+(1 row)
 </pre>
 */
 
@@ -162,11 +153,11 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec(
     output_table VARCHAR,
     list_of_features VARCHAR,
     list_of_features_to_exclude VARCHAR,
-    cols_to_output VARCHAR 
+    cols_to_output VARCHAR
 ) RETURNS void AS $$
-    PythonFunction(cols_vec, cols2vec, cols2vec)
+    PythonFunction(utilities, cols2vec, cols2vec)
 $$ LANGUAGE plpythonu VOLATILE
-m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');    
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec(
@@ -175,8 +166,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec(
     list_of_features VARCHAR,
     list_of_features_to_exclude VARCHAR
 ) RETURNS void AS $$
-SELECT MADLIB_SCHEMA.cols2vec($1,$2,$3,$4,NULL)
-
+    SELECT MADLIB_SCHEMA.cols2vec($1, $2, $3, $4, NULL)
 $$ LANGUAGE SQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -185,7 +175,6 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec(
     output_table VARCHAR,
     list_of_features VARCHAR
 ) RETURNS void AS $$
-SELECT MADLIB_SCHEMA.cols2vec($1,$2,$3,NULL,NULL)
-
+    SELECT MADLIB_SCHEMA.cols2vec($1, $2, $3, NULL, NULL)
 $$ LANGUAGE SQL
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');