Skip to content

Commit

Permalink
Utilties: Refactor and clean cols2vec from 109be7d
Browse files Browse the repository at this point in the history
JIRA: MADLIB-1239

Closes apache#288
  • Loading branch information
iyerr3 committed Jul 13, 2018
1 parent 109be7d commit 625e537
Show file tree
Hide file tree
Showing 5 changed files with 146 additions and 159 deletions.
5 changes: 3 additions & 2 deletions doc/mainpage.dox.in
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,9 @@ Contains graph algorithms.


@defgroup grp_other_functions Other Functions
@defgroup grp_cols2vec Columns to Vector
@ingroup grp_other_functions

@defgroup grp_linear_solver Linear Solvers
@ingroup grp_other_functions
@{A collection of methods that implement solutions for systems of consistent linear equations. @}
Expand All @@ -284,8 +287,6 @@ Contains graph algorithms.
@defgroup @grp_utilities Utilities
@ingroup grp_other_functions

@defgroup grp_cols2vec Columns to Vector
@ingroup grp_utility_functions

@defgroup grp_early_stage Early Stage Development
@brief A collection of implementations which are in early stage of development.
Expand Down
105 changes: 50 additions & 55 deletions src/ports/postgres/modules/utilities/cols2vec.py_in
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,17 @@
"""

import plpy
from utilities.control import MinWarning
from utilities.utilities import split_quoted_delimited_str
from utilities.utilities import _string_to_array
from utilities.utilities import _assert
from utilities.validate_args import columns_exist_in_table
from utilities.validate_args import is_var_valid
from utilities.validate_args import get_cols
from utilities.validate_args import quote_ident
from utilities.utilities import py_list_to_sql_string
from control import MinWarning
from internal.db_utils import quote_literal
from utilities import split_quoted_delimited_str
from utilities import _string_to_array
from utilities import _assert
from utilities import add_postfix
from validate_args import columns_exist_in_table
from validate_args import is_var_valid
from validate_args import get_cols
from validate_args import quote_ident
from utilities import py_list_to_sql_string


m4_changequote(`<!', `!>')
Expand All @@ -31,81 +33,74 @@ def validate_cols2vec_args(source_table, output_table,
_assert(
columns_exist_in_table(
source_table, split_quoted_delimited_str(list_of_features)),
"Invalid columns to list of features {0}".format(list_of_features))
"Invalid columns in list_of_features {0}".format(list_of_features))

if cols_to_output and cols_to_output.strip() != '*':
_assert(
columns_exist_in_table(
source_table, _string_to_array(cols_to_output)),
source_table, split_quoted_delimited_str(cols_to_output)),
"Invalid columns to output list {0}".format(cols_to_output))


def cols2vec(schema_madlib, source_table, output_table, list_of_features,
list_of_features_to_exclude=None, cols_to_output=None, **kwargs):
"""
Args:
@param schema_madlib: Name of MADlib schema
@param model: Name of table containing the tree model
@param source_table: Name of table containing prediction data
@param output_table: Name of table to output the results
@param list_of_features: Comma-separated string of column names or
expressions to put into feature array.
Can also be a '*' implying all columns
are to be put into feature array.
@param schema_madlib: Name of MADlib schema
@param model: Name of table containing the tree model
@param source_table: Name of table containing prediction data
@param output_table: Name of table to output the results
@param list_of_features: Comma-separated string of column names or
expressions to put into feature array.
Can also be a '*' implying all columns
are to be put into feature array.
@param list_of_features_to_exclude: Comma-separated string of column names
to exclude from the feature array
@param cols_to_output: Comma-separated string of column names
from the source table to keep in the output table,
in addition to the feature array.
@param cols_to_output: Comma-separated string of column names
from the source table to keep in the output table,
in addition to the feature array.

Returns:
None

"""

with MinWarning('warning'):
validate_cols2vec_args(source_table, output_table, list_of_features,
list_of_features_to_exclude, cols_to_output, **kwargs)

all_cols = get_cols(source_table, schema_madlib)
if list_of_features.strip() == '*':
all_cols = get_cols(source_table, schema_madlib)
exclude_set = set(split_quoted_delimited_str(
list_of_features_to_exclude))
feature_list = [col for col in all_cols if col not in exclude_set]

exclude_set = set(split_quoted_delimited_str(list_of_features_to_exclude))
feature_list = [c for c in all_cols if c not in exclude_set]
else:
feature_list = split_quoted_delimited_str(list_of_features)

feature_cols = py_list_to_sql_string(
list(feature_list), "text", False)
filtered_list_of_features = ",".join(feature_list)

output_cols = ''
if cols_to_output:
output_cols_list = [', '.join(get_cols(source_table, schema_madlib)) if col == '*' else col
for col in split_quoted_delimited_str(cols_to_output)]
output_cols = ', '.join(output_cols_list) + ","
additional_cols = (all_cols if cols_to_output == '*' else
split_quoted_delimited_str(cols_to_output))
additional_cols_str = ', '.join(additional_cols) + ","
else:
additional_cols_str = ''

feature_list_str = py_list_to_sql_string(feature_list, "TEXT[]", True)
plpy.execute("""
CREATE TABLE {output_table} AS
select {output_cols}
array[{filtered_list_of_features}] as feature_vector
from {source_table}
""".format(**locals()))
CREATE TABLE {output_table} AS
SELECT {additional_cols_str}
{feature_list_str} AS feature_vector
FROM {source_table}
""".format(**locals()))

plpy.execute("""
CREATE TABLE {output_table}_summary
(
source_table TEXT,
list_of_features TEXT,
list_of_features_to_exclude TEXT,
feature_cols TEXT[]
)""".format(output_table=output_table))
feature_cols = py_list_to_sql_string(
[quote_literal(f) for f in feature_list], "TEXT", True)

output_table_summary = add_postfix(output_table, "_summary")
# Dollar-quote the text to allow single-quotes without escaping
dq = "$__MADLIB_OUTER__$"
plpy.execute("""
INSERT INTO {output_table}_summary
VALUES ('{source_table}','{list_of_features}',
'{list_of_features_to_exclude}',
(SELECT {feature_cols} as feature_names from {source_table} limit 1))
""".format( **locals()))
return
CREATE TABLE {output_table_summary} AS
SELECT
{dq}{source_table}{dq}::TEXT AS source_table,
{dq}{list_of_features}{dq}::TEXT AS list_of_features,
{dq}{list_of_features_to_exclude}{dq}::TEXT AS list_of_features_to_exclude,
{feature_cols} AS feature_names
""".format(**locals()))
135 changes: 62 additions & 73 deletions src/ports/postgres/modules/utilities/cols2vec.sql_in
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* ----------------------------------------------------------------------- *//**
*
/* ----------------------------------------------------------------------- */
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
Expand All @@ -17,17 +17,16 @@
* specific language governing permissions and limitations
* under the License.
*
*//* ----------------------------------------------------------------------- */

/* ----------------------------------------------------------------------- *//**
*
*file cols2vec.sql_in
*
*brief A set of utilities to ease basic table transformations (such as *aggregating multiple columns in an array)
*
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
*
* @file cols2vec.sql_in
* @brief A set of utilities to ease basic table transformations
* (such as *aggregating multiple columns in an array)
* @date July 2018
*
*/
/* ----------------------------------------------------------------------- */

m4_include(`SQLCommon.m4')


/**
Expand All @@ -46,50 +45,60 @@
@about
Convert all feature columns in your table into an array in a single column.

Given a table with varying number of columns, this function will create an output table that will contain the feature columns into an array. A summry table will be created and will the names
of the features into array so that this process can be reversed using the function
vec2cols from array_utilities in PDLTools.
Given a table with varying number of columns, this function will create an
output table that will contain the feature columns into an array. A summary
table will be created and will contain the names of the features combined into
array so that this process can be reversed using the function vec2cols.

The columns that need NOT be included in the feature array need to be specified in the exclude_columns field.
The columns that need NOT be included in the feature array need to be specified
in the exclude_columns field.

@anchor cols2vec_usage
@usage

<pre class="syntax">
cols2vec(
source_table ,
output_table ,
source_table,
output_table,
list_of_features,
list_of_features_to_exclude ,
cols_to_output
)
list_of_features_to_exclude,
cols_to_output
)
</pre>

\b Arguments
<dl class="arglist">
<dt>source_table</dt>
<dd>TEXT. Name of the table containing the source data.</tt>.
<dd>TEXT. Name of the table containing the source data.</dd>.

<dt>output_table</dt>
<dd>TEXT. Name of the generated table containing the output.</tt>
<dd>TEXT. Name of the generated table containing the output.</dd>

<dt>list_of_features</dt>
<dd>TEXT. Comma-separated string of column names or expressions to put into feature array. Can also be a '*' implying all columns are to be put into feature array (except for the ones included in the next argument that lists exclusions). Array columns in the source table are not supported in the 'list_of_features'. </tt>
<dd>TEXT.
Comma-separated string of column names or expressions to put into feature array.
Can also be a '*' implying all columns are to be put into feature array (except
for the ones included in the next argument that lists exclusions). Array columns
in the source table are not supported in the 'list_of_features'. </dd>

<dt>list_of_features_to_exclude</dt>
<dd>TEXT. Default NULL. Comma-separated string of column names to exclude from the feature array. Use only when 'list_of_features' is '*'. </tt>
<dd>TEXT. Default NULL.
Comma-separated string of column names to exclude from the feature array. Use
only when 'list_of_features' is '*'. </dd>

<dt>cols_to_output</dt>
<dd>TEXT. Default NULL. Comma-separated string of column names from the source table to keep in the output table, in addition to the feature array. To keep all columns from the source table, use '*'. </tt>

</dd>
<dd>TEXT. Default NULL.
Comma-separated string of column names from the source table to keep in the
output table, in addition to the feature array. To keep all columns from the
source table, use '*'. </dd>

</dl>

@anchor cols2vec_example
@examp

<pre class="syntax">
@par Examples

-# Create an input dataset and run the function
<pre class="example">
DROP TABLE IF EXISTS cols2vec;
CREATE TABLE cols2vec (
id bigint,
Expand All @@ -99,60 +108,42 @@ CREATE TABLE cols2vec (
feat3 float,
other_col float
);

INSERT INTO cols2vec VALUES
(1, 0, 1, 1, 0.5, 0.9),
(2, 1, 0, 1, 0.3, 0.3),
(3, 0, 0, 0, 0.1, 1.1),
(4, 1, 1, 0, 0.9, 0.4);


<pre>

<pre class="syntax">
drop table if exists cols2vec_result;
drop table if exists cols2vec_result_summary;


select cols2vec(
DROP TABLE IF EXISTS cols2vec_result, cols2vec_result_summary;
SELECT cols2vec(
'cols2vec', -- input table
'cols2vec_result', -- Output table
'cols2vec_result', -- Output table
'feat1,feat2,feat3', -- Comma Seperated List of Features
'id', -- Features To Exclude
'id,label' -- Output columns to be included in
output table
'id', -- Features To Exclude
'id,label' -- Columns to be included in output
);

select * from cols2vec_result;
SELECT * FROM cols2vec_result;
</pre>


-# Expected output:

<pre class="result">
select * from cols2vec_result;

id | label | feature_vector
id | label | feature_vector
----+-------+----------------
1 | 0 | {1,1,0.5}
2 | 1 | {0,1,0.3}
3 | 0 | {0,0,0.1}
4 | 1 | {1,0,0.9}
(4 rows)

select * from cols2vec_result_summary;
feature_names
---------------------
{feat1,feat2,feat3}
{feat1,feat2,feat3}
{feat1,feat2,feat3}
{feat1,feat2,feat3}
(4 rows)

</pre>


<pre class="syntax">
-# Summary table
<pre>
SELECT * FROM cols2vec_result_summary
</pre>
<pre class="result">
source_table | list_of_features | list_of_features_to_exclude | feature_names
--------------+-------------------+-----------------------------+---------------------
cols2vec | feat1,feat2,feat3 | id | {feat1,feat2,feat3}
(1 row)
</pre>
*/

Expand All @@ -162,11 +153,11 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec(
output_table VARCHAR,
list_of_features VARCHAR,
list_of_features_to_exclude VARCHAR,
cols_to_output VARCHAR
cols_to_output VARCHAR
) RETURNS void AS $$
PythonFunction(cols_vec, cols2vec, cols2vec)
PythonFunction(utilities, cols2vec, cols2vec)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec(
Expand All @@ -175,8 +166,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec(
list_of_features VARCHAR,
list_of_features_to_exclude VARCHAR
) RETURNS void AS $$
SELECT MADLIB_SCHEMA.cols2vec($1,$2,$3,$4,NULL)

SELECT MADLIB_SCHEMA.cols2vec($1, $2, $3, $4, NULL)
$$ LANGUAGE SQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

Expand All @@ -185,7 +175,6 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.cols2vec(
output_table VARCHAR,
list_of_features VARCHAR
) RETURNS void AS $$
SELECT MADLIB_SCHEMA.cols2vec($1,$2,$3,NULL,NULL)

SELECT MADLIB_SCHEMA.cols2vec($1, $2, $3, NULL, NULL)
$$ LANGUAGE SQL
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');

0 comments on commit 625e537

Please sign in to comment.