diff --git a/examples/gpce/logistic-regress.sql b/examples/gpce/logistic-regress.sql index cba22040a..9c587c6ec 100644 --- a/examples/gpce/logistic-regress.sql +++ b/examples/gpce/logistic-regress.sql @@ -96,6 +96,6 @@ SELECT * FROM artificiallogreg LIMIT 5; \qecho === Calculate Coefficients from artificial data: ================ -SELECT madlib.logreg_coef( +SELECT madlib.logregr_coef( 'artificiallogreg', 'y', 'x', 20, 'irls', 0.001 )::REAL[]; diff --git a/methods/bayes/src/pg_gp/bayes.py_in b/methods/bayes/src/pg_gp/bayes.py_in index 7877f5a9f..4495c9171 100644 --- a/methods/bayes/src/pg_gp/bayes.py_in +++ b/methods/bayes/src/pg_gp/bayes.py_in @@ -15,526 +15,526 @@ Laplacian smoothing). Naive Bayes: Setup Functions @internal - @implementation - - For the Naive Bayes Classification, we need a product over probabilities. - However, multiplying lots of small numbers can lead to an exponent overflow. - E.g., multiplying more than 324 numbers at most 0.1 will yield a product of 0 - in machine arithmetic. A safer way is therefore summing logarithms. - - By the IEEE 754 standard, the smallest number representable as - DOUBLE PRECISION (64bit) is $2^{-1022}$, i.e., approximately 2.225e-308. - See, e.g., http://en.wikipedia.org/wiki/Double_precision - Hence, log(x) = log_10(x) for any non-zero DOUBLE PRECISION @f$x \ge -308@f$. - - Note for theorists: - - Even adding infinitely many \f$ \log_{10}(x)@f$ for @f$0 < x \le 1 \f$ will - never cause an overflow because addition will have no effect once the sum - reaches approx $308 * 2^{53}$ (correspnding to the machine precision). - - The functions __get_*_sql are private because we do not want to commit ourselves - to a particular interface. We might want to be able to change implementation - details should the need arise. + @implementation + + For the Naive Bayes Classification, we need a product over probabilities. + However, multiplying lots of small numbers can lead to an exponent overflow. + E.g., multiplying more than 324 numbers at most 0.1 will yield a product of 0 + in machine arithmetic. A safer way is therefore summing logarithms. + + By the IEEE 754 standard, the smallest number representable as + DOUBLE PRECISION (64bit) is $2^{-1022}$, i.e., approximately 2.225e-308. + See, e.g., http://en.wikipedia.org/wiki/Double_precision + Hence, log(x) = log_10(x) for any non-zero DOUBLE PRECISION @f$x \ge -308@f$. + + Note for theorists: + - Even adding infinitely many \f$ \log_{10}(x)@f$ for @f$0 < x \le 1 \f$ will + never cause an overflow because addition will have no effect once the sum + reaches approx $308 * 2^{53}$ (correspnding to the machine precision). + + The functions __get_*_sql are private because we do not want to commit ourselves + to a particular interface. We might want to be able to change implementation + details should the need arise. @endinternal """ import plpy def __get_feature_probs_sql(**kwargs): - """Return SQL query with columns (class, attr, value, cnt, attr_cnt). - - For class c, attr i, and value a, cnt is #(c,i,a) and attr_cnt is \#i. - - Note that the query will contain a row for every pair (class, value) - occuring in the training data (so it might also contain rows where - \#(c,i,a) = 0). - - @param classPriorsSource Relation (class, class_cnt, all_cnt) where - class is c, class_cnt is \#c, all_cnt is the number of rows in - \em trainingSource - @param attrValuesSource Relation (attr, value) containing all distinct - attribute, value pairs. If omitted, will use __get_attr_values_sql() - @param attrCountsSource Relation (attr, attr_cnt) where attr is i and - attr_cnt is \#i. If omitted, will use __get_attr_counts_sql() - @param trainingSource name of relation containing training data - @param trainingClassColumn name of column with class - @param trainingAttrColumn name of column with attributes array - @param numAttrs Number of attributes to use for classification - - For meanings of \#(c,i,a), \#c, and \#i see the general description of - \ref bayes. - """ - - if not 'attrValuesSource' in kwargs: - kwargs.update(dict( - attrValuesSource = "(" + __get_attr_values_sql(**kwargs) + ")" - )) - if not 'attrCountsSource' in kwargs: - kwargs.update(dict( - attrCountsSource = "(" + __get_attr_counts_sql(**kwargs) + ")" - )) + """Return SQL query with columns (class, attr, value, cnt, attr_cnt). + + For class c, attr i, and value a, cnt is #(c,i,a) and attr_cnt is \#i. + + Note that the query will contain a row for every pair (class, value) + occuring in the training data (so it might also contain rows where + \#(c,i,a) = 0). + + @param classPriorsSource Relation (class, class_cnt, all_cnt) where + class is c, class_cnt is \#c, all_cnt is the number of rows in + \em trainingSource + @param attrValuesSource Relation (attr, value) containing all distinct + attribute, value pairs. If omitted, will use __get_attr_values_sql() + @param attrCountsSource Relation (attr, attr_cnt) where attr is i and + attr_cnt is \#i. If omitted, will use __get_attr_counts_sql() + @param trainingSource name of relation containing training data + @param trainingClassColumn name of column with class + @param trainingAttrColumn name of column with attributes array + @param numAttrs Number of attributes to use for classification + + For meanings of \#(c,i,a), \#c, and \#i see the general description of + \ref bayes. + """ + + if not 'attrValuesSource' in kwargs: + kwargs.update(dict( + attrValuesSource = "(" + __get_attr_values_sql(**kwargs) + ")" + )) + if not 'attrCountsSource' in kwargs: + kwargs.update(dict( + attrCountsSource = "(" + __get_attr_counts_sql(**kwargs) + ")" + )) # {trainingSource} cannot be a subquery, because we use it more than once in # our generated SQL. - return """ - SELECT - class, - attr, - value, - coalesce(cnt, 0) AS cnt, - attr_cnt - FROM - ( - SELECT * - FROM - {classPriorsSource} AS classes - CROSS JOIN - {attrValuesSource} AS attr_values - ) AS required_triples - LEFT OUTER JOIN - ( - SELECT - trainingSource.{trainingClassColumn} AS class, - attr.attr, - trainingSource.{trainingAttrColumn}[attr.attr] AS value, - count(*) AS cnt - FROM - generate_series(1, {numAttrs}) AS attr, - {trainingSource} AS trainingSource - GROUP BY + return """ + SELECT + class, + attr, + value, + coalesce(cnt, 0) AS cnt, + attr_cnt + FROM + ( + SELECT * + FROM + {classPriorsSource} AS classes + CROSS JOIN + {attrValuesSource} AS attr_values + ) AS required_triples + LEFT OUTER JOIN + ( + SELECT + trainingSource.{trainingClassColumn} AS class, + attr.attr, + trainingSource.{trainingAttrColumn}[attr.attr] AS value, + count(*) AS cnt + FROM + generate_series(1, {numAttrs}) AS attr, + {trainingSource} AS trainingSource + GROUP BY trainingSource.{trainingClassColumn}, attr.attr, trainingSource.{trainingAttrColumn}[attr.attr] - ) AS triple_counts - USING (class, attr, value) - INNER JOIN - {attrCountsSource} AS attr_counts - USING (attr) - """.format(**kwargs) + ) AS triple_counts + USING (class, attr, value) + INNER JOIN + {attrCountsSource} AS attr_counts + USING (attr) + """.format(**kwargs) def __get_attr_values_sql(**kwargs): - """ - Return SQL query with columns (attr, value). - - The query contains a row for each pair that occurs in the training data. - - @param trainingSource Name of relation containing the training data - @param trainingAttrColumn Name of attributes-array column in training data - @param numAttrs Number of attributes to use for classification - - @internal - \par Implementation Notes: - If PostgreSQL supported count(DISTINCT ...) for window functions, we could - consolidate this function with __get_attr_counts_sql(): - @verbatim - [...] count(DISTINCT value) OVER (PARTITION BY attr) [...] - @endverbatim - @endinternal - - """ - - return """ - SELECT DISTINCT - attr.attr, - trainingSource.{trainingAttrColumn}[attr.attr] AS value - FROM - generate_series(1, {numAttrs}) AS attr, - {trainingSource} AS trainingSource - """.format(**kwargs) + """ + Return SQL query with columns (attr, value). + + The query contains a row for each pair that occurs in the training data. + + @param trainingSource Name of relation containing the training data + @param trainingAttrColumn Name of attributes-array column in training data + @param numAttrs Number of attributes to use for classification + + @internal + \par Implementation Notes: + If PostgreSQL supported count(DISTINCT ...) for window functions, we could + consolidate this function with __get_attr_counts_sql(): + @verbatim + [...] count(DISTINCT value) OVER (PARTITION BY attr) [...] + @endverbatim + @endinternal + + """ + + return """ + SELECT DISTINCT + attr.attr, + trainingSource.{trainingAttrColumn}[attr.attr] AS value + FROM + generate_series(1, {numAttrs}) AS attr, + {trainingSource} AS trainingSource + """.format(**kwargs) def __get_attr_counts_sql(**kwargs): - """ - Return SQL query with columns (attr, attr_cnt) - - For attr i, attr_cnt is \#i. - - @param trainingSource Name of relation containing the training data - @param trainingAttrColumn Name of attributes-array column in training data - @param numAttrs Number of attributes to use for classification - - """ - - return """ - SELECT - attr.attr, - count( + """ + Return SQL query with columns (attr, attr_cnt) + + For attr i, attr_cnt is \#i. + + @param trainingSource Name of relation containing the training data + @param trainingAttrColumn Name of attributes-array column in training data + @param numAttrs Number of attributes to use for classification + + """ + + return """ + SELECT + attr.attr, + count( DISTINCT trainingSource.{trainingAttrColumn}[attr.attr] ) AS attr_cnt - FROM - generate_series(1, {numAttrs}) AS attr, - {trainingSource} AS trainingSource - GROUP BY attr.attr - """.format(**kwargs) + FROM + generate_series(1, {numAttrs}) AS attr, + {trainingSource} AS trainingSource + GROUP BY attr.attr + """.format(**kwargs) def __get_class_priors_sql(**kwargs): - """ - Return SQL query with columns (class, class_cnt, all_cnt) - - For class c, class_cnt is \#c. all_cnt is the total number of records in the - training data. - - @param trainingSource Name of relation containing the training data - @param trainingClassColumn Name of class column in training data - - """ - - return """ - SELECT - trainingSource.{trainingClassColumn} AS class, - count(*) AS class_cnt, - sum(count(*)) OVER () AS all_cnt - FROM {trainingSource} AS trainingSource - GROUP BY trainingSource.{trainingClassColumn} - """.format(**kwargs) + """ + Return SQL query with columns (class, class_cnt, all_cnt) + + For class c, class_cnt is \#c. all_cnt is the total number of records in the + training data. + + @param trainingSource Name of relation containing the training data + @param trainingClassColumn Name of class column in training data + + """ + + return """ + SELECT + trainingSource.{trainingClassColumn} AS class, + count(*) AS class_cnt, + sum(count(*)) OVER () AS all_cnt + FROM {trainingSource} AS trainingSource + GROUP BY trainingSource.{trainingClassColumn} + """.format(**kwargs) def __get_keys_and_prob_values_sql(**kwargs): - """ - Return SQL query with columns (key, class, log_prob). - - For class c and the attribute array identified by key k, log_prob is - log( P(C = c) * P(A = a(k)[] | C = c) ). - - For each key k and class c, the query also contains a row (k, c, NULL). This - is for technical reasons (we want every key-class pair to appear in the - query. NULL serves as a default value if there is insufficient training data - to compute a probability value). - - @param numAttrs Number of attributes to use for classification - @param classifySource Name of the relation that contains data to be classified - @param classifyKeyColumn Name of column in \em classifySource that can - serve as unique identifier - @param classifyAttrColumn Name of attributes-array column in \em classifySource - @param classPriorsSource - Relation (class, class_cnt, all_cnt) where - class is c, class_cnt is \#c, all_cnt is the number of training - samples. - @param featureProbsSource - Relation (class, attr, value, cnt, attr_cnt) where - (class, attr, value) = (c,i,a), cnt = \#(c,i,a), and attr_cnt = \#i - @param smoothingFactor Smoothing factor for computing feature - feature probabilities. Default value: 1.0 (Laplacian Smoothing). - - """ + """ + Return SQL query with columns (key, class, log_prob). + + For class c and the attribute array identified by key k, log_prob is + log( P(C = c) * P(A = a(k)[] | C = c) ). + + For each key k and class c, the query also contains a row (k, c, NULL). This + is for technical reasons (we want every key-class pair to appear in the + query. NULL serves as a default value if there is insufficient training data + to compute a probability value). + + @param numAttrs Number of attributes to use for classification + @param classifySource Name of the relation that contains data to be classified + @param classifyKeyColumn Name of column in \em classifySource that can + serve as unique identifier + @param classifyAttrColumn Name of attributes-array column in \em classifySource + @param classPriorsSource + Relation (class, class_cnt, all_cnt) where + class is c, class_cnt is \#c, all_cnt is the number of training + samples. + @param featureProbsSource + Relation (class, attr, value, cnt, attr_cnt) where + (class, attr, value) = (c,i,a), cnt = \#(c,i,a), and attr_cnt = \#i + @param smoothingFactor Smoothing factor for computing feature + feature probabilities. Default value: 1.0 (Laplacian Smoothing). + + """ # {classifySource} cannot be a subquery, because we use it more than once in # our generated SQL. - return """ - SELECT - classify.key, - classPriors.class, - CASE WHEN count(*) < {numAttrs} THEN NULL - ELSE - log(classPriors.class_cnt::DOUBLE PRECISION / classPriors.all_cnt) - + sum( log((featureProbs.cnt::DOUBLE PRECISION + {smoothingFactor}) - / (classPriors.class_cnt + {smoothingFactor} * featureProbs.attr_cnt)) ) - END - AS log_prob - FROM - {featureProbsSource} AS featureProbs, - {classPriorsSource} AS classPriors, - ( - SELECT - classifySource.{classifyKeyColumn} AS key, - attr.attr, - classifySource.{classifyAttrColumn}[attr.attr] AS value - FROM - {classifySource} AS classifySource, - generate_series(1, {numAttrs}) AS attr - ) AS classify - WHERE - featureProbs.class = classPriors.class AND - featureProbs.attr = classify.attr AND - featureProbs.value = classify.value AND - ({smoothingFactor} > 0 OR featureProbs.cnt > 0) -- prevent division by 0 - GROUP BY - classify.key, classPriors.class, classPriors.class_cnt, classPriors.all_cnt - - UNION - - SELECT - classify.{classifyKeyColumn} AS key, - classes.class, - NULL - FROM - {classifySource} AS classify, - {classPriorsSource} AS classes - GROUP BY classify.{classifyKeyColumn}, classes.class - """.format(**kwargs) + return """ + SELECT + classify.key, + classPriors.class, + CASE WHEN count(*) < {numAttrs} THEN NULL + ELSE + log(classPriors.class_cnt::DOUBLE PRECISION / classPriors.all_cnt) + + sum( log((featureProbs.cnt::DOUBLE PRECISION + {smoothingFactor}) + / (classPriors.class_cnt + {smoothingFactor} * featureProbs.attr_cnt)) ) + END + AS log_prob + FROM + {featureProbsSource} AS featureProbs, + {classPriorsSource} AS classPriors, + ( + SELECT + classifySource.{classifyKeyColumn} AS key, + attr.attr, + classifySource.{classifyAttrColumn}[attr.attr] AS value + FROM + {classifySource} AS classifySource, + generate_series(1, {numAttrs}) AS attr + ) AS classify + WHERE + featureProbs.class = classPriors.class AND + featureProbs.attr = classify.attr AND + featureProbs.value = classify.value AND + ({smoothingFactor} > 0 OR featureProbs.cnt > 0) -- prevent division by 0 + GROUP BY + classify.key, classPriors.class, classPriors.class_cnt, classPriors.all_cnt + + UNION + + SELECT + classify.{classifyKeyColumn} AS key, + classes.class, + NULL + FROM + {classifySource} AS classify, + {classPriorsSource} AS classes + GROUP BY classify.{classifyKeyColumn}, classes.class + """.format(**kwargs) def __get_prob_values_sql(**kwargs): - """ - Return SQL query with columns (class, log_prob), given an array of - attributes. - - The query binds to an attribute array a[]. For every class c, log_prob - is log( P(C = c) * P(A = a[] | C = c) ). - - @param classifyAttrColumn Array of attributes to bind to. This can be - a column name of an outer query or a literal. - @param smoothingFactor Smoothing factor to use for estimating the feature - probabilities. - @param numAttrs Number of attributes to use for classification - @param classPriorsSource - Relation (class, class_cnt, all_cnt) where - class is c, class_cnt is \#c, all_cnt is the number of training - samples. - @param featureProbsSource - Relation (class, attr, value, cnt, attr_cnt) where - (class, attr, value) = (c,i,a), cnt = \#(c,i,a), and attr_cnt = \#i - - Note that unless \em classifyAttrColumn is a literal, the SQL query will - become a correlated subquery and will not work in Greenplum. - - """ + """ + Return SQL query with columns (class, log_prob), given an array of + attributes. + + The query binds to an attribute array a[]. For every class c, log_prob + is log( P(C = c) * P(A = a[] | C = c) ). + + @param classifyAttrColumn Array of attributes to bind to. This can be + a column name of an outer query or a literal. + @param smoothingFactor Smoothing factor to use for estimating the feature + probabilities. + @param numAttrs Number of attributes to use for classification + @param classPriorsSource + Relation (class, class_cnt, all_cnt) where + class is c, class_cnt is \#c, all_cnt is the number of training + samples. + @param featureProbsSource + Relation (class, attr, value, cnt, attr_cnt) where + (class, attr, value) = (c,i,a), cnt = \#(c,i,a), and attr_cnt = \#i + + Note that unless \em classifyAttrColumn is a literal, the SQL query will + become a correlated subquery and will not work in Greenplum. + + """ # {classifyAttrColumn} binds to a names declared outside of the following # SQL. We need to ensure that ther are no conflicting names with # {classifyAttrColumn}. Therefore, we only introduce the unusual name # __attr. Note that by the structure of the query, there can be no other # name conflicts. - return """ - SELECT - classPriors.class, - CASE WHEN count(*) < {numAttrs} THEN NULL - ELSE - log(classPriors.class_cnt::DOUBLE PRECISION / classPriors.all_cnt) - + sum( log((featureProbs.cnt::DOUBLE PRECISION + {smoothingFactor}) - / (classPriors.class_cnt + {smoothingFactor} * featureProbs.attr_cnt)) ) - END - AS log_prob - FROM - {featureProbsSource} AS featureProbs, - {classPriorsSource} AS classPriors, - ( - SELECT - __attr.__attr, - {classifyAttrColumn}[__attr.__attr] AS value - FROM - generate_series(1, {numAttrs}) AS __attr - ) AS classify - WHERE - featureProbs.class = classPriors.class AND - featureProbs.attr = classify.__attr AND featureProbs.value = classify.value AND - ({smoothingFactor} > 0 OR featureProbs.cnt > 0) -- prevent division by 0 - GROUP BY classPriors.class, classPriors.class_cnt, classPriors.all_cnt - - UNION - - SELECT - classes.class, - NULL - FROM - {classPriorsSource} AS classes - """.format(**kwargs) + return """ + SELECT + classPriors.class, + CASE WHEN count(*) < {numAttrs} THEN NULL + ELSE + log(classPriors.class_cnt::DOUBLE PRECISION / classPriors.all_cnt) + + sum( log((featureProbs.cnt::DOUBLE PRECISION + {smoothingFactor}) + / (classPriors.class_cnt + {smoothingFactor} * featureProbs.attr_cnt)) ) + END + AS log_prob + FROM + {featureProbsSource} AS featureProbs, + {classPriorsSource} AS classPriors, + ( + SELECT + __attr.__attr, + {classifyAttrColumn}[__attr.__attr] AS value + FROM + generate_series(1, {numAttrs}) AS __attr + ) AS classify + WHERE + featureProbs.class = classPriors.class AND + featureProbs.attr = classify.__attr AND featureProbs.value = classify.value AND + ({smoothingFactor} > 0 OR featureProbs.cnt > 0) -- prevent division by 0 + GROUP BY classPriors.class, classPriors.class_cnt, classPriors.all_cnt + + UNION + + SELECT + classes.class, + NULL + FROM + {classPriorsSource} AS classes + """.format(**kwargs) def __get_classification_sql(**kwargs): - """ - Return SQL query with columns (key, nb_classification, nb_log_probability) - - @param keys_and_prob_values Relation (key, class, log_prob) - - """ - - return """ - SELECT - key, - MADLIB_SCHEMA.argmax(class, log_prob) AS nb_classification, - max(log_prob) AS nb_log_probability - FROM {keys_and_prob_values} AS keys_and_nb_values - GROUP BY key - """.format( - keys_and_prob_values = "(" + __get_keys_and_prob_values_sql(**kwargs) + ")" - ) + """ + Return SQL query with columns (key, nb_classification, nb_log_probability) + + @param keys_and_prob_values Relation (key, class, log_prob) + + """ + + return """ + SELECT + key, + MADLIB_SCHEMA.argmax(class, log_prob) AS nb_classification, + max(log_prob) AS nb_log_probability + FROM {keys_and_prob_values} AS keys_and_nb_values + GROUP BY key + """.format( + keys_and_prob_values = "(" + __get_keys_and_prob_values_sql(**kwargs) + ")" + ) def create_prepared_data(**kwargs): - """Precompute all class priors and feature probabilities. - - When the precomputations are stored in a table, this function will create - indices that speed up lookups necessary for Naive Bayes classification. - Moreover, it runs ANALYZE on the new tables to allow for optimized query - plans. - - Class priors are stored in a relation with columns - (class, class_cnt, all_cnt). - - @param trainingSource Name of relation containing the training data - @param trainingClassColumn Name of class column in training data - @param trainingAttrColumn Name of attributes-array column in training data - @param numAttrs Number of attributes to use for classification - - @param whatToCreate (Optional) Either \c 'TABLE' OR \c 'VIEW' (the default). - @param classPriorsDestName Name of class-priors relation to create - @param featureProbsDestName Name of feature-probabilities relation to create - - """ - - if kwargs['whatToCreate'] == 'TABLE': - # FIXME: ANALYZE is not portable. - kwargs.update(dict( - attrCountsSource = '_madlib_nb_attr_counts', - attrValuesSource = '_madlib_nb_attr_values' - )) + """Precompute all class priors and feature probabilities. + + When the precomputations are stored in a table, this function will create + indices that speed up lookups necessary for Naive Bayes classification. + Moreover, it runs ANALYZE on the new tables to allow for optimized query + plans. + + Class priors are stored in a relation with columns + (class, class_cnt, all_cnt). + + @param trainingSource Name of relation containing the training data + @param trainingClassColumn Name of class column in training data + @param trainingAttrColumn Name of attributes-array column in training data + @param numAttrs Number of attributes to use for classification + + @param whatToCreate (Optional) Either \c 'TABLE' OR \c 'VIEW' (the default). + @param classPriorsDestName Name of class-priors relation to create + @param featureProbsDestName Name of feature-probabilities relation to create + + """ + + if kwargs['whatToCreate'] == 'TABLE': + # FIXME: ANALYZE is not portable. + kwargs.update(dict( + attrCountsSource = '_madlib_nb_attr_counts', + attrValuesSource = '_madlib_nb_attr_values' + )) plpy.execute(""" DROP TABLE IF EXISTS {attrCountsSource}; - CREATE TEMPORARY TABLE {attrCountsSource} - AS - {attr_counts_sql}; - ALTER TABLE {attrCountsSource} ADD PRIMARY KEY (attr); - ANALYZE {attrCountsSource}; - + CREATE TEMPORARY TABLE {attrCountsSource} + AS + {attr_counts_sql}; + ALTER TABLE {attrCountsSource} ADD PRIMARY KEY (attr); + ANALYZE {attrCountsSource}; + DROP TABLE IF EXISTS {attrValuesSource}; - CREATE TEMPORARY TABLE {attrValuesSource} - AS - {attr_values_sql}; - ALTER TABLE {attrValuesSource} ADD PRIMARY KEY (attr, value); - ANALYZE {attrValuesSource}; - """.format( + CREATE TEMPORARY TABLE {attrValuesSource} + AS + {attr_values_sql}; + ALTER TABLE {attrValuesSource} ADD PRIMARY KEY (attr, value); + ANALYZE {attrValuesSource}; + """.format( attrCountsSource = kwargs['attrCountsSource'], attrValuesSource = kwargs['attrValuesSource'], - attr_counts_sql = "(" + __get_attr_counts_sql(**kwargs) + ")", - attr_values_sql = "(" + __get_attr_values_sql(**kwargs) + ")" - ) - ) - - - kwargs.update(dict( - sql = __get_class_priors_sql(**kwargs) - )) - plpy.execute(""" - CREATE {whatToCreate} {classPriorsDestName} - AS - {sql} - """.format(**kwargs) - ) - if kwargs['whatToCreate'] == 'TABLE': - plpy.execute(""" - ALTER TABLE {classPriorsDestName} ADD PRIMARY KEY (class); - ANALYZE {classPriorsDestName}; - """.format(**kwargs)) - - kwargs.update(dict( - classPriorsSource = kwargs['classPriorsDestName'] - )) - kwargs.update(dict( - sql = __get_feature_probs_sql(**kwargs) - )) - plpy.execute(""" - CREATE {whatToCreate} {featureProbsDestName} AS - {sql} - """.format(**kwargs) - ) - if kwargs['whatToCreate'] == 'TABLE': - plpy.execute(""" - ALTER TABLE {featureProbsDestName} ADD PRIMARY KEY (class, attr, value); - ANALYZE {featureProbsDestName}; - DROP TABLE {attrCountsSource}; - DROP TABLE {attrValuesSource}; - """.format(**kwargs)) + attr_counts_sql = "(" + __get_attr_counts_sql(**kwargs) + ")", + attr_values_sql = "(" + __get_attr_values_sql(**kwargs) + ")" + ) + ) + + + kwargs.update(dict( + sql = __get_class_priors_sql(**kwargs) + )) + plpy.execute(""" + CREATE {whatToCreate} {classPriorsDestName} + AS + {sql} + """.format(**kwargs) + ) + if kwargs['whatToCreate'] == 'TABLE': + plpy.execute(""" + ALTER TABLE {classPriorsDestName} ADD PRIMARY KEY (class); + ANALYZE {classPriorsDestName}; + """.format(**kwargs)) + + kwargs.update(dict( + classPriorsSource = kwargs['classPriorsDestName'] + )) + kwargs.update(dict( + sql = __get_feature_probs_sql(**kwargs) + )) + plpy.execute(""" + CREATE {whatToCreate} {featureProbsDestName} AS + {sql} + """.format(**kwargs) + ) + if kwargs['whatToCreate'] == 'TABLE': + plpy.execute(""" + ALTER TABLE {featureProbsDestName} ADD PRIMARY KEY (class, attr, value); + ANALYZE {featureProbsDestName}; + DROP TABLE {attrCountsSource}; + DROP TABLE {attrValuesSource}; + """.format(**kwargs)) def create_classification(**kwargs): - """ - Create a view/table with columns (key, nb_classification). - - The created relation will be - - {TABLE|VIEW} destName (key, nb_classification) - - where \c nb_classification is an array containing the most likely - class(es) of the record in \em classifySource identified by \c key. - - There are two sets of arguments this function can be called with. The - following parameters are always needed: - @param numAttrs Number of attributes to use for classification - @param destName Name of the table or view to create - @param whatToCreate (Optional) Either \c 'TABLE' OR \c 'VIEW' (the default). - @param smoothingFactor (Optional) Smoothing factor for computing feature - feature probabilities. Default value: 1.0 (Laplacian Smoothing). - @param classifySource Name of the relation that contains data to be classified - @param classifyKeyColumn Name of column in \em classifySource that can - serve as unique identifier - @param classifyAttrColumn Name of attributes-array column in \em classifySource - - Furthermore, provide either: - @param classPriorsSource - Relation (class, class_cnt, all_cnt) where - class is c, class_cnt is \#c, all_cnt is the number of training - samples. - @param featureProbsSource - Relation (class, attr, value, cnt, attr_cnt) where - (class, attr, value) = (c,i,a), cnt = \#(c,i,a), and attr_cnt = \#i - - Or have this function operate on the "raw" training data: - @param trainingSource - Name of relation containing the training data - @param trainingClassColumn - Name of class column in training data - @param trainingAttrColumn - Name of attributes-array column in \em trainingSource - - """ - - __init_prepared_data(kwargs) - kwargs.update(dict( - keys_and_prob_values = "(" + __get_keys_and_prob_values_sql(**kwargs) + ")" - )) - plpy.execute(""" - CREATE {whatToCreate} {destName} AS - SELECT - key, - MADLIB_SCHEMA.argmax(class, log_prob) AS nb_classification - FROM {keys_and_prob_values} AS keys_and_nb_values - GROUP BY key - """.format(**kwargs)) + """ + Create a view/table with columns (key, nb_classification). + + The created relation will be + + {TABLE|VIEW} destName (key, nb_classification) + + where \c nb_classification is an array containing the most likely + class(es) of the record in \em classifySource identified by \c key. + + There are two sets of arguments this function can be called with. The + following parameters are always needed: + @param numAttrs Number of attributes to use for classification + @param destName Name of the table or view to create + @param whatToCreate (Optional) Either \c 'TABLE' OR \c 'VIEW' (the default). + @param smoothingFactor (Optional) Smoothing factor for computing feature + feature probabilities. Default value: 1.0 (Laplacian Smoothing). + @param classifySource Name of the relation that contains data to be classified + @param classifyKeyColumn Name of column in \em classifySource that can + serve as unique identifier + @param classifyAttrColumn Name of attributes-array column in \em classifySource + + Furthermore, provide either: + @param classPriorsSource + Relation (class, class_cnt, all_cnt) where + class is c, class_cnt is \#c, all_cnt is the number of training + samples. + @param featureProbsSource + Relation (class, attr, value, cnt, attr_cnt) where + (class, attr, value) = (c,i,a), cnt = \#(c,i,a), and attr_cnt = \#i + + Or have this function operate on the "raw" training data: + @param trainingSource + Name of relation containing the training data + @param trainingClassColumn + Name of class column in training data + @param trainingAttrColumn + Name of attributes-array column in \em trainingSource + + """ + + __init_prepared_data(kwargs) + kwargs.update(dict( + keys_and_prob_values = "(" + __get_keys_and_prob_values_sql(**kwargs) + ")" + )) + plpy.execute(""" + CREATE {whatToCreate} {destName} AS + SELECT + key, + MADLIB_SCHEMA.argmax(class, log_prob) AS nb_classification + FROM {keys_and_prob_values} AS keys_and_nb_values + GROUP BY key + """.format(**kwargs)) def create_bayes_probabilities(**kwargs): - """Create table/view with columns (key, class, nb_prob) - - The created relation will be - - {TABLE|VIEW} destName (key, class, nb_prob) - - where \c nb_prob is the Naive-Bayes probability that \c class is the true - class of the record in \em classifySource identified by \c key. - - There are two sets of arguments this function can be called with. The - following parameters are always needed: - @param numAttrs Number of attributes to use for classification - @param destName Name of the table or view to create - @param whatToCreate (Optional) Either \c 'TABLE' OR \c 'VIEW' (the default). - @param smoothingFactor (Optional) Smoothing factor for computing feature - feature probabilities. Default value: 1.0 (Laplacian Smoothing). - - Furthermore, provide either: - @param classPriorsSource - Relation (class, class_cnt, all_cnt) where - class is c, class_cnt is \#c, all_cnt is the number of training - samples. - @param featureProbsSource - Relation (class, attr, value, cnt, attr_cnt) where - (class, attr, value) = (c,i,a), cnt = \#(c,i,a), and attr_cnt = \#i - - Or have this function operate on the "raw" training data: - @param trainingSource - Name of relation containing the training data - @param trainingClassColumn - Name of class column in training data - @param trainingAttrColumn - Name of attributes-array column in training data - - @internal - \par Implementation Notes: - - We have two numerical problems when copmuting the probabilities - @verbatim + """Create table/view with columns (key, class, nb_prob) + + The created relation will be + + {TABLE|VIEW} destName (key, class, nb_prob) + + where \c nb_prob is the Naive-Bayes probability that \c class is the true + class of the record in \em classifySource identified by \c key. + + There are two sets of arguments this function can be called with. The + following parameters are always needed: + @param numAttrs Number of attributes to use for classification + @param destName Name of the table or view to create + @param whatToCreate (Optional) Either \c 'TABLE' OR \c 'VIEW' (the default). + @param smoothingFactor (Optional) Smoothing factor for computing feature + feature probabilities. Default value: 1.0 (Laplacian Smoothing). + + Furthermore, provide either: + @param classPriorsSource + Relation (class, class_cnt, all_cnt) where + class is c, class_cnt is \#c, all_cnt is the number of training + samples. + @param featureProbsSource + Relation (class, attr, value, cnt, attr_cnt) where + (class, attr, value) = (c,i,a), cnt = \#(c,i,a), and attr_cnt = \#i + + Or have this function operate on the "raw" training data: + @param trainingSource + Name of relation containing the training data + @param trainingClassColumn + Name of class column in training data + @param trainingAttrColumn + Name of attributes-array column in training data + + @internal + \par Implementation Notes: + + We have two numerical problems when copmuting the probabilities + @verbatim P(C = c) * P(A = a | C = c) P(C = c) = --------------------------------- (*) -- @@ -544,128 +544,128 @@ def create_bayes_probabilities(**kwargs): __ where P(A = a | C = c) = || P(A_i = a_i | C = c). i - @endverbatim - - 1. P(A = a | C = c) could be a very small number not representable in - double-precision floating-point arithmetic. - - Solution: We have log( P(C = c) * P(A = a | C = c) ) as indermediate - results. We will add the maximum absolute value of these intermediate - results to all of them. This corresponds to multiplying numerator and - denominator of (*) with the same factor. The "normalization" ensures - that the numerator of (*) can never be 0 (in FP arithmetic) for all c. - - 2. PostgreSQL raises an error in case of underflows, even when 0 is the - desirable outcome. - - Solution: if log_10 ( P(A = a | C = c) ) < -300, we interprete + @endverbatim + + 1. P(A = a | C = c) could be a very small number not representable in + double-precision floating-point arithmetic. + - Solution: We have log( P(C = c) * P(A = a | C = c) ) as indermediate + results. We will add the maximum absolute value of these intermediate + results to all of them. This corresponds to multiplying numerator and + denominator of (*) with the same factor. The "normalization" ensures + that the numerator of (*) can never be 0 (in FP arithmetic) for all c. + + 2. PostgreSQL raises an error in case of underflows, even when 0 is the + desirable outcome. + - Solution: if log_10 ( P(A = a | C = c) ) < -300, we interprete P(A = a | C = c) = 0. Note here that 1e-300 is roughly in the order of - magnitude of the smallest double precision FP number. - @endinternal - """ - - __init_prepared_data(kwargs) - kwargs.update(dict( - keys_and_prob_values = "(" + __get_keys_and_prob_values_sql(**kwargs) + ")" - )) - plpy.execute(""" - CREATE {whatToCreate} {destName} AS - SELECT - key, - class, - nb_prob / sum(nb_prob) OVER (PARTITION BY key) AS nb_prob - FROM - ( - SELECT - key, - class, - CASE WHEN max(log_prob) - max(max(log_prob)) OVER (PARTITION BY key) < -300 THEN 0 - ELSE pow(10, max(log_prob) - max(max(log_prob)) OVER (PARTITION BY key)) - END AS nb_prob - FROM - {keys_and_prob_values} AS keys_and_nb_values - GROUP BY - key, class - ) AS keys_and_nb_values - ORDER BY - key, class - """.format(**kwargs)) + magnitude of the smallest double precision FP number. + @endinternal + """ + + __init_prepared_data(kwargs) + kwargs.update(dict( + keys_and_prob_values = "(" + __get_keys_and_prob_values_sql(**kwargs) + ")" + )) + plpy.execute(""" + CREATE {whatToCreate} {destName} AS + SELECT + key, + class, + nb_prob / sum(nb_prob) OVER (PARTITION BY key) AS nb_prob + FROM + ( + SELECT + key, + class, + CASE WHEN max(log_prob) - max(max(log_prob)) OVER (PARTITION BY key) < -300 THEN 0 + ELSE pow(10, max(log_prob) - max(max(log_prob)) OVER (PARTITION BY key)) + END AS nb_prob + FROM + {keys_and_prob_values} AS keys_and_nb_values + GROUP BY + key, class + ) AS keys_and_nb_values + ORDER BY + key, class + """.format(**kwargs)) def create_classification_function(**kwargs): - """Create a SQL function mapping arrays of attribute values to the Naive - Bayes classification. - - The created SQL function will be: - - - FUNCTION destName (attributes INTEGER[], smoothingFactor DOUBLE PRECISION) - RETURNS INTEGER[] - - There are two sets of arguments this function can be called with. The - following parameters are always needed: - @param classifyAttrColumn Array of attributes to bind to. This can be - a column name of an outer query or a literal. - @param smoothingFactor Smoothing factor to use for estimating the feature - probabilities. - @param numAttrs Number of attributes to use for classification - - Furthermore, provide either: - @param classPriorsSource - Relation (class, class_cnt, all_cnt) where - class is c, class_cnt is \#c, all_cnt is the number of training - samples. - @param featureProbsSource - Relation (class, attr, value, cnt, attr_cnt) where - (class, attr, value) = (c,i,a), cnt = \#(c,i,a), and attr_cnt = \#i - - Or have this function operate on the "raw" training data: - @param trainingSource Name of relation containing the training data - @param trainingClassColumn Name of class column in training data - @param trainingAttrColumn Name of attributes-array column in training data - - Note: Greenplum does not support executing STABLE and VOLATILE functions on - segments. The created function can therefore only be called on the master. - """ - - kwargs.update(dict( - classifyAttrColumn = "$1", - smoothingFactor = "$2" - )) - __init_prepared_data(kwargs) - kwargs.update(dict( - keys_and_prob_values = "(" + __get_prob_values_sql(**kwargs) + ")" - )) - plpy.execute(""" - CREATE FUNCTION {destName} (inAttributes INTEGER[], inSmoothingFactor DOUBLE PRECISION) - RETURNS INTEGER[] AS - $$ - SELECT - MADLIB_SCHEMA.argmax(class, log_prob) - FROM {keys_and_prob_values} AS key_and_nb_values - $$ - LANGUAGE sql STABLE - """.format(**kwargs)) + """Create a SQL function mapping arrays of attribute values to the Naive + Bayes classification. + + The created SQL function will be: + + + FUNCTION destName (attributes INTEGER[], smoothingFactor DOUBLE PRECISION) + RETURNS INTEGER[] + + There are two sets of arguments this function can be called with. The + following parameters are always needed: + @param classifyAttrColumn Array of attributes to bind to. This can be + a column name of an outer query or a literal. + @param smoothingFactor Smoothing factor to use for estimating the feature + probabilities. + @param numAttrs Number of attributes to use for classification + + Furthermore, provide either: + @param classPriorsSource + Relation (class, class_cnt, all_cnt) where + class is c, class_cnt is \#c, all_cnt is the number of training + samples. + @param featureProbsSource + Relation (class, attr, value, cnt, attr_cnt) where + (class, attr, value) = (c,i,a), cnt = \#(c,i,a), and attr_cnt = \#i + + Or have this function operate on the "raw" training data: + @param trainingSource Name of relation containing the training data + @param trainingClassColumn Name of class column in training data + @param trainingAttrColumn Name of attributes-array column in training data + + Note: Greenplum does not support executing STABLE and VOLATILE functions on + segments. The created function can therefore only be called on the master. + """ + + kwargs.update(dict( + classifyAttrColumn = "$1", + smoothingFactor = "$2" + )) + __init_prepared_data(kwargs) + kwargs.update(dict( + keys_and_prob_values = "(" + __get_prob_values_sql(**kwargs) + ")" + )) + plpy.execute(""" + CREATE FUNCTION {destName} (inAttributes INTEGER[], inSmoothingFactor DOUBLE PRECISION) + RETURNS INTEGER[] AS + $$ + SELECT + MADLIB_SCHEMA.argmax(class, log_prob) + FROM {keys_and_prob_values} AS key_and_nb_values + $$ + LANGUAGE sql STABLE + """.format(**kwargs)) def __init_prepared_data(kwargs): - """ - Fill in values for optional parameters: Create subqueries instead of using - a relation. - - """ - - if not 'classPriorsSource' in kwargs: - kwargs.update(dict( - classPriorsSource = "(" + __get_class_priors_sql(**kwargs) + ")" - )) - if not 'featureProbsSource' in kwargs: - kwargs.update(dict( - featureProbsSource = "(" + __get_feature_probs_sql(**kwargs) + ")" - )) - if not 'smoothingFactor' in kwargs: - kwargs.update(dict( - smoothingFactor = 1 - )) - + """ + Fill in values for optional parameters: Create subqueries instead of using + a relation. + + """ + + if not 'classPriorsSource' in kwargs: + kwargs.update(dict( + classPriorsSource = "(" + __get_class_priors_sql(**kwargs) + ")" + )) + if not 'featureProbsSource' in kwargs: + kwargs.update(dict( + featureProbsSource = "(" + __get_feature_probs_sql(**kwargs) + ")" + )) + if not 'smoothingFactor' in kwargs: + kwargs.update(dict( + smoothingFactor = 1 + )) + # The m4 preprocessor complains if eof is reach in quoted mode. \ No newline at end of file diff --git a/methods/regress/src/pg_gp/regression.sql_in b/methods/regress/src/pg_gp/regression.sql_in index b4e709109..42893af23 100644 --- a/methods/regress/src/pg_gp/regression.sql_in +++ b/methods/regress/src/pg_gp/regression.sql_in @@ -155,8 +155,8 @@ http://www.stat.columbia.edu/~martin/W2110/SAS_7.pdf. @internal @sa file regress.c (documenting the implementation in C), function - float8_mregr_compute() (documenting the formulas used for coefficients, - $R^2$, t-statistics, and p-values, implemented in C) + float8_mregr_compute() (documenting the formulas used for coefficients, + $R^2$, t-statistics, and p-values, implemented in C) @endinternal @literature @@ -207,7 +207,7 @@ LANGUAGE C STRICT; * independentVariables array to 1. * * @return Array of coefficients, which has the same length as the array of - * independent variables. + * independent variables. * * @examp SELECT mregr_coef(y, [1, x1, x2]) FROM data; */ @@ -215,11 +215,11 @@ CREATE AGGREGATE MADLIB_SCHEMA.mregr_coef( /*+ "dependentVariable" */ DOUBLE PRECISION, /*+ "independentVariables" */ DOUBLE PRECISION[]) ( - SFUNC=MADLIB_SCHEMA.float8_mregr_accum, - STYPE=float8[], - FINALFUNC=MADLIB_SCHEMA.float8_mregr_coef, + SFUNC=MADLIB_SCHEMA.float8_mregr_accum, + STYPE=float8[], + FINALFUNC=MADLIB_SCHEMA.float8_mregr_coef, ifdef(GREENPLUM,prefunc=MADLIB_SCHEMA.float8_mregr_combine,) - INITCOND='{0}' + INITCOND='{0}' ); /** @@ -229,11 +229,11 @@ CREATE AGGREGATE MADLIB_SCHEMA.mregr_r2( /*+ "dependentVariable" */ DOUBLE PRECISION, /*+ "independentVariables" */ DOUBLE PRECISION[]) ( - SFUNC=MADLIB_SCHEMA.float8_mregr_accum, - STYPE=float8[], - FINALFUNC=MADLIB_SCHEMA.float8_mregr_r2, + SFUNC=MADLIB_SCHEMA.float8_mregr_accum, + STYPE=float8[], + FINALFUNC=MADLIB_SCHEMA.float8_mregr_r2, ifdef(GREENPLUM,prefunc=MADLIB_SCHEMA.float8_mregr_combine,) - INITCOND='{0}' + INITCOND='{0}' ); /** @@ -245,17 +245,17 @@ CREATE AGGREGATE MADLIB_SCHEMA.mregr_r2( * @param dependentVariable Dependent variable * @param independentVariables Array of independent variables * @return Array of t-statistics for each coefficient. The returned array has - * the same length as the array of independent variables. + * the same length as the array of independent variables. */ CREATE AGGREGATE MADLIB_SCHEMA.mregr_tstats( /*+ "dependentVariable" */ DOUBLE PRECISION, /*+ "independentVariables" */ DOUBLE PRECISION[]) ( - SFUNC=MADLIB_SCHEMA.float8_mregr_accum, - STYPE=float8[], - FINALFUNC=MADLIB_SCHEMA.float8_mregr_tstats, + SFUNC=MADLIB_SCHEMA.float8_mregr_accum, + STYPE=float8[], + FINALFUNC=MADLIB_SCHEMA.float8_mregr_tstats, ifdef(GREENPLUM,prefunc=MADLIB_SCHEMA.float8_mregr_combine,) - INITCOND='{0}' + INITCOND='{0}' ); /** @@ -264,17 +264,17 @@ CREATE AGGREGATE MADLIB_SCHEMA.mregr_tstats( * @param dependentVariable Dependent variable * @param independentVariables Array of independent variables * @return Array of p-values for each coefficient. The returned array has - * the same length as the array of independent variables. + * the same length as the array of independent variables. */ CREATE AGGREGATE MADLIB_SCHEMA.mregr_pvalues( /*+ "dependentVariable" */ DOUBLE PRECISION, /*+ "independentVariables" */ DOUBLE PRECISION[]) ( - SFUNC=MADLIB_SCHEMA.float8_mregr_accum, - STYPE=float8[], - FINALFUNC=MADLIB_SCHEMA.float8_mregr_pvalues, + SFUNC=MADLIB_SCHEMA.float8_mregr_accum, + STYPE=float8[], + FINALFUNC=MADLIB_SCHEMA.float8_mregr_pvalues, ifdef(GREENPLUM,prefunc=MADLIB_SCHEMA.float8_mregr_combine,) - INITCOND='{0}' + INITCOND='{0}' ); /** @@ -331,7 +331,7 @@ By looking at the Hessian, we can verify that \f$l(\boldsymbol c)\f$ is convex. There are many techniques for solving convex optimization problems. Currently, logistic regression in MADlib can use one of two algorithms: - Iteratively Reweighted Least Squares -- A conjugate-gradient approach, also known as Fletcher–Reeves method in the +- A conjugate-gradient approach, also known as Fletcher-Reeves method in the literature, where we use the Hestenes-Stiefel rule for calculating the step size. @@ -381,11 +381,11 @@ PostgreSQL/Greenplum. @internal @sa namespace logRegress (documenting the driver/outer loop implemented in - Python), function float8_cg_update_final() (documenting the - conjugate-gradient update/iteration steps, implemented in C), function - float8_cg_update_accum() (documenting the - iteratively-reweighted-least-squares update/iteration steps, implemented in - C) + Python), function float8_cg_update_final() (documenting the + conjugate-gradient update/iteration steps, implemented in C), function + float8_cg_update_accum() (documenting the + iteratively-reweighted-least-squares update/iteration steps, implemented in + C) @endinternal @literature @@ -406,38 +406,38 @@ further literature: */ CREATE TYPE MADLIB_SCHEMA.logregr_cg_state AS ( - iteration INTEGER, - len INTEGER, - coef DOUBLE PRECISION[], - dir DOUBLE PRECISION[], - grad DOUBLE PRECISION[], - beta DOUBLE PRECISION, - - count BIGINT, - gradNew DOUBLE PRECISION[], - dTHd DOUBLE PRECISION, - logLikelihood DOUBLE PRECISION + iteration INTEGER, + len INTEGER, + coef DOUBLE PRECISION[], + dir DOUBLE PRECISION[], + grad DOUBLE PRECISION[], + beta DOUBLE PRECISION, + + count BIGINT, + gradNew DOUBLE PRECISION[], + dTHd DOUBLE PRECISION, + logLikelihood DOUBLE PRECISION ); CREATE TYPE MADLIB_SCHEMA.logregr_irls_state AS ( - coef DOUBLE PRECISION[], - logLikelihood DOUBLE PRECISION + coef DOUBLE PRECISION[], + logLikelihood DOUBLE PRECISION ); CREATE FUNCTION MADLIB_SCHEMA.float8_cg_update_accum( - MADLIB_SCHEMA.logregr_cg_state, - BOOLEAN, - DOUBLE PRECISION[], - MADLIB_SCHEMA.logregr_cg_state) + MADLIB_SCHEMA.logregr_cg_state, + BOOLEAN, + DOUBLE PRECISION[], + MADLIB_SCHEMA.logregr_cg_state) RETURNS MADLIB_SCHEMA.logregr_cg_state AS 'MODULE_PATHNAME' LANGUAGE C; CREATE FUNCTION MADLIB_SCHEMA.float8_irls_update_accum( - DOUBLE PRECISION[], - BOOLEAN, - DOUBLE PRECISION[], - MADLIB_SCHEMA.logregr_irls_state) + DOUBLE PRECISION[], + BOOLEAN, + DOUBLE PRECISION[], + MADLIB_SCHEMA.logregr_irls_state) RETURNS DOUBLE PRECISION[] AS 'MODULE_PATHNAME' LANGUAGE C; @@ -462,9 +462,9 @@ CREATE AGGREGATE MADLIB_SCHEMA.logregr_cg_step( DOUBLE PRECISION[], MADLIB_SCHEMA.logregr_cg_state) ( - SFUNC=MADLIB_SCHEMA.float8_cg_update_accum, - STYPE=MADLIB_SCHEMA.logregr_cg_state, - FINALFUNC=MADLIB_SCHEMA.float8_cg_update_final + SFUNC=MADLIB_SCHEMA.float8_cg_update_accum, + STYPE=MADLIB_SCHEMA.logregr_cg_state, + FINALFUNC=MADLIB_SCHEMA.float8_cg_update_final ); /** @@ -477,18 +477,18 @@ CREATE AGGREGATE MADLIB_SCHEMA.logregr_irls_step( DOUBLE PRECISION[], MADLIB_SCHEMA.logregr_irls_state) ( - SFUNC=MADLIB_SCHEMA.float8_irls_update_accum, - STYPE=float8[], - PREFUNC=MADLIB_SCHEMA.float8_mregr_combine, - FINALFUNC=MADLIB_SCHEMA.float8_irls_update_final, - INITCOND='{0}' + SFUNC=MADLIB_SCHEMA.float8_irls_update_accum, + STYPE=float8[], + PREFUNC=MADLIB_SCHEMA.float8_mregr_combine, + FINALFUNC=MADLIB_SCHEMA.float8_irls_update_final, + INITCOND='{0}' ); CREATE FUNCTION MADLIB_SCHEMA.logregr_should_terminate( - DOUBLE PRECISION[], - DOUBLE PRECISION[], - VARCHAR, - DOUBLE PRECISION) + DOUBLE PRECISION[], + DOUBLE PRECISION[], + VARCHAR, + DOUBLE PRECISION) RETURNS BOOLEAN AS 'MODULE_PATHNAME' LANGUAGE C STRICT; @@ -497,9 +497,9 @@ LANGUAGE C STRICT; -- We only need to document the last one (unfortunately, in Greenplum we have to -- use function overloading instead of default arguments). CREATE FUNCTION MADLIB_SCHEMA.logregr_coef( - "source" VARCHAR, - "depColumn" VARCHAR, - "indepColumn" VARCHAR) + "source" VARCHAR, + "depColumn" VARCHAR, + "indepColumn" VARCHAR) RETURNS DOUBLE PRECISION[] AS $$ import sys try: @@ -508,14 +508,14 @@ RETURNS DOUBLE PRECISION[] AS $$ sys.path.append("PLPYTHON_LIBDIR") ifdef(DEBUG,,from madlib )import logRegress - return logRegress.compute_logregr_coef(**globals()) + return logRegress.compute_logregr_coef(**globals()) $$ LANGUAGE plpythonu VOLATILE; CREATE FUNCTION MADLIB_SCHEMA.logregr_coef( - "source" VARCHAR, - "depColumn" VARCHAR, - "indepColumn" VARCHAR, - "numIterations" INTEGER) + "source" VARCHAR, + "depColumn" VARCHAR, + "indepColumn" VARCHAR, + "numIterations" INTEGER) RETURNS DOUBLE PRECISION[] AS $$ import sys try: @@ -524,15 +524,15 @@ RETURNS DOUBLE PRECISION[] AS $$ sys.path.append("PLPYTHON_LIBDIR") ifdef(DEBUG,,from madlib )import logRegress - return logRegress.compute_logregr_coef(**globals()) + return logRegress.compute_logregr_coef(**globals()) $$ LANGUAGE plpythonu VOLATILE; CREATE FUNCTION MADLIB_SCHEMA.logregr_coef( - "source" VARCHAR, - "depColumn" VARCHAR, - "indepColumn" VARCHAR, - "numIterations" INTEGER, - "optimizer" VARCHAR) + "source" VARCHAR, + "depColumn" VARCHAR, + "indepColumn" VARCHAR, + "numIterations" INTEGER, + "optimizer" VARCHAR) RETURNS DOUBLE PRECISION[] AS $$ import sys try: @@ -541,7 +541,7 @@ RETURNS DOUBLE PRECISION[] AS $$ sys.path.append("PLPYTHON_LIBDIR") ifdef(DEBUG,,from madlib )import logRegress - return logRegress.compute_logregr_coef(**globals()) + return logRegress.compute_logregr_coef(**globals()) $$ LANGUAGE plpythonu VOLATILE; @@ -554,33 +554,33 @@ $$ LANGUAGE plpythonu VOLATILE; * @param source Name of the source relation containing the training data * @param depColumn Name of the dependent column (of type BOOLEAN) * @param indepColumn Name of the independent column (of type DOUBLE - * PRECISION[]) + * PRECISION[]) * @param numIterations The maximum number of iterations * @param optimizer The optimizer to use (either - * 'ilrs'/'newton' for iteratively reweighted least - * squares or 'cg' for conjugent gradient) + * 'ilrs'/'newton' for iteratively reweighted least + * squares or 'cg' for conjugent gradient) * @param precision The difference between log-likelihood values in successive - * iterations that should indicate convergence, or 0 indicating that - * log-likelihood values should be ignored + * iterations that should indicate convergence, or 0 indicating that + * log-likelihood values should be ignored * * @note This function starts an iterative algorithm. It is not an aggregate - * function. Source and column names have to be passed as strings (due to - * limitations of the SQL syntax). + * function. Source and column names have to be passed as strings (due to + * limitations of the SQL syntax). * * @examp SELECT logregr_coef('data', 'y', 'array[1, x1, x2]', 20, 'cg', - * 0.001); + * 0.001); * * @internal * @sa This function is a wrapper for logRegress::compute_logregr_coef(), which * sets the default values. */ CREATE FUNCTION MADLIB_SCHEMA.logregr_coef( - "source" VARCHAR, - "depColumn" VARCHAR, - "indepColumn" VARCHAR, - "numIterations" INTEGER /*+ DEFAULT 20 */, - "optimizer" VARCHAR /*+ DEFAULT 'irls' */, - "precision" DOUBLE PRECISION /*+ DEFAULT 0.0001 */) + "source" VARCHAR, + "depColumn" VARCHAR, + "indepColumn" VARCHAR, + "numIterations" INTEGER /*+ DEFAULT 20 */, + "optimizer" VARCHAR /*+ DEFAULT 'irls' */, + "precision" DOUBLE PRECISION /*+ DEFAULT 0.0001 */) RETURNS DOUBLE PRECISION[] AS $$ import sys try: @@ -589,7 +589,7 @@ RETURNS DOUBLE PRECISION[] AS $$ sys.path.append("PLPYTHON_LIBDIR") ifdef(DEBUG,,from madlib )import logRegress - return logRegress.compute_logregr_coef(**globals()) + return logRegress.compute_logregr_coef(**globals()) $$ LANGUAGE plpythonu VOLATILE; @@ -602,21 +602,21 @@ ifdef(PGXS, CREATE FUNCTION MADLIB_SCHEMA.init_python_paths() RETURNS VOID AS $$ - # FIXME: The following code should be common code and not reside in a specialized module - import sys - - dyld_paths = plpy.execute( - "SHOW dynamic_library_path")[0]["dynamic_library_path"].split(':') - before_default = True - count = 0 - for path in dyld_paths: - if path == "$libdir": - before_default = False - else: - if before_default: - sys.path.insert(count, path) - count += 1 - else: - sys.path.append(path) + # FIXME: The following code should be common code and not reside in a specialized module + import sys + + dyld_paths = plpy.execute( + "SHOW dynamic_library_path")[0]["dynamic_library_path"].split(':') + before_default = True + count = 0 + for path in dyld_paths: + if path == "$libdir": + before_default = False + else: + if before_default: + sys.path.insert(count, path) + count += 1 + else: + sys.path.append(path) $$ LANGUAGE plpythonu VOLATILE; )