diff --git a/examples/gpce/logistic-regress.sql b/examples/gpce/logistic-regress.sql
index cba22040a..9c587c6ec 100644
--- a/examples/gpce/logistic-regress.sql
+++ b/examples/gpce/logistic-regress.sql
@@ -96,6 +96,6 @@ SELECT * FROM artificiallogreg LIMIT 5;
\qecho === Calculate Coefficients from artificial data: ================
-SELECT madlib.logreg_coef(
+SELECT madlib.logregr_coef(
'artificiallogreg', 'y', 'x', 20, 'irls', 0.001
)::REAL[];
diff --git a/methods/bayes/src/pg_gp/bayes.py_in b/methods/bayes/src/pg_gp/bayes.py_in
index 7877f5a9f..4495c9171 100644
--- a/methods/bayes/src/pg_gp/bayes.py_in
+++ b/methods/bayes/src/pg_gp/bayes.py_in
@@ -15,526 +15,526 @@ Laplacian smoothing).
Naive Bayes: Setup Functions
@internal
- @implementation
-
- For the Naive Bayes Classification, we need a product over probabilities.
- However, multiplying lots of small numbers can lead to an exponent overflow.
- E.g., multiplying more than 324 numbers at most 0.1 will yield a product of 0
- in machine arithmetic. A safer way is therefore summing logarithms.
-
- By the IEEE 754 standard, the smallest number representable as
- DOUBLE PRECISION (64bit) is $2^{-1022}$, i.e., approximately 2.225e-308.
- See, e.g., http://en.wikipedia.org/wiki/Double_precision
- Hence, log(x) = log_10(x) for any non-zero DOUBLE PRECISION @f$x \ge -308@f$.
-
- Note for theorists:
- - Even adding infinitely many \f$ \log_{10}(x)@f$ for @f$0 < x \le 1 \f$ will
- never cause an overflow because addition will have no effect once the sum
- reaches approx $308 * 2^{53}$ (correspnding to the machine precision).
-
- The functions __get_*_sql are private because we do not want to commit ourselves
- to a particular interface. We might want to be able to change implementation
- details should the need arise.
+ @implementation
+
+ For the Naive Bayes Classification, we need a product over probabilities.
+ However, multiplying lots of small numbers can lead to an exponent overflow.
+ E.g., multiplying more than 324 numbers at most 0.1 will yield a product of 0
+ in machine arithmetic. A safer way is therefore summing logarithms.
+
+ By the IEEE 754 standard, the smallest number representable as
+ DOUBLE PRECISION (64bit) is $2^{-1022}$, i.e., approximately 2.225e-308.
+ See, e.g., http://en.wikipedia.org/wiki/Double_precision
+ Hence, log(x) = log_10(x) for any non-zero DOUBLE PRECISION @f$x \ge -308@f$.
+
+ Note for theorists:
+ - Even adding infinitely many \f$ \log_{10}(x)@f$ for @f$0 < x \le 1 \f$ will
+ never cause an overflow because addition will have no effect once the sum
+ reaches approx $308 * 2^{53}$ (correspnding to the machine precision).
+
+ The functions __get_*_sql are private because we do not want to commit ourselves
+ to a particular interface. We might want to be able to change implementation
+ details should the need arise.
@endinternal
"""
import plpy
def __get_feature_probs_sql(**kwargs):
- """Return SQL query with columns (class, attr, value, cnt, attr_cnt).
-
- For class c, attr i, and value a, cnt is #(c,i,a) and attr_cnt is \#i.
-
- Note that the query will contain a row for every pair (class, value)
- occuring in the training data (so it might also contain rows where
- \#(c,i,a) = 0).
-
- @param classPriorsSource Relation (class, class_cnt, all_cnt) where
- class is c, class_cnt is \#c, all_cnt is the number of rows in
- \em trainingSource
- @param attrValuesSource Relation (attr, value) containing all distinct
- attribute, value pairs. If omitted, will use __get_attr_values_sql()
- @param attrCountsSource Relation (attr, attr_cnt) where attr is i and
- attr_cnt is \#i. If omitted, will use __get_attr_counts_sql()
- @param trainingSource name of relation containing training data
- @param trainingClassColumn name of column with class
- @param trainingAttrColumn name of column with attributes array
- @param numAttrs Number of attributes to use for classification
-
- For meanings of \#(c,i,a), \#c, and \#i see the general description of
- \ref bayes.
- """
-
- if not 'attrValuesSource' in kwargs:
- kwargs.update(dict(
- attrValuesSource = "(" + __get_attr_values_sql(**kwargs) + ")"
- ))
- if not 'attrCountsSource' in kwargs:
- kwargs.update(dict(
- attrCountsSource = "(" + __get_attr_counts_sql(**kwargs) + ")"
- ))
+ """Return SQL query with columns (class, attr, value, cnt, attr_cnt).
+
+ For class c, attr i, and value a, cnt is #(c,i,a) and attr_cnt is \#i.
+
+ Note that the query will contain a row for every pair (class, value)
+ occuring in the training data (so it might also contain rows where
+ \#(c,i,a) = 0).
+
+ @param classPriorsSource Relation (class, class_cnt, all_cnt) where
+ class is c, class_cnt is \#c, all_cnt is the number of rows in
+ \em trainingSource
+ @param attrValuesSource Relation (attr, value) containing all distinct
+ attribute, value pairs. If omitted, will use __get_attr_values_sql()
+ @param attrCountsSource Relation (attr, attr_cnt) where attr is i and
+ attr_cnt is \#i. If omitted, will use __get_attr_counts_sql()
+ @param trainingSource name of relation containing training data
+ @param trainingClassColumn name of column with class
+ @param trainingAttrColumn name of column with attributes array
+ @param numAttrs Number of attributes to use for classification
+
+ For meanings of \#(c,i,a), \#c, and \#i see the general description of
+ \ref bayes.
+ """
+
+ if not 'attrValuesSource' in kwargs:
+ kwargs.update(dict(
+ attrValuesSource = "(" + __get_attr_values_sql(**kwargs) + ")"
+ ))
+ if not 'attrCountsSource' in kwargs:
+ kwargs.update(dict(
+ attrCountsSource = "(" + __get_attr_counts_sql(**kwargs) + ")"
+ ))
# {trainingSource} cannot be a subquery, because we use it more than once in
# our generated SQL.
- return """
- SELECT
- class,
- attr,
- value,
- coalesce(cnt, 0) AS cnt,
- attr_cnt
- FROM
- (
- SELECT *
- FROM
- {classPriorsSource} AS classes
- CROSS JOIN
- {attrValuesSource} AS attr_values
- ) AS required_triples
- LEFT OUTER JOIN
- (
- SELECT
- trainingSource.{trainingClassColumn} AS class,
- attr.attr,
- trainingSource.{trainingAttrColumn}[attr.attr] AS value,
- count(*) AS cnt
- FROM
- generate_series(1, {numAttrs}) AS attr,
- {trainingSource} AS trainingSource
- GROUP BY
+ return """
+ SELECT
+ class,
+ attr,
+ value,
+ coalesce(cnt, 0) AS cnt,
+ attr_cnt
+ FROM
+ (
+ SELECT *
+ FROM
+ {classPriorsSource} AS classes
+ CROSS JOIN
+ {attrValuesSource} AS attr_values
+ ) AS required_triples
+ LEFT OUTER JOIN
+ (
+ SELECT
+ trainingSource.{trainingClassColumn} AS class,
+ attr.attr,
+ trainingSource.{trainingAttrColumn}[attr.attr] AS value,
+ count(*) AS cnt
+ FROM
+ generate_series(1, {numAttrs}) AS attr,
+ {trainingSource} AS trainingSource
+ GROUP BY
trainingSource.{trainingClassColumn},
attr.attr,
trainingSource.{trainingAttrColumn}[attr.attr]
- ) AS triple_counts
- USING (class, attr, value)
- INNER JOIN
- {attrCountsSource} AS attr_counts
- USING (attr)
- """.format(**kwargs)
+ ) AS triple_counts
+ USING (class, attr, value)
+ INNER JOIN
+ {attrCountsSource} AS attr_counts
+ USING (attr)
+ """.format(**kwargs)
def __get_attr_values_sql(**kwargs):
- """
- Return SQL query with columns (attr, value).
-
- The query contains a row for each pair that occurs in the training data.
-
- @param trainingSource Name of relation containing the training data
- @param trainingAttrColumn Name of attributes-array column in training data
- @param numAttrs Number of attributes to use for classification
-
- @internal
- \par Implementation Notes:
- If PostgreSQL supported count(DISTINCT ...) for window functions, we could
- consolidate this function with __get_attr_counts_sql():
- @verbatim
- [...] count(DISTINCT value) OVER (PARTITION BY attr) [...]
- @endverbatim
- @endinternal
-
- """
-
- return """
- SELECT DISTINCT
- attr.attr,
- trainingSource.{trainingAttrColumn}[attr.attr] AS value
- FROM
- generate_series(1, {numAttrs}) AS attr,
- {trainingSource} AS trainingSource
- """.format(**kwargs)
+ """
+ Return SQL query with columns (attr, value).
+
+ The query contains a row for each pair that occurs in the training data.
+
+ @param trainingSource Name of relation containing the training data
+ @param trainingAttrColumn Name of attributes-array column in training data
+ @param numAttrs Number of attributes to use for classification
+
+ @internal
+ \par Implementation Notes:
+ If PostgreSQL supported count(DISTINCT ...) for window functions, we could
+ consolidate this function with __get_attr_counts_sql():
+ @verbatim
+ [...] count(DISTINCT value) OVER (PARTITION BY attr) [...]
+ @endverbatim
+ @endinternal
+
+ """
+
+ return """
+ SELECT DISTINCT
+ attr.attr,
+ trainingSource.{trainingAttrColumn}[attr.attr] AS value
+ FROM
+ generate_series(1, {numAttrs}) AS attr,
+ {trainingSource} AS trainingSource
+ """.format(**kwargs)
def __get_attr_counts_sql(**kwargs):
- """
- Return SQL query with columns (attr, attr_cnt)
-
- For attr i, attr_cnt is \#i.
-
- @param trainingSource Name of relation containing the training data
- @param trainingAttrColumn Name of attributes-array column in training data
- @param numAttrs Number of attributes to use for classification
-
- """
-
- return """
- SELECT
- attr.attr,
- count(
+ """
+ Return SQL query with columns (attr, attr_cnt)
+
+ For attr i, attr_cnt is \#i.
+
+ @param trainingSource Name of relation containing the training data
+ @param trainingAttrColumn Name of attributes-array column in training data
+ @param numAttrs Number of attributes to use for classification
+
+ """
+
+ return """
+ SELECT
+ attr.attr,
+ count(
DISTINCT trainingSource.{trainingAttrColumn}[attr.attr]
) AS attr_cnt
- FROM
- generate_series(1, {numAttrs}) AS attr,
- {trainingSource} AS trainingSource
- GROUP BY attr.attr
- """.format(**kwargs)
+ FROM
+ generate_series(1, {numAttrs}) AS attr,
+ {trainingSource} AS trainingSource
+ GROUP BY attr.attr
+ """.format(**kwargs)
def __get_class_priors_sql(**kwargs):
- """
- Return SQL query with columns (class, class_cnt, all_cnt)
-
- For class c, class_cnt is \#c. all_cnt is the total number of records in the
- training data.
-
- @param trainingSource Name of relation containing the training data
- @param trainingClassColumn Name of class column in training data
-
- """
-
- return """
- SELECT
- trainingSource.{trainingClassColumn} AS class,
- count(*) AS class_cnt,
- sum(count(*)) OVER () AS all_cnt
- FROM {trainingSource} AS trainingSource
- GROUP BY trainingSource.{trainingClassColumn}
- """.format(**kwargs)
+ """
+ Return SQL query with columns (class, class_cnt, all_cnt)
+
+ For class c, class_cnt is \#c. all_cnt is the total number of records in the
+ training data.
+
+ @param trainingSource Name of relation containing the training data
+ @param trainingClassColumn Name of class column in training data
+
+ """
+
+ return """
+ SELECT
+ trainingSource.{trainingClassColumn} AS class,
+ count(*) AS class_cnt,
+ sum(count(*)) OVER () AS all_cnt
+ FROM {trainingSource} AS trainingSource
+ GROUP BY trainingSource.{trainingClassColumn}
+ """.format(**kwargs)
def __get_keys_and_prob_values_sql(**kwargs):
- """
- Return SQL query with columns (key, class, log_prob).
-
- For class c and the attribute array identified by key k, log_prob is
- log( P(C = c) * P(A = a(k)[] | C = c) ).
-
- For each key k and class c, the query also contains a row (k, c, NULL). This
- is for technical reasons (we want every key-class pair to appear in the
- query. NULL serves as a default value if there is insufficient training data
- to compute a probability value).
-
- @param numAttrs Number of attributes to use for classification
- @param classifySource Name of the relation that contains data to be classified
- @param classifyKeyColumn Name of column in \em classifySource that can
- serve as unique identifier
- @param classifyAttrColumn Name of attributes-array column in \em classifySource
- @param classPriorsSource
- Relation (class, class_cnt, all_cnt) where
- class is c, class_cnt is \#c, all_cnt is the number of training
- samples.
- @param featureProbsSource
- Relation (class, attr, value, cnt, attr_cnt) where
- (class, attr, value) = (c,i,a), cnt = \#(c,i,a), and attr_cnt = \#i
- @param smoothingFactor Smoothing factor for computing feature
- feature probabilities. Default value: 1.0 (Laplacian Smoothing).
-
- """
+ """
+ Return SQL query with columns (key, class, log_prob).
+
+ For class c and the attribute array identified by key k, log_prob is
+ log( P(C = c) * P(A = a(k)[] | C = c) ).
+
+ For each key k and class c, the query also contains a row (k, c, NULL). This
+ is for technical reasons (we want every key-class pair to appear in the
+ query. NULL serves as a default value if there is insufficient training data
+ to compute a probability value).
+
+ @param numAttrs Number of attributes to use for classification
+ @param classifySource Name of the relation that contains data to be classified
+ @param classifyKeyColumn Name of column in \em classifySource that can
+ serve as unique identifier
+ @param classifyAttrColumn Name of attributes-array column in \em classifySource
+ @param classPriorsSource
+ Relation (class, class_cnt, all_cnt) where
+ class is c, class_cnt is \#c, all_cnt is the number of training
+ samples.
+ @param featureProbsSource
+ Relation (class, attr, value, cnt, attr_cnt) where
+ (class, attr, value) = (c,i,a), cnt = \#(c,i,a), and attr_cnt = \#i
+ @param smoothingFactor Smoothing factor for computing feature
+ feature probabilities. Default value: 1.0 (Laplacian Smoothing).
+
+ """
# {classifySource} cannot be a subquery, because we use it more than once in
# our generated SQL.
- return """
- SELECT
- classify.key,
- classPriors.class,
- CASE WHEN count(*) < {numAttrs} THEN NULL
- ELSE
- log(classPriors.class_cnt::DOUBLE PRECISION / classPriors.all_cnt)
- + sum( log((featureProbs.cnt::DOUBLE PRECISION + {smoothingFactor})
- / (classPriors.class_cnt + {smoothingFactor} * featureProbs.attr_cnt)) )
- END
- AS log_prob
- FROM
- {featureProbsSource} AS featureProbs,
- {classPriorsSource} AS classPriors,
- (
- SELECT
- classifySource.{classifyKeyColumn} AS key,
- attr.attr,
- classifySource.{classifyAttrColumn}[attr.attr] AS value
- FROM
- {classifySource} AS classifySource,
- generate_series(1, {numAttrs}) AS attr
- ) AS classify
- WHERE
- featureProbs.class = classPriors.class AND
- featureProbs.attr = classify.attr AND
- featureProbs.value = classify.value AND
- ({smoothingFactor} > 0 OR featureProbs.cnt > 0) -- prevent division by 0
- GROUP BY
- classify.key, classPriors.class, classPriors.class_cnt, classPriors.all_cnt
-
- UNION
-
- SELECT
- classify.{classifyKeyColumn} AS key,
- classes.class,
- NULL
- FROM
- {classifySource} AS classify,
- {classPriorsSource} AS classes
- GROUP BY classify.{classifyKeyColumn}, classes.class
- """.format(**kwargs)
+ return """
+ SELECT
+ classify.key,
+ classPriors.class,
+ CASE WHEN count(*) < {numAttrs} THEN NULL
+ ELSE
+ log(classPriors.class_cnt::DOUBLE PRECISION / classPriors.all_cnt)
+ + sum( log((featureProbs.cnt::DOUBLE PRECISION + {smoothingFactor})
+ / (classPriors.class_cnt + {smoothingFactor} * featureProbs.attr_cnt)) )
+ END
+ AS log_prob
+ FROM
+ {featureProbsSource} AS featureProbs,
+ {classPriorsSource} AS classPriors,
+ (
+ SELECT
+ classifySource.{classifyKeyColumn} AS key,
+ attr.attr,
+ classifySource.{classifyAttrColumn}[attr.attr] AS value
+ FROM
+ {classifySource} AS classifySource,
+ generate_series(1, {numAttrs}) AS attr
+ ) AS classify
+ WHERE
+ featureProbs.class = classPriors.class AND
+ featureProbs.attr = classify.attr AND
+ featureProbs.value = classify.value AND
+ ({smoothingFactor} > 0 OR featureProbs.cnt > 0) -- prevent division by 0
+ GROUP BY
+ classify.key, classPriors.class, classPriors.class_cnt, classPriors.all_cnt
+
+ UNION
+
+ SELECT
+ classify.{classifyKeyColumn} AS key,
+ classes.class,
+ NULL
+ FROM
+ {classifySource} AS classify,
+ {classPriorsSource} AS classes
+ GROUP BY classify.{classifyKeyColumn}, classes.class
+ """.format(**kwargs)
def __get_prob_values_sql(**kwargs):
- """
- Return SQL query with columns (class, log_prob), given an array of
- attributes.
-
- The query binds to an attribute array a[]. For every class c, log_prob
- is log( P(C = c) * P(A = a[] | C = c) ).
-
- @param classifyAttrColumn Array of attributes to bind to. This can be
- a column name of an outer query or a literal.
- @param smoothingFactor Smoothing factor to use for estimating the feature
- probabilities.
- @param numAttrs Number of attributes to use for classification
- @param classPriorsSource
- Relation (class, class_cnt, all_cnt) where
- class is c, class_cnt is \#c, all_cnt is the number of training
- samples.
- @param featureProbsSource
- Relation (class, attr, value, cnt, attr_cnt) where
- (class, attr, value) = (c,i,a), cnt = \#(c,i,a), and attr_cnt = \#i
-
- Note that unless \em classifyAttrColumn is a literal, the SQL query will
- become a correlated subquery and will not work in Greenplum.
-
- """
+ """
+ Return SQL query with columns (class, log_prob), given an array of
+ attributes.
+
+ The query binds to an attribute array a[]. For every class c, log_prob
+ is log( P(C = c) * P(A = a[] | C = c) ).
+
+ @param classifyAttrColumn Array of attributes to bind to. This can be
+ a column name of an outer query or a literal.
+ @param smoothingFactor Smoothing factor to use for estimating the feature
+ probabilities.
+ @param numAttrs Number of attributes to use for classification
+ @param classPriorsSource
+ Relation (class, class_cnt, all_cnt) where
+ class is c, class_cnt is \#c, all_cnt is the number of training
+ samples.
+ @param featureProbsSource
+ Relation (class, attr, value, cnt, attr_cnt) where
+ (class, attr, value) = (c,i,a), cnt = \#(c,i,a), and attr_cnt = \#i
+
+ Note that unless \em classifyAttrColumn is a literal, the SQL query will
+ become a correlated subquery and will not work in Greenplum.
+
+ """
# {classifyAttrColumn} binds to a names declared outside of the following
# SQL. We need to ensure that ther are no conflicting names with
# {classifyAttrColumn}. Therefore, we only introduce the unusual name
# __attr. Note that by the structure of the query, there can be no other
# name conflicts.
- return """
- SELECT
- classPriors.class,
- CASE WHEN count(*) < {numAttrs} THEN NULL
- ELSE
- log(classPriors.class_cnt::DOUBLE PRECISION / classPriors.all_cnt)
- + sum( log((featureProbs.cnt::DOUBLE PRECISION + {smoothingFactor})
- / (classPriors.class_cnt + {smoothingFactor} * featureProbs.attr_cnt)) )
- END
- AS log_prob
- FROM
- {featureProbsSource} AS featureProbs,
- {classPriorsSource} AS classPriors,
- (
- SELECT
- __attr.__attr,
- {classifyAttrColumn}[__attr.__attr] AS value
- FROM
- generate_series(1, {numAttrs}) AS __attr
- ) AS classify
- WHERE
- featureProbs.class = classPriors.class AND
- featureProbs.attr = classify.__attr AND featureProbs.value = classify.value AND
- ({smoothingFactor} > 0 OR featureProbs.cnt > 0) -- prevent division by 0
- GROUP BY classPriors.class, classPriors.class_cnt, classPriors.all_cnt
-
- UNION
-
- SELECT
- classes.class,
- NULL
- FROM
- {classPriorsSource} AS classes
- """.format(**kwargs)
+ return """
+ SELECT
+ classPriors.class,
+ CASE WHEN count(*) < {numAttrs} THEN NULL
+ ELSE
+ log(classPriors.class_cnt::DOUBLE PRECISION / classPriors.all_cnt)
+ + sum( log((featureProbs.cnt::DOUBLE PRECISION + {smoothingFactor})
+ / (classPriors.class_cnt + {smoothingFactor} * featureProbs.attr_cnt)) )
+ END
+ AS log_prob
+ FROM
+ {featureProbsSource} AS featureProbs,
+ {classPriorsSource} AS classPriors,
+ (
+ SELECT
+ __attr.__attr,
+ {classifyAttrColumn}[__attr.__attr] AS value
+ FROM
+ generate_series(1, {numAttrs}) AS __attr
+ ) AS classify
+ WHERE
+ featureProbs.class = classPriors.class AND
+ featureProbs.attr = classify.__attr AND featureProbs.value = classify.value AND
+ ({smoothingFactor} > 0 OR featureProbs.cnt > 0) -- prevent division by 0
+ GROUP BY classPriors.class, classPriors.class_cnt, classPriors.all_cnt
+
+ UNION
+
+ SELECT
+ classes.class,
+ NULL
+ FROM
+ {classPriorsSource} AS classes
+ """.format(**kwargs)
def __get_classification_sql(**kwargs):
- """
- Return SQL query with columns (key, nb_classification, nb_log_probability)
-
- @param keys_and_prob_values Relation (key, class, log_prob)
-
- """
-
- return """
- SELECT
- key,
- MADLIB_SCHEMA.argmax(class, log_prob) AS nb_classification,
- max(log_prob) AS nb_log_probability
- FROM {keys_and_prob_values} AS keys_and_nb_values
- GROUP BY key
- """.format(
- keys_and_prob_values = "(" + __get_keys_and_prob_values_sql(**kwargs) + ")"
- )
+ """
+ Return SQL query with columns (key, nb_classification, nb_log_probability)
+
+ @param keys_and_prob_values Relation (key, class, log_prob)
+
+ """
+
+ return """
+ SELECT
+ key,
+ MADLIB_SCHEMA.argmax(class, log_prob) AS nb_classification,
+ max(log_prob) AS nb_log_probability
+ FROM {keys_and_prob_values} AS keys_and_nb_values
+ GROUP BY key
+ """.format(
+ keys_and_prob_values = "(" + __get_keys_and_prob_values_sql(**kwargs) + ")"
+ )
def create_prepared_data(**kwargs):
- """Precompute all class priors and feature probabilities.
-
- When the precomputations are stored in a table, this function will create
- indices that speed up lookups necessary for Naive Bayes classification.
- Moreover, it runs ANALYZE on the new tables to allow for optimized query
- plans.
-
- Class priors are stored in a relation with columns
- (class, class_cnt, all_cnt).
-
- @param trainingSource Name of relation containing the training data
- @param trainingClassColumn Name of class column in training data
- @param trainingAttrColumn Name of attributes-array column in training data
- @param numAttrs Number of attributes to use for classification
-
- @param whatToCreate (Optional) Either \c 'TABLE' OR \c 'VIEW' (the default).
- @param classPriorsDestName Name of class-priors relation to create
- @param featureProbsDestName Name of feature-probabilities relation to create
-
- """
-
- if kwargs['whatToCreate'] == 'TABLE':
- # FIXME: ANALYZE is not portable.
- kwargs.update(dict(
- attrCountsSource = '_madlib_nb_attr_counts',
- attrValuesSource = '_madlib_nb_attr_values'
- ))
+ """Precompute all class priors and feature probabilities.
+
+ When the precomputations are stored in a table, this function will create
+ indices that speed up lookups necessary for Naive Bayes classification.
+ Moreover, it runs ANALYZE on the new tables to allow for optimized query
+ plans.
+
+ Class priors are stored in a relation with columns
+ (class, class_cnt, all_cnt).
+
+ @param trainingSource Name of relation containing the training data
+ @param trainingClassColumn Name of class column in training data
+ @param trainingAttrColumn Name of attributes-array column in training data
+ @param numAttrs Number of attributes to use for classification
+
+ @param whatToCreate (Optional) Either \c 'TABLE' OR \c 'VIEW' (the default).
+ @param classPriorsDestName Name of class-priors relation to create
+ @param featureProbsDestName Name of feature-probabilities relation to create
+
+ """
+
+ if kwargs['whatToCreate'] == 'TABLE':
+ # FIXME: ANALYZE is not portable.
+ kwargs.update(dict(
+ attrCountsSource = '_madlib_nb_attr_counts',
+ attrValuesSource = '_madlib_nb_attr_values'
+ ))
plpy.execute("""
DROP TABLE IF EXISTS {attrCountsSource};
- CREATE TEMPORARY TABLE {attrCountsSource}
- AS
- {attr_counts_sql};
- ALTER TABLE {attrCountsSource} ADD PRIMARY KEY (attr);
- ANALYZE {attrCountsSource};
-
+ CREATE TEMPORARY TABLE {attrCountsSource}
+ AS
+ {attr_counts_sql};
+ ALTER TABLE {attrCountsSource} ADD PRIMARY KEY (attr);
+ ANALYZE {attrCountsSource};
+
DROP TABLE IF EXISTS {attrValuesSource};
- CREATE TEMPORARY TABLE {attrValuesSource}
- AS
- {attr_values_sql};
- ALTER TABLE {attrValuesSource} ADD PRIMARY KEY (attr, value);
- ANALYZE {attrValuesSource};
- """.format(
+ CREATE TEMPORARY TABLE {attrValuesSource}
+ AS
+ {attr_values_sql};
+ ALTER TABLE {attrValuesSource} ADD PRIMARY KEY (attr, value);
+ ANALYZE {attrValuesSource};
+ """.format(
attrCountsSource = kwargs['attrCountsSource'],
attrValuesSource = kwargs['attrValuesSource'],
- attr_counts_sql = "(" + __get_attr_counts_sql(**kwargs) + ")",
- attr_values_sql = "(" + __get_attr_values_sql(**kwargs) + ")"
- )
- )
-
-
- kwargs.update(dict(
- sql = __get_class_priors_sql(**kwargs)
- ))
- plpy.execute("""
- CREATE {whatToCreate} {classPriorsDestName}
- AS
- {sql}
- """.format(**kwargs)
- )
- if kwargs['whatToCreate'] == 'TABLE':
- plpy.execute("""
- ALTER TABLE {classPriorsDestName} ADD PRIMARY KEY (class);
- ANALYZE {classPriorsDestName};
- """.format(**kwargs))
-
- kwargs.update(dict(
- classPriorsSource = kwargs['classPriorsDestName']
- ))
- kwargs.update(dict(
- sql = __get_feature_probs_sql(**kwargs)
- ))
- plpy.execute("""
- CREATE {whatToCreate} {featureProbsDestName} AS
- {sql}
- """.format(**kwargs)
- )
- if kwargs['whatToCreate'] == 'TABLE':
- plpy.execute("""
- ALTER TABLE {featureProbsDestName} ADD PRIMARY KEY (class, attr, value);
- ANALYZE {featureProbsDestName};
- DROP TABLE {attrCountsSource};
- DROP TABLE {attrValuesSource};
- """.format(**kwargs))
+ attr_counts_sql = "(" + __get_attr_counts_sql(**kwargs) + ")",
+ attr_values_sql = "(" + __get_attr_values_sql(**kwargs) + ")"
+ )
+ )
+
+
+ kwargs.update(dict(
+ sql = __get_class_priors_sql(**kwargs)
+ ))
+ plpy.execute("""
+ CREATE {whatToCreate} {classPriorsDestName}
+ AS
+ {sql}
+ """.format(**kwargs)
+ )
+ if kwargs['whatToCreate'] == 'TABLE':
+ plpy.execute("""
+ ALTER TABLE {classPriorsDestName} ADD PRIMARY KEY (class);
+ ANALYZE {classPriorsDestName};
+ """.format(**kwargs))
+
+ kwargs.update(dict(
+ classPriorsSource = kwargs['classPriorsDestName']
+ ))
+ kwargs.update(dict(
+ sql = __get_feature_probs_sql(**kwargs)
+ ))
+ plpy.execute("""
+ CREATE {whatToCreate} {featureProbsDestName} AS
+ {sql}
+ """.format(**kwargs)
+ )
+ if kwargs['whatToCreate'] == 'TABLE':
+ plpy.execute("""
+ ALTER TABLE {featureProbsDestName} ADD PRIMARY KEY (class, attr, value);
+ ANALYZE {featureProbsDestName};
+ DROP TABLE {attrCountsSource};
+ DROP TABLE {attrValuesSource};
+ """.format(**kwargs))
def create_classification(**kwargs):
- """
- Create a view/table with columns (key, nb_classification).
-
- The created relation will be
-
- {TABLE|VIEW} destName (key, nb_classification)
-
- where \c nb_classification is an array containing the most likely
- class(es) of the record in \em classifySource identified by \c key.
-
- There are two sets of arguments this function can be called with. The
- following parameters are always needed:
- @param numAttrs Number of attributes to use for classification
- @param destName Name of the table or view to create
- @param whatToCreate (Optional) Either \c 'TABLE' OR \c 'VIEW' (the default).
- @param smoothingFactor (Optional) Smoothing factor for computing feature
- feature probabilities. Default value: 1.0 (Laplacian Smoothing).
- @param classifySource Name of the relation that contains data to be classified
- @param classifyKeyColumn Name of column in \em classifySource that can
- serve as unique identifier
- @param classifyAttrColumn Name of attributes-array column in \em classifySource
-
- Furthermore, provide either:
- @param classPriorsSource
- Relation (class, class_cnt, all_cnt) where
- class is c, class_cnt is \#c, all_cnt is the number of training
- samples.
- @param featureProbsSource
- Relation (class, attr, value, cnt, attr_cnt) where
- (class, attr, value) = (c,i,a), cnt = \#(c,i,a), and attr_cnt = \#i
-
- Or have this function operate on the "raw" training data:
- @param trainingSource
- Name of relation containing the training data
- @param trainingClassColumn
- Name of class column in training data
- @param trainingAttrColumn
- Name of attributes-array column in \em trainingSource
-
- """
-
- __init_prepared_data(kwargs)
- kwargs.update(dict(
- keys_and_prob_values = "(" + __get_keys_and_prob_values_sql(**kwargs) + ")"
- ))
- plpy.execute("""
- CREATE {whatToCreate} {destName} AS
- SELECT
- key,
- MADLIB_SCHEMA.argmax(class, log_prob) AS nb_classification
- FROM {keys_and_prob_values} AS keys_and_nb_values
- GROUP BY key
- """.format(**kwargs))
+ """
+ Create a view/table with columns (key, nb_classification).
+
+ The created relation will be
+
+ {TABLE|VIEW} destName (key, nb_classification)
+
+ where \c nb_classification is an array containing the most likely
+ class(es) of the record in \em classifySource identified by \c key.
+
+ There are two sets of arguments this function can be called with. The
+ following parameters are always needed:
+ @param numAttrs Number of attributes to use for classification
+ @param destName Name of the table or view to create
+ @param whatToCreate (Optional) Either \c 'TABLE' OR \c 'VIEW' (the default).
+ @param smoothingFactor (Optional) Smoothing factor for computing feature
+ feature probabilities. Default value: 1.0 (Laplacian Smoothing).
+ @param classifySource Name of the relation that contains data to be classified
+ @param classifyKeyColumn Name of column in \em classifySource that can
+ serve as unique identifier
+ @param classifyAttrColumn Name of attributes-array column in \em classifySource
+
+ Furthermore, provide either:
+ @param classPriorsSource
+ Relation (class, class_cnt, all_cnt) where
+ class is c, class_cnt is \#c, all_cnt is the number of training
+ samples.
+ @param featureProbsSource
+ Relation (class, attr, value, cnt, attr_cnt) where
+ (class, attr, value) = (c,i,a), cnt = \#(c,i,a), and attr_cnt = \#i
+
+ Or have this function operate on the "raw" training data:
+ @param trainingSource
+ Name of relation containing the training data
+ @param trainingClassColumn
+ Name of class column in training data
+ @param trainingAttrColumn
+ Name of attributes-array column in \em trainingSource
+
+ """
+
+ __init_prepared_data(kwargs)
+ kwargs.update(dict(
+ keys_and_prob_values = "(" + __get_keys_and_prob_values_sql(**kwargs) + ")"
+ ))
+ plpy.execute("""
+ CREATE {whatToCreate} {destName} AS
+ SELECT
+ key,
+ MADLIB_SCHEMA.argmax(class, log_prob) AS nb_classification
+ FROM {keys_and_prob_values} AS keys_and_nb_values
+ GROUP BY key
+ """.format(**kwargs))
def create_bayes_probabilities(**kwargs):
- """Create table/view with columns (key, class, nb_prob)
-
- The created relation will be
-
- {TABLE|VIEW} destName (key, class, nb_prob)
-
- where \c nb_prob is the Naive-Bayes probability that \c class is the true
- class of the record in \em classifySource identified by \c key.
-
- There are two sets of arguments this function can be called with. The
- following parameters are always needed:
- @param numAttrs Number of attributes to use for classification
- @param destName Name of the table or view to create
- @param whatToCreate (Optional) Either \c 'TABLE' OR \c 'VIEW' (the default).
- @param smoothingFactor (Optional) Smoothing factor for computing feature
- feature probabilities. Default value: 1.0 (Laplacian Smoothing).
-
- Furthermore, provide either:
- @param classPriorsSource
- Relation (class, class_cnt, all_cnt) where
- class is c, class_cnt is \#c, all_cnt is the number of training
- samples.
- @param featureProbsSource
- Relation (class, attr, value, cnt, attr_cnt) where
- (class, attr, value) = (c,i,a), cnt = \#(c,i,a), and attr_cnt = \#i
-
- Or have this function operate on the "raw" training data:
- @param trainingSource
- Name of relation containing the training data
- @param trainingClassColumn
- Name of class column in training data
- @param trainingAttrColumn
- Name of attributes-array column in training data
-
- @internal
- \par Implementation Notes:
-
- We have two numerical problems when copmuting the probabilities
- @verbatim
+ """Create table/view with columns (key, class, nb_prob)
+
+ The created relation will be
+
+ {TABLE|VIEW} destName (key, class, nb_prob)
+
+ where \c nb_prob is the Naive-Bayes probability that \c class is the true
+ class of the record in \em classifySource identified by \c key.
+
+ There are two sets of arguments this function can be called with. The
+ following parameters are always needed:
+ @param numAttrs Number of attributes to use for classification
+ @param destName Name of the table or view to create
+ @param whatToCreate (Optional) Either \c 'TABLE' OR \c 'VIEW' (the default).
+ @param smoothingFactor (Optional) Smoothing factor for computing feature
+ feature probabilities. Default value: 1.0 (Laplacian Smoothing).
+
+ Furthermore, provide either:
+ @param classPriorsSource
+ Relation (class, class_cnt, all_cnt) where
+ class is c, class_cnt is \#c, all_cnt is the number of training
+ samples.
+ @param featureProbsSource
+ Relation (class, attr, value, cnt, attr_cnt) where
+ (class, attr, value) = (c,i,a), cnt = \#(c,i,a), and attr_cnt = \#i
+
+ Or have this function operate on the "raw" training data:
+ @param trainingSource
+ Name of relation containing the training data
+ @param trainingClassColumn
+ Name of class column in training data
+ @param trainingAttrColumn
+ Name of attributes-array column in training data
+
+ @internal
+ \par Implementation Notes:
+
+ We have two numerical problems when copmuting the probabilities
+ @verbatim
P(C = c) * P(A = a | C = c)
P(C = c) = --------------------------------- (*)
--
@@ -544,128 +544,128 @@ def create_bayes_probabilities(**kwargs):
__
where P(A = a | C = c) = || P(A_i = a_i | C = c).
i
- @endverbatim
-
- 1. P(A = a | C = c) could be a very small number not representable in
- double-precision floating-point arithmetic.
- - Solution: We have log( P(C = c) * P(A = a | C = c) ) as indermediate
- results. We will add the maximum absolute value of these intermediate
- results to all of them. This corresponds to multiplying numerator and
- denominator of (*) with the same factor. The "normalization" ensures
- that the numerator of (*) can never be 0 (in FP arithmetic) for all c.
-
- 2. PostgreSQL raises an error in case of underflows, even when 0 is the
- desirable outcome.
- - Solution: if log_10 ( P(A = a | C = c) ) < -300, we interprete
+ @endverbatim
+
+ 1. P(A = a | C = c) could be a very small number not representable in
+ double-precision floating-point arithmetic.
+ - Solution: We have log( P(C = c) * P(A = a | C = c) ) as indermediate
+ results. We will add the maximum absolute value of these intermediate
+ results to all of them. This corresponds to multiplying numerator and
+ denominator of (*) with the same factor. The "normalization" ensures
+ that the numerator of (*) can never be 0 (in FP arithmetic) for all c.
+
+ 2. PostgreSQL raises an error in case of underflows, even when 0 is the
+ desirable outcome.
+ - Solution: if log_10 ( P(A = a | C = c) ) < -300, we interprete
P(A = a | C = c) = 0. Note here that 1e-300 is roughly in the order of
- magnitude of the smallest double precision FP number.
- @endinternal
- """
-
- __init_prepared_data(kwargs)
- kwargs.update(dict(
- keys_and_prob_values = "(" + __get_keys_and_prob_values_sql(**kwargs) + ")"
- ))
- plpy.execute("""
- CREATE {whatToCreate} {destName} AS
- SELECT
- key,
- class,
- nb_prob / sum(nb_prob) OVER (PARTITION BY key) AS nb_prob
- FROM
- (
- SELECT
- key,
- class,
- CASE WHEN max(log_prob) - max(max(log_prob)) OVER (PARTITION BY key) < -300 THEN 0
- ELSE pow(10, max(log_prob) - max(max(log_prob)) OVER (PARTITION BY key))
- END AS nb_prob
- FROM
- {keys_and_prob_values} AS keys_and_nb_values
- GROUP BY
- key, class
- ) AS keys_and_nb_values
- ORDER BY
- key, class
- """.format(**kwargs))
+ magnitude of the smallest double precision FP number.
+ @endinternal
+ """
+
+ __init_prepared_data(kwargs)
+ kwargs.update(dict(
+ keys_and_prob_values = "(" + __get_keys_and_prob_values_sql(**kwargs) + ")"
+ ))
+ plpy.execute("""
+ CREATE {whatToCreate} {destName} AS
+ SELECT
+ key,
+ class,
+ nb_prob / sum(nb_prob) OVER (PARTITION BY key) AS nb_prob
+ FROM
+ (
+ SELECT
+ key,
+ class,
+ CASE WHEN max(log_prob) - max(max(log_prob)) OVER (PARTITION BY key) < -300 THEN 0
+ ELSE pow(10, max(log_prob) - max(max(log_prob)) OVER (PARTITION BY key))
+ END AS nb_prob
+ FROM
+ {keys_and_prob_values} AS keys_and_nb_values
+ GROUP BY
+ key, class
+ ) AS keys_and_nb_values
+ ORDER BY
+ key, class
+ """.format(**kwargs))
def create_classification_function(**kwargs):
- """Create a SQL function mapping arrays of attribute values to the Naive
- Bayes classification.
-
- The created SQL function will be:
-
-
- FUNCTION destName (attributes INTEGER[], smoothingFactor DOUBLE PRECISION)
- RETURNS INTEGER[]
-
- There are two sets of arguments this function can be called with. The
- following parameters are always needed:
- @param classifyAttrColumn Array of attributes to bind to. This can be
- a column name of an outer query or a literal.
- @param smoothingFactor Smoothing factor to use for estimating the feature
- probabilities.
- @param numAttrs Number of attributes to use for classification
-
- Furthermore, provide either:
- @param classPriorsSource
- Relation (class, class_cnt, all_cnt) where
- class is c, class_cnt is \#c, all_cnt is the number of training
- samples.
- @param featureProbsSource
- Relation (class, attr, value, cnt, attr_cnt) where
- (class, attr, value) = (c,i,a), cnt = \#(c,i,a), and attr_cnt = \#i
-
- Or have this function operate on the "raw" training data:
- @param trainingSource Name of relation containing the training data
- @param trainingClassColumn Name of class column in training data
- @param trainingAttrColumn Name of attributes-array column in training data
-
- Note: Greenplum does not support executing STABLE and VOLATILE functions on
- segments. The created function can therefore only be called on the master.
- """
-
- kwargs.update(dict(
- classifyAttrColumn = "$1",
- smoothingFactor = "$2"
- ))
- __init_prepared_data(kwargs)
- kwargs.update(dict(
- keys_and_prob_values = "(" + __get_prob_values_sql(**kwargs) + ")"
- ))
- plpy.execute("""
- CREATE FUNCTION {destName} (inAttributes INTEGER[], inSmoothingFactor DOUBLE PRECISION)
- RETURNS INTEGER[] AS
- $$
- SELECT
- MADLIB_SCHEMA.argmax(class, log_prob)
- FROM {keys_and_prob_values} AS key_and_nb_values
- $$
- LANGUAGE sql STABLE
- """.format(**kwargs))
+ """Create a SQL function mapping arrays of attribute values to the Naive
+ Bayes classification.
+
+ The created SQL function will be:
+
+
+ FUNCTION destName (attributes INTEGER[], smoothingFactor DOUBLE PRECISION)
+ RETURNS INTEGER[]
+
+ There are two sets of arguments this function can be called with. The
+ following parameters are always needed:
+ @param classifyAttrColumn Array of attributes to bind to. This can be
+ a column name of an outer query or a literal.
+ @param smoothingFactor Smoothing factor to use for estimating the feature
+ probabilities.
+ @param numAttrs Number of attributes to use for classification
+
+ Furthermore, provide either:
+ @param classPriorsSource
+ Relation (class, class_cnt, all_cnt) where
+ class is c, class_cnt is \#c, all_cnt is the number of training
+ samples.
+ @param featureProbsSource
+ Relation (class, attr, value, cnt, attr_cnt) where
+ (class, attr, value) = (c,i,a), cnt = \#(c,i,a), and attr_cnt = \#i
+
+ Or have this function operate on the "raw" training data:
+ @param trainingSource Name of relation containing the training data
+ @param trainingClassColumn Name of class column in training data
+ @param trainingAttrColumn Name of attributes-array column in training data
+
+ Note: Greenplum does not support executing STABLE and VOLATILE functions on
+ segments. The created function can therefore only be called on the master.
+ """
+
+ kwargs.update(dict(
+ classifyAttrColumn = "$1",
+ smoothingFactor = "$2"
+ ))
+ __init_prepared_data(kwargs)
+ kwargs.update(dict(
+ keys_and_prob_values = "(" + __get_prob_values_sql(**kwargs) + ")"
+ ))
+ plpy.execute("""
+ CREATE FUNCTION {destName} (inAttributes INTEGER[], inSmoothingFactor DOUBLE PRECISION)
+ RETURNS INTEGER[] AS
+ $$
+ SELECT
+ MADLIB_SCHEMA.argmax(class, log_prob)
+ FROM {keys_and_prob_values} AS key_and_nb_values
+ $$
+ LANGUAGE sql STABLE
+ """.format(**kwargs))
def __init_prepared_data(kwargs):
- """
- Fill in values for optional parameters: Create subqueries instead of using
- a relation.
-
- """
-
- if not 'classPriorsSource' in kwargs:
- kwargs.update(dict(
- classPriorsSource = "(" + __get_class_priors_sql(**kwargs) + ")"
- ))
- if not 'featureProbsSource' in kwargs:
- kwargs.update(dict(
- featureProbsSource = "(" + __get_feature_probs_sql(**kwargs) + ")"
- ))
- if not 'smoothingFactor' in kwargs:
- kwargs.update(dict(
- smoothingFactor = 1
- ))
-
+ """
+ Fill in values for optional parameters: Create subqueries instead of using
+ a relation.
+
+ """
+
+ if not 'classPriorsSource' in kwargs:
+ kwargs.update(dict(
+ classPriorsSource = "(" + __get_class_priors_sql(**kwargs) + ")"
+ ))
+ if not 'featureProbsSource' in kwargs:
+ kwargs.update(dict(
+ featureProbsSource = "(" + __get_feature_probs_sql(**kwargs) + ")"
+ ))
+ if not 'smoothingFactor' in kwargs:
+ kwargs.update(dict(
+ smoothingFactor = 1
+ ))
+
# The m4 preprocessor complains if eof is reach in quoted mode.
\ No newline at end of file
diff --git a/methods/regress/src/pg_gp/regression.sql_in b/methods/regress/src/pg_gp/regression.sql_in
index b4e709109..42893af23 100644
--- a/methods/regress/src/pg_gp/regression.sql_in
+++ b/methods/regress/src/pg_gp/regression.sql_in
@@ -155,8 +155,8 @@ http://www.stat.columbia.edu/~martin/W2110/SAS_7.pdf.
@internal
@sa file regress.c (documenting the implementation in C), function
- float8_mregr_compute() (documenting the formulas used for coefficients,
- $R^2$, t-statistics, and p-values, implemented in C)
+ float8_mregr_compute() (documenting the formulas used for coefficients,
+ $R^2$, t-statistics, and p-values, implemented in C)
@endinternal
@literature
@@ -207,7 +207,7 @@ LANGUAGE C STRICT;
* independentVariables array to 1.
*
* @return Array of coefficients, which has the same length as the array of
- * independent variables.
+ * independent variables.
*
* @examp SELECT mregr_coef(y, [1, x1, x2]) FROM data;
*/
@@ -215,11 +215,11 @@ CREATE AGGREGATE MADLIB_SCHEMA.mregr_coef(
/*+ "dependentVariable" */ DOUBLE PRECISION,
/*+ "independentVariables" */ DOUBLE PRECISION[]) (
- SFUNC=MADLIB_SCHEMA.float8_mregr_accum,
- STYPE=float8[],
- FINALFUNC=MADLIB_SCHEMA.float8_mregr_coef,
+ SFUNC=MADLIB_SCHEMA.float8_mregr_accum,
+ STYPE=float8[],
+ FINALFUNC=MADLIB_SCHEMA.float8_mregr_coef,
ifdef(GREENPLUM,prefunc=MADLIB_SCHEMA.float8_mregr_combine,)
- INITCOND='{0}'
+ INITCOND='{0}'
);
/**
@@ -229,11 +229,11 @@ CREATE AGGREGATE MADLIB_SCHEMA.mregr_r2(
/*+ "dependentVariable" */ DOUBLE PRECISION,
/*+ "independentVariables" */ DOUBLE PRECISION[]) (
- SFUNC=MADLIB_SCHEMA.float8_mregr_accum,
- STYPE=float8[],
- FINALFUNC=MADLIB_SCHEMA.float8_mregr_r2,
+ SFUNC=MADLIB_SCHEMA.float8_mregr_accum,
+ STYPE=float8[],
+ FINALFUNC=MADLIB_SCHEMA.float8_mregr_r2,
ifdef(GREENPLUM,prefunc=MADLIB_SCHEMA.float8_mregr_combine,)
- INITCOND='{0}'
+ INITCOND='{0}'
);
/**
@@ -245,17 +245,17 @@ CREATE AGGREGATE MADLIB_SCHEMA.mregr_r2(
* @param dependentVariable Dependent variable
* @param independentVariables Array of independent variables
* @return Array of t-statistics for each coefficient. The returned array has
- * the same length as the array of independent variables.
+ * the same length as the array of independent variables.
*/
CREATE AGGREGATE MADLIB_SCHEMA.mregr_tstats(
/*+ "dependentVariable" */ DOUBLE PRECISION,
/*+ "independentVariables" */ DOUBLE PRECISION[]) (
- SFUNC=MADLIB_SCHEMA.float8_mregr_accum,
- STYPE=float8[],
- FINALFUNC=MADLIB_SCHEMA.float8_mregr_tstats,
+ SFUNC=MADLIB_SCHEMA.float8_mregr_accum,
+ STYPE=float8[],
+ FINALFUNC=MADLIB_SCHEMA.float8_mregr_tstats,
ifdef(GREENPLUM,prefunc=MADLIB_SCHEMA.float8_mregr_combine,)
- INITCOND='{0}'
+ INITCOND='{0}'
);
/**
@@ -264,17 +264,17 @@ CREATE AGGREGATE MADLIB_SCHEMA.mregr_tstats(
* @param dependentVariable Dependent variable
* @param independentVariables Array of independent variables
* @return Array of p-values for each coefficient. The returned array has
- * the same length as the array of independent variables.
+ * the same length as the array of independent variables.
*/
CREATE AGGREGATE MADLIB_SCHEMA.mregr_pvalues(
/*+ "dependentVariable" */ DOUBLE PRECISION,
/*+ "independentVariables" */ DOUBLE PRECISION[]) (
- SFUNC=MADLIB_SCHEMA.float8_mregr_accum,
- STYPE=float8[],
- FINALFUNC=MADLIB_SCHEMA.float8_mregr_pvalues,
+ SFUNC=MADLIB_SCHEMA.float8_mregr_accum,
+ STYPE=float8[],
+ FINALFUNC=MADLIB_SCHEMA.float8_mregr_pvalues,
ifdef(GREENPLUM,prefunc=MADLIB_SCHEMA.float8_mregr_combine,)
- INITCOND='{0}'
+ INITCOND='{0}'
);
/**
@@ -331,7 +331,7 @@ By looking at the Hessian, we can verify that \f$l(\boldsymbol c)\f$ is convex.
There are many techniques for solving convex optimization problems. Currently,
logistic regression in MADlib can use one of two algorithms:
- Iteratively Reweighted Least Squares
-- A conjugate-gradient approach, also known as Fletcher–Reeves method in the
+- A conjugate-gradient approach, also known as Fletcher-Reeves method in the
literature, where we use the Hestenes-Stiefel rule for calculating the step
size.
@@ -381,11 +381,11 @@ PostgreSQL/Greenplum.
@internal
@sa namespace logRegress (documenting the driver/outer loop implemented in
- Python), function float8_cg_update_final() (documenting the
- conjugate-gradient update/iteration steps, implemented in C), function
- float8_cg_update_accum() (documenting the
- iteratively-reweighted-least-squares update/iteration steps, implemented in
- C)
+ Python), function float8_cg_update_final() (documenting the
+ conjugate-gradient update/iteration steps, implemented in C), function
+ float8_cg_update_accum() (documenting the
+ iteratively-reweighted-least-squares update/iteration steps, implemented in
+ C)
@endinternal
@literature
@@ -406,38 +406,38 @@ further literature:
*/
CREATE TYPE MADLIB_SCHEMA.logregr_cg_state AS (
- iteration INTEGER,
- len INTEGER,
- coef DOUBLE PRECISION[],
- dir DOUBLE PRECISION[],
- grad DOUBLE PRECISION[],
- beta DOUBLE PRECISION,
-
- count BIGINT,
- gradNew DOUBLE PRECISION[],
- dTHd DOUBLE PRECISION,
- logLikelihood DOUBLE PRECISION
+ iteration INTEGER,
+ len INTEGER,
+ coef DOUBLE PRECISION[],
+ dir DOUBLE PRECISION[],
+ grad DOUBLE PRECISION[],
+ beta DOUBLE PRECISION,
+
+ count BIGINT,
+ gradNew DOUBLE PRECISION[],
+ dTHd DOUBLE PRECISION,
+ logLikelihood DOUBLE PRECISION
);
CREATE TYPE MADLIB_SCHEMA.logregr_irls_state AS (
- coef DOUBLE PRECISION[],
- logLikelihood DOUBLE PRECISION
+ coef DOUBLE PRECISION[],
+ logLikelihood DOUBLE PRECISION
);
CREATE FUNCTION MADLIB_SCHEMA.float8_cg_update_accum(
- MADLIB_SCHEMA.logregr_cg_state,
- BOOLEAN,
- DOUBLE PRECISION[],
- MADLIB_SCHEMA.logregr_cg_state)
+ MADLIB_SCHEMA.logregr_cg_state,
+ BOOLEAN,
+ DOUBLE PRECISION[],
+ MADLIB_SCHEMA.logregr_cg_state)
RETURNS MADLIB_SCHEMA.logregr_cg_state
AS 'MODULE_PATHNAME'
LANGUAGE C;
CREATE FUNCTION MADLIB_SCHEMA.float8_irls_update_accum(
- DOUBLE PRECISION[],
- BOOLEAN,
- DOUBLE PRECISION[],
- MADLIB_SCHEMA.logregr_irls_state)
+ DOUBLE PRECISION[],
+ BOOLEAN,
+ DOUBLE PRECISION[],
+ MADLIB_SCHEMA.logregr_irls_state)
RETURNS DOUBLE PRECISION[]
AS 'MODULE_PATHNAME'
LANGUAGE C;
@@ -462,9 +462,9 @@ CREATE AGGREGATE MADLIB_SCHEMA.logregr_cg_step(
DOUBLE PRECISION[],
MADLIB_SCHEMA.logregr_cg_state) (
- SFUNC=MADLIB_SCHEMA.float8_cg_update_accum,
- STYPE=MADLIB_SCHEMA.logregr_cg_state,
- FINALFUNC=MADLIB_SCHEMA.float8_cg_update_final
+ SFUNC=MADLIB_SCHEMA.float8_cg_update_accum,
+ STYPE=MADLIB_SCHEMA.logregr_cg_state,
+ FINALFUNC=MADLIB_SCHEMA.float8_cg_update_final
);
/**
@@ -477,18 +477,18 @@ CREATE AGGREGATE MADLIB_SCHEMA.logregr_irls_step(
DOUBLE PRECISION[],
MADLIB_SCHEMA.logregr_irls_state) (
- SFUNC=MADLIB_SCHEMA.float8_irls_update_accum,
- STYPE=float8[],
- PREFUNC=MADLIB_SCHEMA.float8_mregr_combine,
- FINALFUNC=MADLIB_SCHEMA.float8_irls_update_final,
- INITCOND='{0}'
+ SFUNC=MADLIB_SCHEMA.float8_irls_update_accum,
+ STYPE=float8[],
+ PREFUNC=MADLIB_SCHEMA.float8_mregr_combine,
+ FINALFUNC=MADLIB_SCHEMA.float8_irls_update_final,
+ INITCOND='{0}'
);
CREATE FUNCTION MADLIB_SCHEMA.logregr_should_terminate(
- DOUBLE PRECISION[],
- DOUBLE PRECISION[],
- VARCHAR,
- DOUBLE PRECISION)
+ DOUBLE PRECISION[],
+ DOUBLE PRECISION[],
+ VARCHAR,
+ DOUBLE PRECISION)
RETURNS BOOLEAN
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
@@ -497,9 +497,9 @@ LANGUAGE C STRICT;
-- We only need to document the last one (unfortunately, in Greenplum we have to
-- use function overloading instead of default arguments).
CREATE FUNCTION MADLIB_SCHEMA.logregr_coef(
- "source" VARCHAR,
- "depColumn" VARCHAR,
- "indepColumn" VARCHAR)
+ "source" VARCHAR,
+ "depColumn" VARCHAR,
+ "indepColumn" VARCHAR)
RETURNS DOUBLE PRECISION[] AS $$
import sys
try:
@@ -508,14 +508,14 @@ RETURNS DOUBLE PRECISION[] AS $$
sys.path.append("PLPYTHON_LIBDIR")
ifdef(DEBUG,,from madlib )import logRegress
- return logRegress.compute_logregr_coef(**globals())
+ return logRegress.compute_logregr_coef(**globals())
$$ LANGUAGE plpythonu VOLATILE;
CREATE FUNCTION MADLIB_SCHEMA.logregr_coef(
- "source" VARCHAR,
- "depColumn" VARCHAR,
- "indepColumn" VARCHAR,
- "numIterations" INTEGER)
+ "source" VARCHAR,
+ "depColumn" VARCHAR,
+ "indepColumn" VARCHAR,
+ "numIterations" INTEGER)
RETURNS DOUBLE PRECISION[] AS $$
import sys
try:
@@ -524,15 +524,15 @@ RETURNS DOUBLE PRECISION[] AS $$
sys.path.append("PLPYTHON_LIBDIR")
ifdef(DEBUG,,from madlib )import logRegress
- return logRegress.compute_logregr_coef(**globals())
+ return logRegress.compute_logregr_coef(**globals())
$$ LANGUAGE plpythonu VOLATILE;
CREATE FUNCTION MADLIB_SCHEMA.logregr_coef(
- "source" VARCHAR,
- "depColumn" VARCHAR,
- "indepColumn" VARCHAR,
- "numIterations" INTEGER,
- "optimizer" VARCHAR)
+ "source" VARCHAR,
+ "depColumn" VARCHAR,
+ "indepColumn" VARCHAR,
+ "numIterations" INTEGER,
+ "optimizer" VARCHAR)
RETURNS DOUBLE PRECISION[] AS $$
import sys
try:
@@ -541,7 +541,7 @@ RETURNS DOUBLE PRECISION[] AS $$
sys.path.append("PLPYTHON_LIBDIR")
ifdef(DEBUG,,from madlib )import logRegress
- return logRegress.compute_logregr_coef(**globals())
+ return logRegress.compute_logregr_coef(**globals())
$$ LANGUAGE plpythonu VOLATILE;
@@ -554,33 +554,33 @@ $$ LANGUAGE plpythonu VOLATILE;
* @param source Name of the source relation containing the training data
* @param depColumn Name of the dependent column (of type BOOLEAN)
* @param indepColumn Name of the independent column (of type DOUBLE
- * PRECISION[])
+ * PRECISION[])
* @param numIterations The maximum number of iterations
* @param optimizer The optimizer to use (either
- * 'ilrs'/'newton' for iteratively reweighted least
- * squares or 'cg' for conjugent gradient)
+ * 'ilrs'/'newton' for iteratively reweighted least
+ * squares or 'cg' for conjugent gradient)
* @param precision The difference between log-likelihood values in successive
- * iterations that should indicate convergence, or 0 indicating that
- * log-likelihood values should be ignored
+ * iterations that should indicate convergence, or 0 indicating that
+ * log-likelihood values should be ignored
*
* @note This function starts an iterative algorithm. It is not an aggregate
- * function. Source and column names have to be passed as strings (due to
- * limitations of the SQL syntax).
+ * function. Source and column names have to be passed as strings (due to
+ * limitations of the SQL syntax).
*
* @examp SELECT logregr_coef('data', 'y', 'array[1, x1, x2]', 20, 'cg',
- * 0.001);
+ * 0.001);
*
* @internal
* @sa This function is a wrapper for logRegress::compute_logregr_coef(), which
* sets the default values.
*/
CREATE FUNCTION MADLIB_SCHEMA.logregr_coef(
- "source" VARCHAR,
- "depColumn" VARCHAR,
- "indepColumn" VARCHAR,
- "numIterations" INTEGER /*+ DEFAULT 20 */,
- "optimizer" VARCHAR /*+ DEFAULT 'irls' */,
- "precision" DOUBLE PRECISION /*+ DEFAULT 0.0001 */)
+ "source" VARCHAR,
+ "depColumn" VARCHAR,
+ "indepColumn" VARCHAR,
+ "numIterations" INTEGER /*+ DEFAULT 20 */,
+ "optimizer" VARCHAR /*+ DEFAULT 'irls' */,
+ "precision" DOUBLE PRECISION /*+ DEFAULT 0.0001 */)
RETURNS DOUBLE PRECISION[] AS $$
import sys
try:
@@ -589,7 +589,7 @@ RETURNS DOUBLE PRECISION[] AS $$
sys.path.append("PLPYTHON_LIBDIR")
ifdef(DEBUG,,from madlib )import logRegress
- return logRegress.compute_logregr_coef(**globals())
+ return logRegress.compute_logregr_coef(**globals())
$$ LANGUAGE plpythonu VOLATILE;
@@ -602,21 +602,21 @@ ifdef(PGXS,
CREATE FUNCTION MADLIB_SCHEMA.init_python_paths()
RETURNS VOID AS
$$
- # FIXME: The following code should be common code and not reside in a specialized module
- import sys
-
- dyld_paths = plpy.execute(
- "SHOW dynamic_library_path")[0]["dynamic_library_path"].split(':')
- before_default = True
- count = 0
- for path in dyld_paths:
- if path == "$libdir":
- before_default = False
- else:
- if before_default:
- sys.path.insert(count, path)
- count += 1
- else:
- sys.path.append(path)
+ # FIXME: The following code should be common code and not reside in a specialized module
+ import sys
+
+ dyld_paths = plpy.execute(
+ "SHOW dynamic_library_path")[0]["dynamic_library_path"].split(':')
+ before_default = True
+ count = 0
+ for path in dyld_paths:
+ if path == "$libdir":
+ before_default = False
+ else:
+ if before_default:
+ sys.path.insert(count, path)
+ count += 1
+ else:
+ sys.path.append(path)
$$ LANGUAGE plpythonu VOLATILE;
)