Skip to content

Commit

Permalink
Order mozfun.glam.histogram_fill_buckets by string sorting (#1497)
Browse files Browse the repository at this point in the history
* Order fill buckets by string sorting

* Add buckets in the correct order

* Simplify histogram_fill_buckets into single statement

* Remove extra casting
  • Loading branch information
acmiyaguchi committed Oct 29, 2020
1 parent 1529510 commit 0e23866
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 49 deletions.
34 changes: 13 additions & 21 deletions sql/mozfun/glam/histogram_fill_buckets/udf.sql
Expand Up @@ -5,26 +5,18 @@ CREATE OR REPLACE FUNCTION glam.histogram_fill_buckets(
)
RETURNS ARRAY<STRUCT<key STRING, value FLOAT64>> AS (
-- Given a MAP `input_map`, fill in any missing keys with value `0.0`
(
WITH total_counts AS (
SELECT
key,
COALESCE(e.value, 0.0) AS value
FROM
UNNEST(buckets) AS key
LEFT JOIN
UNNEST(input_map) AS e
ON
SAFE_CAST(key AS STRING) = e.key
)
SELECT
ARRAY_AGG(
STRUCT<key STRING, value FLOAT64>(SAFE_CAST(key AS STRING), value)
ORDER BY
CAST(key AS int64)
)
ARRAY(
SELECT AS STRUCT
key,
COALESCE(e.value, 0.0) AS value
FROM
total_counts
UNNEST(buckets) AS key
LEFT JOIN
UNNEST(input_map) AS e
ON
key = e.key
ORDER BY
key
)
);

Expand All @@ -45,9 +37,9 @@ SELECT
["0"]
)
),
-- return ordered keys
-- return ordered keys by string value
assert.array_equals(
ARRAY<STRUCT<key STRING, value FLOAT64>>[("2", 1.0), ("11", 0.0)],
ARRAY<STRUCT<key STRING, value FLOAT64>>[("11", 0.0), ("2", 1.0)],
glam.histogram_fill_buckets(
ARRAY<STRUCT<key STRING, value FLOAT64>>[("0", 1.0), ("2", 1.0)],
["11", "2"]
Expand Down
48 changes: 20 additions & 28 deletions sql/mozfun/glam/histogram_fill_buckets_dirichlet/udf.sql
Expand Up @@ -6,33 +6,25 @@ CREATE OR REPLACE FUNCTION glam.histogram_fill_buckets_dirichlet(
)
RETURNS ARRAY<STRUCT<key STRING, value FLOAT64>> AS (
-- Given a MAP `input_map`, fill in any missing keys with value `0.0`
(
WITH total_counts AS (
SELECT
key,
-- Dirichlet distribution density for each bucket in a histogram.
-- Given {k1: p1,k2:p2} where p’s are proportions(and p1, p2 sum to 1)
-- return {k1: (P1+1/K) / (nreporting+1), k2:(P2+1/K) / (nreporting+1)}.
-- https://docs.google.com/document/d/1ipy1oFIKDvHr3R6Ku0goRjS11R1ZH1z2gygOGkSdqUg
SAFE_DIVIDE(
COALESCE(e.value, 0.0) + SAFE_DIVIDE(1, ARRAY_LENGTH(buckets)),
total_users + 1
) AS value
FROM
UNNEST(buckets) AS key
LEFT JOIN
UNNEST(input_map) AS e
ON
SAFE_CAST(key AS STRING) = e.key
)
SELECT
ARRAY_AGG(
STRUCT<key STRING, value FLOAT64>(SAFE_CAST(key AS STRING), value)
ORDER BY
CAST(key AS INT64)
)
ARRAY(
SELECT AS STRUCT
key,
-- Dirichlet distribution density for each bucket in a histogram.
-- Given {k1: p1,k2:p2} where p’s are proportions(and p1, p2 sum to 1)
-- return {k1: (P1+1/K) / (nreporting+1), k2:(P2+1/K) / (nreporting+1)}.
-- https://docs.google.com/document/d/1ipy1oFIKDvHr3R6Ku0goRjS11R1ZH1z2gygOGkSdqUg
SAFE_DIVIDE(
COALESCE(e.value, 0.0) + SAFE_DIVIDE(1, ARRAY_LENGTH(buckets)),
total_users + 1
) AS value
FROM
total_counts
UNNEST(buckets) AS key
LEFT JOIN
UNNEST(input_map) AS e
ON
key = e.key
ORDER BY
key
)
);

Expand All @@ -59,9 +51,9 @@ SELECT
2
)
),
-- out of order keys
-- keys may not non-integer values, so we must make do with string ordering
assert.array_equals(
ARRAY<STRUCT<key STRING, value FLOAT64>>[("2", (1 + (1 / 2)) / 3), ("11", (1 + (1 / 2)) / 3)],
ARRAY<STRUCT<key STRING, value FLOAT64>>[("11", (1 + (1 / 2)) / 3), ("2", (1 + (1 / 2)) / 3)],
glam.histogram_fill_buckets_dirichlet(
ARRAY<STRUCT<key STRING, value FLOAT64>>[("11", 1.0), ("2", 1.0)],
["11", "2"],
Expand Down

0 comments on commit 0e23866

Please sign in to comment.