mozilla · jklukas · Mar 7, 2019 · Mar 5, 2019 · Mar 6, 2019 · Mar 6, 2019
diff --git a/sql/core_clients_daily_v1.sql b/sql/core_clients_daily_v1.sql
@@ -0,0 +1,95 @@
+CREATE TEMP FUNCTION udf_mode_last(x ANY TYPE) AS ((
+  SELECT
+    val
+  FROM (
+    SELECT
+      val,
+      COUNT(val) AS n,
+      MAX(offset) AS max_offset
+    FROM
+      UNNEST(x) AS val
+    WITH OFFSET AS offset
+    GROUP BY
+      val
+    ORDER BY
+      n DESC,
+      max_offset DESC
+  )
+  LIMIT 1
+));
+
+WITH
+  numbered_duplicates AS (
+  SELECT
+    * REPLACE(LOWER(client_id) AS client_id),
+    ROW_NUMBER() OVER (PARTITION BY client_id, submission_date_s3, metadata.document_id ORDER BY metadata.timestamp DESC) AS n
+  FROM
+    telemetry_core_parquet_v3
+  WHERE
+    client_id IS NOT NULL ),
+  -- Deduplicating on document_id is necessary to get valid SUM values.
+  deduplicated AS (
+  SELECT
+    * EXCEPT (n)
+  FROM
+    numbered_duplicates
+  WHERE
+    n = 1 ),
+  windowed AS (
+  SELECT
+    @submission_date AS submission_date,
+    CURRENT_DATETIME() AS generated_time,
+    client_id,
+    ROW_NUMBER() OVER w1 AS n,
+    -- For now, we're ignoring the following RECORD type fields:
+    --   accessibility_services
+    --   experiments
+    --   searches
+    --
+    -- Take the min observed profile creation date.
+    SAFE.DATE_FROM_UNIX_DATE(MIN(profile_date) OVER w1) AS profile_date,
+    -- These integer fields are already sums over sessions since last upload,
+    -- so we sum to represent all uploads in the given day;
+    -- we set an upper limit of 100K which contains 99.9th percentile of durations
+    -- while avoiding integer overflow on pathological input.
+    SUM(IF(sessions BETWEEN 0 AND 100000, sessions, 0)) OVER w1 AS sessions,
+    SUM(IF(durations BETWEEN 0 AND 100000, durations, 0)) OVER w1 AS durations,
+    SUM(IF(flash_usage BETWEEN 0 AND 100000, flash_usage, 0)) OVER w1 AS flash_usage,
+    -- For all other dimensions, we use the mode of observed values in the day.
+    udf_mode_last(ARRAY_AGG(app_name) OVER w1) AS app_name,
+    udf_mode_last(ARRAY_AGG(os) OVER w1) AS os,
+    udf_mode_last(ARRAY_AGG(metadata.geo_country) OVER w1) AS country,
+    udf_mode_last(ARRAY_AGG(metadata.geo_city) OVER w1) AS city,
+    udf_mode_last(ARRAY_AGG(metadata.app_build_id) OVER w1) AS app_build_id,
+    udf_mode_last(ARRAY_AGG(metadata.normalized_channel) OVER w1) AS normalized_channel,
+    udf_mode_last(ARRAY_AGG(locale) OVER w1) AS locale,
+    udf_mode_last(ARRAY_AGG(osversion) OVER w1) AS osversion,
+    udf_mode_last(ARRAY_AGG(device) OVER w1) AS device,
+    udf_mode_last(ARRAY_AGG(arch) OVER w1) AS arch,
+    udf_mode_last(ARRAY_AGG(default_search) OVER w1) AS default_search,
+    udf_mode_last(ARRAY_AGG(distribution_id) OVER w1) AS distribution_id,
+    udf_mode_last(ARRAY_AGG(campaign) OVER w1) AS campaign,
+    udf_mode_last(ARRAY_AGG(campaign_id) OVER w1) AS campaign_id,
+    udf_mode_last(ARRAY_AGG(default_browser) OVER w1) AS default_browser,
+    udf_mode_last(ARRAY_AGG(show_tracker_stats_share) OVER w1) AS show_tracker_stats_share,
+    udf_mode_last(ARRAY_AGG(metadata_app_version) OVER w1) AS metadata_app_version,
+    udf_mode_last(ARRAY_AGG(bug_1501329_affected) OVER w1) AS bug_1501329_affected
+  FROM
+    deduplicated
+  WHERE
+    submission_date_s3 = @submission_date
+    -- Bug 1501329: avoid the pathological "canary" client_id
+    AND client_id != 'c0ffeec0-ffee-c0ff-eec0-ffeec0ffeec0'
+  WINDOW
+    w1 AS (
+    PARTITION BY
+      client_id,
+      submission_date_s3
+    ORDER BY
+      metadata.timestamp DESC) )
+SELECT
+  * EXCEPT (n)
+FROM
+  windowed
+WHERE
+  n = 1
diff --git a/sql/core_clients_last_seen_v1.init.sql b/sql/core_clients_last_seen_v1.init.sql
@@ -0,0 +1,13 @@
+SELECT
+  submission_date,
+  CURRENT_DATETIME() AS generated_time,
+  submission_date AS date_last_seen,
+  IF(country IN ('US', 'FR', 'DE', 'UK', 'CA'),
+    submission_date,
+    NULL) AS date_last_seen_in_tier1_country,
+  * EXCEPT (submission_date, generated_time)
+FROM
+  core_clients_daily_v1
+WHERE
+  -- 2017-01-01 is the first date in telemetry_core_parquet_v3.
+  submission_date = DATE('2017-01-01')
diff --git a/sql/core_clients_last_seen_v1.sql b/sql/core_clients_last_seen_v1.sql
@@ -0,0 +1,47 @@
+WITH
+  current_sample AS (
+  SELECT
+    -- Include dummy dates for date_last_* fields to make schema match with `previous`.
+    DATE('2000-01-01') AS date_last_seen,
+    DATE('2000-01-01') AS date_last_seen_in_tier1_country,
+    * EXCEPT (submission_date,
+      generated_time)
+  FROM
+    core_clients_daily_v1
+  WHERE
+    submission_date = @submission_date ),
+  previous AS (
+  SELECT
+    * EXCEPT (submission_date,
+      generated_time)
+  FROM
+    core_clients_last_seen_v1
+  WHERE
+    submission_date = DATE_SUB(@submission_date, INTERVAL 1 DAY)
+    AND date_last_seen > DATE_SUB(@submission_date, INTERVAL 28 DAY) )
+SELECT
+  @submission_date AS submission_date,
+  CURRENT_DATETIME() AS generated_time,
+  -- Record the last day on which we recieved any core ping at all from this client.
+  IF(current_sample.client_id IS NOT NULL,
+    @submission_date,
+    previous.date_last_seen) AS date_last_seen,
+  -- Record the last day on which the client was in a "Tier 1" country;
+  -- this allows a variant of country-segmented MAU where we can still count
+  -- a client that appeared in one of the target countries in the previous
+  -- 28 days even if the most recent "country" value is not in this set.
+  IF(current_sample.client_id IS NOT NULL
+    AND current_sample.country IN ('US', 'FR', 'DE', 'UK', 'CA'),
+    @submission_date,
+    IF(previous.date_last_seen_in_tier1_country > DATE_SUB(@submission_date, INTERVAL 28 DAY),
+      previous.date_last_seen_in_tier1_country,
+      NULL)) AS date_last_seen_in_tier1_country,
+  IF(current_sample.client_id IS NOT NULL,
+    current_sample,
+    previous).* EXCEPT (date_last_seen, date_last_seen_in_tier1_country)
+FROM
+  current_sample
+FULL JOIN
+  previous
+USING
+  (client_id)
diff --git a/udf/udf_mode_last.sql b/udf/udf_mode_last.sql
@@ -0,0 +1,39 @@
+CREATE TEMP FUNCTION udf_mode_last(x ANY TYPE) AS ((
+  SELECT
+    val
+  FROM (
+    SELECT
+      val,
+      COUNT(val) AS n,
+      MAX(offset) AS max_offset
+    FROM
+      UNNEST(x) AS val
+    WITH OFFSET AS offset
+    GROUP BY
+      val
+    ORDER BY
+      n DESC,
+      max_offset DESC
+  )
+  LIMIT 1
+));
+
+/*
+
+Returns the most frequently occuring element in an array.
+
+In the case of multiple values tied for the highest count, it returns the value
+that appears latest in the array. Nulls are ignored.
+
+Examples:
+
+SELECT udf_mode_last(['foo', 'bar', 'baz', 'bar', 'fred']);
+-- bar
+
+SELECT udf_mode_last(['foo', 'bar', 'baz', 'bar', 'baz', 'fred']);
+-- baz
+
+SELECT udf_mode_last([null, 'foo', null]);
+-- foo
+
+*/