From 2dd0e0b7b4f3d9b9726ce27e4fec7d7732341b4d Mon Sep 17 00:00:00 2001 From: Marlene-M-Hirose Date: Thu, 18 Apr 2024 14:26:36 -1000 Subject: [PATCH 1/9] initial add of query.sql WIP for ga downloads_with_attr_v3 --- .../downloads_with_attribution_v3/query.sql | 201 ++++++++++++++++++ 1 file changed, 201 insertions(+) create mode 100644 sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql diff --git a/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql b/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql new file mode 100644 index 00000000000..92853f99977 --- /dev/null +++ b/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql @@ -0,0 +1,201 @@ +CREATE TEMP FUNCTION normalize_browser(browser STRING) AS ( + CASE + WHEN `moz-fx-data-shared-prod.udf.ga_is_mozilla_browser`(browser) + THEN 'Firefox' + WHEN browser IN ('Internet Explorer') + THEN 'MSIE' + WHEN browser IN ('Edge') + THEN 'Edge' + WHEN browser IN ('Chrome') + THEN 'Chrome' + WHEN browser IN ('Safari') + THEN 'Safari' + WHEN browser IN ('(not set)') + THEN NULL + WHEN browser IS NULL + THEN NULL + ELSE 'Other' + END +); + +CREATE TEMP FUNCTION normalize_ga_os(os STRING, nrows INTEGER) AS ( + CASE + WHEN nrows > 1 + THEN NULL + WHEN os IS NULL + THEN NULL + WHEN os LIKE 'Macintosh%' + THEN 'Mac' -- these values are coming from GA. + ELSE mozfun.norm.os(os) + END +); + + +WITH +-- Extract all the download rows, de-duping and tracking number of duplicates per download token. +stub_downloads AS ( + SELECT + stub.jsonPayload.fields.visit_id AS stub_visit_id, + stub.jsonPayload.fields.session_id AS stub_download_session_id, + stub.jsonPayload.fields.dltoken AS dltoken, + (COUNT(*) - 1) AS count_dltoken_duplicates, + -- DATE(@download_date) AS download_date + DATE("2024-02-14") AS download_date + FROM + `moz-fx-stubattribut-prod-32a5.stubattribution_prod.stdout` AS stub + WHERE + -- DATE(stub.timestamp) = @download_date + DATE(stub.timestamp) = "2024-02-15" + -- AND DATE(stub.timestamp) <= "2024-02-20" + AND stub.jsonPayload.fields.log_type = 'download_started' + GROUP BY + stub_visit_id, + stub_download_session_id, + dltoken +), + +multiple_downloads_in_session AS ( + SELECT + stub_visit_id, + stub_download_session_id, + IF(COUNT(*) > 1, TRUE, FALSE) AS additional_download_occurred + FROM + stub_downloads + GROUP BY + stub_visit_id, + stub_download_session_id +), + +stub_downloads_with_download_tracking AS ( + SELECT + s1.stub_visit_id, + s1.stub_download_session_id, + dltoken, + count_dltoken_duplicates, + additional_download_occurred, + download_date + FROM + stub_downloads s1 + JOIN + multiple_downloads_in_session s2 + ON ( + s1.stub_visit_id = s2.stub_visit_id + AND IFNULL(s1.stub_download_session_id, "null") = IFNULL(s2.stub_download_session_id, "null") + ) + +), + +-- Extract all the stub_session_ids from GA +stub_session_ids AS ( + SELECT DISTINCT + user_pseudo_id AS full_visitor_id, + CAST(((SELECT `value` FROM UNNEST(event_params) WHERE key = 'id' LIMIT 1).int_value) AS STRING) AS stub_session_id + FROM `moz-fx-data-marketing-prod.analytics_313696158.events_*` + WHERE event_name = 'stub_session_set' + -- will need to update this + AND _TABLE_SUFFIX + BETWEEN FORMAT_DATE('%Y%m%d', DATE_SUB("2024-02-15", INTERVAL 2 DAY)) + AND FORMAT_DATE('%Y%m%d', DATE_ADD("2024-02-15", INTERVAL 1 DAY)) + ), + +-- join stub_download_session_ids with ga_stub_session_ids +stub_download_ids_ga_session_ids AS ( +SELECT sd.stub_visit_id, +sd.stub_download_session_id, +sd.dltoken, +sd.count_dltoken_duplicates, +sd.additional_download_occurred, +sd.download_date, +ssi.full_visitor_id +FROM stub_downloads_with_download_tracking sd +LEFT JOIN stub_session_ids ssi +ON sd.stub_download_session_id = ssi.stub_session_id +) +, + +ga_sessions_time_on_site AS ( + SELECT CONCAT(ga_client_id, "-", ga_session_id) AS visit_identifier, + time_on_site + FROM `moz-fx-data-shared-prod.mozilla_org_derived.ga_sessions_v2` + WHERE session_date = '2024-02-14' + AND had_download_event IS TRUE +), + +page_hits AS ( +SELECT + ph.full_visitor_id, + ph.visit_identifier, + ph.date AS submission_date, + ph.country, + ph.ad_content, + ph.campaign, + ph.medium, + ph.source, + ph.device_category, + ph.operating_system AS os, + ph.browser, + ph.browser_version, + ph.language, + ph.page_path, + gav.time_on_site, + SUM(CASE WHEN ph.hit_type = 'PAGE' then 1 else 0 end) AS page_hits, + COUNT(distinct(CASE WHEN ph.hit_type = 'PAGE' THEN ph.page_path ELSE NULL END)) AS unique_page_hits, + MAX(CASE WHEN ph.is_entrance is true then ph.page_path ELSE NULL END) as landing_page +FROM `moz-fx-data-marketing-prod.ga_derived.www_site_hits_v2` ph +LEFT JOIN ga_sessions_time_on_site gav + ON gav.visit_identifier = ph.visit_identifier +WHERE date = "2024-02-14" + GROUP BY + ph.full_visitor_id, + ph.visit_identifier, + ph.date, + ph.country, + ph.ad_content, + ph.campaign, + ph.medium, + ph.source, + ph.device_category, + ph.operating_system, + ph.browser, + ph.browser_version, + ph.language, + ph.page_path, + gav.time_on_site +) +, +-- join stub_sessions with ga_sessions using full_visitor_id +downloads_with_ga_sessions AS ( + SELECT + sd.stub_visit_id, + sd.stub_download_session_id, + sd.dltoken, + sd.count_dltoken_duplicates, + sd.additional_download_occurred, + sd.download_date, + sd.full_visitor_id, + ph.full_visitor_id as full_visitor_id_check, + ph.visit_identifier, + ph.submission_date, + ph.country, + ph.ad_content, + ph.campaign, + ph.medium, + ph.source, + ph.device_category, + ph.os, + ph.browser, + ph.browser_version, + ph.language, + ph.page_path, + ph.time_on_site, + ph.page_hits, + ph.unique_page_hits, + ph.landing_page + + FROM stub_download_ids_ga_session_ids sd + LEFT JOIN page_hits ph ON + ph.full_visitor_id = sd.full_visitor_id + AND sd.download_date = ph.submission_date +) + +select * from downloads_with_ga_sessions where stub_visit_id = "1436287147.1707680916" \ No newline at end of file From 34e234452593de83967a8014178a1319695a7be9 Mon Sep 17 00:00:00 2001 From: Marlene-M-Hirose Date: Thu, 18 Apr 2024 14:30:45 -1000 Subject: [PATCH 2/9] take out filter from test table --- .../ga_derived/downloads_with_attribution_v3/query.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql b/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql index 92853f99977..1f6cd3673d7 100644 --- a/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql +++ b/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql @@ -198,4 +198,4 @@ downloads_with_ga_sessions AS ( AND sd.download_date = ph.submission_date ) -select * from downloads_with_ga_sessions where stub_visit_id = "1436287147.1707680916" \ No newline at end of file +select * from downloads_with_ga_sessions \ No newline at end of file From cba4bab7dd9d18e256a922668cc952e42ccfb824 Mon Sep 17 00:00:00 2001 From: Marlene-M-Hirose Date: Thu, 18 Apr 2024 14:42:46 -1000 Subject: [PATCH 3/9] add in has_ga_download_event column --- .../downloads_with_attribution_v3/query.sql | 36 ++++++++++++++++--- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql b/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql index 1f6cd3673d7..62d8f956cec 100644 --- a/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql +++ b/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql @@ -115,7 +115,8 @@ ON sd.stub_download_session_id = ssi.stub_session_id ga_sessions_time_on_site AS ( SELECT CONCAT(ga_client_id, "-", ga_session_id) AS visit_identifier, - time_on_site + time_on_site, + had_download_event FROM `moz-fx-data-shared-prod.mozilla_org_derived.ga_sessions_v2` WHERE session_date = '2024-02-14' AND had_download_event IS TRUE @@ -138,6 +139,7 @@ SELECT ph.language, ph.page_path, gav.time_on_site, + gav.had_download_event, SUM(CASE WHEN ph.hit_type = 'PAGE' then 1 else 0 end) AS page_hits, COUNT(distinct(CASE WHEN ph.hit_type = 'PAGE' THEN ph.page_path ELSE NULL END)) AS unique_page_hits, MAX(CASE WHEN ph.is_entrance is true then ph.page_path ELSE NULL END) as landing_page @@ -160,7 +162,8 @@ WHERE date = "2024-02-14" ph.browser_version, ph.language, ph.page_path, - gav.time_on_site + gav.time_on_site, + gav.had_download_event ) , -- join stub_sessions with ga_sessions using full_visitor_id @@ -190,12 +193,35 @@ downloads_with_ga_sessions AS ( ph.time_on_site, ph.page_hits, ph.unique_page_hits, - ph.landing_page - + ph.landing_page, + ph.had_download_event FROM stub_download_ids_ga_session_ids sd LEFT JOIN page_hits ph ON ph.full_visitor_id = sd.full_visitor_id AND sd.download_date = ph.submission_date ) -select * from downloads_with_ga_sessions \ No newline at end of file +SELECT dltoken, + time_on_site, + ad_content, + campaign, + medium, + source, + landing_page, + country, + -- normalized_country_code, + device_category, + os, + -- normalized_os, + browser, + -- normalized_browser, + browser_version, + -- browser_major_version, + language, + page_hits AS pageviews, + unique_page_hits AS unique_pageviews, + had_download_event AS has_ga_download_event, + count_dltoken_duplicates, + additional_download_occurred, + download_date +FROM downloads_with_ga_sessions From c12ffeaadee4d9b17c22a1ed3b6bf0ff472ff506 Mon Sep 17 00:00:00 2001 From: Marlene-M-Hirose Date: Thu, 18 Apr 2024 14:50:36 -1000 Subject: [PATCH 4/9] add in final table structure, i.e. only select specific columns --- .../ga_derived/downloads_with_attribution_v3/query.sql | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql b/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql index 62d8f956cec..7463728df40 100644 --- a/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql +++ b/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql @@ -225,3 +225,4 @@ SELECT dltoken, additional_download_occurred, download_date FROM downloads_with_ga_sessions +order by time_on_site desc From 1fe505baebc5f40cee999f8148ede2ebbae88e40 Mon Sep 17 00:00:00 2001 From: Marlene-M-Hirose Date: Thu, 18 Apr 2024 14:53:18 -1000 Subject: [PATCH 5/9] take out unnecessary order by statement --- .../ga_derived/downloads_with_attribution_v3/query.sql | 1 - 1 file changed, 1 deletion(-) diff --git a/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql b/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql index 7463728df40..62d8f956cec 100644 --- a/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql +++ b/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql @@ -225,4 +225,3 @@ SELECT dltoken, additional_download_occurred, download_date FROM downloads_with_ga_sessions -order by time_on_site desc From 249799ef1a23a55b59d7aa0d79f72e21e888182a Mon Sep 17 00:00:00 2001 From: Marlene-M-Hirose Date: Mon, 29 Apr 2024 07:28:53 -1000 Subject: [PATCH 6/9] update with filters on stub_downloads, refactor join to be in next CTE after calculation CTE --- .../downloads_with_attribution_v3/query.sql | 133 ++++++++++-------- 1 file changed, 72 insertions(+), 61 deletions(-) diff --git a/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql b/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql index 62d8f956cec..1c93115af64 100644 --- a/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql +++ b/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql @@ -1,38 +1,39 @@ -CREATE TEMP FUNCTION normalize_browser(browser STRING) AS ( - CASE - WHEN `moz-fx-data-shared-prod.udf.ga_is_mozilla_browser`(browser) - THEN 'Firefox' - WHEN browser IN ('Internet Explorer') - THEN 'MSIE' - WHEN browser IN ('Edge') - THEN 'Edge' - WHEN browser IN ('Chrome') - THEN 'Chrome' - WHEN browser IN ('Safari') - THEN 'Safari' - WHEN browser IN ('(not set)') - THEN NULL - WHEN browser IS NULL - THEN NULL - ELSE 'Other' - END -); +-- CREATE TEMP FUNCTION normalize_browser(browser STRING) AS ( +-- CASE +-- WHEN `moz-fx-data-shared-prod.udf.ga_is_mozilla_browser`(browser) +-- THEN 'Firefox' +-- WHEN browser IN ('Internet Explorer') +-- THEN 'MSIE' +-- WHEN browser IN ('Edge') +-- THEN 'Edge' +-- WHEN browser IN ('Chrome') +-- THEN 'Chrome' +-- WHEN browser IN ('Safari') +-- THEN 'Safari' +-- WHEN browser IN ('(not set)') +-- THEN NULL +-- WHEN browser IS NULL +-- THEN NULL +-- ELSE 'Other' +-- END +-- ); -CREATE TEMP FUNCTION normalize_ga_os(os STRING, nrows INTEGER) AS ( - CASE - WHEN nrows > 1 - THEN NULL - WHEN os IS NULL - THEN NULL - WHEN os LIKE 'Macintosh%' - THEN 'Mac' -- these values are coming from GA. - ELSE mozfun.norm.os(os) - END -); +-- CREATE TEMP FUNCTION normalize_ga_os(os STRING, nrows INTEGER) AS ( +-- CASE +-- WHEN nrows > 1 +-- THEN NULL +-- WHEN os IS NULL +-- THEN NULL +-- WHEN os LIKE 'Macintosh%' +-- THEN 'Mac' -- these values are coming from GA. +-- ELSE mozfun.norm.os(os) +-- END +-- ); WITH -- Extract all the download rows, de-duping and tracking number of duplicates per download token. +-- Also filter out null/empty strings for stub_visit_ids and stub_download_session_ids stub_downloads AS ( SELECT stub.jsonPayload.fields.visit_id AS stub_visit_id, @@ -48,6 +49,10 @@ stub_downloads AS ( DATE(stub.timestamp) = "2024-02-15" -- AND DATE(stub.timestamp) <= "2024-02-20" AND stub.jsonPayload.fields.log_type = 'download_started' + AND stub.jsonPayload.fields.visit_id NOT LIKE ("(not set)") + AND stub.jsonPayload.fields.session_id NOT LIKE ("(not set)") + AND NULLIF(stub.jsonPayload.fields.visit_id, "") IS NOT NULL + AND NULLIF(stub.jsonPayload.fields.session_id, "") IS NOT NULL GROUP BY stub_visit_id, stub_download_session_id, @@ -138,14 +143,10 @@ SELECT ph.browser_version, ph.language, ph.page_path, - gav.time_on_site, - gav.had_download_event, SUM(CASE WHEN ph.hit_type = 'PAGE' then 1 else 0 end) AS page_hits, COUNT(distinct(CASE WHEN ph.hit_type = 'PAGE' THEN ph.page_path ELSE NULL END)) AS unique_page_hits, MAX(CASE WHEN ph.is_entrance is true then ph.page_path ELSE NULL END) as landing_page FROM `moz-fx-data-marketing-prod.ga_derived.www_site_hits_v2` ph -LEFT JOIN ga_sessions_time_on_site gav - ON gav.visit_identifier = ph.visit_identifier WHERE date = "2024-02-14" GROUP BY ph.full_visitor_id, @@ -161,9 +162,7 @@ WHERE date = "2024-02-14" ph.browser, ph.browser_version, ph.language, - ph.page_path, - gav.time_on_site, - gav.had_download_event + ph.page_path ) , -- join stub_sessions with ga_sessions using full_visitor_id @@ -190,38 +189,50 @@ downloads_with_ga_sessions AS ( ph.browser_version, ph.language, ph.page_path, - ph.time_on_site, + gav.time_on_site, ph.page_hits, ph.unique_page_hits, ph.landing_page, - ph.had_download_event + gav.had_download_event, FROM stub_download_ids_ga_session_ids sd LEFT JOIN page_hits ph ON ph.full_visitor_id = sd.full_visitor_id AND sd.download_date = ph.submission_date + LEFT JOIN ga_sessions_time_on_site gav + ON gav.visit_identifier = ph.visit_identifier ) -SELECT dltoken, - time_on_site, - ad_content, - campaign, - medium, - source, - landing_page, - country, - -- normalized_country_code, - device_category, - os, - -- normalized_os, - browser, +SELECT + dgs.dltoken, + dgs.time_on_site, + dgs.ad_content, + dgs.campaign, + dgs.medium, + dgs.source, + dgs.landing_page, + dgs.country, + cn.code AS normalized_country_code, + dgs.device_category, + dgs.os, + CASE + WHEN dgs.os IS NULL + THEN NULL + WHEN dgs.os LIKE 'Macintosh%' + THEN 'Mac' -- these values are coming from GA. + ELSE mozfun.norm.os(os) + END AS normalized_os, + dgs.browser, -- normalized_browser, - browser_version, + dgs.browser_version, -- browser_major_version, - language, - page_hits AS pageviews, - unique_page_hits AS unique_pageviews, - had_download_event AS has_ga_download_event, - count_dltoken_duplicates, - additional_download_occurred, - download_date -FROM downloads_with_ga_sessions + dgs.language, + dgs.page_hits AS pageviews, + dgs.unique_page_hits AS unique_pageviews, + dgs.had_download_event AS has_ga_download_event, + dgs.count_dltoken_duplicates, + dgs.additional_download_occurred, + dgs.download_date +FROM downloads_with_ga_sessions dgs +LEFT JOIN + `moz-fx-data-shared-prod.static.country_names_v1` AS cn + ON cn.name = country From 69f7327aefbc02aac45a2e45404eff4003ff948b Mon Sep 17 00:00:00 2001 From: Marlene-M-Hirose Date: Mon, 29 Apr 2024 07:36:52 -1000 Subject: [PATCH 7/9] add in normalized_os, normalized_browser, normalized_browser_major_version, normalized_country_code --- .../downloads_with_attribution_v3/query.sql | 55 +++++++------------ 1 file changed, 21 insertions(+), 34 deletions(-) diff --git a/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql b/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql index 1c93115af64..6920eeb6daa 100644 --- a/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql +++ b/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql @@ -1,35 +1,22 @@ --- CREATE TEMP FUNCTION normalize_browser(browser STRING) AS ( --- CASE --- WHEN `moz-fx-data-shared-prod.udf.ga_is_mozilla_browser`(browser) --- THEN 'Firefox' --- WHEN browser IN ('Internet Explorer') --- THEN 'MSIE' --- WHEN browser IN ('Edge') --- THEN 'Edge' --- WHEN browser IN ('Chrome') --- THEN 'Chrome' --- WHEN browser IN ('Safari') --- THEN 'Safari' --- WHEN browser IN ('(not set)') --- THEN NULL --- WHEN browser IS NULL --- THEN NULL --- ELSE 'Other' --- END --- ); - --- CREATE TEMP FUNCTION normalize_ga_os(os STRING, nrows INTEGER) AS ( --- CASE --- WHEN nrows > 1 --- THEN NULL --- WHEN os IS NULL --- THEN NULL --- WHEN os LIKE 'Macintosh%' --- THEN 'Mac' -- these values are coming from GA. --- ELSE mozfun.norm.os(os) --- END --- ); - +CREATE TEMP FUNCTION normalize_browser(browser STRING) AS ( + CASE + WHEN `moz-fx-data-shared-prod.udf.ga_is_mozilla_browser`(browser) + THEN 'Firefox' + WHEN browser IN ('Internet Explorer') + THEN 'MSIE' + WHEN browser IN ('Edge') + THEN 'Edge' + WHEN browser IN ('Chrome') + THEN 'Chrome' + WHEN browser IN ('Safari') + THEN 'Safari' + WHEN browser IN ('(not set)') + THEN NULL + WHEN browser IS NULL + THEN NULL + ELSE 'Other' + END +); WITH -- Extract all the download rows, de-duping and tracking number of duplicates per download token. @@ -222,9 +209,9 @@ SELECT ELSE mozfun.norm.os(os) END AS normalized_os, dgs.browser, - -- normalized_browser, + normalize(dgs.browser) AS normalized_browser, dgs.browser_version, - -- browser_major_version, + CAST(mozfun.norm.extract_version(browser_version, 'major') AS INTEGER) AS browser_major_version, dgs.language, dgs.page_hits AS pageviews, dgs.unique_page_hits AS unique_pageviews, From 01bb88a59adbdb2277d420ea7d8229ab05e5e269 Mon Sep 17 00:00:00 2001 From: Marlene-M-Hirose Date: Fri, 31 May 2024 07:16:03 -1000 Subject: [PATCH 8/9] refactor query.sql to match downloads_with_attribution_v2, but use GA4 sources. Note:date is set to '2024-02-14' for testing purposes. This will get changed when code is put into production --- .../ga_derived/downloads_with_attribution_v3/query.sql | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql b/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql index 6920eeb6daa..9ec6cee083c 100644 --- a/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql +++ b/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql @@ -36,10 +36,6 @@ stub_downloads AS ( DATE(stub.timestamp) = "2024-02-15" -- AND DATE(stub.timestamp) <= "2024-02-20" AND stub.jsonPayload.fields.log_type = 'download_started' - AND stub.jsonPayload.fields.visit_id NOT LIKE ("(not set)") - AND stub.jsonPayload.fields.session_id NOT LIKE ("(not set)") - AND NULLIF(stub.jsonPayload.fields.visit_id, "") IS NOT NULL - AND NULLIF(stub.jsonPayload.fields.session_id, "") IS NOT NULL GROUP BY stub_visit_id, stub_download_session_id, From 872b29703009d6103c13729cc9c52820e1bad04e Mon Sep 17 00:00:00 2001 From: Marlene-M-Hirose Date: Fri, 31 May 2024 07:41:09 -1000 Subject: [PATCH 9/9] refactor query.sql to match downloads_with_attribution_v2, but use GA4 sources. Note:date is set to '2024-05-24' for testing purposes. This will get changed when code is put into production --- .../downloads_with_attribution_v3/query.sql | 206 +++++++++--------- 1 file changed, 107 insertions(+), 99 deletions(-) diff --git a/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql b/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql index 9ec6cee083c..b50370ddb6a 100644 --- a/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql +++ b/sql/moz-fx-data-marketing-prod/ga_derived/downloads_with_attribution_v3/query.sql @@ -41,7 +41,6 @@ stub_downloads AS ( stub_download_session_id, dltoken ), - multiple_downloads_in_session AS ( SELECT stub_visit_id, @@ -53,7 +52,6 @@ multiple_downloads_in_session AS ( stub_visit_id, stub_download_session_id ), - stub_downloads_with_download_tracking AS ( SELECT s1.stub_visit_id, @@ -70,121 +68,130 @@ stub_downloads_with_download_tracking AS ( s1.stub_visit_id = s2.stub_visit_id AND IFNULL(s1.stub_download_session_id, "null") = IFNULL(s2.stub_download_session_id, "null") ) - ), - -- Extract all the stub_session_ids from GA stub_session_ids AS ( SELECT DISTINCT user_pseudo_id AS full_visitor_id, - CAST(((SELECT `value` FROM UNNEST(event_params) WHERE key = 'id' LIMIT 1).int_value) AS STRING) AS stub_session_id - FROM `moz-fx-data-marketing-prod.analytics_313696158.events_*` - WHERE event_name = 'stub_session_set' + CAST( + ((SELECT `value` FROM UNNEST(event_params) WHERE key = 'id' LIMIT 1).int_value) AS STRING + ) AS stub_session_id + FROM + `moz-fx-data-marketing-prod.analytics_313696158.events_*` + WHERE + event_name = 'stub_session_set' -- will need to update this AND _TABLE_SUFFIX - BETWEEN FORMAT_DATE('%Y%m%d', DATE_SUB("2024-02-15", INTERVAL 2 DAY)) + BETWEEN FORMAT_DATE('%Y%m%d', DATE_SUB("2024-02-15", INTERVAL 2 DAY)) AND FORMAT_DATE('%Y%m%d', DATE_ADD("2024-02-15", INTERVAL 1 DAY)) - ), - +), -- join stub_download_session_ids with ga_stub_session_ids stub_download_ids_ga_session_ids AS ( -SELECT sd.stub_visit_id, -sd.stub_download_session_id, -sd.dltoken, -sd.count_dltoken_duplicates, -sd.additional_download_occurred, -sd.download_date, -ssi.full_visitor_id -FROM stub_downloads_with_download_tracking sd -LEFT JOIN stub_session_ids ssi -ON sd.stub_download_session_id = ssi.stub_session_id -) -, - + SELECT + sd.stub_visit_id, + sd.stub_download_session_id, + sd.dltoken, + sd.count_dltoken_duplicates, + sd.additional_download_occurred, + sd.download_date, + ssi.full_visitor_id + FROM + stub_downloads_with_download_tracking sd + LEFT JOIN + stub_session_ids ssi + ON sd.stub_download_session_id = ssi.stub_session_id +), ga_sessions_time_on_site AS ( - SELECT CONCAT(ga_client_id, "-", ga_session_id) AS visit_identifier, + SELECT + CONCAT(ga_client_id, "-", ga_session_id) AS visit_identifier, time_on_site, had_download_event - FROM `moz-fx-data-shared-prod.mozilla_org_derived.ga_sessions_v2` - WHERE session_date = '2024-02-14' - AND had_download_event IS TRUE + FROM + `moz-fx-data-shared-prod.mozilla_org_derived.ga_sessions_v2` + WHERE + session_date = '2024-02-14' + AND had_download_event IS TRUE ), - page_hits AS ( -SELECT - ph.full_visitor_id, - ph.visit_identifier, - ph.date AS submission_date, - ph.country, - ph.ad_content, - ph.campaign, - ph.medium, - ph.source, - ph.device_category, - ph.operating_system AS os, - ph.browser, - ph.browser_version, - ph.language, - ph.page_path, - SUM(CASE WHEN ph.hit_type = 'PAGE' then 1 else 0 end) AS page_hits, - COUNT(distinct(CASE WHEN ph.hit_type = 'PAGE' THEN ph.page_path ELSE NULL END)) AS unique_page_hits, - MAX(CASE WHEN ph.is_entrance is true then ph.page_path ELSE NULL END) as landing_page -FROM `moz-fx-data-marketing-prod.ga_derived.www_site_hits_v2` ph -WHERE date = "2024-02-14" + SELECT + ph.full_visitor_id, + ph.visit_identifier, + ph.date AS submission_date, + ph.country, + ph.ad_content, + ph.campaign, + ph.medium, + ph.source, + ph.device_category, + ph.operating_system AS os, + ph.browser, + ph.browser_version, + ph.language, + ph.page_path, + SUM(CASE WHEN ph.hit_type = 'PAGE' THEN 1 ELSE 0 END) AS page_hits, + COUNT( + DISTINCT(CASE WHEN ph.hit_type = 'PAGE' THEN ph.page_path ELSE NULL END) + ) AS unique_page_hits, + MAX(CASE WHEN ph.is_entrance IS TRUE THEN ph.page_path ELSE NULL END) AS landing_page + FROM + `moz-fx-data-marketing-prod.ga_derived.www_site_hits_v2` ph + WHERE + date = "2024-02-14" GROUP BY - ph.full_visitor_id, - ph.visit_identifier, - ph.date, - ph.country, - ph.ad_content, - ph.campaign, - ph.medium, - ph.source, - ph.device_category, - ph.operating_system, - ph.browser, - ph.browser_version, - ph.language, - ph.page_path -) -, + ph.full_visitor_id, + ph.visit_identifier, + ph.date, + ph.country, + ph.ad_content, + ph.campaign, + ph.medium, + ph.source, + ph.device_category, + ph.operating_system, + ph.browser, + ph.browser_version, + ph.language, + ph.page_path +), -- join stub_sessions with ga_sessions using full_visitor_id downloads_with_ga_sessions AS ( SELECT - sd.stub_visit_id, - sd.stub_download_session_id, - sd.dltoken, - sd.count_dltoken_duplicates, - sd.additional_download_occurred, - sd.download_date, - sd.full_visitor_id, - ph.full_visitor_id as full_visitor_id_check, - ph.visit_identifier, - ph.submission_date, - ph.country, - ph.ad_content, - ph.campaign, - ph.medium, - ph.source, - ph.device_category, - ph.os, - ph.browser, - ph.browser_version, - ph.language, - ph.page_path, - gav.time_on_site, - ph.page_hits, - ph.unique_page_hits, - ph.landing_page, - gav.had_download_event, - FROM stub_download_ids_ga_session_ids sd - LEFT JOIN page_hits ph ON - ph.full_visitor_id = sd.full_visitor_id - AND sd.download_date = ph.submission_date - LEFT JOIN ga_sessions_time_on_site gav - ON gav.visit_identifier = ph.visit_identifier + sd.stub_visit_id, + sd.stub_download_session_id, + sd.dltoken, + sd.count_dltoken_duplicates, + sd.additional_download_occurred, + sd.download_date, + sd.full_visitor_id, + ph.full_visitor_id AS full_visitor_id_check, + ph.visit_identifier, + ph.submission_date, + ph.country, + ph.ad_content, + ph.campaign, + ph.medium, + ph.source, + ph.device_category, + ph.os, + ph.browser, + ph.browser_version, + ph.language, + ph.page_path, + gav.time_on_site, + ph.page_hits, + ph.unique_page_hits, + ph.landing_page, + gav.had_download_event, + FROM + stub_download_ids_ga_session_ids sd + LEFT JOIN + page_hits ph + ON ph.full_visitor_id = sd.full_visitor_id + AND sd.download_date = ph.submission_date + LEFT JOIN + ga_sessions_time_on_site gav + ON gav.visit_identifier = ph.visit_identifier ) - SELECT dgs.dltoken, dgs.time_on_site, @@ -205,7 +212,7 @@ SELECT ELSE mozfun.norm.os(os) END AS normalized_os, dgs.browser, - normalize(dgs.browser) AS normalized_browser, + NORMALIZE(dgs.browser) AS normalized_browser, dgs.browser_version, CAST(mozfun.norm.extract_version(browser_version, 'major') AS INTEGER) AS browser_major_version, dgs.language, @@ -215,7 +222,8 @@ SELECT dgs.count_dltoken_duplicates, dgs.additional_download_occurred, dgs.download_date -FROM downloads_with_ga_sessions dgs +FROM + downloads_with_ga_sessions dgs LEFT JOIN `moz-fx-data-shared-prod.static.country_names_v1` AS cn ON cn.name = country