Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
332 lines (278 sloc) 10.3 KB
"""Firefox Desktop Search Count Datasets
This job produces derived datasets that make it easy to explore search count
data.
The search_aggregates job is used to populate an executive search dashboard.
For more information, see Bug 1381140.
The search_clients_daily job produces a dataset keyed by
`(client_id, submission_date, search_counts.engine, search_counts.source)`.
This allows for deeper analysis into user level search behavior.
"""
import click
import logging
import datetime
from pyspark_hyperloglog import hll
from pyspark.sql.functions import (
expr,
explode,
col,
when,
udf,
sum,
first,
count,
datediff,
from_unixtime,
mean,
size,
max,
lit,
)
from pyspark.sql.types import StringType
from pyspark.sql import SparkSession
from mozetl.constants import SEARCH_SOURCE_WHITELIST
from mozetl.utils import stop_session_safely
DEFAULT_INPUT_BUCKET = "telemetry-parquet"
DEFAULT_INPUT_PREFIX = "main_summary/v4"
DEFAULT_SAVE_MODE = "error"
MAX_CLIENT_SEARCH_COUNT = 10000
SEARCH_AGGREGATES_VERSION = 4
SEARCH_CLIENTS_DAILY_VERSION = 4
def agg_first(col):
return first(col).alias(col)
def search_clients_daily(main_summary):
return agg_search_data(
main_summary,
["client_id", "submission_date", "engine", "source"],
list(
map(
agg_first,
[
"country",
"app_version",
"distribution_id",
"locale",
"search_cohort",
"addon_version",
"os",
"channel",
"profile_creation_date",
"default_search_engine",
"default_search_engine_data_load_path",
"default_search_engine_data_submission_url",
"sample_id",
],
)
)
+ [
# Count of 'first' subsessions seen for this client_day
(
count(when(col("subsession_counter") == 1, 1)).alias(
"sessions_started_on_this_day"
)
),
first(
datediff(
"subsession_start_date",
from_unixtime(col("profile_creation_date") * 24 * 60 * 60),
)
).alias("profile_age_in_days"),
sum(col("subsession_length") / 3600.0).alias("subsession_hours_sum"),
mean(size("active_addons")).alias("active_addons_count_mean"),
(
max("scalar_parent_browser_engagement_max_concurrent_tab_count").alias(
"max_concurrent_tab_count_max"
)
),
(
sum("scalar_parent_browser_engagement_tab_open_event_count").alias(
"tab_open_event_count_sum"
)
),
(sum(col("active_ticks") * 5 / 3600.0).alias("active_hours_sum")),
],
)
def search_aggregates(main_summary):
hll.register()
return agg_search_data(
main_summary.withColumn("hll", expr("hll_create(client_id, 12)")),
[
"addon_version",
"app_version",
"country",
"distribution_id",
"engine",
"locale",
"search_cohort",
"source",
"submission_date",
"default_search_engine",
],
[expr("hll_cardinality(hll_merge(hll)) as client_count")],
).where(col("engine").isNotNull())
def agg_search_data(main_summary, grouping_cols, agg_functions):
"""Augment, Explode, and Aggregate search data
The augmented and exploded dataset has the same columns as main_summary
with the addition of the following:
engine: A key in the search_counts field representing a search engine.
e.g. 'hoolie'
source: A key in the search_counts field representing a search source
e.g. 'urlbar'
tagged-sap: Sum of all searches with partner codes from an SAP
tagged-follow-on: Sum of all searches with partner codes from a downstream query
sap: Sum of all searches originating from a direct user interaction with the Firefox UI
addon_version: The version of the followon-search@mozilla.com addon
"""
exploded = explode_search_counts(main_summary)
augmented = add_derived_columns(exploded)
# Do all aggregations
aggregated = augmented.groupBy(grouping_cols + ["type"]).agg(
*(agg_functions + [sum("count").alias("count")])
)
# Pivot on search type
pivoted = (
aggregated.groupBy(
[column for column in aggregated.columns if column not in ["type", "count"]]
)
.pivot("type", ["organic", "tagged-sap", "tagged-follow-on", "sap", "unknown"])
.sum("count")
# Add convenience columns with underscores instead of hyphens.
# This makes the table easier to query from Presto.
.withColumn("tagged_sap", col("tagged-sap"))
.withColumn("tagged_follow_on", col("tagged-follow-on"))
)
return pivoted
def get_search_addon_version(active_addons):
if not active_addons:
return None
return next(
(a[5] for a in active_addons if a[0] == "followonsearch@mozilla.com"), None
)
def explode_search_counts(main_summary):
exploded_col_name = "single_search_count"
search_fields = [
exploded_col_name + "." + field for field in ["engine", "source", "count"]
]
exploded_search_counts = (
main_summary.withColumn(exploded_col_name, explode(col("search_counts")))
.select(["*"] + search_fields)
.filter("single_search_count.count < %s" % MAX_CLIENT_SEARCH_COUNT)
.drop(exploded_col_name)
.drop("search_counts")
)
zero_search_users = (
main_summary.where(col("search_counts").isNull())
.withColumn("engine", lit(None))
.withColumn("source", lit(None))
# Using 0 instead of None for search_count makes many queries easier
# (e.g. average searche per user)
.withColumn("count", lit(0))
.drop("search_counts")
)
return exploded_search_counts.union(zero_search_users)
def add_derived_columns(exploded_search_counts):
"""Adds the following columns to the provided dataset:
type: One of 'in-content-sap', 'follow-on', or 'chrome-sap'.
addon_version: The version of the followon-search@mozilla addon, or None
"""
udf_get_search_addon_version = udf(get_search_addon_version, StringType())
def _generate_when_expr(source_mappings):
if not source_mappings:
return "unknown"
source_mapping = source_mappings[0]
return when(
col("source").startswith(source_mapping[0]), source_mapping[1]
).otherwise(_generate_when_expr(source_mappings[1:]))
when_expr = when(col("source").isin(SEARCH_SOURCE_WHITELIST), "sap").otherwise(
when(col("source").isNull(), "sap").otherwise(
_generate_when_expr(
[
("in-content:sap:", "tagged-sap"),
("in-content:sap-follow-on:", "tagged-follow-on"),
("in-content:organic:", "organic"),
("sap:", "tagged-sap"),
("follow-on:", "tagged-follow-on"),
]
)
)
)
return exploded_search_counts.withColumn("type", when_expr).withColumn(
"addon_version", udf_get_search_addon_version("active_addons")
)
def generate_rollups(
submission_date,
output_bucket,
output_prefix,
output_version,
transform_func,
input_bucket=DEFAULT_INPUT_BUCKET,
input_prefix=DEFAULT_INPUT_PREFIX,
save_mode=DEFAULT_SAVE_MODE,
orderBy=[],
):
"""Load main_summary, apply transform_func, and write to S3"""
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info("Running the {0} ETL job...".format(transform_func.__name__))
start = datetime.datetime.now()
spark = SparkSession.builder.appName("search_dashboard_etl").getOrCreate()
source_path = "s3://{}/{}/submission_date_s3={}".format(
input_bucket, input_prefix, submission_date
)
output_path = "s3://{}/{}/v{}/submission_date_s3={}".format(
output_bucket, output_prefix, output_version, submission_date
)
logger.info("Loading main_summary...")
main_summary = spark.read.parquet(source_path)
logger.info("Applying transformation function...")
search_dashboard_data = transform_func(main_summary)
if orderBy:
search_dashboard_data = search_dashboard_data.orderBy(*orderBy)
logger.info("Saving rollups to: {}".format(output_path))
(search_dashboard_data.write.mode(save_mode).save(output_path))
stop_session_safely(spark)
logger.info("... done (took: %s)", str(datetime.datetime.now() - start))
# Generate ETL jobs - these are useful if you want to run a job from ATMO
def search_aggregates_etl(submission_date, bucket, prefix, **kwargs):
generate_rollups(
submission_date,
bucket,
prefix,
SEARCH_AGGREGATES_VERSION,
search_aggregates,
**kwargs
)
def search_clients_daily_etl(submission_date, bucket, prefix, **kwargs):
generate_rollups(
submission_date,
bucket,
prefix,
SEARCH_CLIENTS_DAILY_VERSION,
search_clients_daily,
orderBy=["sample_id"],
**kwargs
)
# Generate click commands - wrap ETL jobs to accept click arguements
def gen_click_command(etl_job):
"""Wrap an ETL job with click arguements"""
@click.command()
@click.option("--submission_date", required=True)
@click.option("--bucket", required=True)
@click.option("--prefix", required=True)
@click.option(
"--input_bucket",
default=DEFAULT_INPUT_BUCKET,
help="Bucket of the input dataset",
)
@click.option(
"--input_prefix",
default=DEFAULT_INPUT_PREFIX,
help="Prefix of the input dataset",
)
@click.option(
"--save_mode", default=DEFAULT_SAVE_MODE, help="Save mode for writing data"
)
def wrapper(*args, **posargs):
return etl_job(*args, **posargs)
return wrapper
search_aggregates_click = gen_click_command(search_aggregates_etl)
search_clients_daily_click = gen_click_command(search_clients_daily_etl)
You can’t perform that action at this time.