# Example - Suggest Online
An example of using moz_preprocess and difference_finder to segment clients by Suggest Online vs Suggest Offline, and look for differences between those 2 segments.

In [7]:
#possible libraries to install before using the diff-finder library. 
#They will be included in the requirement for the package later

#%pip install google-cloud-bigquery
#%pip install db_dtypes

Collecting db_dtypes
  Using cached db_dtypes-1.0.5-py2.py3-none-any.whl (14 kB)
Collecting pyarrow>=3.0.0
  Using cached pyarrow-11.0.0-cp310-cp310-macosx_10_14_x86_64.whl (24.4 MB)
Installing collected packages: pyarrow, db_dtypes
Successfully installed db_dtypes-1.0.5 pyarrow-11.0.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
from moz_preprocess.bq_utils import fetch_weekly_aggregate
from moz_preprocess.preprocess import preprocess

# Fetch data from BigQuery

In [2]:
!gcloud auth login --update-adc

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=32555940559.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=jHZRkQK1FZpVasjraEtdC6qirAMrGs&access_type=offline&code_challenge=c5aocpEEAsHB6owgjdubYwe0qdYELjqcAhDuqoIqcQU&code_challenge_method=S256


Application default credentials (ADC) were updated.

You are now logged in as [ysmith@mozilla.com].
Your current project is [mozdata].  You can change this setting by running:
  $ gcloud config set project PROJECT_ID


To take a quick anonymous survey, run:
  $ gcloud survey



In [3]:
# This will take a few minutes to run.
sample = 30000

df_from_bq = fetch_weekly_aggregate(
    week_start_date="2023-01-15",  # Choose a different start date to avoid lunar new year.
    segment="""
    CASE WHEN user_pref_browser_urlbar_quicksuggest_data_collection_enabled = 'true'
         THEN 'online' 
         ELSE 'offline' 
         END
    """,
    target="country = 'US' and normalized_channel = 'release' and locale like 'en%'",
    sample=sample,  # The dataset is 15 million rows without sampling.
    verbose=False  # Set True to see the SQL that is run.
)  

In [3]:
df_from_bq.segment.value_counts()

offline    10087
online     10050
Name: segment, dtype: int64

# Preprocess data for difference-finder

In [None]:
cols_to_drop = [
    # To Do: don't drop these fields. preprocess them instead.
    "attribution",
    "browser_version_info",
    "active_addons",
    "a11y_theme",
    "experiments",
    "scalar_parent_browser_ui_interaction_content_context_sum",
    "scalar_parent_browser_ui_interaction_preferences_pane_home_sum",
    "scalar_parent_devtools_accessibility_select_accessible_for_node_sum",
]
discrete_cols = [
  "addon_compatibility_check_enabled",
  "app_display_version",
  "blocklist_enabled",
  "cpu_cores",
  "cpu_count",
  "cpu_family",
  "cpu_l2_cache_kb",
  "cpu_l3_cache_kb",
  "cpu_model",
  "cpu_speed_mhz",
  "cpu_stepping",
  "cpu_vendor",
  "default_search_engine_data_name",
  "distribution_id",
  "e10s_enabled",
  "env_build_arch",
  "flash_version",
  "country",
  "city",
  "geo_subdivision1",
  "geo_subdivision2",
  "isp_name",
  "isp_organization",
  "gfx_features_advanced_layers_status",
  "gfx_features_d2d_status",
  "gfx_features_d3d11_status",
  "gfx_features_gpu_process_status",
  "install_year",
  "is_default_browser",
  "is_wow64",
  "locale",
  "memory_mb",
  "normalized_channel",
  "normalized_os_version",
  "os",
  "os_version",
  "sandbox_effective_content_process_level",
  "sync_configured",
  "telemetry_enabled",
  "timezone_offset",
  "update_auto_download",
  "update_channel",
  "update_enabled",
  "vendor",
  "windows_build_number",
  "windows_ubr",
  "fxa_configured",
  "scalar_parent_os_environment_is_taskbar_pinned",
  "scalar_parent_os_environment_launched_via_desktop",
  "scalar_parent_os_environment_launched_via_taskbar",
  "scalar_parent_os_environment_launched_via_other",
  "scalar_parent_os_environment_launched_via_start_menu",
  "scalar_parent_os_environment_launched_via_other_shortcut",
  "default_private_search_engine",
  "user_pref_browser_search_region",
  "update_background",
  "user_pref_browser_urlbar_suggest_searches",
  "user_pref_browser_newtabpage_enabled",
  "user_pref_app_shield_optoutstudies_enabled",
  "scalar_parent_os_environment_launched_via_taskbar_private",
  "dom_parentprocess_private_window_used",
  "os_environment_is_taskbar_pinned_any",
  "os_environment_is_taskbar_pinned_private_any",
  "os_environment_is_taskbar_pinned_private",
  "search_cohort",
  "user_pref_browser_urlbar_quicksuggest_data_collection_enabled",
]

In [None]:
dummies = []
for col in discrete_cols:
    dummies.append(preprocess(df_from_bq[col], col, int(0.01*sample)))

df = pd.concat([df_from_bq.drop(cols_to_drop + discrete_cols, axis=1)] + [x for x in dummies if x])
print(df.shape)

# Use difference-finder
* Test each dummy (binary) column using a binomial test.
* Test each continuous column using a KS test.

To Do: KS test is not great. Implement a test of means and a test of medians.