# Example - Suggest Online
An example of using moz_preprocess and difference_finder to segment clients by Suggest Online vs Suggest Offline, and look for differences between those 2 segments.

In [None]:
#possible libraries to install before using the diff-finder library. 
#They will be included in the requirement for the package later

#%pip install google-cloud-bigquery
#%pip install db_dtypes

In [1]:
import pandas as pd
from moz_preprocess.bq_utils import fetch_weekly_aggregate
from moz_preprocess.preprocess import preprocess

# Fetch data from BigQuery

In [None]:
!gcloud auth login --update-adc

In [None]:
# This will take a few minutes to run.
sample = 30000

df_from_bq = fetch_weekly_aggregate(
    week_start_date="2023-01-15",  # Choose a different start date to avoid lunar new year.
    segment="""
    CASE WHEN user_pref_browser_urlbar_quicksuggest_data_collection_enabled = 'true'
         THEN 'online' 
         ELSE 'offline' 
         END
    """,
    target="country = 'US' and normalized_channel = 'release' and locale like 'en%'",
    sample=sample,  # The dataset is 15 million rows without sampling.
    verbose=False  # Set True to see the SQL that is run.
)  

In [7]:
df_from_bq.segment.value_counts()

offline    29864
online     23183
Name: segment, dtype: int64

In [None]:
df_from_bq.head()

# Preprocess data for difference-finder

In [None]:
cols_to_drop = [
    # To Do: don't drop these fields. preprocess them instead.
    "attribution",
    "browser_version_info",
    "active_addons",
    "a11y_theme",
    "experiments",
    "scalar_parent_browser_ui_interaction_content_context_sum",
    "scalar_parent_browser_ui_interaction_preferences_pane_home_sum",
    "scalar_parent_devtools_accessibility_select_accessible_for_node_sum",
]
discrete_cols = [
  "addon_compatibility_check_enabled",
  "app_display_version",
  "blocklist_enabled",
  "cpu_cores",
  "cpu_count",
  "cpu_family",
  "cpu_l2_cache_kb",
  "cpu_l3_cache_kb",
  "cpu_model",
  "cpu_speed_mhz",
  "cpu_stepping",
  "cpu_vendor",
  "default_search_engine_data_name",
  "distribution_id",
  "e10s_enabled",
  "env_build_arch",
  "flash_version",
  "country",
  "city",
  "geo_subdivision1",
  "geo_subdivision2",
  "isp_name",
  "isp_organization",
  "gfx_features_advanced_layers_status",
  "gfx_features_d2d_status",
  "gfx_features_d3d11_status",
  "gfx_features_gpu_process_status",
  "install_year",
  "is_default_browser",
  "is_wow64",
  "locale",
  "memory_mb",
  "normalized_channel",
  "normalized_os_version",
  "os",
  "os_version",
  "sandbox_effective_content_process_level",
  "sync_configured",
  "telemetry_enabled",
  "timezone_offset",
  "update_auto_download",
  "update_channel",
  "update_enabled",
  "vendor",
  "windows_build_number",
  "windows_ubr",
  "fxa_configured",
  "scalar_parent_os_environment_is_taskbar_pinned",
  "scalar_parent_os_environment_launched_via_desktop",
  "scalar_parent_os_environment_launched_via_taskbar",
  "scalar_parent_os_environment_launched_via_other",
  "scalar_parent_os_environment_launched_via_start_menu",
  "scalar_parent_os_environment_launched_via_other_shortcut",
  "default_private_search_engine",
  "user_pref_browser_search_region",
  "update_background",
  "user_pref_browser_urlbar_suggest_searches",
  "user_pref_browser_newtabpage_enabled",
  "user_pref_app_shield_optoutstudies_enabled",
  "scalar_parent_os_environment_launched_via_taskbar_private",
  "dom_parentprocess_private_window_used",
  "os_environment_is_taskbar_pinned_any",
  "os_environment_is_taskbar_pinned_private_any",
  "os_environment_is_taskbar_pinned_private",
  "search_cohort",
  "user_pref_browser_urlbar_quicksuggest_data_collection_enabled",
]

In [None]:
dummies = []
for col in discrete_cols:
    dummies.append(preprocess(df_from_bq[col], col, int(0.01*sample)))

df = pd.concat([df_from_bq.drop(cols_to_drop + discrete_cols, axis=1)] + [x for x in dummies if x])
print(df.shape)

# Use difference-finder
* Test each dummy (binary) column using a binomial test.
* Test each continuous column using a KS test.

To Do: KS test is not great. Implement a test of means and a test of medians.