# Overview

In a separate file we generate a dataframe that can be used for comparing segment vs kinesis.

This requires the output of that process, and provides a simpler area for investigating it (and hopefully less likely to crash).

In [62]:
# Safe imports
from datetime import datetime, timedelta, date

In [63]:
import pandas as pd
import numpy as np

In [64]:
import ipywidgets as widgets

In [65]:
# Imports on files that might have dependencies that need installing
import data_pier_querying
from athena_querying import AthenaQuery
from athena_common_queries import *
import user_agents # this converts user agent from browser to mobile / desktop etc.

# Settings

In [66]:
load_merged_results_file_path = "merged_df_with_meta_01-10_01-12.gzip"

In [67]:
merged_df_with_meta = pd.read_parquet(load_merged_results_file_path)

In [68]:
merged_df_with_meta.head()

Unnamed: 0_level_0,anonymous_id,event_name,page_url,date,s_count,k_count,page_type,slug,slug_root,ab_test,country_code,user_agent,device_family,os_family,os_version,browser_family,browser_version,is_bot
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,00000b54-600a-4de2-8700-fd9885252dca,PageView,https://blog.moneysmart.sg/career/5-easy-side-...,2020-01-12,2,2,blog,/career/5-easy-side-businesses-you-can-run-whi...,/career,control,sg,Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like M...,mobile,iOS,13.3,Mobile Safari,13.0.4,False
1,00000b54-600a-4de2-8700-fd9885252dca,PageView,https://www.moneysmart.sg/embed/98e61305602380...,2020-01-12,1,1,shop,/embed/98e61305602380971d9c5e68c4a75647,/embed,control,sg,Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like M...,mobile,iOS,13.3,Mobile Safari,13.0.4,False
2,00000b54-600a-4de2-8700-fd9885252dca,Reading,https://blog.moneysmart.sg/career/5-easy-side-...,2020-01-12,3,3,blog,/career/5-easy-side-businesses-you-can-run-whi...,/career,control,sg,Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like M...,mobile,iOS,13.3,Mobile Safari,13.0.4,False
3,00000b54-600a-4de2-8700-fd9885252dca,UserView.WidgetLoad,https://www.moneysmart.sg/embed/98e61305602380...,2020-01-12,1,1,shop,/embed/98e61305602380971d9c5e68c4a75647,/embed,control,sg,Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like M...,mobile,iOS,13.3,Mobile Safari,13.0.4,False
4,000034a2-e973-4108-b920-0681877d4fc0,PageView,https://blog.moneysmart.sg/budgeting/cheapest-...,2020-01-10,1,1,blog,/budgeting/cheapest-sim-only-plans,/budgeting,control,sg,Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like M...,mobile,iOS,13.3,Mobile Safari,13.3,False


In [69]:
len(merged_df_with_meta)

1019496

In [70]:
merged_df_with_meta.dtypes

anonymous_id         object
event_name           object
page_url             object
date               category
s_count               int64
k_count               int64
page_type          category
slug               category
slug_root            object
ab_test            category
country_code       category
user_agent           object
device_family      category
os_family          category
os_version         category
browser_family     category
browser_version    category
is_bot                 bool
dtype: object

In [71]:
def colour_grouped_table(df):

        

    def color_how_good(value):
        if isinstance(value, str):
            return
        av = abs(value)
        if av<2:
            c = "green"
        elif value <0:
            c =  "red"

        else:
            c= "blue"

        if av>20:
            return "background-color:rgb(250,200,200)"
        return "color:%s"%c # it's just CSS, so you can do background as well.
    return df.style.applymap(color_how_good , subset=["k_vs_s_%"])

In [72]:
def group_by_and_show_count_difference(df, group_by_cols, with_styling=True):
    """
    This expects counts in s_count and k_count
    """
    
    grouped = df.groupby(group_by_cols).sum().reset_index()
    
    grouped["k_vs_s_%"] = grouped.apply(lambda row:(999 if row.k_count else 0) if row.s_count==0 else round(((row.k_count - row.s_count)/row.s_count)*100, 1), axis=1 )
    grouped = grouped[(grouped.k_count>0) | (grouped.s_count>0)] # filters out NaNs after grouping
    
    

    return grouped

# Search Feature

In [73]:
country_codes = merged_df_with_meta.country_code.unique().to_list()
country_codes

['sg', 'hk', '??', 'tw', 'id', 'ph']

In [74]:
top_level_slugs = list(merged_df_with_meta[merged_df_with_meta.page_type!="blog"].slug_root.unique())
top_level_slugs.sort()
event_types = list(merged_df_with_meta.event_name.unique())
event_types.sort()

In [75]:

search_results_df = pd.DataFrame()
def on_search_button_click(b):
    #pandas likes lists, not tuples (at least for group by)
    print("searching")
    print("erm...")
    df = merged_df_with_meta
    
    anonymous_id = anonymous_user_input.value.strip()
    search_anonymous_id = bool(anonymous_id)
    
    event_types = list(event_type_select.value)
    search_event_types = len(event_types)>0
    
    slug_search_string = slug_search_input.value.strip()
    search_slug_by_string = bool(slug_search_string)
    
    country_codes = list(country_code_dropdown.value)
    search_country_codes =  len(country_codes)!=0
        
    top_level_slugs = list(top_level_slug_select.value)
    search_by_top_level_slugs = len(top_level_slugs)>0
    
    group_by_cols = list(group_by_select.value)
    do_group_by = len(group_by_cols)>0
    

    print("Events search")
    d = df[(( not search_anonymous_id) | (df.anonymous_id==anonymous_id)) \
           & ( (not search_event_types) | (df.event_name.isin(event_types))) \
          & (( not search_slug_by_string) | (df.slug.str.contains(slug_search_string))) \
            & (( not search_country_codes) | (df.country_code.isin(country_codes))) \
           & (( not search_by_top_level_slugs) | (df.slug_root.isin(top_level_slugs))) \

           ]



    if do_group_by:
        #d = d.groupby(group_by_cols).sum()
        d = group_by_and_show_count_difference(d, group_by_cols)
        #colour_grouped_table(d)
    search_results_df = d
    display(search_results_df)
    
    
    
    print("done searching")
    
def on_reset_button_click(b):
    print("I would be resetting")

In [76]:
def button_click_placeholder(b):
    print("just chilling")

anonymous_user_input = widgets.Text(description = "Anonymous_id")
country_code_dropdown = widgets.SelectMultiple(
    options= country_codes,
    value=["sg","hk"],
    # rows=10,
    description='Country',
    disabled=False
)
search_button = widgets.Button(description='Search', on_click=on_search_button_click)
reset_button = widgets.Button(description='Reset', on_click=on_reset_button_click)

search_button.on_click(on_search_button_click)

top_level_slug_select = widgets.SelectMultiple(options = top_level_slugs, description="slug")

slug_search_input = widgets.Text(description = "Slug like")

event_type_select = widgets.SelectMultiple(options=event_types, description="Event")



#search_modes = ["Summary", "Summary Deduped", "Events",] # summary - > grouped with difference, events -> grouped by anon_id etc, 
#search_mode_dropdown = widgets.Dropdown(description="Search Mode", options=search_modes, value=search_modes[0])

search_options = widgets.Box([anonymous_user_input, country_code_dropdown , top_level_slug_select, slug_search_input,  event_type_select])

search_options.layout=widgets.Layout(width='100%',display='inline-flex',flex_flow='row wrap') #auto wrap

group_by_select = widgets.SelectMultiple(description="Group By", options = merged_df_with_meta.columns.to_list())

search_bar = widgets.VBox([search_options, group_by_select, widgets.Box([search_button, reset_button])])
display(search_bar)



VBox(children=(Box(children=(Text(value='', description='Anonymous_id'), SelectMultiple(description='Country',…

In [77]:
search_results_df.head() #not sure why it's not working

In [58]:
search_results_df.to_csv("breakdown_by_browser.csv")