# Overview

In a separate file we generate a dataframe that can be used for comparing segment vs kinesis.

This requires the output of that process, and provides a simpler area for investigating it (and hopefully less likely to crash).

# Prerequisites

* This uses ipython widgets that might not show up if you don't have the right jupyterlab extension installed


In [1]:
!pip install pyarrow

[33mYou are using pip version 10.0.1, however version 20.0.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


# Imports

In [2]:
# Safe imports
from datetime import datetime, timedelta, date

In [3]:
import pandas as pd
import numpy as np
import math

In [4]:
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

In [5]:
# Imports on files that might have dependencies that need installing
import data_pier_querying
from athena_querying import AthenaQuery
from athena_common_queries import *
import user_agents # this converts user agent from browser to mobile / desktop etc.

  """)


ModuleNotFoundError: No module named 'user_agents'

# Settings

In [6]:
load_merged_results_file_path = "merged_df_with_meta_20200209_0000_to_20200216_0000.gzip"

In [7]:
merged_df_with_meta = pd.read_parquet(load_merged_results_file_path)

In [8]:
merged_df_with_meta.head(2)

Unnamed: 0_level_0,anonymous_id,event_name,page_url,date,s_count,k_count,page_type,slug,slug_root,ab_test,country_code,user_agent,device_family,os_family,os_version,browser_family,browser_version,is_bot
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,000034a2-e973-4108-b920-0681877d4fc0,PageView,https://blog.moneysmart.sg/budgeting/mattress-...,2020-02-15,1,1,blog,/budgeting/mattress-singapore-guide,/budgeting,control,sg,Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like M...,mobile,iOS,13.3,Google,91.1.292041477,False
1,000034a2-e973-4108-b920-0681877d4fc0,PageView,https://blog.moneysmart.sg/property/3-things-l...,2020-02-10,1,1,blog,/property/3-things-look-buying-condo-2017,/property,control,sg,Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like M...,mobile,iOS,13.3,Google,91.1.292041477,False


In [9]:
len(merged_df_with_meta)

2021883

In [10]:
merged_df_with_meta.dtypes

anonymous_id       object
event_name         object
page_url           object
date               object
s_count             int64
k_count             int64
page_type          object
slug               object
slug_root          object
ab_test            object
country_code       object
user_agent         object
device_family      object
os_family          object
os_version         object
browser_family     object
browser_version    object
is_bot             object
dtype: object

In [11]:

def highlight_cols(cell):
    #use hex colours, or named ones to ensure excel compatibility on export
    if cell=="":
        return ""
    ci = min(100, int(abs(cell*10)))
    if abs(cell)<=2:
        return "color:green;"
    if cell <0:

        return "background-color:#%02x%02x%02x;" % (255,255-ci,255-ci)
    if cell>0:
        #return "background-color:cyan;"
        return "background-color:#%02x%02x%02x;" % (255-ci, 255-ci, 255)

    return "background-color:red;"

def colour_grouped_table(df):

    return df.style.applymap(highlight_cols, subset=["k_vs_s_%"])

In [12]:
def group_by_and_show_count_difference(df, group_by_cols, with_styling=True):
    """
    This expects counts in s_count and k_count
    """
    
    grouped = df.groupby(group_by_cols).sum().reset_index()
    
    grouped["k_vs_s_%"] = grouped.apply(lambda row:(999 if row.k_count else 0) if row.s_count==0 else round(((row.k_count - row.s_count)/row.s_count)*100, 1), axis=1 )
    grouped = grouped[(grouped.k_count>0) | (grouped.s_count>0)] # filters out NaNs after grouping
    
    

    return grouped

# Search Feature

In [14]:
country_codes = list(merged_df_with_meta.country_code.unique())#merged_df_with_meta.country_code.unique().to_list()
country_codes

['sg', 'hk', '??', 'tw', 'ph', 'id']

In [15]:
top_level_slugs = list(merged_df_with_meta[merged_df_with_meta.page_type!="blog"].slug_root.unique())
top_level_slugs.sort()
event_types = list(merged_df_with_meta.event_name.unique())
event_types.sort()

page_types = list(merged_df_with_meta.page_type.unique())

num_events = len(merged_df_with_meta)

In [16]:

search_results_df = pd.DataFrame()

def on_search_button_click(b):
    #pandas likes lists, not tuples (at least for group by)
    print("searching")
    print("erm...")
    df = merged_df_with_meta
    
    anonymous_id = anonymous_user_input.value.strip()
    search_anonymous_id = bool(anonymous_id)
    
    event_types = list(event_type_select.value)
    search_event_types = len(event_types)>0
    
    slug_search_string = slug_search_input.value.strip()
    search_slug_by_string = bool(slug_search_string)
    
    country_codes = list(country_code_dropdown.value)
    search_country_codes =  len(country_codes)!=0
        
    top_level_slugs = list(top_level_slug_select.value)
    search_by_top_level_slugs = len(top_level_slugs)>0
    
    group_by_cols = list(group_by_select.value)
    do_group_by = len(group_by_cols)>0
    
    page_types = list(page_type_select.value)
    search_by_page_type = len(page_types)>0
    
    at_least_events = int(event_count_slider.value)
    
    print("%s %s"%(search_by_page_type, page_types))

    print("Events search")
    d = df[(( not search_anonymous_id) | (df.anonymous_id==anonymous_id)) \
           & ( (not search_event_types) | (df.event_name.isin(event_types))) \
          & (( not search_slug_by_string) | (df.slug.str.contains(slug_search_string))) \
            & (( not search_country_codes) | (df.country_code.isin(country_codes))) \
           & (( not search_by_top_level_slugs) | (df.slug_root.isin(top_level_slugs))) \
            & (( not search_by_page_type) | (df.page_type.isin(page_types))) \
           
           ]



    if do_group_by:
        #d = d.groupby(group_by_cols).sum()
        d = group_by_and_show_count_difference(d, group_by_cols)
        d = d[((d.s_count > at_least_events) | (d.k_count>at_least_events))]
        colour_grouped_table(d)
    search_results_df = d
    display(search_results_df)
    
    
    #return search_results_df
    print("done searching")
    
def on_reset_button_click(b):
    print("I would be resetting")

In [17]:
def button_click_placeholder(b):
    print("just chilling")

anonymous_user_input = widgets.Text(description = "Anonymous_id")
country_code_dropdown = widgets.SelectMultiple(
    options= country_codes,
    value=["sg","hk"],
    # rows=10,
    description='Country',
    disabled=False
)
search_button = widgets.Button(description='Search', on_click=on_search_button_click)
#reset_button = widgets.Button(description='Reset', on_click=on_reset_button_click)


search_button.on_click(on_search_button_click)

top_level_slug_select = widgets.SelectMultiple(options = top_level_slugs, description="slug")

slug_search_input = widgets.Text(description = "Slug like")

event_type_select = widgets.SelectMultiple(options=event_types, description="Event")

page_type_select = widgets.SelectMultiple(options=page_types, description="Page Type")


#event_count_box = widgets.Text(description="Min num events", )
#event_count_slider = widgets.FloatLogSlider(value=num_events, min=0, max=num_events, step=1, description="Num events", continuous_update=True, readout=True)
event_count_slider = widgets.FloatLogSlider(value=0, min=0, max=math.log10(num_events), description="At least num events")

#search_modes = ["Summary", "Summary Deduped", "Events",] # summary - > grouped with difference, events -> grouped by anon_id etc, 
#search_mode_dropdown = widgets.Dropdown(description="Search Mode", options=search_modes, value=search_modes[0])

search_options = widgets.Box([anonymous_user_input, country_code_dropdown ,
                              top_level_slug_select, slug_search_input,  event_type_select,
                             page_type_select,
                              event_count_slider
                             ])

search_options.layout=widgets.Layout(width='100%',display='inline-flex',flex_flow='row wrap') #auto wrap

group_by_select = widgets.SelectMultiple(description="Group By", options = merged_df_with_meta.columns.to_list())

search_bar = widgets.VBox([search_options, group_by_select, widgets.Box([search_button])])
display(search_bar)



VBox(children=(Box(children=(Text(value='', description='Anonymous_id'), SelectMultiple(description='Country',…