# Overview

Standard checks / results presentation.

It loads a parquet data frame that has been created in a different script.  You can also load that data into the viewer script, which has a search interface.

This is more about having a standard set of tables / charts that are top level checks to look through.

Originally it was at the end of the main script.

# Imports

In [1]:
# Safe imports
from datetime import datetime, timedelta, date

In [2]:
import pandas as pd
import numpy as np

In [3]:
import ipywidgets as widgets

In [5]:
# Imports on files that might have dependencies that need installing
import data_pier_querying
from athena_querying import AthenaQuery
from athena_common_queries import *
#import user_agents # this converts user agent from browser to mobile / desktop etc.

# Settings & Loading Data

In [6]:
# load_merged_resutls_file_path = "merged_df_with_meta_01-10_01-12.gzip"
load_merged_results_file_path = "merged_df_with_meta_20200209_0000_to_20200216_0000.gzip"
just_sg_hk = True # a little bit dangerous as you'll also miss out ones without country set.

In [7]:
merged_df_with_meta = pd.read_parquet(load_merged_results_file_path)

ImportError: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
pyarrow or fastparquet is required for parquet support

In [8]:
if just_sg_hk:
    merged_df_with_meta = merged_df_with_meta[merged_df_with_meta.country_code.isin(["sg", "hk"])]

NameError: name 'merged_df_with_meta' is not defined

In [9]:
merged_df_with_meta.head()

NameError: name 'merged_df_with_meta' is not defined

In [9]:
len(merged_df_with_meta)

1794946

In [10]:
merged_df_with_meta.dtypes

anonymous_id         object
event_name           object
page_url             object
date               category
s_count               int64
k_count               int64
page_type          category
slug               category
slug_root            object
ab_test            category
country_code       category
user_agent           object
device_family      category
os_family          category
os_version         category
browser_family     category
browser_version    category
is_bot              float16
dtype: object

# Common Code

In [11]:
def colour_grouped_table(df):

    def color_how_good(value):
        if isinstance(value, str):
            return
        av = abs(value)
        if av<2:
            c = "green"
        elif value <0:
            c =  "red"

        else:
            c= "blue"

        if av>20:
            return "background-color:rgb(250,200,200)"
        return "color:%s"%c # it's just CSS, so you can do background as well.
    return df.style.applymap(color_how_good , subset=["k_vs_s_%"])

In [12]:
def group_by_and_show_count_difference(df, group_by_cols, with_styling=True):
    """
    This expects counts in s_count and k_count
    """
    
    grouped = df.groupby(group_by_cols, observed=True).sum().reset_index() #observed prevents stupid inefficiency with category columns
    
    grouped["k_vs_s_%"] = grouped.apply(lambda row:(999 if row.k_count else 0) if row.s_count==0 else round(((row.k_count - row.s_count)/row.s_count)*100, 1), axis=1 )
    grouped = grouped[(grouped.k_count>0) | (grouped.s_count>0)] # filters out NaNs after grouping
    
    

    return grouped

# Top Level Checks

In [13]:
def group_by_and_show_count_difference(df, group_by_cols, with_styling=True):
    """
    This expects counts in s_count and k_count
    """
    
    grouped = df.groupby(group_by_cols).sum().reset_index()
    
    grouped["k_vs_s_%"] = grouped.apply(lambda row:(999 if row.k_count else 0) if row.s_count==0 else round(((row.k_count - row.s_count)/row.s_count)*100, 1), axis=1 ) #TODO: likely a lot faster using pandas functions not lambda or numpy compile
    grouped = grouped[(grouped.k_count>0) | (grouped.s_count>0)] # filters out NaNs after grouping
    
    

    return grouped

In [14]:

def highlight_cols(cell):
    #use hex colours, or named ones to ensure excel compatibility on export
    if cell=="":
        return ""
    ci = min(100, int(abs(cell*10)))
    if abs(cell)<=2:
        return "color:green;"
    if cell <0:

        return "background-color:#%02x%02x%02x;" % (255,255-ci,255-ci)
    if cell>0:
        #return "background-color:cyan;"
        return "background-color:#%02x%02x%02x;" % (255-ci, 255-ci, 255)

    return "background-color:red;"


def color_how_good(value): # old one, might be junk
    if isinstance(value, str):
        return
    av = abs(value)
    if av<2:
        c = "green"
    elif value <0:
        c =  "red"

    else:
        c= "blue"

    if av>20:
        return "background-color:rgb(250,200,200)"
    return "color:%s"%c # it's just CSS, so you can do background as well.

In [15]:
def colour_grouped_table(df):

    return df.style.applymap(highlight_cols, subset=["k_vs_s_%"])


## By Country


In [16]:
g = group_by_and_show_count_difference(merged_df_with_meta, ["country_code", "date"])

In [17]:
colour_grouped_table(g)

Unnamed: 0,country_code,date,s_count,k_count,is_bot,k_vs_s_%
7,hk,2020-02-09,44568,44669,133,0.2
8,hk,2020-02-10,48563,48796,70,0.5
9,hk,2020-02-11,47089,47508,90,0.9
10,hk,2020-02-12,48352,47896,87,-0.9
11,hk,2020-02-13,54533,54656,145,0.2
12,hk,2020-02-14,47404,48153,96,1.6
13,hk,2020-02-15,52472,53597,127,2.1
28,sg,2020-02-09,294505,302552,348,2.7
29,sg,2020-02-10,309912,317167,415,2.3
30,sg,2020-02-11,312694,325880,363,4.2


## By Event Type

In [18]:
g = group_by_and_show_count_difference(merged_df_with_meta, ["country_code", "event_name"])
g.sort_values(["country_code", "event_name"], inplace=True)
colour_grouped_table(g)

Unnamed: 0,country_code,event_name,s_count,k_count,is_bot,k_vs_s_%
38,hk,Display user feedback form,5,0,0,-100.0
40,hk,LeadGeneration.ClickConversion,790,837,0,5.9
41,hk,LeadGeneration.ClickedApplyButton,238,258,0,8.4
42,hk,LeadGeneration.ClickedCTA,2708,2866,0,5.8
44,hk,LeadGeneration.Conversion,4011,4076,0,1.6
46,hk,LeadGeneration.FormSubmitted,243,934,0,284.4
48,hk,LeadGeneration.RedirectCompleted,4243,4237,0,-0.1
49,hk,LeadGeneration.ShowedMoreDetails,6200,6227,0,0.4
51,hk,LeadGeneration.ViewedMoreDetails,2861,2980,0,4.2
54,hk,PageView,155565,158348,708,1.8


In [19]:
g = group_by_and_show_count_difference(merged_df_with_meta[merged_df_with_meta.event_name=="LeadGeneration.ClickedApplyButton"], ["country_code", "slug_root", "event_name"])

In [20]:
g

Unnamed: 0,country_code,slug_root,event_name,s_count,k_count,is_bot,k_vs_s_%
3,hk,/credit-cards,LeadGeneration.ClickedApplyButton,238.0,258.0,0.0,8.4
12,sg,/credit-cards,LeadGeneration.ClickedApplyButton,118.0,123.0,0.0,4.2
13,sg,/debt-consolidation-plan,LeadGeneration.ClickedApplyButton,109.0,117.0,0.0,7.3
14,sg,/personal-loan,LeadGeneration.ClickedApplyButton,30.0,31.0,0.0,3.3


## By Top Level Slug

In [21]:
g = group_by_and_show_count_difference(merged_df_with_meta[merged_df_with_meta.page_type!="blog"], ["country_code", "slug_root", "event_name"])
#g.sort_values(["country_code", "slug_root", "event_name"])

#filtering where s_count or k_count is >1000
pv = pd.pivot_table(g[(g.s_count>1000) | (g.k_count>1000)], index=["country_code", "slug_root"], values=["k_count","s_count","k_vs_s_%"], columns=["event_name"], fill_value="")

#colour_grouped_table(pv)
#TODO: not showing up the s_count and k_count :(
pv = pv.swaplevel(0, 1, axis=1).sort_index(axis=1)

In [22]:
pv.to_excel("kinesis_vs_segment.xlsx")

In [23]:




#pv[:20].style.background_gradient(cmap=cm, subset=pd.IndexSlice[:, 's-count'])
styled = pv.style.applymap(highlight_cols, subset=pv.columns.get_loc_level('k_vs_s_%', level=1)[0]) #special multi-index on column for colour
#can't get they styling to be happy with colors
#styled.to_excel("kinesis_vs_segment.xlsx", engine='openpyxl') #use special engine for formatting https://pandas.pydata.org/pandas-docs/stable/user_guide/style.html

styled

Unnamed: 0_level_0,event_name,EmailCapture,EmailCapture,EmailCapture,LeadGeneration.ClickConversion,LeadGeneration.ClickConversion,LeadGeneration.ClickConversion,LeadGeneration.ClickedCTA,LeadGeneration.ClickedCTA,LeadGeneration.ClickedCTA,LeadGeneration.Conversion,LeadGeneration.Conversion,LeadGeneration.Conversion,LeadGeneration.FormStepCompleted,LeadGeneration.FormStepCompleted,LeadGeneration.FormStepCompleted,LeadGeneration.FormSubmitted,LeadGeneration.FormSubmitted,LeadGeneration.FormSubmitted,LeadGeneration.RedirectCompleted,LeadGeneration.RedirectCompleted,LeadGeneration.RedirectCompleted,LeadGeneration.ShowedMoreDetails,LeadGeneration.ShowedMoreDetails,LeadGeneration.ShowedMoreDetails,LeadGeneration.ViewedMoreDetails,LeadGeneration.ViewedMoreDetails,LeadGeneration.ViewedMoreDetails,PageView,PageView,PageView,UserEngagement.ButtonClick,UserEngagement.ButtonClick,UserEngagement.ButtonClick,UserEngagement.ClickedFilter,UserEngagement.ClickedFilter,UserEngagement.ClickedFilter,UserEngagement.FilterSelection,UserEngagement.FilterSelection,UserEngagement.FilterSelection,UserEngagement.QuestionAnswered,UserEngagement.QuestionAnswered,UserEngagement.QuestionAnswered,UserEngagement.ShowedMoreDetails,UserEngagement.ShowedMoreDetails,UserEngagement.ShowedMoreDetails,UserEngagement.SortedList,UserEngagement.SortedList,UserEngagement.SortedList,UserView.WidgetLoad,UserView.WidgetLoad,UserView.WidgetLoad
Unnamed: 0_level_1,Unnamed: 1_level_1,k_count,k_vs_s_%,s_count,k_count,k_vs_s_%,s_count,k_count,k_vs_s_%,s_count,k_count,k_vs_s_%,s_count,k_count,k_vs_s_%,s_count,k_count,k_vs_s_%,s_count,k_count,k_vs_s_%,s_count,k_count,k_vs_s_%,s_count,k_count,k_vs_s_%,s_count,k_count,k_vs_s_%,s_count,k_count,k_vs_s_%,s_count,k_count,k_vs_s_%,s_count,k_count,k_vs_s_%,s_count,k_count,k_vs_s_%,s_count,k_count,k_vs_s_%,s_count,k_count,k_vs_s_%,s_count,k_count,k_vs_s_%,s_count
country_code,slug_root,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2,Unnamed: 46_level_2,Unnamed: 47_level_2,Unnamed: 48_level_2,Unnamed: 49_level_2,Unnamed: 50_level_2,Unnamed: 51_level_2,Unnamed: 52_level_2
hk,/,,,,,,,,,,,,,,,,,,,,,,,,,,,,2169,-6.5,2319,,,,,,,,,,,,,,,,,,,,,
hk,/banks-loan,,,,,,,,,,,,,,,,,,,,,,,,,,,,1380,-1.6,1402,,,,,,,,,,,,,,,,,,,,,
hk,/credit-cards,,,,,,,,,,,,,,,,,,,,,,3518.0,0.4,3503.0,1037.0,4.4,993.0,22297,1.4,21985,,,,,,,1051.0,-8.8,1153.0,,,,,,,,,,,,
hk,/debt-consolidation-loan,,,,,,,,,,,,,,,,,,,,,,,,,,,,1396,0.0,1396,,,,,,,,,,,,,,,,,,,,,
hk,/embed,,,,,,,,,,,,,,,,,,,,,,,,,,,,15769,-0.4,15840,,,,,,,,,,,,,,,,,,,15749.0,-0.4,15805.0
hk,/lending-companies-loan,,,,,,,,,,,,,,,,,,,,,,,,,,,,1671,-0.6,1681,,,,,,,,,,,,,,,,,,,,,
hk,/mortgage,,,,,,,,,,,,,,,,,,,,,,,,,,,,2693,332.3,623,,,,,,,,,,,,,,,,,,,,,
hk,/personal-loan,,,,,,,1684.0,4.9,1605.0,2171.0,1.1,2148.0,,,,,,,2298.0,2.0,2254.0,1930.0,0.3,1924.0,,,,26748,-0.1,26776,,,,1914.0,-34.4,2916.0,,,,,,,,,,,,,,,
hk,/tax-loan,,,,,,,,,,,,,,,,,,,,,,,,,,,,4193,0.0,4194,,,,,,,,,,,,,,,,,,,,,
hk,/travel-insurance,,,,,,,,,,,,,,,,,,,,,,,,,1014.0,4.3,972.0,6493,-4.7,6812,,,,,,,,,,,,,,,,,,,,,


In [24]:
#pv.to_html("kinesis_vs_segment.html")

## By Type of Page

In [25]:
g = group_by_and_show_count_difference(merged_df_with_meta, ["country_code", "page_type"])
colour_grouped_table(g)

Unnamed: 0,country_code,page_type,s_count,k_count,is_bot,k_vs_s_%
5,hk,blog,214085.0,215184.0,607,0.5
6,hk,iss,16452.0,16538.0,0,0.5
7,hk,lps,4271.0,4409.0,6,3.2
8,hk,shop,108173.0,109144.0,135,0.9
20,sg,blog,1308960.0,1339450.0,979,2.3
21,sg,iss,50882.0,50168.0,6,-1.4
22,sg,lps,4092.0,4306.0,8,5.2
23,sg,shop,722200.0,769458.0,1548,6.5
24,sg,unbounce,6619.0,6843.0,0,3.4


### Type of Page, Just Pageviews

In [26]:
g = group_by_and_show_count_difference(merged_df_with_meta[(merged_df_with_meta.event_name=="PageView") & (merged_df_with_meta.country_code.isin(["sg", "hk"]))], ["country_code", "page_type"])
colour_grouped_table(g)

Unnamed: 0,country_code,page_type,s_count,k_count,is_bot,k_vs_s_%
5,hk,blog,72072,72951,586,1.2
6,hk,iss,8198,8225,0,0.3
7,hk,lps,3656,3720,6,1.8
8,hk,shop,71639,73452,116,2.5
20,sg,blog,480957,497781,946,3.5
21,sg,iss,24351,24798,2,1.8
22,sg,lps,3729,3889,8,4.3
23,sg,shop,407195,433442,1420,6.4
24,sg,unbounce,6619,6843,0,3.4


### By Device Type and Country for pageviews in HK and SG

In [27]:
merged_df_with_meta.head()

Unnamed: 0_level_0,anonymous_id,event_name,page_url,date,s_count,k_count,page_type,slug,slug_root,ab_test,country_code,user_agent,device_family,os_family,os_version,browser_family,browser_version,is_bot
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,000034a2-e973-4108-b920-0681877d4fc0,PageView,https://blog.moneysmart.sg/budgeting/mattress-...,2020-02-15,1,1,blog,/budgeting/mattress-singapore-guide,/budgeting,control,sg,Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like M...,mobile,iOS,13.3,Google,91.1.292041477,0.0
1,000034a2-e973-4108-b920-0681877d4fc0,PageView,https://blog.moneysmart.sg/property/3-things-l...,2020-02-10,1,1,blog,/property/3-things-look-buying-condo-2017,/property,control,sg,Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like M...,mobile,iOS,13.3,Google,91.1.292041477,0.0
2,000034a2-e973-4108-b920-0681877d4fc0,Reading,https://blog.moneysmart.sg/budgeting/mattress-...,2020-02-15,3,3,blog,/budgeting/mattress-singapore-guide,/budgeting,control,sg,Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like M...,mobile,iOS,13.3,Google,91.1.292041477,0.0
3,000034a2-e973-4108-b920-0681877d4fc0,Reading,https://blog.moneysmart.sg/property/3-things-l...,2020-02-10,4,4,blog,/property/3-things-look-buying-condo-2017,/property,control,sg,Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like M...,mobile,iOS,13.3,Google,91.1.292041477,0.0
4,0000628f-db5d-4554-96eb-66454e203e92,PageView,https://www.moneysmart.sg/embed/dc96c1e58d2f68...,2020-02-09,1,1,shop,/embed/dc96c1e58d2f6855228962060a1a8b77,/embed,control,sg,Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like M...,mobile,iOS,13.3,Mobile Safari,13.0.4,0.0


# By Device Type

## Pageviews by Device Type and OS

In [28]:
g = group_by_and_show_count_difference(merged_df_with_meta[(merged_df_with_meta.event_name=="PageView") \
                                                           & (merged_df_with_meta.country_code.isin(["sg", "hk"]))\
                                                          # & ((merged_df_with_meta.s_count>100) | (merged_df_with_meta.k_count>100))
                                                          ]\
                                       , ["device_family", "os_family"])
colour_grouped_table(g)

Unnamed: 0,device_family,os_family,s_count,k_count,is_bot,k_vs_s_%
2,desktop,Chrome OS,799,855,0,7.0
3,desktop,Chromecast,30,30,0,0.0
4,desktop,Fedora,11,14,0,27.3
6,desktop,Linux,4197,4566,28,8.8
7,desktop,Mac OS X,98905,105003,3,6.2
10,desktop,Ubuntu,1624,1760,2080,8.4
11,desktop,Windows,255142,275822,29,8.1
14,mobile,Android,334873,345945,19,3.3
15,mobile,BlackBerry OS,7,5,0,-28.6
22,mobile,Other,22,21,0,-4.5


## All Events Desktop and mobile by browser, popular ones

In [29]:
browser_popularity_threshold = len(merged_df_with_meta)/100 #must have 1% of traffic to be interesting.
g = group_by_and_show_count_difference(merged_df_with_meta[(merged_df_with_meta.device_family.isin(["mobile", "desktop"])) \
                                                           & (merged_df_with_meta.country_code.isin(["sg", "hk"]))\
                                                          # & ((merged_df_with_meta.s_count>100) | (merged_df_with_meta.k_count>100))
                                                          ]\
                                       , ["device_family", "os_family", "browser_family"])
g = g[(g.s_count > browser_popularity_threshold) |(g.k_count>browser_popularity_threshold)]
#remainder = g[(g.s_count <= browser_popularity_threshold) |(g.k_count<=browser_popularity_threshold)]
#print("%i events (%.3f percent of total) were in smaller browsers"%(len(remainder), len(remainder)/len(merged_df_with_meta)))
colour_grouped_table(g)

Unnamed: 0,device_family,os_family,browser_family,s_count,k_count,is_bot,k_vs_s_%
548,desktop,Mac OS X,Chrome,125853,131958,0,4.9
599,desktop,Mac OS X,Safari,89475,91091,0,1.8
856,desktop,Windows,Chrome,515373,538399,0,4.5
866,desktop,Windows,Edge,25907,27160,0,4.8
872,desktop,Windows,Firefox,28377,36846,0,29.8
880,desktop,Windows,IE,19275,19538,0,1.4
1088,mobile,Android,Chrome Mobile,566769,588622,0,3.9
1100,mobile,Android,Facebook,41119,41075,0,-0.1
1139,mobile,Android,Samsung Internet,101026,103386,0,2.3
2091,mobile,iOS,Chrome Mobile iOS,101177,102264,0,1.1


## Anonymous_ids with big differences 
(might be bots or duplicates or whatever)

In [30]:
# This has non-standard code because the standard code was taking too long to run and running out of memory.
cols_to_group_by = ["anonymous_id", "device_family", "os_family", "browser_family"]
# g = group_by_and_show_count_difference(merged_df_with_meta[ (merged_df_with_meta.country_code.isin(["sg", "hk"]))], cols_to_group_by)
g = merged_df_with_meta[merged_df_with_meta.country_code.isin(["sg", "hk"])].groupby(cols_to_group_by, observed=True).sum().reset_index() #need to use observed or it creates loads of empty rows and runs out of memory
# g = g[(g["k_vs_s_%"].abs()>50) & ((g.k_count>20) | (g.s_count>20))]

#colour_grouped_table(g)

In [31]:
unique_pages_per_anon_id = merged_df_with_meta.groupby('anonymous_id').page_url.nunique()

In [32]:
len(g)

656247

In [33]:
len(unique_pages_per_anon_id)

656251

In [34]:
g=g.merge(unique_pages_per_anon_id, left_on="anonymous_id", right_on="anonymous_id")
g.rename(columns={"page_url":"num_pages"}, inplace=True)

In [35]:
g.head(5)

Unnamed: 0,anonymous_id,device_family,os_family,browser_family,s_count,k_count,is_bot,num_pages
0,000034a2-e973-4108-b920-0681877d4fc0,mobile,iOS,Google,9,9,0.0,2
1,0000628f-db5d-4554-96eb-66454e203e92,mobile,iOS,Mobile Safari,2,2,0.0,1
2,00007ce2-f710-4f78-bf44-93fcc7e68c24,mobile,iOS,Mobile Safari,1,1,0.0,1
3,0000b18c-192c-4f9b-b3c4-4b8f65a9e197,mobile,iOS,Google,3,3,0.0,1
4,0000e821-1060-4146-9374-2e32bea14f00,mobile,iOS,Mobile Safari,4,3,0.0,3


In [36]:
len(merged_df_with_meta)

1794946

In [37]:
big_one_threshold = 50
big_ones = g[((g.k_count>big_one_threshold) | (g.s_count>big_one_threshold)) & (((g.k_count - g.s_count)/g.s_count).abs()>0.5)]  
#big_ones.sort_values(big_ones.s_count)

#g.sort_values("k_vs_s_%")
#colour_grouped_table(g)
big_ones_sorted = big_ones.sort_values("k_count", ascending = False)
def highlight_big_num_pages(x):
    threshold_num = 20
    if x<threshold_num:
        return ""
    else:
        c_value = int(min(255, (x-threshold_num)/80*255))
        return "background-color:rgb(%i, %i, %i)"%(255, 255-c_value, 255-c_value)
    
big_ones_sorted.style.applymap(highlight_big_num_pages, subset=["num_pages"])


Unnamed: 0,anonymous_id,device_family,os_family,browser_family,s_count,k_count,is_bot,num_pages
360915,8cce5789-b10f-4fda-934b-935b8356dd29,mobile,Android,Chrome Mobile,150,9348,0,765
622397,f2d4f27d-077a-4086-8cce-5789b10fbfda,mobile,Android,Chrome Mobile,127,1141,0,248
176465,44fd60fc-1d34-470a-8dce-31e97b184f54,mobile,Android,Samsung Internet,87,728,0,77
115547,2d343113-cde5-44ad-aebf-9473ee153d35,mobile,Android,Chrome Mobile,0,654,0,38
336880,8356dd29-44fd-40fc-9d34-870acdce31e9,mobile,Android,Chrome Mobile,16,475,0,127
49366,134b935b-8356-4d29-84fd-60fc1d34870a,mobile,Android,Chrome Mobile,50,288,0,55
464719,b55424f5-53eb-44b5-b2d4-f27d077a3086,mobile,Android,Chrome Mobile,39,234,0,139
297401,73edbfe3-6a6e-43c2-ab29-db04584c830c,mobile,Android,Chrome Mobile,58,148,0,45
412975,a12ff1cf-595c-40e3-a9db-f00e5eba1883,desktop,Mac OS X,Chrome,14,118,0,23
144335,38699da5-1c93-4452-85d5-e08acc686b77,desktop,Windows,Firefox,0,106,0,32


In [38]:
# dig into the top one
top_anon_id = big_ones.sort_values("k_count", ascending = False)["anonymous_id"].values[0]
top_anon_df = merged_df_with_meta[merged_df_with_meta.anonymous_id==top_anon_id]
print("looking into %s \n %s"% (top_anon_id, top_anon_df["user_agent"].values[0]))


looking into 8cce5789-b10f-4fda-934b-935b8356dd29 
 Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko; googleweblight) Chrome/38.0.1025.166 Mobile Safari/535.19


In [39]:
top_anon_df.groupby("user_agent").sum()

Unnamed: 0_level_0,s_count,k_count,is_bot
user_agent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko; googleweblight) Chrome/38.0.1025.166 Mobile Safari/535.19",150,9348,0.0


In [40]:
g = group_by_and_show_count_difference(top_anon_df, ["event_name"])
colour_grouped_table(g)

Unnamed: 0,event_name,s_count,k_count,is_bot,k_vs_s_%
0,PageView,20,2423,0,12015.0
1,Reading,120,6824,0,5586.7
2,UserView.WidgetLoad,10,101,0,910.0


In [41]:
g = group_by_and_show_count_difference(top_anon_df, ["page_url"])
colour_grouped_table(g)

Unnamed: 0,page_url,s_count,k_count,is_bot,k_vs_s_%
0,https://blog.moneysmart.hk/zh-hk/credit-cards/%E9%8A%80%E8%A1%8C%E7%86%B1%E7%B7%9A-%E7%9C%9F%E4%BA%BA%E5%B0%8D%E8%A9%B1-%E6%92%B3%E6%8E%A3%E6%8D%B7%E5%BE%91%E9%9B%86%E5%90%88/,4,0,0,-100
1,https://blog.moneysmart.hk/zh-hk/insurance/%E7%A9%A9%E5%AE%9A-%E5%9B%9E%E5%A0%B1-%E8%B2%A1%E5%AF%8C-%E5%A2%9E%E5%80%BC-%E6%AF%94%E8%BC%83-%E5%82%B5%E5%88%B8-%E5%AE%9A%E6%9C%9F%E5%AD%98%E6%AC%BE-%E5%84%B2%E8%93%84%E4%BF%9D%E9%9A%AA/,1,1,0,0
2,https://blog.moneysmart.sg/budgeting/3-ways-manage-finances-new-dbs-ibanking-platform/,0,4,0,999
3,https://blog.moneysmart.sg/budgeting/applecare-singapore-extended-warranty/,0,3,0,999
4,https://blog.moneysmart.sg/budgeting/bankruptcy-singapore/,0,15,0,999
5,https://blog.moneysmart.sg/budgeting/beat-heat-lower-aircon-bill/,0,12,0,999
6,https://blog.moneysmart.sg/budgeting/best-data-plans-singapore-telcos/,0,4,0,999
7,https://blog.moneysmart.sg/budgeting/best-electric-scooter-lta-singapore/,0,25,0,999
8,https://blog.moneysmart.sg/budgeting/best-electric-scooter-lta-singapore/#lg-mobile-mmenu,0,5,0,999
9,https://blog.moneysmart.sg/budgeting/best-fibre-broadband-singapore/,0,31,0,999


# Possible bots

In [42]:
anonymous_ids_counts = group_by_and_show_count_difference(merged_df_with_meta, ["anonymous_id"])>>>>>

SyntaxError: invalid syntax (<ipython-input-42-f3a92fa9fd21>, line 1)

# Other Issues to Check For
* duplicates
* skipping "https" from the url (observed as a current issue)
* certain browsers having issues
* users creating a lot of duplicate events (and doing the above analysis using sum vs count)

# Play Space

In [None]:
blog_pageviews = merged_df_with_meta[(merged_df_with_meta.event_name=="PageView") & (merged_df_with_meta.country_code.isin(["sg", "hk"])) & (merged_df_with_meta.page_type=="blog")]

In [None]:
blog_pageviews.groupby(["country_code", "date"]).size().reset_index()