In [23]:
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import csv

import athena_querying  #doing this style as there are connection details within that I want to scope

In [24]:
# Expand to screen width to fit more on.
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [42]:
num_days_to_query = 7
to_datetime = datetime.now().date() - timedelta(days=1) #datetime(year=2020, month=3, day=1)
from_datetime = to_datetime - timedelta(days=num_days_to_query)

event_definition_csv = "./Event Definitions/MS Event Categories.csv"
consumer_allowed_fields_file = "./Event Definitions/schema.py"

In [26]:
event_defs_raw = pd.read_csv(event_definition_csv, header=1)[1:]

In [27]:
event_defs_raw.head()

Unnamed: 0.1,Unnamed: 0,Event Name,Description,Sample button /URL,Property Name,isMandated,Property Value Type,Property Description,Suggested values (Red colored values are fixed allowed values),Standardisation,...,Home Loan - Mortgage Calculator,Home Loan - Progressive Payment Calculator,Home Loan - Property Valuation Calculator,Home Loan HK - Mortgage Calculator,Home Loan HK - Property Valuation Calculator,Mortgage Home Page,New Purchase Home Page,Refinance Home Page,New Listing Service HK TI,Product Widgets
1,1.0,LeadGeneration.ClickConversion,Tracks events that lead directly to a conversi...,- Apply Now\n- Buy Now\n- GoToSite\n- Learn Mo...,channel,True,string,Product channel,"home-loan,refinancing,home-equity-loan,renovat...",Listing and details page,...,,,,,,,,,travel-insurance,"credit-cards,personal-loan,travel-insurance"
2,,,,,pageType,True,string,Type of page user landed,"home-page,product-listing,product-details,blog...",,...,,,,,,,,,,
3,,,,,language,True,string,Locale of the page,"en,zh-hk",,...,,,,,,,,,buy-now,left blanks
4,,,,,country,True,string,which market this events happened,"sg,hk,tw,ph,id",,...,,,,,,,,,eg 331,integer value greater than or equal to 1
5,,,,,provider,True,string,Unique Identifier of the provider,citibank,,...,,,,,,,,,eg 4,integer value greater than or equal to 1


In [28]:
cols_of_interest = ["Event Name", "Property Name", "isMandated", "Property Value Type"]


In [29]:
event_defs_raw.columns

Index(['Unnamed: 0', 'Event Name', 'Description', 'Sample button\n/URL',
       'Property Name', 'isMandated', 'Property Value Type',
       'Property Description',
       'Suggested values (Red colored values are fixed allowed values)',
       'Standardisation', 'DBS/POSB SG PL Full API', 'SCB SG CC Full API',
       'SCB SG PL Full API', 'OCBC Bot Service', 'CI FWD ', 'CI Etiqa',
       'CI Wizard', 'Home Loan - TDSR Calculator',
       'Home Loan - Property Tax Calculator',
       'Home Loan - Stamp Duty Calculator', 'Home Loan - Refinance Calculator',
       'Home Loan - MSR Calculator', 'Home Loan - Mortgage Calculator',
       'Home Loan - Progressive Payment Calculator',
       'Home Loan - Property Valuation Calculator',
       'Home Loan HK - Mortgage Calculator',
       'Home Loan HK - Property Valuation Calculator', 'Mortgage Home Page',
       'New Purchase Home Page', 'Refinance Home Page',
       'New Listing Service\nHK TI', 'Product Widgets'],
      dtype='object')

In [30]:
event_defs_raw[["Event Name",]] = event_defs_raw[["Event Name",]].fillna(method='ffill') # fill down the event name to make it more database-y
event_defs = event_defs_raw[cols_of_interest][~event_defs_raw["Property Name"].isna()] # There's some empty space rows in the source

In [31]:
# Create snake case version
# This so feels like an interview question to do it more efficiently
def camelcaseify(event_name):
    ret = ""
    for c in event_name:
        c_lower = c.lower()
        if c==c_lower:
            ret+=c
        else:
            ret+="_"+c_lower
    return ret
    
event_defs["property_name"] = event_defs.apply(lambda x: camelcaseify(x["Property Name"]), axis=1)

In [32]:
event_defs.groupby(["Event Name"]).count() # will be lower if some nulls

Unnamed: 0_level_0,Property Name,isMandated,Property Value Type,property_name
Event Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ABTest.Conversion,2,0,2,2
EmailCapture,7,7,7,7
LeadGeneration.ClickConversion,22,22,22,22
LeadGeneration.FormStepCompleted,12,12,12,12
LeadGeneration.FormSubmitted,9,9,9,9
LeadGeneration.PaymentCompleted,8,8,8,8
LeadGeneration.ThankYou,4,4,4,4
Reading,6,6,3,6
UserAuth.LoggedIn,5,5,5,5
UserAuth.LoggedOut,4,4,4,4


In [33]:
# Events missing a property value being set
event_defs[event_defs["Property Value Type"].isna()]

Unnamed: 0,Event Name,Property Name,isMandated,Property Value Type,property_name
225,Reading,status,True,,status
229,Reading,page_url,True,,page_url
230,Reading,title,True,,title


In [34]:
# Events with spaces in them
event_defs[event_defs["Event Name"].str.contains(" ")]

Unnamed: 0,Event Name,Property Name,isMandated,Property Value Type,property_name


In [40]:
event_defs.groupby(["Property Value Type"]).count()

Unnamed: 0_level_0,Event Name,Property Name,isMandated,property_name
Property Value Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
string,216,216,211,216
string/JSON,2,2,2,2
string/json,4,4,4,4


In [17]:
# Interpret the expected values (not currently possible due to the spreadsheet format)

In [36]:
# Check that there aren't duplicates
event_nps = event_defs.groupby(["Event Name", "Property Name"]).count()

In [38]:
event_nps[event_nps.property_name>1]

Unnamed: 0_level_0,Unnamed: 1_level_0,isMandated,Property Value Type,property_name
Event Name,Property Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LeadGeneration.ClickConversion,productCategory,2,2,2
LeadGeneration.ClickConversion,provider,2,2,2


In [43]:
# Very Crude Loading of Allowed Structure from consumer
with open(consumer_allowed_fields_file) as f:
    consumer_raw_defs = f.read()
    

In [45]:
consumer_raw_defs_data = consumer_raw_defs.split("StructField('data', StructType([", 1)[1]\
    .split("StructField('sent_at', StringType(), True),", 1)[0]

In [65]:
consumer_raw_defs_data = "\n".join([ z for z in consumer_raw_defs_data.split("\n")[:-3] if z.strip()!=""])

In [66]:
print(consumer_raw_defs_data)

            StructField('title', StringType(), True),
            StructField('status', StringType(), True),
            StructField('page_url', StringType(), True),
            StructField('email_id', StringType(), True),
            StructField('form_type', StringType(), True),
            StructField('username', StringType(), True),
            StructField('sort_order',StringType(),True),
            StructField('sort_field_name',StringType(),True),
            StructField('page_path',StringType(),True),
            StructField('auth_status',StringType(),True),
            StructField('type',StringType(),True),
            StructField('cta_type',StringType(),True),
            StructField('product_id',StringType(),True),
            StructField('product_name',StringType(),True),
            StructField('provider_id',StringType(),True),
            StructField('list_position',StringType(),True),
            StructField('is_sponsored',StringType(),True),
            StructField('addit

In [67]:
import string

In [78]:
consumer_raw_defs_data_lines = consumer_raw_defs_data.split("\n")
print(len(consumer_raw_defs_data_lines))

85


In [72]:
top_level_indent = [i for i, z in enumerate(consumer_raw_defs_data_lines[0]) if z in string.ascii_letters][0]

In [73]:
top_level_indent

12

In [75]:
allowed_values = []
for line in consumer_raw_defs_data_lines:
    indent = [i for i, z in enumerate(line) if z in string.ascii_letters][0]
    if indent != top_level_indent:
        continue
    allowed_values.append(line.split("'")[1])

In [79]:
print(len(allowed_values))

62


In [76]:
allowed_values

['title',
 'status',
 'page_url',
 'email_id',
 'form_type',
 'username',
 'sort_order',
 'sort_field_name',
 'page_path',
 'auth_status',
 'type',
 'cta_type',
 'product_id',
 'product_name',
 'provider_id',
 'list_position',
 'is_sponsored',
 'additional_parameters',
 'event_type',
 'object_clicked',
 'borrow_amount',
 'loan_duration_in_years',
 'citizenship',
 'is_recommended',
 'question_number',
 'question_answer',
 'loan_type',
 'property_type',
 'remaining_loan',
 'current_rate',
 'remaining_tenure',
 'current_bank',
 'rate_type',
 'loan_tenure',
 'is_paid',
 'action',
 'form_step',
 'form_name',
 'car_make',
 'ncd',
 'gender',
 'birthdate',
 'policy_expiring',
 'age',
 'filters',
 'language',
 'provider',
 'product_category',
 'page_type',
 'country',
 'channel',
 'product',
 'search',
 'affiliate_widget_type',
 'affiliate_category',
 'affiliate_page_type',
 'affiliate_location',
 'source',
 'source_id',
 'login_method',
 'product_category_id',
 'sequence']

In [77]:
# Event defs with properties that will be rejected (at the high level only)

event_defs[~event_defs["property_name"].isin(allowed_values)]

Unnamed: 0,Event Name,Property Name,isMandated,Property Value Type,property_name
11,LeadGeneration.ClickConversion,pageReferrer,True,string,page_referrer
31,LeadGeneration.FormStepCompleted,error,False,string,error
32,LeadGeneration.FormStepCompleted,formDetails,False,string/json,form_details
41,LeadGeneration.FormSubmitted,formDetails,False,string/json,form_details
68,UserEngagement.ShowedMoreDetails,pageReferrer,True,string,page_referrer
89,UserEngagement.ViewedMoreDetails,pageReferrer,True,string,page_referrer
103,UserEngagement.UsedHelpHints,hintType,True,string,hint_type
104,UserEngagement.UsedHelpHints,hintName,True,string,hint_name
109,UserEngagement.UsedHelpHints,interactionType,True,string,interaction_type
110,UserEngagement.ClickedMenuItem,menuName,True,string,menu_name
