In [None]:
import pandas as pd
import json

import dabl

from google.cloud import bigquery

In [None]:
client = bigquery.Client()

In [None]:
def obtain_monthly_save_aggregates():
    sql = """
        select EXTRACT(MONTH from TIMESTAMP_MILLIS(time_transaction_occurred)) as save_month, unit, 
            sum(amount) as sum, avg(amount) as average, count(*) as count from ops.user_behaviour 
            where transaction_type = 'SAVING_EVENT' group by save_month, unit order by save_month desc;
    """
    
    df = client.query(sql).to_dataframe()
    return df

In [None]:
def obtain_boosts_with_saves():
    sql = """
        with boost_offers as (
            select *, TIMESTAMP_MILLIS(created_at) as creation_timestamp 
            from ops.all_user_events 
            where event_type like 'BOOST_CREATED%'

    ), save_events as (
            select *, TIMESTAMP_MILLIS(created_at) as creation_timestamp 
            from ops.all_user_events 
            where event_type = 'SAVING_PAYMENT_SUCCESSFUL'
    )
    select boost_offers.user_id, boost_offers.event_type, boost_offers.context, 
        boost_offers.creation_timestamp as boost_creation_time, save_events.creation_timestamp as save_completion_time,  
        TIMESTAMP_DIFF(save_events.creation_timestamp, boost_offers.creation_timestamp, HOUR) as time_from_boost_to_save
    from boost_offers left join save_events on boost_offers.user_id = save_events.user_id
    where 
        TIMESTAMP_DIFF(save_events.creation_timestamp, boost_offers.creation_timestamp, HOUR) > 0 
        or TIMESTAMP_DIFF(save_events.creation_timestamp, boost_offers.creation_timestamp, HOUR) is null
    """
    
    df = client.query(sql).to_dataframe()
    return df

In [None]:
def clean_up_and_construct_labels(boosts_with_saves):
    unit_convertors = { 'WHOLE_CURRENCY': 1, 'WHOLE_CENT': 100, 'HUNDREDTH_CENT': 10000 }
    
    df = boosts_with_saves
    df['user_id_count'] = boosts_with_saves.groupby(['user_id'])['boost_creation_time'].transform('count')
    
    # we remove the top 2, because they are team members often testing, so distort
    outlier_user_ids = df['user_id'].value_counts()[:2].index.tolist()
    # probably a better panda-ninja way to do this but not worth it right now
    for user_id in outlier_user_ids:
        df = df[df.user_id != user_id]
    
    # and here we have our real label     
    df["is_save_within_day"] = df["time_from_boost_to_save"] < 24
    
    # extract a bunch of context from the boosts    
    df["parsed_context"] = df.context.apply(json.loads)
    df["boost_id"] = df["parsed_context"].apply(lambda context: context["boostId"])
    df["boost_amount_whole_currency"] = df["parsed_context"].apply(
        lambda context: context["boostAmount"] / unit_convertors[context["boostUnit"]])
    
    df["boost_type"] = df["parsed_context"].apply(lambda context: context["boostType"])
    df["boost_category"] = df["parsed_context"].apply(lambda context: context["boostCategory"])
    
    df["day_of_month"] = df["boost_creation_time"].dt.day
    df["hour_of_day"] = df["boost_creation_time"].dt.hour
    
    # and this functions as our index     
    df["boost_user_id"] = df["boost_id"] + "::" + df["user_id"]
    
    # and finally we strip out the surplus boost-save pairs (by retaining only the opening)
    slimmed_df = df.sort_values("save_completion_time").groupby("boost_user_id", as_index=False).first()
    
    return slimmed_df

In [None]:
def feature_extraction(data):
    features_of_interest = [
        "boost_amount_whole_currency", 
        "day_of_month", 
        "hour_of_day", 
        "boost_type", 
        "boost_category", 
        "is_save_within_day"
    ]
    stripped_df = data[features_of_interest]
    return stripped_df

In [None]:
boosts_with_saves = obtain_boosts_with_saves()

In [None]:
data = clean_up_and_construct_labels(boosts_with_saves)

In [None]:
data.is_save_within_day.value_counts()

In [None]:
data.head()

In [None]:
feature_frame = feature_extraction(data)

In [None]:
dabl_data = dabl.clean(feature_frame)

In [None]:
dabl.plot(dabl_data, 'is_save_within_day')

In [None]:
fc = dabl.SimpleClassifier(random_state=0)

In [None]:
X = dabl_data.drop("is_save_within_day", axis=1)

In [None]:
Y = dabl_data.is_save_within_day

In [None]:
fc.fit(X, Y)