In [None]:
import pandas as pd
import json

from google.cloud import bigquery

In [None]:
client = bigquery.Client()

In [None]:
def obtain_monthly_save_aggregates():
    sql = """
        select EXTRACT(MONTH from TIMESTAMP_MILLIS(time_transaction_occurred)) as save_month, unit, 
            sum(amount) as sum, avg(amount) as average, count(*) as count from ops.user_behaviour 
            where transaction_type = 'SAVING_EVENT' group by save_month, unit order by save_month desc;
    """
    
    df = client.query(sql).to_dataframe()
    return df

In [None]:
def obtain_boosts_with_labels():
    sql = """
        with boost_offers as (
            select *, TIMESTAMP_MILLIS(created_at) as creation_timestamp 
            from ops.all_user_events 
            where event_type like 'BOOST_CREATED%'

    ), save_events as (
            select *, TIMESTAMP_MILLIS(created_at) as creation_timestamp 
            from ops.all_user_events 
            where event_type = 'SAVING_PAYMENT_SUCCESSFUL'
    )
    select boost_offers.user_id, boost_offers.event_type, boost_offers.context, 
        boost_offers.creation_timestamp as boost_creation_time, save_events.creation_timestamp as save_completion_time,  
        TIMESTAMP_DIFF(save_events.creation_timestamp, boost_offers.creation_timestamp, HOUR) as time_from_boost_to_save
    from boost_offers left join save_events on boost_offers.user_id = save_events.user_id
    where 
        TIMESTAMP_DIFF(save_events.creation_timestamp, boost_offers.creation_timestamp, HOUR) > 0 
        or TIMESTAMP_DIFF(save_events.creation_timestamp, boost_offers.creation_timestamp, HOUR) is null
    """
    
    df = client.query(sql).to_dataframe()
    return df

In [None]:
def clean_up_and_construct_labels(boosts_with_labels):
    unit_convertors = { 'WHOLE_CURRENCY': 1, 'WHOLE_CENT': 100, 'HUNDREDTH_CENT': 10000 }
    df = boosts_with_labels
    df['user_id_count'] = labels.groupby(['user_id'])['boost_creation_time'].transform('count')

In [None]:
labels = obtain_boosts_with_labels()

In [None]:
labels.head()

In [None]:
user_counts_series = labels.user_id.value_counts(sort=True)
df_val_counts = pd.DataFrame(user_counts_series)
user_counts_frame = df_val_counts.reset_index()
user_counts_frame.columns = ["user_id", "event_count"]
user_counts_frame.head()

In [None]:
data = labels
data['user_id_count'] = labels.groupby(['user_id'])['boost_creation_time'].transform('count')

In [None]:
# we remove the top 2, because they are team members often testing, so distort
outlier_user_ids = data['user_id'].value_counts()[:2].index.tolist()

In [None]:
# probably a better panda-ninja way to do this but not worth it right now
for user_id in outlier_user_ids:
    data = data[data.user_id != user_id]

In [None]:
data.user_id.value_counts()

In [None]:
data["is_save_within_day"] = data["time_from_boost_to_save"] < 24
data.is_save_within_day.value_counts()

In [None]:
data["parsed_context"] = data.context.apply(json.loads)
data["boost_id"] = data["parsed_context"].apply(lambda context: context["boostId"])
data["boost_amount_raw"] = data["parsed_context"].apply(lambda context: context["boostAmount"])
data["boost_unit"] = data["parsed_context"].apply(lambda context: context["boostUnit"])
data["boost_amount_whole_currency"] = data["parsed_context"].apply(
    lambda context: context["boostAmount"] / unit_convertors[context["boostUnit"]])
data["day_of_month"] = data["boost_creation_time"].dt.day
data["boost_user_id"] = data["boost_id"] + "::" + data["user_id"]

In [None]:
data[data.boost_user_id == "db2708a9-a244-433b-973d-726b0b933b3e::008aff50-7073-4e0a-9e17-6f754c1a2c0b"]

In [None]:
# slimmed_data = data.reindex[data.groupby('boost_user_id')['save_completion_time'].idxmin()]
slimmed_data = data.sort_values("save_completion_time").groupby("boost_user_id", as_index=False).first()

In [None]:
slimmed_data

In [None]:
stripped_df = slimmed_data[["boost_user_id", "boost_amount_whole_currency", "day_of_month", "is_save_within_day"]]

In [None]:
stripped_df.head()

In [None]:
### Okay start some exploring here
import dabl

In [None]:
stripped_df.is_save_within_day.value_counts()

In [None]:
data_clean = dabl.clean(stripped_df)

In [None]:
dabl.plot(stripped_df, 'is_save_within_day')

In [None]:
fc = dabl.SimpleClassifier(random_state=0)

In [None]:
X = stripped_df.drop("is_save_within_day", axis=1)

In [None]:
Y = stripped_df.is_save_within_day

In [None]:
fc.fit(X, Y)