In [None]:
import pandas as pd
import json

from google.cloud import bigquery

In [None]:
import dabl

from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [None]:
client = bigquery.Client()

In [None]:
def obtain_monthly_save_aggregates():
    sql = """
        select EXTRACT(MONTH from TIMESTAMP_MILLIS(time_transaction_occurred)) as save_month, unit, 
            sum(amount) as sum, avg(amount) as average, count(*) as count from ops.user_behaviour 
            where transaction_type = 'SAVING_EVENT' group by save_month, unit order by save_month desc;
    """
    
    df = client.query(sql).to_dataframe()
    return df

In [None]:
# actually just use a pandas mask to split forward and back
def obtain_boosts_with_saves():
    sql = """
        with boost_offers as (
            select *, TIMESTAMP_MILLIS(created_at) as creation_timestamp 
            from ops.all_user_events 
            where event_type like 'BOOST_CREATED%'

    ), save_events as (
            select *, TIMESTAMP_MILLIS(created_at) as creation_timestamp 
            from ops.all_user_events 
            where event_type = 'SAVING_PAYMENT_SUCCESSFUL'
    )
    select boost_offers.user_id, boost_offers.event_type, boost_offers.context, 
        boost_offers.creation_timestamp as boost_creation_time, save_events.creation_timestamp as save_completion_time,  
        TIMESTAMP_DIFF(save_events.creation_timestamp, boost_offers.creation_timestamp, HOUR) as time_from_boost_to_save
    from boost_offers left join save_events on boost_offers.user_id = save_events.user_id
    """
    
    df = client.query(sql).to_dataframe()
    return df

In [None]:
def extract_prior_save_counts(prior_save_counts):
    print('Past rows: ', prior_save_counts.shape)
    prior_save_counts["boost_prior_saves"] = prior_save_counts.groupby('boost_user_id').transform('count')["save_completion_time"]
    prior_save_counts = prior_save_counts[["boost_user_id", "boost_prior_saves"]]
    prior_save_counts = prior_save_counts.groupby("boost_user_id").first() # no need for a sort
    return prior_save_counts

In [None]:
def extract_time_since_latest_save(prior_save_df):
    with_latest_save = prior_save_df.sort_values("save_completion_time").groupby("boost_user_id", as_index = False).last()
    with_latest_save["days_since_latest_save"] = abs(with_latest_save["time_from_boost_to_save"] / 24)
    with_latest_save = with_latest_save[["boost_user_id", "days_since_latest_save"]]
    return with_latest_save

In [None]:
def extract_time_since_first_save(prior_save_df):
    # for some reason if index, causes issues here
    with_earliest_save = prior_save_df.sort_values("save_completion_time").groupby("boost_user_id", as_index = False).first()
    with_earliest_save["days_since_first_save"] = abs(with_earliest_save["time_from_boost_to_save"] / 24)
    with_earliest_save = with_earliest_save[["boost_user_id", "days_since_first_save"]]
    return with_earliest_save

In [None]:
def clean_up_and_construct_labels(boosts_with_saves):
    unit_convertors = { 'WHOLE_CURRENCY': 1, 'WHOLE_CENT': 100, 'HUNDREDTH_CENT': 10000 }
    
    df = boosts_with_saves
    print('Starting count: ', df.shape)
    
    df['user_id_count'] = boosts_with_saves.groupby(['user_id'])['boost_creation_time'].transform('count')
    
    # we remove the top 2, because they are team members often testing, so distort
    outlier_user_ids = df['user_id'].value_counts()[:2].index.tolist()
    # probably a better panda-ninja way to do this but not worth it right now
    for user_id in outlier_user_ids:
        df = df[df.user_id != user_id]
        
    print('With outlier top users stripped: ', df.shape)
    
    # here we have our label
    df["is_save_within_day"] = df["time_from_boost_to_save"] < 24
    
    # extract a bunch of context from the boosts    
    df["parsed_context"] = df.context.apply(json.loads)
    df["boost_id"] = df["parsed_context"].apply(lambda context: context["boostId"])
    df["boost_amount_whole_currency"] = df["parsed_context"].apply(
        lambda context: context["boostAmount"] / unit_convertors[context["boostUnit"]])
    
    df["boost_type"] = df["parsed_context"].apply(lambda context: context["boostType"])
    df["boost_category"] = df["parsed_context"].apply(lambda context: context["boostCategory"])
    df["boost_type_category"] = df["boost_type"] + "::" + df["boost_category"]
    
    df["day_of_month"] = df["boost_creation_time"].dt.day
    df["hour_of_day"] = df["boost_creation_time"].dt.hour
    
    # and this functions as our index     
    df["boost_user_id"] = df["boost_id"] + "::" + df["user_id"]
    
    # then we construct our future and past masks, calculate prior saves, and find next save
    prior_save_mask = df["time_from_boost_to_save"] < 0
    future_save_mask = df["time_from_boost_to_save"] > 0
        
    # likely a way to do these more simply, but for now doing groups & sorts differently    
    prior_save_counts = extract_prior_save_counts(df[prior_save_mask].copy())
    days_since_latest_save = extract_time_since_latest_save(df[prior_save_mask].copy())
    days_since_first_save = extract_time_since_first_save(df[prior_save_mask].copy())
    
    # then we discard the past
    with_future_saves = df[future_save_mask].copy()
    with_next_save = with_future_saves.sort_values("save_completion_time").groupby("boost_user_id").first()
    
    print('Now with just future saves crossed: ', with_future_saves.shape, ' and next save only: ', with_next_save.shape)
    
    # and finally we strip out the surplus boost-save pairs (by retaining only the opening)
    # at the moment an inner join, but we may want to turn this into joining from those with saves
    final_df = pd.merge(with_next_save, prior_save_counts, on='boost_user_id')
    final_df = pd.merge(final_df, days_since_latest_save, on='boost_user_id')
    final_df = pd.merge(final_df, days_since_first_save, on='boost_user_id')
    print("And finally, stripped to just one per: ", final_df.shape)
    
    return final_df

In [None]:
def feature_extraction(data):
    features_of_interest = [
        "boost_amount_whole_currency", 
        "day_of_month", 
        "boost_prior_saves",
        "boost_type_category",
        "days_since_latest_save",
        "days_since_first_save",
        "is_save_within_day"
    ]
    stripped_df = data[features_of_interest]
    return stripped_df

In [None]:
boosts_with_saves = obtain_boosts_with_saves()

In [None]:
data = clean_up_and_construct_labels(boosts_with_saves)

In [None]:
data.head()

In [None]:
data.is_save_within_day.value_counts()

In [None]:
data.boost_prior_saves.value_counts()

In [None]:
feature_frame = feature_extraction(data)
feature_frame.dtypes
# feature_frame.head()

In [None]:
dabl_data = dabl.clean(feature_frame)
dabl_data.dtypes

In [None]:
dabl.plot(dabl_data, target_col='is_save_within_day')

In [None]:
X = dabl_data.drop("is_save_within_day", axis=1)
Y = dabl_data.is_save_within_day

In [None]:
# at the moment this is making things worse, so
# preprocessor = dabl.EasyPreprocessor()
# X_trans = preprocessor.fit_transform(X)

In [None]:
fc = dabl.SimpleClassifier(random_state=0)
fc.fit(X, Y)

In [None]:
X_encoded = pd.get_dummies(X, prefix_sep="_", columns=["boost_type_category"])
X_encoded.head()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_encoded, Y)

In [None]:
from sklearn.model_selection import cross_val_score
clf = svm.SVC()
# cross_val_score(clf, X_encoded, Y, cv=5, scoring='recall_macro')

In [None]:
from sklearn.metrics import precision_recall_fscore_support

In [None]:
Y_train.value_counts()
# Y_test.value_counts()

In [None]:
clf.fit(X_train, Y_train)

In [None]:
precision_recall_fscore_support(Y_test, clf.predict(X_test))

In [None]:
clf.predict(X_test)