# Notebook using local copy of all events to explore

## Load in the master DF

In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
master_df = pd.read_csv('all_user_events_2020_08_07.csv')

# Extract primary boost info

In [None]:
bdf = master_df[master_df['event_type'].str.contains('BOOST_CREATED')]

In [None]:
from json import loads
bdf["parsed_context"] = bdf.context.apply(loads) 

In [None]:
bdf["boost_id"] = bdf["parsed_context"].apply(lambda context: context["boostId"])
bdf["boost_type"] = bdf["parsed_context"].apply(lambda context: context["boostType"])
bdf["boost_category"] = bdf["parsed_context"].apply(lambda context: context["boostCategory"])

In [None]:
bdf["boost_time"] = pd.to_datetime(bdf["time_transaction_occurred"], unit='ms')

In [None]:
bdf.head()

In [None]:
unit_convertors = { 'WHOLE_CURRENCY': 1, 'WHOLE_CENT': 100, 'HUNDREDTH_CENT': 10000 }

def extract_save_requirement(parsed_context):
    if 'statusConditions' not in parsed_context:
        return None, None
    
    # we look for the first
    conditions = parsed_context['statusConditions']
    save_type = None
    save_threshold = None
    
    sought_conditions = ['save_greater_than', 'first_save_above', 'balance_crossed_major_digit', 'balance_crossed_abs_target']
    is_save_condition = lambda cond: len([check for check in sought_conditions if cond.startswith(check)]) > 0
    
    for value in conditions.values():
        matches = [cond for cond in value if is_save_condition(cond)]
        if (len(matches) == 0):
            continue
            
        condition_clause = matches[0]
        save_type = condition_clause[0:condition_clause.find(' ')]
        
        param_start = condition_clause.find('{') + 1
        param_end = condition_clause.find('}')
        save_parameter = condition_clause[param_start:param_end].split('::')
#         print(save_parameter)
        
        save_threshold = int(save_parameter[0]) / unit_convertors[save_parameter[1]]
                
    return save_type, save_threshold

In [None]:
example_context = bdf.iloc[0]['parsed_context']
example_context

In [None]:
extract_save_requirement(example_context)

In [None]:
bdf['save_requirements'] = bdf['parsed_context'].apply(extract_save_requirement)
bdf[['save_type', 'save_amount']] = pd.DataFrame(bdf['save_requirements'].tolist(), index=bdf.index)

In [None]:
df = bdf[[
    "boost_id",
    "user_id",
    "boost_time",
    "boost_type",
    "boost_category",
    "save_type",
    "save_amount",
    "parsed_context"
]]

In [None]:
df.head()

In [None]:
example_context

In [None]:
days_open = (example_context['boostEndTime'] - example_context['boostStartTime']) / (24 * 60 * 60 * 1000)

In [None]:
days_open

# Obtain prior saves

In [None]:
# df.groupby('ID').flag.cumsum().astype(int)
# Out[362]: 
# 0    1
# 1    1
# 2    2
# 3    2
# 4    3

In [None]:
sdf = master_df[master_df['event_type'].str.contains('SAVING_PAYMENT_SUCCESSFUL')]

In [None]:
sdf.shape

In [None]:
sdf.head()

In [None]:
# there is going to be a clever way to do this vectorized, but it is escaping me (+ my SO searches) at present
def count_prior_saves(boost_time, user_id):
    save_mask = 
    
    return len(sdf[sdf['save_time'] < boost_time & sdf['user_id'] == user_id])

In [None]:
sdf["save_time"] = pd.to_datetime(sdf["time_transaction_occurred"], unit='ms')

In [None]:
example_boost = df.iloc[2]
len(sdf[sdf["user_id"] == example_boost["user_id"]])

In [None]:
count_prior_saves = lambda boost_row: len(sdf[(sdf["save_time"] < boost_row["boost_time"]) & (sdf["user_id"] == boost_row["user_id"])])

In [None]:
# count_prior_saves(df.iloc[423])
# df.apply(count_prior_saves, axis=1)
len(df)

In [None]:
new_df = sdf[(sdf["save_time"] < df.iloc[0]["boost_time"]) & (sdf["user_id"] == df.iloc[100]["user_id"])]

In [None]:
df.iloc[0]["boost_time"]

In [None]:
bdf["prior_save_count"] = df.apply(count_prior_saves, axis = 1)

In [None]:
bdf.prior_save_count.value_counts()

In [None]:
df.iloc[0]