# Notebook using local copy of all events to explore

## Load in the master DF

In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
master_df = pd.read_csv('all_user_events_2020_08_07.csv')

# Extract primary boost info

In [None]:
bdf = master_df[master_df['event_type'].str.contains('BOOST_CREATED')]

In [None]:
from json import loads
bdf["parsed_context"] = bdf.context.apply(loads) 

In [None]:
bdf["boost_id"] = bdf["parsed_context"].apply(lambda context: context["boostId"])
bdf["boost_type"] = bdf["parsed_context"].apply(lambda context: context["boostType"])
bdf["boost_category"] = bdf["parsed_context"].apply(lambda context: context["boostCategory"])

In [None]:
bdf["boost_time"] = pd.to_datetime(bdf["time_transaction_occurred"], unit='ms')

In [None]:
bdf.head()

In [None]:
unit_convertors = { 'WHOLE_CURRENCY': 1, 'WHOLE_CENT': 100, 'HUNDREDTH_CENT': 10000 }

def extract_save_requirement(parsed_context):
    if 'statusConditions' not in parsed_context:
        return None, None
    
    # we look for the first
    conditions = parsed_context['statusConditions']
    save_type = None
    save_threshold = None
    
    sought_conditions = ['save_greater_than', 'first_save_above', 'balance_crossed_major_digit', 'balance_crossed_abs_target']
    is_save_condition = lambda cond: len([check for check in sought_conditions if cond.startswith(check)]) > 0
    
    for value in conditions.values():
        matches = [cond for cond in value if is_save_condition(cond)]
        if (len(matches) == 0):
            continue
            
        condition_clause = matches[0]
        save_type = condition_clause[0:condition_clause.find(' ')]
        
        param_start = condition_clause.find('{') + 1
        param_end = condition_clause.find('}')
        save_parameter = condition_clause[param_start:param_end].split('::')
#         print(save_parameter)
        
        save_threshold = int(save_parameter[0]) / unit_convertors[save_parameter[1]]
                
    return save_type, save_threshold

In [None]:
example_context = bdf.iloc[0]['parsed_context']
example_context

In [None]:
extract_save_requirement(example_context)

In [None]:
bdf['save_requirements'] = bdf['parsed_context'].apply(extract_save_requirement)
bdf[['save_type', 'save_amount']] = pd.DataFrame(bdf['save_requirements'].tolist(), index=bdf.index)

In [None]:
df = bdf[[
    "boost_id",
    "user_id",
    "boost_time",
    "boost_type",
    "boost_category",
    "save_type",
    "save_amount",
    "parsed_context"
]]

In [None]:
df.head()

In [None]:
example_context

In [None]:
days_open = (example_context['boostEndTime'] - example_context['boostStartTime']) / (24 * 60 * 60 * 1000)

In [None]:
days_open

# Obtain prior saves

In [None]:
sdf = master_df[master_df['event_type'].str.contains('SAVING_PAYMENT_SUCCESSFUL')]
sdf["save_time"] = pd.to_datetime(sdf["time_transaction_occurred"], unit='ms')

In [None]:
sdf.shape

In [None]:
sdf.head()

In [None]:
count_prior_saves = lambda boost_row: len(sdf[(sdf["save_time"] < boost_row["boost_time"]) & (sdf["user_id"] == boost_row["user_id"])])

In [None]:
# count_prior_saves(df.iloc[423])
# df.apply(count_prior_saves, axis=1)
len(df)

In [None]:
bdf["prior_save_count"] = df.apply(count_prior_saves, axis = 1)

In [None]:
bdf.prior_save_count.describe()

In [None]:
bdf.head()

## Additional feature extraction (to come)

In [None]:
bdf["day_of_month"] = bdf["boost_time"].dt.day

In [None]:
final_feature_list = [
    "boost_type",
    "boost_category",
    "save_type",
    "save_amount",
    "day_of_month",
    "prior_save_count",
    "save_within_48"
]

## Label extract

In [None]:
# find_next_save = lambda boost_row: len(sdf[(sdf["save_time"] < boost_row["boost_time"]) & (sdf["user_id"] == boost_row["user_id"])])
def find_next_save(boost_row, time_threshold = 48):
    user_mask = sdf["user_id"] == boost_row["user_id"]
    save_time_mask = sdf["save_time"] > boost_row["boost_time"]
    duration_mask = (sdf["save_time"] - boost_row["boost_time"]).astype('timedelta64[h]') < 48
    next_save_df = sdf[user_mask & save_time_mask & duration_mask]
    return len(next_save_df) > 0

In [None]:
count_of_next = 0
row_counter = 8 # just skipping over first one which happens to be withdrawal

while count_of_next == 0:
    row_counter += 1
    boost_row = bdf.iloc[row_counter]
    user_mask = sdf["user_id"] == boost_row["user_id"]
    save_time_mask = sdf["save_time"] > boost_row["boost_time"]
    count_of_next = len(sdf[user_mask & save_time_mask])

In [None]:
print('Row: ', row_counter)
boost_row = bdf.iloc[row_counter]
print('Boost time: ', boost_row['boost_time'])
user_mask = sdf["user_id"] == boost_row["user_id"]
save_time_mask = sdf["save_time"] > boost_row["boost_time"]
duration_mask = (sdf["save_time"] - boost_row["boost_time"]).astype('timedelta64[h]') < 48
sdf[user_mask & save_time_mask & duration_mask].head()

In [None]:
bdf["save_within_48"] = bdf.apply(find_next_save, axis=1)

In [None]:
bdf.save_within_48.value_counts()

## Extract and do DABL

In [None]:
import dabl

In [None]:
feature_df = bdf[final_feature_list]

In [None]:
dabl_data = dabl.clean(feature_df)

In [None]:
dabl.plot(dabl_data, target_col='save_within_48')

In [None]:
X = dabl_data.drop("save_within_48", axis=1)
Y = dabl_data.save_within_48

In [None]:
preprocessor = dabl.EasyPreprocessor()
X_trans = preprocessor.fit_transform(X)

In [None]:
fc = dabl.SimpleClassifier(random_state=0)
fc.fit(X_trans, Y)