In [20]:
import pandas as pd
import glob
import os
from datetime import timedelta
import copy
path = "./preprocessed/"
output_path = "./data/"

#  Fast check to see dates and determine test/train interval according to that

In [21]:
import warnings
warnings.filterwarnings('ignore')

for ix, intrctn in enumerate(["df_visit.csv", "df_basket.csv", "df_fav.csv", "df_trx.csv"]):

    df_event = pd.read_pickle(f"{path}{intrctn}.pkl")
    print(intrctn)
    display(df_event["date"].describe())

df_visit.csv


count                 8567033
unique                     31
top       2020-10-06 00:00:00
freq                   462160
first     2020-10-01 00:00:00
last      2020-10-31 00:00:00
Name: date, dtype: object

df_basket.csv


count                  283442
unique                     31
top       2020-10-08 00:00:00
freq                    24794
first     2020-10-01 00:00:00
last      2020-10-31 00:00:00
Name: date, dtype: object

df_fav.csv


count                  398720
unique                     31
top       2020-10-06 00:00:00
freq                    23252
first     2020-10-01 00:00:00
last      2020-10-31 00:00:00
Name: date, dtype: object

df_trx.csv


count                  228011
unique                     32
top       2020-10-08 00:00:00
freq                    16963
first     2020-10-01 00:00:00
last      2020-11-01 00:00:00
Name: date, dtype: object

## Lets talk about how we create our train/test instances and features  with an example

- If 31 october, 30 october, 29 october will be test days (test interval), that means we want to make prediction for instances in 28 october

    -- So, we should filter train/test data according to user-currentbugroupname pairs at 28 october
- if user-currentbugroupname pairs at 28 october in train/test data have a purchase in these days, then this pairs will positive samples
- other days will be used for feature generation since other days are in train period

- With the algorithm above, we assign labels and create features for user-currentbugroupname pairs at each day in the collected data
- We will train our binary classfication ml models for each days' train/test instances and save their classfication metrics
- mean and std of classification results for each day will be the final result

In [22]:
import datetime, calendar

year = 2020
month = 10
num_days = calendar.monthrange(year, month)[1]
days = [datetime.date(year, 10, day) for day in range(1, 32)]
df_purchased = pd.read_pickle(f"{path}df_trx.csv.pkl")

In [23]:

import warnings
warnings.filterwarnings('ignore')

for the_day in days:

    the_day_str = the_day.strftime('%Y-%m-%d')
    test_start_day = (datetime.datetime.strptime(the_day_str, '%Y-%m-%d')  + timedelta(days=1)).strftime('%Y-%m-%d')
    test_end_day = (datetime.datetime.strptime(the_day_str, '%Y-%m-%d')  + timedelta(days=3)).strftime('%Y-%m-%d')
    


    #gather the events that happen in the_day and assign target that show which of user-bugroup pair purchased in nex 3 days

    print(the_day_str)
    print(test_start_day)
    print(test_end_day)
    
    pd_list = []

    for intrctn in ["df_visit.csv", "df_basket.csv", "df_fav.csv", "df_trx.csv"]:
        
        #filter event data according to date
        intrctn_df = pd.read_pickle(f'{path}{intrctn}.pkl')
        intrctn_df_the_day = intrctn_df[intrctn_df["date"] == the_day_str]

        if intrctn_df_the_day.shape[0] == 0:
            continue
        
        #filter purchased data according to date
        pos = df_purchased[(df_purchased["date"] >= test_start_day)&(df_purchased["date"] <= test_end_day)]
        pos["label"] = 1
        
        #assign label to events that shows whether user-bugroup pair purchased in nex 3 days
        unpickled_df = pd.merge(intrctn_df_the_day[["userid", "currentbugroupname"]], pos[["userid", "currentbugroupname", "label"]], how='left', on=["userid", "currentbugroupname"])
        unpickled_df=unpickled_df.fillna(0)
        pd_list.append(unpickled_df)

    # concat each event and check pos/neg samples

    if len(pd_list) > 0:
        events = pd.concat(pd_list)
        events.drop_duplicates(inplace=True)
        print("pos/neg dist of the day for all events: instance count", events.shape, "pos", events[events["label"]==1].shape, "neg",events[events["label"]==0].shape)


        #filter events according to event in the_day
        for split in ["df_target_train.csv", "df_test.csv"]:

            split_df = pd.read_pickle(f'{path}{split}.pkl')
            print("size before the inner join",split, split_df.shape)
            split_df = pd.merge(split_df[["userid", "currentbugroupname"]], events[["userid", "currentbugroupname", "label"]], how='inner', on=["userid", "currentbugroupname"])
            print("size after the inner join",split, split_df.shape)
            print(f"pos/neg dist of the day for {split}: instance count", split_df.shape, "pos", split_df[split_df["label"]==1].shape, "neg", split_df[split_df["label"]==0].shape)
            split_df.to_pickle(f'{output_path}{split}_{the_day}.pkl')

    else:
        break

    print("********************************************************************************************")


2020-10-01
2020-10-02
2020-10-04
pos/neg dist of the day for all events: instance count (52829, 3) pos (3664, 3) neg (49165, 3)
size before the inner join df_target_train.csv (22864, 2)
size after the inner join df_target_train.csv (4397, 3)
pos/neg dist of the day for df_target_train.csv: instance count (4397, 3) pos (549, 3) neg (3848, 3)
size before the inner join df_test.csv (230592, 4)
size after the inner join df_test.csv (23344, 3)
pos/neg dist of the day for df_test.csv: instance count (23344, 3) pos (1843, 3) neg (21501, 3)
********************************************************************************************
2020-10-02
2020-10-03
2020-10-05
pos/neg dist of the day for all events: instance count (50337, 3) pos (4050, 3) neg (46287, 3)
size before the inner join df_target_train.csv (22864, 2)
size after the inner join df_target_train.csv (4302, 3)
pos/neg dist of the day for df_target_train.csv: instance count (4302, 3) pos (624, 3) neg (3678, 3)
size before the inner joi