In [1]:
import random

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import cudf as cpd
from tqdm import tqdm

from multiprocessing import  Pool

import warnings
warnings.filterwarnings('ignore')

In [2]:
tqdm.pandas()

In [3]:
class config:
    data_path = '../data/'
    local_validation = True
    validation_path = '../data/local_validation/'
    train_file = 'train.parquet'
    test_file = 'test.parquet'
    test_labels_file = 'test_labels.parquet'
    n_session_samples = 100
    n_most_common = 50
    debug = False
    type_labels = {'clicks':0, 'carts':1, 'orders':2}


In [5]:
chunks = pd.read_json(config.data_path + 'train.jsonl', lines=True, chunksize=100_000)
train = pd.DataFrame()
for i, chunk in enumerate(chunks):
    event_dict = {
        'session': [],
        'aid':     [],
        'ts':      [],
        'type':    []
    }

    if i >= 2:
        break
    for session, events in zip(chunk['session'].tolist(), \
                               chunk['events'].tolist()):
        for event in events:
            event_dict['session'].append(session)
            event_dict['aid'].append(event['aid'])
            event_dict['ts'].append(event['ts'])
            event_dict['type'].append(event['type'])
    chunk_session = pd.DataFrame(event_dict)
    train = pd.concat([train, chunk_session])

In [5]:
train.head()

Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800025,clicks
1,0,1563459,1659304904511,clicks
2,0,1309446,1659367439426,clicks
3,0,16246,1659367719997,clicks
4,0,1781822,1659367871344,clicks


In [108]:
def parallelize_dataframe(df, func, n_cores=12):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [109]:
def create_time_features(df):
    df['date'] = pd.to_datetime(df['ts'], unit='ms', origin='unix')
    df['day'] = df['date'].dt.day
    df['hour'] = df['date'].dt.hour
    df['minute'] = df['date'].dt.minute
    df['second'] = df['date'].dt.second
    df['weekday'] = df['date'].dt.weekday
    return df

def create_session_features(df):
    df['session_length'] = df.groupby('session')['date'].transform('count')
    # create a feature for calculating the number of clicks, carts and orders per session
    df['n_clicks'] = df[df['type'] == 'clicks'].groupby('session')['type'].transform('count')
    df['n_carts'] = df[df['type'] == 'carts'].groupby('session')['type'].transform('count')
    df['n_orders'] = df[df['type'] == 'orders'].groupby('session')['type'].transform('count')

    df['unique_products_per_session'] = df.groupby('session')['aid'].transform('nunique')
    df['unique_events_per_session'] = df.groupby('session')['type'].transform('nunique')
    # create feature for calculating the session duration in seconds
    df['session_duration'] = df.groupby('session')['date'] \
                                    .transform(lambda x: (x.max() - x.min()).total_seconds())
    
    df['avg_time_between_events_per_session'] = df.groupby('session')['date'] \
                                                    .transform(lambda x: x.diff().mean().total_seconds())
    df['avg_time_between_clicks_per_session'] = df[df['type'] == config.type_labels['clicks']]\
                                                    .groupby('session')['date'] \
                                                    .transform(lambda x: x.diff().mean().total_seconds())
    return df

In [110]:
df = parallelize_dataframe(train, create_time_features)

In [111]:
display(df)

Unnamed: 0,session,aid,ts,type,date,day,hour,minute,second,weekday
0,0,1517085,1659304800025,clicks,2022-07-31 22:00:00.025,31,22,0,0,6
1,0,1563459,1659304904511,clicks,2022-07-31 22:01:44.511,31,22,1,44,6
2,0,1309446,1659367439426,clicks,2022-08-01 15:23:59.426,1,15,23,59,0
3,0,16246,1659367719997,clicks,2022-08-01 15:28:39.997,1,15,28,39,0
4,0,1781822,1659367871344,clicks,2022-08-01 15:31:11.344,1,15,31,11,0
...,...,...,...,...,...,...,...,...,...,...
5057903,199999,928064,1659336408967,clicks,2022-08-01 06:46:48.967,1,6,46,48,0
5057904,199999,849970,1659336449078,clicks,2022-08-01 06:47:29.078,1,6,47,29,0
5057905,199999,1052480,1659336547035,clicks,2022-08-01 06:49:07.035,1,6,49,7,0
5057906,199999,487255,1659336561116,clicks,2022-08-01 06:49:21.116,1,6,49,21,0


In [112]:
%%time
df = parallelize_dataframe(df, create_session_features)

CPU times: user 8.45 s, sys: 9.57 s, total: 18 s
Wall time: 31.9 s


In [113]:
display(df)

Unnamed: 0,session,aid,ts,type,date,day,hour,minute,second,weekday,session_length,n_clicks,n_carts,n_orders,unique_products_per_session,unique_events_per_session,session_duration,avg_time_between_events_per_session,avg_time_between_clicks_per_session
0,0,1517085,1659304800025,clicks,2022-07-31 22:00:00.025,31,22,0,0,6,276,255.0,,,183,3,2380183.682,8655.213389,
1,0,1563459,1659304904511,clicks,2022-07-31 22:01:44.511,31,22,1,44,6,276,255.0,,,183,3,2380183.682,8655.213389,
2,0,1309446,1659367439426,clicks,2022-08-01 15:23:59.426,1,15,23,59,0,276,255.0,,,183,3,2380183.682,8655.213389,
3,0,16246,1659367719997,clicks,2022-08-01 15:28:39.997,1,15,28,39,0,276,255.0,,,183,3,2380183.682,8655.213389,
4,0,1781822,1659367871344,clicks,2022-08-01 15:31:11.344,1,15,31,11,0,276,255.0,,,183,3,2380183.682,8655.213389,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5057903,199999,928064,1659336408967,clicks,2022-08-01 06:46:48.967,1,6,46,48,0,9,9.0,,,9,1,351.035,43.879375,
5057904,199999,849970,1659336449078,clicks,2022-08-01 06:47:29.078,1,6,47,29,0,9,9.0,,,9,1,351.035,43.879375,
5057905,199999,1052480,1659336547035,clicks,2022-08-01 06:49:07.035,1,6,49,7,0,9,9.0,,,9,1,351.035,43.879375,
5057906,199999,487255,1659336561116,clicks,2022-08-01 06:49:21.116,1,6,49,21,0,9,9.0,,,9,1,351.035,43.879375,


In [114]:
# fill na with 0
df.fillna(0, inplace=True)
# df.fillna(pd.Timedelta(seconds=0), inplace=True)

In [115]:
display(df)

Unnamed: 0,session,aid,ts,type,date,day,hour,minute,second,weekday,session_length,n_clicks,n_carts,n_orders,unique_products_per_session,unique_events_per_session,session_duration,avg_time_between_events_per_session,avg_time_between_clicks_per_session
0,0,1517085,1659304800025,clicks,2022-07-31 22:00:00.025,31,22,0,0,6,276,255.0,0.0,0.0,183,3,2380183.682,8655.213389,0.0
1,0,1563459,1659304904511,clicks,2022-07-31 22:01:44.511,31,22,1,44,6,276,255.0,0.0,0.0,183,3,2380183.682,8655.213389,0.0
2,0,1309446,1659367439426,clicks,2022-08-01 15:23:59.426,1,15,23,59,0,276,255.0,0.0,0.0,183,3,2380183.682,8655.213389,0.0
3,0,16246,1659367719997,clicks,2022-08-01 15:28:39.997,1,15,28,39,0,276,255.0,0.0,0.0,183,3,2380183.682,8655.213389,0.0
4,0,1781822,1659367871344,clicks,2022-08-01 15:31:11.344,1,15,31,11,0,276,255.0,0.0,0.0,183,3,2380183.682,8655.213389,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5057903,199999,928064,1659336408967,clicks,2022-08-01 06:46:48.967,1,6,46,48,0,9,9.0,0.0,0.0,9,1,351.035,43.879375,0.0
5057904,199999,849970,1659336449078,clicks,2022-08-01 06:47:29.078,1,6,47,29,0,9,9.0,0.0,0.0,9,1,351.035,43.879375,0.0
5057905,199999,1052480,1659336547035,clicks,2022-08-01 06:49:07.035,1,6,49,7,0,9,9.0,0.0,0.0,9,1,351.035,43.879375,0.0
5057906,199999,487255,1659336561116,clicks,2022-08-01 06:49:21.116,1,6,49,21,0,9,9.0,0.0,0.0,9,1,351.035,43.879375,0.0


In [116]:
# drop duplicates based on session
df.drop_duplicates(subset=['session'], inplace=True)
display(df)

Unnamed: 0,session,aid,ts,type,date,day,hour,minute,second,weekday,session_length,n_clicks,n_carts,n_orders,unique_products_per_session,unique_events_per_session,session_duration,avg_time_between_events_per_session,avg_time_between_clicks_per_session
0,0,1517085,1659304800025,clicks,2022-07-31 22:00:00.025,31,22,0,0,6,276,255.0,0.0,0.0,183,3,2380183.682,8655.213389,0.0
276,1,424964,1659304800025,carts,2022-07-31 22:00:00.025,31,22,0,0,6,32,0.0,8.0,0.0,22,2,2410054.967,77743.708612,0.0
308,2,763743,1659304800038,clicks,2022-07-31 22:00:00.038,31,22,0,0,6,33,32.0,0.0,0.0,29,2,2409415.621,75294.238156,0.0
341,3,1425967,1659304800095,carts,2022-07-31 22:00:00.095,31,22,0,0,6,226,0.0,21.0,0.0,140,3,1804866.676,8021.629671,0.0
567,4,613619,1659304800119,clicks,2022-07-31 22:00:00.119,31,22,0,0,6,19,15.0,0.0,0.0,12,3,2281881.184,126771.176888,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5057701,199995,1384304,1659336223762,clicks,2022-08-01 06:43:43.762,1,6,43,43,0,17,17.0,0.0,0.0,15,1,1461041.013,91315.063312,0.0
5057718,199996,381361,1659336223803,clicks,2022-08-01 06:43:43.803,1,6,43,43,0,25,24.0,0.0,0.0,17,2,2168298.432,90345.768000,0.0
5057743,199997,1401230,1659336223845,clicks,2022-08-01 06:43:43.845,1,6,43,43,0,154,119.0,0.0,0.0,82,3,2292202.263,14981.714137,0.0
5057897,199998,743485,1659336223992,clicks,2022-08-01 06:43:43.992,1,6,43,43,0,2,2.0,0.0,0.0,1,1,7.250,7.250000,0.0
