In [1]:
import os, pickle

import numpy as np
import pandas as pd

from tqdm import tqdm

from collections import defaultdict, Counter

import cudf, itertools

In [2]:
train = cudf.read_parquet('../data/train.parquet')
test = cudf.read_parquet('../data/test.parquet')

data = cudf.concat([train, test])

In [3]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}
type_weight = {0:1, 1:6, 2:3}

In [4]:
version = 1
data = data.set_index('session')
sessions = data.index.unique()

chunk_size = 100_000

tmp = None
for i in tqdm(range(0, sessions.shape[0], chunk_size)):
    df = data.loc[sessions[i]:sessions[min(sessions.shape[0]-1, i+chunk_size-1)]].reset_index()

    df = df.sort_values(['session','ts'],ascending=[True,False])

    # USE TAIL OF SESSION
    df = df.reset_index(drop=True)
    df['n'] = df.groupby('session').cumcount()
    df = df.loc[df.n<30].drop('n',axis=1)

    # CREATE PAIRS
    df = df.merge(df,on='session')
    df = df.loc[ ((df.ts_x - df.ts_y).abs()< 24 * 60 * 60) & (df.aid_x != df.aid_y) ]

    # ASSIGN WEIGHTS
    df = df[['session', 'aid_x', 'aid_y','type_y']].drop_duplicates(['session', 'aid_x', 'aid_y'])
    df['wgt'] = df.type_y.map(type_weight)
    df = df[['aid_x','aid_y','wgt']]
    df.wgt = df.wgt.astype('float32')
    df = df.groupby(['aid_x','aid_y']).wgt.sum()

    if tmp is None: tmp = df
    else: tmp.add(df, fill_value=0)


# CONVERT MATRIX TO DICTIONARY
tmp = tmp.reset_index()
tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])
# SAVE TOP 40
tmp = tmp.reset_index(drop=True)
tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount()
tmp = tmp.loc[tmp.n<40].drop('n',axis=1)
# SAVE TO DISK
df = tmp.to_pandas().groupby('aid_x').aid_y.apply(list)
with open(f'../data/top_40_carts_orders_v{version}.pkl', 'wb') as f:
    pickle.dump(df.to_dict(), f)

100%|██████████████████████████████████████████████| 146/146 [01:52<00:00,  1.29it/s]
