In [75]:
from tqdm import tqdm

import numpy as np
import pandas as pd

import pickle

from collections import defaultdict, Counter

In [66]:
train = pd.read_parquet('../data/optimised/train.parquet')
test = pd.read_parquet('../data/optimised/test.parquet')


with open('../data/optimised/id2type.pkl', "rb") as fh:
    id2type = pickle.load(fh)
with open('../data/optimised/type2id.pkl', "rb") as fh:
    type2id = pickle.load(fh)
    
sample_sub = pd.read_csv('../data/sample_submission.csv')

In [67]:
display(train)

Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800,0
1,0,1563459,1659304904,0
2,0,1309446,1659367439,0
3,0,16246,1659367719,0
4,0,1781822,1659367871,0
...,...,...,...,...
216716091,12899776,1737908,1661723987,0
216716092,12899777,384045,1661723976,0
216716093,12899777,384045,1661723986,0
216716094,12899778,561560,1661723983,0


In [68]:
display(test)

Unnamed: 0,session,aid,ts,type
0,12899779,59625,1661724000,0
1,12899780,1142000,1661724000,0
2,12899780,582732,1661724058,0
3,12899780,973453,1661724109,0
4,12899780,736515,1661724136,0
...,...,...,...,...
6928118,14571577,1141710,1662328774,0
6928119,14571578,519105,1662328775,0
6928120,14571579,739876,1662328775,0
6928121,14571580,202353,1662328781,0


In [69]:
retained_train_sessions = train.drop_duplicates(subset=['session'])['session']
df_train = train[train['session'].isin(retained_train_sessions)]
df_train.index = pd.MultiIndex.from_frame(df_train[['session']])

retained_test_sessions = test.drop_duplicates(subset=['session'])['session']
df_test = test[test['session'].isin(retained_test_sessions)]
df_test.index = pd.MultiIndex.from_frame(df_test[['session']])

In [71]:
print(df_train.shape, df_test.shape)

(216716096, 4) (6928123, 4)


In [82]:
covisitation_matrix = defaultdict(Counter)
sessions = train.session.unique()
chunk_size = 30_000

for i in tqdm(range(0, sessions.shape[0], chunk_size)):
    # get current chunk of sessions
    temp = train.loc[sessions[i]:sessions[min(sessions.shape[0]-1, i+chunk_size-1)]].reset_index(drop=True)

    # get 30 most recent products per session
    temp = temp.groupby('session', as_index=False).tail(30).reset_index(drop=True)

    # merge temp to temp
    temp = temp.merge(temp, on='session', suffixes=('_x', '_y'))

    temp =  temp[temp.aid_x != temp.aid_y]

    # create a column to calculate days elapsed between aid_y and aid_x
    temp['days_elapsed'] = (temp.ts_y - temp.ts_x) / (24 * 60 * 60)

    # filter out rows where days elapsed is between 0 and 1
    temp = temp[(temp.days_elapsed > 0) & (temp.days_elapsed <= 1)]

    # drop duplicates based on session, aid_x, aid_y
    temp.drop_duplicates(subset=['session', 'aid_x', 'aid_y'], inplace=True)

    # zip aid_x and aid_y, and index them in index
    for aid_x, aid_y in zip(temp.aid_x, temp.aid_y):
        covisitation_matrix[aid_x][aid_y] += 1

100%|██████████| 430/430 [00:25<00:00, 16.70it/s]


In [85]:
len(covisitation_matrix)

598757