In [75]:
from tqdm import tqdm

import numpy as np
import pandas as pd

import pickle

from collections import defaultdict, Counter

In [66]:
train = pd.read_parquet('../data/optimised/train.parquet')
test = pd.read_parquet('../data/optimised/test.parquet')


with open('../data/optimised/id2type.pkl', "rb") as fh:
    id2type = pickle.load(fh)
with open('../data/optimised/type2id.pkl', "rb") as fh:
    type2id = pickle.load(fh)
    
sample_sub = pd.read_csv('../data/sample_submission.csv')

In [67]:
display(train)

Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800,0
1,0,1563459,1659304904,0
2,0,1309446,1659367439,0
3,0,16246,1659367719,0
4,0,1781822,1659367871,0
...,...,...,...,...
216716091,12899776,1737908,1661723987,0
216716092,12899777,384045,1661723976,0
216716093,12899777,384045,1661723986,0
216716094,12899778,561560,1661723983,0


In [68]:
display(test)

Unnamed: 0,session,aid,ts,type
0,12899779,59625,1661724000,0
1,12899780,1142000,1661724000,0
2,12899780,582732,1661724058,0
3,12899780,973453,1661724109,0
4,12899780,736515,1661724136,0
...,...,...,...,...
6928118,14571577,1141710,1662328774,0
6928119,14571578,519105,1662328775,0
6928120,14571579,739876,1662328775,0
6928121,14571580,202353,1662328781,0


In [69]:
retained_train_sessions = train.drop_duplicates(subset=['session'])['session']
df_train = train[train['session'].isin(retained_train_sessions)]
df_train.index = pd.MultiIndex.from_frame(df_train[['session']])

retained_test_sessions = test.drop_duplicates(subset=['session'])['session']
df_test = test[test['session'].isin(retained_test_sessions)]
df_test.index = pd.MultiIndex.from_frame(df_test[['session']])

In [71]:
print(df_train.shape, df_test.shape)

(216716096, 4) (6928123, 4)


In [86]:
covisitation_matrix = defaultdict(Counter)

def get_covisitation_matrix(df):
    sessions = df.session.unique()
    chunk_size = 30_000
    for i in tqdm(range(0, sessions.shape[0], chunk_size)):
        # get current chunk of sessions
        temp = df.loc[sessions[i]:sessions[min(sessions.shape[0]-1, i+chunk_size-1)]].reset_index(drop=True)

        # get 30 most recent products per session
        temp = temp.groupby('session', as_index=False).tail(30).reset_index(drop=True)

        # merge temp to temp
        temp = temp.merge(temp, on='session', suffixes=('_x', '_y'))

        temp =  temp[temp.aid_x != temp.aid_y]

        # create a column to calculate days elapsed between aid_y and aid_x
        temp['days_elapsed'] = (temp.ts_y - temp.ts_x) / (24 * 60 * 60)

        # filter out rows where days elapsed is between 0 and 1
        temp = temp[(temp.days_elapsed > 0) & (temp.days_elapsed <= 1)]

        # drop duplicates based on session, aid_x, aid_y
        temp.drop_duplicates(subset=['session', 'aid_x', 'aid_y'], inplace=True)

        # zip aid_x and aid_y, and index them in index
        for aid_x, aid_y in zip(temp.aid_x, temp.aid_y):
            covisitation_matrix[aid_x][aid_y] += 1

In [87]:
get_covisitation_matrix(df_train)
get_covisitation_matrix(df_test)

100%|██████████| 430/430 [10:56<00:00,  1.53s/it]
100%|██████████| 56/56 [00:41<00:00,  1.36it/s]


In [89]:
test_session_products = test.groupby('session')['aid'].apply(list)

In [90]:
test_session_products

session
12899779                                              [59625]
12899780           [1142000, 582732, 973453, 736515, 1142000]
12899781    [141736, 199008, 57315, 194067, 199008, 199008...
12899782    [1669402, 1494780, 1494780, 1494780, 1494780, ...
12899783    [255297, 1114789, 255297, 300127, 198385, 3001...
                                  ...                        
14571577                                            [1141710]
14571578                                             [519105]
14571579                                             [739876]
14571580                                             [202353]
14571581                                            [1100210]
Name: aid, Length: 1671803, dtype: object

In [98]:
predictions = []

for products in tqdm(test_session_products):
    products = list(dict.fromkeys(products[::-1]))

    # get first 20 products if products is more than 20
    if len(products) > 20:

        products = products[:20]
        
        predictions.append(products)
    
    else:
        # get top 20 products
        top_products = [x[0] for x in covisitation_matrix[products[0]].most_common(20)]

        # get products that are not in top 20 products
        products = [x for x in products if x not in top_products]

        # append products to top 20 products
        top_products.extend(products)

        # append top 20 products to predictions
        predictions.append(top_products)

100%|██████████| 1671803/1671803 [12:54<00:00, 2157.42it/s]


In [99]:
# predictions as strings
predictions = [' '.join([str(x) for x in pred]) for pred in predictions]
# predictions as dataframe with session and predictions
predictions = pd.DataFrame({'session_type': test_session_products.index, 'labels': predictions})

In [100]:
display(predictions)

Unnamed: 0,session_type,labels
0,12899779,737445 499621 469285 941596 1246235 731692 140...
1,12899780,1502122 889686 487136 1344758 1515511 1419849 ...
2,12899781,1681537 1628918 374037 1836671 1119163 528496 ...
3,12899782,1007613 595994 1033148 834354 479970 1696036 8...
4,12899783,1811433 58861 73864 408787 1627943 1476106 455...
...,...,...
1671798,14571577,1276792 367734 1666114 1004292 1768884 1349213...
1671799,14571578,977826 815460 1811714 822641 1671592 735459 12...
1671800,14571579,1209992 1750859 1550479 770418 785544 870569 7...
1671801,14571580,1231403 871658 1314576 925638 888228 1627186 4...


In [101]:
submission_df = []

session_types = ['clicks', 'carts', 'orders']

for type in session_types:
    submission_preds = predictions.copy()
    submission_preds['session_type'] = submission_preds['session_type'].apply(lambda x: f'{type}_{str(x)}')
    submission_df.append(submission_preds)

submission_df = pd.concat(submission_df).reset_index(drop=True)

In [102]:
display(submission_df)

Unnamed: 0,session_type,labels
0,clicks_12899779,737445 499621 469285 941596 1246235 731692 140...
1,clicks_12899780,1502122 889686 487136 1344758 1515511 1419849 ...
2,clicks_12899781,1681537 1628918 374037 1836671 1119163 528496 ...
3,clicks_12899782,1007613 595994 1033148 834354 479970 1696036 8...
4,clicks_12899783,1811433 58861 73864 408787 1627943 1476106 455...
...,...,...
5015404,orders_14571577,1276792 367734 1666114 1004292 1768884 1349213...
5015405,orders_14571578,977826 815460 1811714 822641 1671592 735459 12...
5015406,orders_14571579,1209992 1750859 1550479 770418 785544 870569 7...
5015407,orders_14571580,1231403 871658 1314576 925638 888228 1627186 4...


In [103]:
submission_df.to_csv('submission.csv', index=False)