In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
VER = 5

import pandas as pd, numpy as np
from tqdm.notebook import tqdm
import os, sys, pickle, glob, gc
from collections import Counter
import itertools
import cudf
print('We will use RAPIDS version',cudf.__version__)

from coVisitation import CV_carts_orders, CV_B2B, CV_clicks
from eventsSuggester import ClicksSuggester, BuysSuggester

We will use RAPIDS version 22.10.01+2.gca9a422da9


# Step 1 - Candidate Generation with RAPIDS

## Compute Three Co-visitation Matrices with RAPIDS

In [3]:
%%time
# CACHE FUNCTIONS
def read_file_to_cache(f):
    df = pd.read_parquet(f)
    df.ts = (df.ts/1000).astype('int32')
    df['type'] = df['type'].map(type_labels).astype('int8')
    return df

# CACHE THE DATA ON CPU BEFORE PROCESSING ON GPU
data_cache = {}
type_labels = {'clicks':0, 'carts':1, 'orders':2}
files = glob.glob('../input/parquet/*_parquet/*')
for f in files: data_cache[f] = read_file_to_cache(f)

CPU times: user 17.6 s, sys: 5.91 s, total: 23.5 s
Wall time: 16.3 s


## 1) "Carts Orders" Co-visitation Matrix - Type Weighted

In [3]:
%%time
cv_carts_orders = CV_carts_orders(files, data_cache)
cv_carts_orders.processDisks(15)

NameError: name 'files' is not defined

## 2) "Buy2Buy" Co-visitation Matrix

In [6]:
%%time
cv_B2B = CV_B2B(files, data_cache)
cv_B2B.processDisks(15)

We will process 146 files, in groups of 3 and chunks of 25.

### DISK PART 1
Processing files 0 thru 24 in groups of 3...
0 , 3 , 6 , 9 , 12 , 15 , 18 , 21 , 24 , 
Processing files 25 thru 49 in groups of 3...
25 , 28 , 31 , 34 , 37 , 40 , 43 , 46 , 49 , 
Processing files 50 thru 74 in groups of 3...
50 , 53 , 56 , 59 , 62 , 65 , 68 , 71 , 74 , 
Processing files 75 thru 99 in groups of 3...
75 , 78 , 81 , 84 , 87 , 90 , 93 , 96 , 99 , 
Processing files 100 thru 124 in groups of 3...
100 , 103 , 106 , 109 , 112 , 115 , 118 , 121 , 124 , 
Processing files 125 thru 145 in groups of 3...
125 , 128 , 131 , 134 , 137 , 140 , 143 , 

### DISK PART 2
Processing files 0 thru 24 in groups of 3...
0 , 3 , 6 , 9 , 12 , 15 , 18 , 21 , 24 , 
Processing files 25 thru 49 in groups of 3...
25 , 28 , 31 , 34 , 37 , 40 , 43 , 46 , 49 , 
Processing files 50 thru 74 in groups of 3...
50 , 53 , 56 , 59 , 62 , 65 , 68 , 71 , 74 , 
Processing files 75 thru 99 in groups of 3...
75 , 78 , 81 , 84 , 87 , 90 , 93

## 3) "Clicks" Co-visitation Matrix - Time Weighted

In [12]:
%%time
cv_clicks = CV_clicks(files, data_cache)
cv_clicks.processDisks(20)

We will process 146 files, in groups of 3 and chunks of 25.
CPU times: user 47 µs, sys: 19 µs, total: 66 µs
Wall time: 107 µs


In [6]:
# FREE MEMORY
del data_cache
gc.collect()

0

# Step 2 - ReRank (choose 20) using handcrafted rules

In [7]:
def load_test():    
    dfs = []
    for e, chunk_file in enumerate(glob.glob('../input/parquet/test_parquet/*')):
        chunk = pd.read_parquet(chunk_file)
        chunk.ts = (chunk.ts/1000).astype('int32')
        chunk['type'] = chunk['type'].map(type_labels).astype('int8')
        dfs.append(chunk)
    return pd.concat(dfs).reset_index(drop=True) #.astype({"ts": "datetime64[ms]"})

test_df = load_test()
print('Test data has shape',test_df.shape)
test_df.head()

Test data has shape (6928123, 4)


Unnamed: 0,session,aid,ts,type
0,12899779,59625,1661724000,0
1,12899780,1142000,1661724000,0
2,12899780,582732,1661724058,0
3,12899780,973453,1661724109,0
4,12899780,736515,1661724136,0


In [14]:
%%time
def pqt_to_dict(df):
    return df.groupby('aid_x').aid_y.apply(list).to_dict()
    
# LOAD THREE CO-VISITATION MATRICES
top_20_clicks = {}
for k in range(0, cv_clicks.DISK_PIECES): 
    top_20_clicks.update( pqt_to_dict( pd.read_parquet(cv_clicks.saveFolder + f'/top_20_clicks_v{VER}_{k}.pqt') ) )

top_15_buy2buy = {}
for k in range(0, cv_B2B.DISK_PIECES): 
    top_15_buy2buy.update( pqt_to_dict( pd.read_parquet(cv_B2B.saveFolder + f'/top_15_buy2buy_v{VER}_{k}.pqt') ) )
    
top_15_buys = {}
for k in range(0, cv_carts_orders.DISK_PIECES): 
    top_15_buys.update( pqt_to_dict( pd.read_parquet(cv_carts_orders.saveFolder + f'/top_15_carts_orders_v{VER}_{k}.pqt') ) )

# TOP CLICKS AND ORDERS IN TEST
top_clicks = test_df.loc[test_df['type']==0,'aid'].value_counts().index.values[:20]
top_orders = test_df.loc[test_df['type']==2,'aid'].value_counts().index.values[:20]

print('Here are size of our 3 co-visitation matrices:')
print( len( top_20_clicks ), len( top_15_buy2buy ), len( top_15_buys ) )

Here are size of our 3 co-visitation matrices:
1837166 1168768 1837166
CPU times: user 54.3 s, sys: 2.44 s, total: 56.7 s
Wall time: 54.8 s


In [32]:
clicksSuggester = ClicksSuggester(top_clicks, top_20_clicks)
buysSuggester = BuysSuggester(top_orders, top_15_buy2buy, top_15_buys)

In [33]:
%%time
pred_df_clicks = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(
    lambda x: clicksSuggester.suggest(x, 20)
)

pred_df_buys = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(
    lambda x: buysSuggester.suggest(x, 20)
)

CPU times: user 9min 11s, sys: 884 ms, total: 9min 12s
Wall time: 9min 12s


# Step 3 - Create Submission CSV

In [34]:
clicks_pred_df = pd.DataFrame(pred_df_clicks.add_suffix("_clicks"), columns=["labels"]).reset_index()
orders_pred_df = pd.DataFrame(pred_df_buys.add_suffix("_orders"), columns=["labels"]).reset_index()
carts_pred_df = pd.DataFrame(pred_df_buys.add_suffix("_carts"), columns=["labels"]).reset_index()

In [35]:
pred_df = pd.concat([clicks_pred_df, orders_pred_df, carts_pred_df])
pred_df.columns = ["session_type", "labels"]
pred_df["labels"] = pred_df.labels.apply(lambda x: " ".join(map(str,x)))
pred_df.to_csv("../submissions/submission.csv", index=False)
pred_df.head()

Unnamed: 0,session_type,labels
0,12899779_clicks,59625 1253524 737445 438191 731692 1790770 942...
1,12899780_clicks,1142000 736515 973453 582732 1502122 889686 48...
2,12899781_clicks,918667 199008 194067 57315 141736 1460571 7594...
3,12899782_clicks,834354 595994 740494 889671 987399 779477 1344...
4,12899783_clicks,1817895 607638 1754419 1216820 1729553 300127 ...
