In [1]:
import pandas as pd, numpy as np
from tqdm.notebook import tqdm
import os, sys, pickle, glob, gc
from collections import Counter
import itertools
import cudf
print('We will use RAPIDS version',cudf.__version__)
cudf.set_option("default_integer_bitwidth", 32)
cudf.set_option("default_float_bitwidth", 32)

from coVisitation import CV_carts_orders, CV_B2B, CV_clicks
from eventsSuggester import ClicksSuggester, BuysSuggester

We will use RAPIDS version 22.10.01+2.gca9a422da9


# Step 1 - Generate covisitation matrix

In [2]:
data_cache = {}
type_labels = {'clicks':0, 'carts':1, 'orders':2}
files = glob.glob('../input/splited/parquets/*_parquets/*')
saveFolder = '../output/coVisitationPqt_splited'

In [None]:
%%time
# CACHE THE DATA ON CPU BEFORE PROCESSING ON GPU

for f in files: 
    data_cache[f] = pd.read_parquet(f)

In [None]:
! mkdir '../output/coVisitationPqt_splited'

In [None]:
%%time
cv_carts_orders = CV_carts_orders(files, data_cache, saveFolder)
cv_carts_orders.processDisks(50)

In [None]:
%%time
cv_B2B = CV_B2B(files, data_cache, saveFolder)
cv_B2B.processDisks(50)

In [None]:
%%time
cv_clicks = CV_clicks(files, data_cache, saveFolder)
cv_clicks.processDisks(50)

In [None]:
# FREE MEMORY
del data_cache
gc.collect()

# Step 2 - Top 50 Candidate Generation

In [17]:
def pqt_to_dict(df):
    return df.groupby('aid_x').aid_y.apply(list).to_dict()

In [18]:
cv_carts_orders = CV_carts_orders(files, data_cache, saveFolder)
cv_B2B = CV_B2B(files, data_cache, saveFolder)
cv_clicks = CV_clicks(files, data_cache, saveFolder)

We will process 130 files, in groups of 3 and chunks of 25.
We will process 130 files, in groups of 3 and chunks of 25.
We will process 130 files, in groups of 3 and chunks of 25.


In [19]:
top_50_clicks = {}
for k in range(0, cv_clicks.DISK_PIECES):
    top_50_clicks.update( pqt_to_dict( pd.read_parquet(cv_clicks.saveFolder + f'/top_50_clicks_v{cv_clicks.VER}_{k}.pqt') ) )

In [20]:
top_50_buy2buy = {}
for k in range(0, cv_B2B.DISK_PIECES):
    top_50_buy2buy.update( pqt_to_dict( pd.read_parquet(cv_B2B.saveFolder + f'/top_50_buy2buy_v{cv_B2B.VER}_{k}.pqt') ) )

In [21]:
top_50_buys = {}
for k in range(0, cv_carts_orders.DISK_PIECES): 
    top_50_buys.update( pqt_to_dict( pd.read_parquet(cv_carts_orders.saveFolder + f'/top_50_carts_orders_v{cv_carts_orders.VER}_{k}.pqt') ) )

In [20]:
val_df = pd.read_parquet('../input/splited/test.parquet')
val_df

Unnamed: 0,session,aid,ts,type
0,11098528,11830,1661119200,0
1,11098529,1105029,1661119200,0
2,11098530,264500,1661119200,0
3,11098530,264500,1661119288,0
4,11098530,409236,1661119369,0
...,...,...,...,...
7683572,12899774,33035,1661723968,0
7683573,12899775,1743151,1661723970,0
7683574,12899776,548599,1661723972,0
7683575,12899777,384045,1661723976,0


: 

In [23]:
# TOP CLICKS AND ORDERS IN TEST
top_clicks = val_df.loc[val_df['type']==0,'aid'].value_counts().index.values[:50]
top_orders = val_df.loc[val_df['type']==2,'aid'].value_counts().index.values[:50]

print('Here are size of our 3 co-visitation matrices:')
print( len( top_50_clicks ), len( top_50_buy2buy ), len( top_50_buys ) )

Here are size of our 3 co-visitation matrices:
1812132 1055146 1812132


In [24]:
clicksSuggester = ClicksSuggester(top_clicks, top_50_clicks)
buysSuggester = BuysSuggester(top_orders, top_50_buy2buy, top_50_buys)

In [25]:
%%time
pred_df_clicks = val_df.sort_values(["session", "ts"]).groupby(["session"]).apply(
    lambda x: clicksSuggester.suggest(x, 50)
)

pred_df_buys = val_df.sort_values(["session", "ts"]).groupby(["session"]).apply(
    lambda x: buysSuggester.suggest(x, 50)
)

CPU times: user 10min 34s, sys: 1.6 s, total: 10min 36s
Wall time: 10min 36s


In [28]:
clicks_pred_df = pd.DataFrame(pred_df_clicks, columns=["labels"]).reset_index()
clicks_pred_df.head()

Unnamed: 0,session,labels
0,11098528,"[11830, 588923, 1732105, 571762, 884502, 11578..."
1,11098529,"[1105029, 459126, 1339838, 1544564, 217742, 16..."
2,11098530,"[409236, 264500, 1603001, 963957, 254154, 5830..."
3,11098531,"[396199, 1271998, 452188, 1728212, 1365569, 62..."
4,11098532,"[876469, 7651, 108125, 612920, 1673641, 120261..."


In [29]:
buys_pred_df = pd.DataFrame(pred_df_buys, columns=["labels"]).reset_index()
buys_pred_df.head()

Unnamed: 0,session,labels
0,11098528,"[11830, 1732105, 588923, 884502, 1157882, 5717..."
1,11098529,"[1105029, 1049489, 132016, 459126, 785427, 792..."
2,11098530,"[409236, 264500, 1603001, 254154, 963957, 5830..."
3,11098531,"[396199, 1271998, 452188, 1728212, 1365569, 62..."
4,11098532,"[876469, 7651, 108125, 612920, 1159379, 120261..."


In [41]:
! mkdir ../output/top50Candidates
candidatesSavePath = '../output/top50Candidates/'
clicks_pred_df.to_parquet(candidatesSavePath + 'clicksCandidate.pqt')
buys_pred_df.to_parquet(candidatesSavePath + 'buysCandidate.pqt')

mkdir: cannot create directory ‘../output/top50Candidates’: File exists


# Step 3 - ReRank and select top 20 using XGB

## Create features

In [10]:
candidatesSavePath = '../output/top50Candidates/'
clicks_pred_df = pd.read_parquet(candidatesSavePath + 'clicksCandidate.pqt')
buys_pred_df = pd.read_parquet(candidatesSavePath + 'buysCandidate.pqt')

In [11]:
clicksCandidate = {
        'session': [],
        'aid': [],
    }

for user, items in zip(clicks_pred_df['session'], clicks_pred_df['labels']):
    for item in items:
        clicksCandidate['session'].append(user)
        clicksCandidate['aid'].append(item)

clicksCandidate = pd.DataFrame(clicksCandidate)
clicksCandidate.head()

Unnamed: 0,session,aid
0,11098528,11830
1,11098528,588923
2,11098528,1732105
3,11098528,571762
4,11098528,884502


In [39]:
buysCandidate = {
        'session': [],
        'aid': [],
    }
for user, items in zip(buys_pred_df['session'], buys_pred_df['labels']):
    for item in items:
        buysCandidate['session'].append(user)
        buysCandidate['aid'].append(item)

buysCandidate = pd.DataFrame(buysCandidate)
buysCandidate.head()

Unnamed: 0,session,aid
0,11098528,11830
1,11098528,1732105
2,11098528,588923
3,11098528,884502
4,11098528,1157882


In [12]:
train_df = pd.read_parquet('../input/splited/train.parquet')
train_df.head()

Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800,0
1,0,1563459,1659304904,0
2,0,1309446,1659367439,0
3,0,16246,1659367719,0
4,0,1781822,1659367871,0


In [16]:
train_val_df = pd.concat([train_df, val_df], axis=0, ignore_index=True)

In [42]:
! mkdir ../output/features
featuresPath = '../output/features/'

mkdir: cannot create directory ‘../output/features’: File exists


### Item features
Using train data + val data A

In [17]:
item_features = train_val_df.groupby('aid').agg({'aid':'count', 'session':'nunique', 'type':'mean'})
item_features.columns = ['item_item_count', 'item_user_count', 'item_buy_ratio']
item_features.head()

Unnamed: 0_level_0,item_item_count,item_user_count,item_buy_ratio
aid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,38,30,0.0
1,32,29,0.03125
2,16,15,0.0
3,1525,852,0.083279
4,148,103,0.033784


In [None]:
item_features.to_parquet(featuresPath + 'item_features.pqt')

### User features

In [18]:
user_features = val_df.groupby('session').agg({'session':'count', 'aid':'nunique', 'type':'mean'})
user_features.columns = ['user_user_count', 'user_item_count', 'user_buy_ratio']
user_features.head()

Unnamed: 0_level_0,user_user_count,user_item_count,user_buy_ratio
session,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11098528,1,1,0.0
11098529,1,1,0.0
11098530,6,2,0.166667
11098531,24,11,0.333333
11098532,2,2,0.0


In [None]:
user_features.to_parquet(featuresPath + 'user_features.pqt')

## Merge features

In [45]:
item_features = pd.read_parquet(featuresPath + 'item_features.pqt')
user_features = pd.read_parquet(featuresPath + 'user_features.pqt')

In [None]:
clicksCandidate = clicksCandidate.merge(item_features, left_on='aid', right_index=True, how='left').fillna(-1)
clicksCandidate.head()
clicksCandidate = clicksCandidate.merge(user_features, left_on='session', right_index=True, how='left').fillna(-1)
clicksCandidate.head()

In [46]:
buysCandidate = buysCandidate.merge(item_features, left_on='aid', right_index=True, how='left').fillna(-1)
buysCandidate = buysCandidate.merge(user_features, left_on='session', right_index=True, how='left').fillna(-1)
buysCandidate.head()

Unnamed: 0,session,aid,item_item_count,item_user_count,item_buy_ratio,user_user_count,user_item_count,user_buy_ratio
0,11098528,11830,33776,19211,0.17397,1,1,0.0
1,11098528,1732105,10452,5770,0.280233,1,1,0.0
2,11098528,588923,24588,14741,0.125346,1,1,0.0
3,11098528,884502,31522,18163,0.135398,1,1,0.0
4,11098528,1157882,28341,16057,0.136975,1,1,0.0


### add label

In [47]:
val_labels_pd = pd.read_parquet('../input/splited/test_labels.parquet')
val_labels_pd.head()

Unnamed: 0,session,type,ground_truth
0,11098528,clicks,[1679529]
1,11098528,carts,[1199737]
2,11098528,orders,"[990658, 950341, 1462506, 1561739, 907564, 369..."
3,11098529,clicks,[1105029]
4,11098530,orders,[409236]


In [48]:
val_labels_clicks = val_labels_pd.loc[val_labels_pd['type'] == 'clicks']
aids = val_labels_clicks.ground_truth.explode().astype('int32').rename('aid')
val_labels_clicks = val_labels_clicks[['session']].astype('int32')
val_labels_clicks = val_labels_clicks.merge(aids, left_index=True, right_index=True, how='left')
val_labels_clicks['click'] = 1
val_labels_clicks.head()

Unnamed: 0,session,aid,click
0,11098528,1679529,1
3,11098529,1105029,1
6,11098532,1596491,1
7,11098533,1417450,1
10,11098534,908024,1


In [49]:
val_labels_carts = val_labels_pd.loc[val_labels_pd['type'] == 'carts']
aids = val_labels_carts.ground_truth.explode().astype('int32').rename('aid')
val_labels_carts = val_labels_carts[['session']].astype('int32')
val_labels_carts = val_labels_carts.merge(aids, left_index=True, right_index=True, how='left')
val_labels_carts['cart'] = 1
val_labels_carts.head()

Unnamed: 0,session,aid,cart
1,11098528,1199737,1
8,11098533,108676,1
8,11098533,1406660,1
8,11098533,988295,1
8,11098533,1118792,1


In [55]:
val_labels_orders = val_labels_pd.loc[val_labels_pd['type'] == 'orders']
aids = val_labels_orders.ground_truth.explode().astype('int32').rename('aid')
val_labels_orders = val_labels_orders[['session']].astype('int32')
val_labels_orders = val_labels_orders.merge(aids, left_index=True, right_index=True, how='left')
val_labels_orders['order'] = 1
val_labels_orders.head()

Unnamed: 0,session,aid,order
2,11098528,990658,1
2,11098528,950341,1
2,11098528,1462506,1
2,11098528,1561739,1
2,11098528,907564,1


In [None]:
clicksCandidate = clicksCandidate.merge(val_labels_clicks, on=['session','aid'],how='left').fillna(0)
clicksCandidate.head()

In [51]:
cartsCandidate = buysCandidate.merge(val_labels_carts, on=['session','aid'],how='left').fillna(0)
cartsCandidate.head()

Unnamed: 0,session,aid,item_item_count,item_user_count,item_buy_ratio,user_user_count,user_item_count,user_buy_ratio,cart
0,11098528,11830,33776,19211,0.17397,1,1,0.0,0.0
1,11098528,1732105,10452,5770,0.280233,1,1,0.0,0.0
2,11098528,588923,24588,14741,0.125346,1,1,0.0,0.0
3,11098528,884502,31522,18163,0.135398,1,1,0.0,0.0
4,11098528,1157882,28341,16057,0.136975,1,1,0.0,0.0


In [56]:
ordersCandidate = buysCandidate.merge(val_labels_orders, on=['session','aid'],how='left').fillna(0)
ordersCandidate.head()

Unnamed: 0,session,aid,item_item_count,item_user_count,item_buy_ratio,user_user_count,user_item_count,user_buy_ratio,order
0,11098528,11830,33776,19211,0.17397,1,1,0.0,1.0
1,11098528,1732105,10452,5770,0.280233,1,1,0.0,0.0
2,11098528,588923,24588,14741,0.125346,1,1,0.0,0.0
3,11098528,884502,31522,18163,0.135398,1,1,0.0,0.0
4,11098528,1157882,28341,16057,0.136975,1,1,0.0,0.0


In [53]:
! mkdir ../output/data4xgb
data4xgb_path = '../output/data4xgb/'

mkdir: cannot create directory ‘../output/data4xgb’: File exists


In [None]:
clicksCandidate.to_parquet(data4xgb_path + 'data4xgb_clicks.pqt')
cartsCandidate.to_parquet(data4xgb_path + 'data4xgb_carts.pqt')
ordersCandidate.to_parquet(data4xgb_path + 'data4xgb_orders.pqt')

## Training

In [2]:
import xgboost as xgb
from sklearn.model_selection import GroupKFold

In [19]:
data4xgb_path = '../output/data4xgb/'
predictType = 'clicks'
data4xgb = pd.read_parquet(data4xgb_path + 'data4xgb_' + predictType + '.pqt')

In [20]:
data4xgb.head()

Unnamed: 0,session,aid,item_item_count,item_user_count,item_buy_ratio,user_user_count,user_item_count,user_buy_ratio,click
0,11098528,11830,33776,19211,0.17397,1,1,0.0,0.0
1,11098528,588923,24588,14741,0.125346,1,1,0.0,0.0
2,11098528,1732105,10452,5770,0.280233,1,1,0.0,0.0
3,11098528,571762,18822,12405,0.124588,1,1,0.0,0.0
4,11098528,884502,31522,18163,0.135398,1,1,0.0,0.0


### Under sample unclick

In [21]:
positives = data4xgb.loc[data4xgb.iloc[:, -1] == 1]
print(len(positives))
negatives = data4xgb.loc[data4xgb.iloc[:, -1] == 0].sample(frac=0.05)
print(len(negatives))
data4xgb_underSample = pd.concat([positives,negatives],axis=0,ignore_index=True)

1055362
4450359


In [49]:
! mkdir -p ../output/savedModel/xgb

In [22]:
skf = GroupKFold(n_splits=5)
topN = 50
candidates = data4xgb_underSample
xgbModelSavePath = '../output/savedModel/xgb/'
modelName = 'xgb_' + predictType

predsVal = np.zeros(len(candidates))

for fold,(train_idx, valid_idx) in enumerate(skf.split(candidates, candidates.iloc[:, -1], groups=candidates['session'] )):
    print('========= fold', fold, '==========')

    train = candidates.iloc[train_idx]
    val = candidates.iloc[valid_idx]


    train = train.sort_values('session')
    groupsTrain = train.groupby('session').aid.agg('count').values
    dtrain = xgb.DMatrix(train.iloc[:, 2:-1], train.iloc[:, -1], group=groupsTrain)

    val = val.sort_values('session')
    groupsVal = val.groupby('session').aid.agg('count').values
    dvalid = xgb.DMatrix(val.iloc[:, 2:-1], val.iloc[:, -1], group=groupsVal)

    xgb_parms = {'objective':'rank:pairwise', 'tree_method':'gpu_hist'}
    model = xgb.train(xgb_parms, 
        dtrain=dtrain,
        evals=[(dtrain,'train'),(dvalid,'valid')],
        num_boost_round=1000,
        verbose_eval=100)
    # model.save_model(xgbModelSavePath + modelName + f'_fold{fold}.xgb')

    # Validate on 1 fold
    dvalid = xgb.DMatrix(val.iloc[:, 2:-1])
    pred = model.predict(dvalid)
    predsVal[valid_idx] = pred
    

[0]	train-map:0.77658	valid-map:0.77684
[100]	train-map:0.79595	valid-map:0.79342
[200]	train-map:0.80153	valid-map:0.79812
[300]	train-map:0.80567	valid-map:0.80082
[400]	train-map:0.80852	valid-map:0.80276
[500]	train-map:0.81041	valid-map:0.80398
[600]	train-map:0.81226	valid-map:0.80501
[700]	train-map:0.81366	valid-map:0.80557
[800]	train-map:0.81480	valid-map:0.80620
[900]	train-map:0.81625	valid-map:0.80682
[999]	train-map:0.81727	valid-map:0.80731
[0]	train-map:0.77654	valid-map:0.77726
[100]	train-map:0.79556	valid-map:0.79374
[200]	train-map:0.80188	valid-map:0.79891
[300]	train-map:0.80577	valid-map:0.80169
[400]	train-map:0.80823	valid-map:0.80322
[500]	train-map:0.81061	valid-map:0.80469
[600]	train-map:0.81250	valid-map:0.80587
[700]	train-map:0.81398	valid-map:0.80665
[800]	train-map:0.81516	valid-map:0.80712
[900]	train-map:0.81644	valid-map:0.80756
[999]	train-map:0.81748	valid-map:0.80803
[0]	train-map:0.77672	valid-map:0.77671
[100]	train-map:0.79519	valid-map:0.7930

In [24]:
predictions = candidates[['session','aid']].copy()
predictions['pred'] = predsVal
predictions

Unnamed: 0,session,aid,pred
0,11098529,1105029,0.574947
1,11098534,908024,1.086648
2,11098535,745365,0.856596
3,11098537,1503532,0.217740
4,11098538,1452081,0.340209
...,...,...,...
5505716,12889342,574512,0.464641
5505717,12606360,1700852,0.444745
5505718,11821817,1614080,0.316568
5505719,11201393,698925,0.437166


In [None]:
predictions = predictions.sort_values(['session','pred'], ascending=[True,False]).reset_index(drop=True)
predictions['n'] = predictions.groupby('session').aid.cumcount().astype('int8')
predictions = predictions.loc[predictions.n<20]
predictions

In [25]:
sub = predictions.groupby('session').aid.apply(list)
sub = sub.to_frame().reset_index()
sub['type'] = predictType
sub.columns = ['session', 'labels', 'type']
sub

Unnamed: 0,session,labels,type
0,11098528,"[500334, 532616, 77440, 603583]",clicks
1,11098529,"[217742, 937091, 1105029, 386895]",clicks
2,11098530,"[530261, 210880, 1596897, 841206, 877496]",clicks
3,11098531,"[149790, 1351489, 67839]",clicks
4,11098532,"[1753963, 1212859]",clicks
...,...,...,...
1743851,12899774,"[1443205, 1399483]",clicks
1743852,12899775,"[614363, 1604133, 1760714, 575616]",clicks
1743853,12899776,[1737908],clicks
1743854,12899777,[384045],clicks


In [26]:
subClicks = sub
subClicks

Unnamed: 0,session,labels,type
0,11098528,"[500334, 532616, 77440, 603583]",clicks
1,11098529,"[217742, 937091, 1105029, 386895]",clicks
2,11098530,"[530261, 210880, 1596897, 841206, 877496]",clicks
3,11098531,"[149790, 1351489, 67839]",clicks
4,11098532,"[1753963, 1212859]",clicks
...,...,...,...
1743851,12899774,"[1443205, 1399483]",clicks
1743852,12899775,"[614363, 1604133, 1760714, 575616]",clicks
1743853,12899776,[1737908],clicks
1743854,12899777,[384045],clicks


In [18]:
subCarts = sub
subCarts

Unnamed: 0,session,labels,type
0,11098528,[11830],carts
1,11098529,"[1216613, 481971, 180164]",carts
2,11098530,"[409236, 460553]",carts
3,11098531,[344471],carts
4,11098532,[441064],carts
...,...,...,...
1679455,12899773,"[192769, 115181, 9763, 129615, 132434]",carts
1679456,12899774,[1271158],carts
1679457,12899776,"[861477, 1144446]",carts
1679458,12899777,"[1545815, 1689224]",carts


In [10]:
subOrders = sub
subOrders

Unnamed: 0,session,labels,type
0,11098528,[11830],orders
1,11098529,"[217742, 1544564]",orders
2,11098530,"[1391203, 1748824, 365220, 409236]",orders
3,11098531,"[1415171, 73689, 93062, 1365569, 1271998]",orders
4,11098532,"[669555, 1545861, 496180]",orders
...,...,...,...
1673192,12899774,"[696373, 771913, 218795]",orders
1673193,12899775,"[1194834, 1798464, 329725, 832192]",orders
1673194,12899776,"[798010, 512791, 748226, 251107, 83799, 133030...",orders
1673195,12899777,"[1104713, 1751703, 1688215]",orders


In [27]:
pred_df = pd.concat([subClicks, subCarts, subOrders])
pred_df

Unnamed: 0,session,labels,type
0,11098528,"[500334, 532616, 77440, 603583]",clicks
1,11098529,"[217742, 937091, 1105029, 386895]",clicks
2,11098530,"[530261, 210880, 1596897, 841206, 877496]",clicks
3,11098531,"[149790, 1351489, 67839]",clicks
4,11098532,"[1753963, 1212859]",clicks
...,...,...,...
1673192,12899774,"[696373, 771913, 218795]",orders
1673193,12899775,"[1194834, 1798464, 329725, 832192]",orders
1673194,12899776,"[798010, 512791, 748226, 251107, 83799, 133030...",orders
1673195,12899777,"[1104713, 1751703, 1688215]",orders


## Evaluate

In [32]:
test_labels = pd.read_parquet('../input/splited/test_labels.parquet')
test_labels

Unnamed: 0,session,type,ground_truth
0,11098528,clicks,[1679529]
1,11098528,carts,[1199737]
2,11098528,orders,"[990658, 950341, 1462506, 1561739, 907564, 369..."
3,11098529,clicks,[1105029]
4,11098530,orders,[409236]
...,...,...,...
2212687,12899774,clicks,[1399483]
2212688,12899775,clicks,[1760714]
2212689,12899776,clicks,[1737908]
2212690,12899777,clicks,[384045]


In [33]:
test_labels = test_labels.merge(pred_df, how='left', on=['session', 'type'])
test_labels

Unnamed: 0,session,type,ground_truth,labels
0,11098528,clicks,[1679529],"[500334, 532616, 77440, 603583]"
1,11098528,carts,[1199737],[11830]
2,11098528,orders,"[990658, 950341, 1462506, 1561739, 907564, 369...",[11830]
3,11098529,clicks,[1105029],"[217742, 937091, 1105029, 386895]"
4,11098530,orders,[409236],"[1391203, 1748824, 365220, 409236]"
...,...,...,...,...
2212687,12899774,clicks,[1399483],"[1443205, 1399483]"
2212688,12899775,clicks,[1760714],"[614363, 1604133, 1760714, 575616]"
2212689,12899776,clicks,[1737908],[1737908]
2212690,12899777,clicks,[384045],[384045]


In [31]:
def intersect(df):
    if type(df.labels) == float:
        return 0
    else:
        return len(set(df.ground_truth).intersection(set(df.labels)))

In [34]:
test_labels['hits'] = test_labels.apply(lambda df: intersect(df), axis=1)
test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20)
test_labels

Unnamed: 0,session,type,ground_truth,labels,hits,gt_count
0,11098528,clicks,[1679529],"[500334, 532616, 77440, 603583]",0,1
1,11098528,carts,[1199737],[11830],0,1
2,11098528,orders,"[990658, 950341, 1462506, 1561739, 907564, 369...",[11830],1,11
3,11098529,clicks,[1105029],"[217742, 937091, 1105029, 386895]",1,1
4,11098530,orders,[409236],"[1391203, 1748824, 365220, 409236]",1,1
...,...,...,...,...,...,...
2212687,12899774,clicks,[1399483],"[1443205, 1399483]",1,1
2212688,12899775,clicks,[1760714],"[614363, 1604133, 1760714, 575616]",1,1
2212689,12899776,clicks,[1737908],[1737908],1,1
2212690,12899777,clicks,[384045],[384045],1,1


In [37]:
recall_per_type = test_labels.groupby(['type'])['hits'].sum() / test_labels.groupby(['type'])['gt_count'].sum()
recall_per_type

type
carts     0.463192
clicks    0.601151
orders    0.681723
dtype: float64

In [39]:
score = (recall_per_type * pd.Series({'clicks': 0.10, 'carts': 0.30, 'orders': 0.60})).sum()
score

0.608106842722019

# Testing

### Candidates generation

In [15]:
def read_file_to_cache(f):
    df = pd.read_parquet(f)
    df.ts = (df.ts/1000).astype('int32')
    df['type'] = df['type'].map(type_labels).astype('int8')
    return df

In [16]:
data_cache = {}
type_labels = {'clicks':0, 'carts':1, 'orders':2}
files = glob.glob('../input/parquet/*_parquet/*')
! mkdir '../output/coVisitationPqt_top50'
saveFolder = '../output/coVisitationPqt_top50'

mkdir: cannot create directory ‘../output/coVisitationPqt_top50’: File exists


In [17]:
for f in files: data_cache[f] = read_file_to_cache(f)

In [17]:
%%time
cv_carts_orders = CV_carts_orders(files, data_cache, saveFolder)
cv_carts_orders.processDisks(50)

We will process 146 files, in groups of 3 and chunks of 25.

### DISK PART 1
Processing files 0 thru 24 in groups of 3...
0 , 3 , 6 , 9 , 12 , 15 , 18 , 21 , 24 , 
Processing files 25 thru 49 in groups of 3...
25 , 28 , 31 , 34 , 37 , 40 , 43 , 46 , 49 , 
Processing files 50 thru 74 in groups of 3...
50 , 53 , 56 , 59 , 62 , 65 , 68 , 71 , 74 , 
Processing files 75 thru 99 in groups of 3...
75 , 78 , 81 , 84 , 87 , 90 , 93 , 96 , 99 , 
Processing files 100 thru 124 in groups of 3...
100 , 103 , 106 , 109 , 112 , 115 , 118 , 121 , 124 , 
Processing files 125 thru 145 in groups of 3...
125 , 128 , 131 , 134 , 137 , 140 , 143 , 

### DISK PART 2
Processing files 0 thru 24 in groups of 3...
0 , 3 , 6 , 9 , 12 , 15 , 18 , 21 , 24 , 
Processing files 25 thru 49 in groups of 3...
25 , 28 , 31 , 34 , 37 , 40 , 43 , 46 , 49 , 
Processing files 50 thru 74 in groups of 3...
50 , 53 , 56 , 59 , 62 , 65 , 68 , 71 , 74 , 
Processing files 75 thru 99 in groups of 3...
75 , 78 , 81 , 84 , 87 , 90 , 93

In [18]:
%%time
cv_B2B = CV_B2B(files, data_cache, saveFolder)
cv_B2B.processDisks(50)

We will process 146 files, in groups of 3 and chunks of 25.

### DISK PART 1
Processing files 0 thru 24 in groups of 3...
0 , 3 , 6 , 9 , 12 , 15 , 18 , 21 , 24 , 
Processing files 25 thru 49 in groups of 3...
25 , 28 , 31 , 34 , 37 , 40 , 43 , 46 , 49 , 
Processing files 50 thru 74 in groups of 3...
50 , 53 , 56 , 59 , 62 , 65 , 68 , 71 , 74 , 
Processing files 75 thru 99 in groups of 3...
75 , 78 , 81 , 84 , 87 , 90 , 93 , 96 , 99 , 
Processing files 100 thru 124 in groups of 3...
100 , 103 , 106 , 109 , 112 , 115 , 118 , 121 , 124 , 
Processing files 125 thru 145 in groups of 3...
125 , 128 , 131 , 134 , 137 , 140 , 143 , 

### DISK PART 2
Processing files 0 thru 24 in groups of 3...
0 , 3 , 6 , 9 , 12 , 15 , 18 , 21 , 24 , 
Processing files 25 thru 49 in groups of 3...
25 , 28 , 31 , 34 , 37 , 40 , 43 , 46 , 49 , 
Processing files 50 thru 74 in groups of 3...
50 , 53 , 56 , 59 , 62 , 65 , 68 , 71 , 74 , 
Processing files 75 thru 99 in groups of 3...
75 , 78 , 81 , 84 , 87 , 90 , 93

In [19]:
%%time
cv_clicks = CV_clicks(files, data_cache, saveFolder)
cv_clicks.processDisks(50)

We will process 146 files, in groups of 3 and chunks of 25.

### DISK PART 1
Processing files 0 thru 24 in groups of 3...
0 , 3 , 6 , 9 , 12 , 15 , 18 , 21 , 24 , 
Processing files 25 thru 49 in groups of 3...
25 , 28 , 31 , 34 , 37 , 40 , 43 , 46 , 49 , 
Processing files 50 thru 74 in groups of 3...
50 , 53 , 56 , 59 , 62 , 65 , 68 , 71 , 74 , 
Processing files 75 thru 99 in groups of 3...
75 , 78 , 81 , 84 , 87 , 90 , 93 , 96 , 99 , 
Processing files 100 thru 124 in groups of 3...
100 , 103 , 106 , 109 , 112 , 115 , 118 , 121 , 124 , 
Processing files 125 thru 145 in groups of 3...
125 , 128 , 131 , 134 , 137 , 140 , 143 , 

### DISK PART 2
Processing files 0 thru 24 in groups of 3...
0 , 3 , 6 , 9 , 12 , 15 , 18 , 21 , 24 , 
Processing files 25 thru 49 in groups of 3...
25 , 28 , 31 , 34 , 37 , 40 , 43 , 46 , 49 , 
Processing files 50 thru 74 in groups of 3...
50 , 53 , 56 , 59 , 62 , 65 , 68 , 71 , 74 , 
Processing files 75 thru 99 in groups of 3...
75 , 78 , 81 , 84 , 87 , 90 , 93

In [20]:
# FREE MEMORY
del data_cache
gc.collect()

0

### Generate 50 candudate

In [18]:
def pqt_to_dict(df):
    return df.groupby('aid_x').aid_y.apply(list).to_dict()

In [19]:
cv_carts_orders = CV_carts_orders(files, data_cache, saveFolder)
cv_B2B = CV_B2B(files, data_cache, saveFolder)
cv_clicks = CV_clicks(files, data_cache, saveFolder)

We will process 146 files, in groups of 3 and chunks of 25.
We will process 146 files, in groups of 3 and chunks of 25.
We will process 146 files, in groups of 3 and chunks of 25.


In [21]:
top_50_clicks = {}
for k in range(0, cv_clicks.DISK_PIECES):
    top_50_clicks.update( pqt_to_dict( pd.read_parquet(saveFolder + f'/top_50_clicks_v{cv_clicks.VER}_{k}.pqt') ) )

top_50_buy2buy = {}
for k in range(0, cv_B2B.DISK_PIECES):
    top_50_buy2buy.update( pqt_to_dict( pd.read_parquet(saveFolder + f'/top_50_buy2buy_v{cv_B2B.VER}_{k}.pqt') ) )

top_50_buys = {}
for k in range(0, cv_carts_orders.DISK_PIECES): 
    top_50_buys.update( pqt_to_dict( pd.read_parquet(saveFolder + f'/top_50_carts_orders_v{cv_carts_orders.VER}_{k}.pqt') ) )

In [20]:
def load_test():    
    dfs = []
    for e, chunk_file in enumerate(glob.glob('../input/parquet/test_parquet/*')):
        chunk = pd.read_parquet(chunk_file)
        chunk.ts = (chunk.ts/1000).astype('int32')
        chunk['type'] = chunk['type'].map(type_labels).astype('int8')
        dfs.append(chunk)
    return pd.concat(dfs).reset_index(drop=True) #.astype({"ts": "datetime64[ms]"})

test_df = load_test()
print('Test data has shape',test_df.shape)
test_df.head()

Test data has shape (6928123, 4)


Unnamed: 0,session,aid,ts,type
0,12899779,59625,1661724000,0
1,12899780,1142000,1661724000,0
2,12899780,582732,1661724058,0
3,12899780,973453,1661724109,0
4,12899780,736515,1661724136,0


In [22]:
# TOP CLICKS AND ORDERS IN TEST
top_clicks = test_df.loc[test_df['type']==0,'aid'].value_counts().index.values[:50]
top_orders = test_df.loc[test_df['type']==2,'aid'].value_counts().index.values[:50]

print('Here are size of our 3 co-visitation matrices:')
print( len( top_50_clicks ), len( top_50_buy2buy ), len( top_50_buys ) )

Here are size of our 3 co-visitation matrices:
1837166 1168768 1837166


In [23]:
clicksSuggester = ClicksSuggester(top_clicks, top_50_clicks)
buysSuggester = BuysSuggester(top_orders, top_50_buy2buy, top_50_buys)

In [24]:
%%time
pred_df_clicks = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(
    lambda x: clicksSuggester.suggest(x, 50)
)

pred_df_buys = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(
    lambda x: buysSuggester.suggest(x, 50)
)

CPU times: user 8min 11s, sys: 798 ms, total: 8min 12s
Wall time: 8min 12s


In [28]:
clicks_pred_df = pd.DataFrame(pred_df_clicks, columns=["labels"]).reset_index()
clicks_pred_df.head()

Unnamed: 0,session,labels
0,12899779,"[59625, 1253524, 737445, 438191, 731692, 17907..."
1,12899780,"[1142000, 736515, 973453, 582732, 1502122, 487..."
2,12899781,"[918667, 199008, 194067, 57315, 141736, 146057..."
3,12899782,"[1007613, 595994, 1033148, 834354, 479970, 169..."
4,12899783,"[1817895, 607638, 1754419, 1216820, 1729553, 3..."


In [25]:
buys_pred_df = pd.DataFrame(pred_df_buys, columns=["labels"]).reset_index()
buys_pred_df.head()

Unnamed: 0,session,labels
0,12899779,"[59625, 397451, 469285, 1493965, 438191, 73169..."
1,12899780,"[1142000, 736515, 973453, 582732, 1502122, 487..."
2,12899781,"[918667, 199008, 194067, 57315, 141736, 185328..."
3,12899782,"[1007613, 595994, 1033148, 834354, 479970, 169..."
4,12899783,"[1817895, 607638, 1754419, 1216820, 1729553, 3..."


In [27]:
! mkdir ../output/top50CandidatesAllTrain
candidatesSavePath = '../output/top50CandidatesAllTrain/'
clicks_pred_df.to_parquet(candidatesSavePath + 'clicksCandidate.pqt')
buys_pred_df.to_parquet(candidatesSavePath + 'buysCandidate.pqt')

mkdir: cannot create directory ‘../output/top50CandidatesAllTrain’: File exists


### Create feature

In [29]:
clicksCandidate = {
        'session': [],
        'aid': [],
    }

for user, items in zip(clicks_pred_df['session'], clicks_pred_df['labels']):
    for item in items:
        clicksCandidate['session'].append(user)
        clicksCandidate['aid'].append(item)

clicksCandidate = pd.DataFrame(clicksCandidate)
clicksCandidate.head()

Unnamed: 0,session,aid
0,12899779,59625
1,12899779,1253524
2,12899779,737445
3,12899779,438191
4,12899779,731692


In [28]:
buysCandidate = {
        'session': [],
        'aid': [],
    }
for user, items in zip(buys_pred_df['session'], buys_pred_df['labels']):
    for item in items:
        buysCandidate['session'].append(user)
        buysCandidate['aid'].append(item)

buysCandidate = pd.DataFrame(buysCandidate)
buysCandidate.head()

Unnamed: 0,session,aid
0,12899779,59625
1,12899779,397451
2,12899779,469285
3,12899779,1493965
4,12899779,438191


In [31]:
! mkdir ../output/featuresAllTrain
featuresPath = '../output/featuresAllTrain/'

mkdir: cannot create directory ‘../output/featuresAllTrain’: File exists


In [29]:
def load_train():    
    dfs = []
    for e, chunk_file in enumerate(glob.glob('../input/parquet/train_parquet/*')):
        chunk = pd.read_parquet(chunk_file)
        chunk.ts = (chunk.ts/1000).astype('int32')
        chunk['type'] = chunk['type'].map(type_labels).astype('int8')
        dfs.append(chunk)
    return pd.concat(dfs).reset_index(drop=True) #.astype({"ts": "datetime64[ms]"})

train_df = load_train()
print('Test data has shape',test_df.shape)
train_df.head()

Test data has shape (6928123, 4)


Unnamed: 0,session,aid,ts,type
0,8000000,766953,1660420292,0
1,8000000,1586093,1660420385,0
2,8000000,1586093,1660420465,0
3,8000000,682970,1660420634,0
4,8000000,799794,1660420683,0


In [30]:
train_test_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

Item Features
(Using All training + testing data)

In [35]:
item_features = train_test_df.groupby('aid').agg({'aid':'count', 'session':'nunique', 'type':'mean'})
item_features.columns = ['item_item_count', 'item_user_count', 'item_buy_ratio']
# CONVERT COLUMNS TO INT32 and FLOAT32 HERE
item_features.to_parquet(featuresPath + 'item_features.pqt')
item_features.head()

Unnamed: 0_level_0,item_item_count,item_user_count,item_buy_ratio
aid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,48,40,0.0
1,34,30,0.029412
2,17,16,0.0
3,2759,1392,0.104023
4,221,143,0.040724


User features

In [36]:
user_features = test_df.groupby('session').agg({'session':'count', 'aid':'nunique', 'type':'mean'})
user_features.columns = ['user_user_count', 'user_item_count', 'user_buy_ratio']
# CONVERT COLUMNS TO INT32 and FLOAT32 HERE
user_features.to_parquet(featuresPath + 'user_features.pqt')
user_features.head()

Unnamed: 0_level_0,user_user_count,user_item_count,user_buy_ratio
session,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12899779,1,1,0.0
12899780,5,4,0.0
12899781,11,5,0.090909
12899782,70,38,0.457143
12899783,11,9,0.0


Merge features

In [32]:
featuresPath = '../output/featuresAllTrain/'
item_features = pd.read_parquet(featuresPath + 'item_features.pqt')
user_features = pd.read_parquet(featuresPath + 'user_features.pqt')

In [37]:
clicksCandidate = clicksCandidate.merge(item_features, left_on='aid', right_index=True, how='left').fillna(-1)
clicksCandidate = clicksCandidate.merge(user_features, left_on='session', right_index=True, how='left').fillna(-1)
clicksCandidate.head()

Unnamed: 0,session,aid,item_item_count,item_user_count,item_buy_ratio
0,12899779,59625,13,12,0.0
1,12899779,1253524,279,212,0.021505
2,12899779,737445,959,600,0.016684
3,12899779,438191,3651,2305,0.033416
4,12899779,731692,133,61,0.203008


In [33]:
buysCandidate = buysCandidate.merge(item_features, left_on='aid', right_index=True, how='left').fillna(-1)
buysCandidate = buysCandidate.merge(user_features, left_on='session', right_index=True, how='left').fillna(-1)
buysCandidate.head()

Unnamed: 0,session,aid,item_item_count,item_user_count,item_buy_ratio,user_user_count,user_item_count,user_buy_ratio
0,12899779,59625,13,12,0.0,1,1,0.0
1,12899779,397451,138,69,0.23913,1,1,0.0
2,12899779,469285,45,27,0.066667,1,1,0.0
3,12899779,1493965,185,134,0.059459,1,1,0.0
4,12899779,438191,3651,2305,0.033416,1,1,0.0


In [34]:
! mkdir ../output/data4xgbAllTrain
data4xgb_path = '../output/data4xgbAllTrain/'

mkdir: cannot create directory ‘../output/data4xgbAllTrain’: File exists


In [65]:
clicksCandidate.to_parquet(data4xgb_path + 'data4xgb.pqt')

In [35]:
buysCandidate.to_parquet(data4xgb_path + 'data4xgb_buys.pqt')

### Predict

In [2]:
import xgboost as xgb
from sklearn.model_selection import GroupKFold

In [13]:
data4xgb_path = '../output/data4xgbAllTrain/'
predictType = 'clicks'
if predictType == 'clicks':
    data4xgb = pd.read_parquet(data4xgb_path + 'data4xgb.pqt')
else:
    data4xgb = pd.read_parquet(data4xgb_path + 'data4xgb_buys.pqt')
data4xgb

Unnamed: 0,session,aid,item_item_count,item_user_count,item_buy_ratio,user_user_count,user_item_count,user_buy_ratio
0,12899779,59625,13,12,0.000000,1,1,0.0
1,12899779,1253524,279,212,0.021505,1,1,0.0
2,12899779,737445,959,600,0.016684,1,1,0.0
3,12899779,438191,3651,2305,0.033416,1,1,0.0
4,12899779,731692,133,61,0.203008,1,1,0.0
...,...,...,...,...,...,...,...,...
83590145,14571581,594420,7650,4934,0.066928,1,1,0.0
83590146,14571581,1781934,605,393,0.077686,1,1,0.0
83590147,14571581,206561,2821,2180,0.040411,1,1,0.0
83590148,14571581,1196716,6267,3968,0.083613,1,1,0.0


In [14]:
xgbModelSavePath = '../output/savedModel/xgb/'
preds = np.zeros(len(data4xgb))
modelName = 'xgb_' + predictType
for fold in range(5):
    print('======= fold', fold, '===========')
    model = xgb.Booster()
    model.load_model(xgbModelSavePath + modelName + f'_fold{fold}.xgb')
    model.set_param({'predictor': 'gpu_predictor'})
    dtest = xgb.DMatrix(data=data4xgb.iloc[:, 2:-1])
    preds += model.predict(dtest)/5
predictions = data4xgb[['session','aid']].copy()
predictions['pred'] = preds
predictions



Unnamed: 0,session,aid,pred
0,12899779,59625,-2.756937
1,12899779,1253524,-1.884056
2,12899779,737445,-1.483246
3,12899779,438191,-1.091157
4,12899779,731692,1.271955
...,...,...,...
83590145,14571581,594420,-0.486081
83590146,14571581,1781934,-0.456283
83590147,14571581,206561,-1.036466
83590148,14571581,1196716,-1.198395


Arrange predictions

In [15]:
predictions = predictions.sort_values(['session','pred'], ascending=[True,False]).reset_index(drop=True)
predictions['n'] = predictions.groupby('session').aid.cumcount().astype('int8')
predictions = predictions.loc[predictions.n<20]

sub = predictions.groupby('session').aid.apply(list)
sub = sub.to_frame().reset_index()
sub.aid = sub.aid.apply(lambda x: " ".join(map(str,x)))
sub.columns = ['session_type','labels']
sub.session_type = sub.session_type.astype('str')+ '_' + predictType
sub

Unnamed: 0,session_type,labels
0,12899779_clicks,679602 731692 1790770 696438 448688 1700255 47...
1,12899780_clicks,231487 103974 736515 636813 77422 1125638 4487...
2,12899781_clicks,1242608 447645 1192169 918667 1836671 1767530 ...
3,12899782_clicks,476063 562753 834354 829180 1344773 779477 229...
4,12899783_clicks,1257009 198385 255297 1754419 1492009 1390935 ...
...,...,...
1671798,14571577_clicks,446995 447748 1190776 459882 631085 1768884 85...
1671799,14571578_clicks,1512002 231894 1575726 1285327 1662628 1109341...
1671800,14571579_clicks,1189853 702275 1498799 1550479 1796451 118487 ...
1671801,14571580_clicks,1652005 560878 32322 628568 1455166 682237 336...


In [16]:
subClicks = sub
subClicks

Unnamed: 0,session_type,labels
0,12899779_clicks,679602 731692 1790770 696438 448688 1700255 47...
1,12899780_clicks,231487 103974 736515 636813 77422 1125638 4487...
2,12899781_clicks,1242608 447645 1192169 918667 1836671 1767530 ...
3,12899782_clicks,476063 562753 834354 829180 1344773 779477 229...
4,12899783_clicks,1257009 198385 255297 1754419 1492009 1390935 ...
...,...,...
1671798,14571577_clicks,446995 447748 1190776 459882 631085 1768884 85...
1671799,14571578_clicks,1512002 231894 1575726 1285327 1662628 1109341...
1671800,14571579_clicks,1189853 702275 1498799 1550479 1796451 118487 ...
1671801,14571580_clicks,1652005 560878 32322 628568 1455166 682237 336...


In [12]:
subCarts = sub
subCarts

Unnamed: 0,session_type,labels
0,12899779_carts,679602 535414 1790770 273918 696438 689970 397...
1,12899780_carts,103974 932551 87442 1290032 455423 1712906 151...
2,12899781_carts,1242608 129797 447645 1681537 1836671 1248748 ...
3,12899782_carts,987399 1696036 975116 834354 406001 1099390 13...
4,12899783_carts,639338 987038 169050 1553537 1787028 1317341 2...
...,...,...
1671798,14571577_carts,1768884 53114 446995 464835 1190776 1273700 30...
1671799,14571578_carts,231894 178547 131032 1572352 393009 1264204 16...
1671800,14571579_carts,392471 581331 164523 398437 1370061 1750859 15...
1671801,14571580_carts,32322 887920 989688 560878 942479 1675403 1357...


In [6]:
subOrders = sub
subOrders

Unnamed: 0,session_type,labels
0,12899779_orders,679602 475447 696438 1790770 637538 742709 535...
1,12899780_orders,404612 103974 973453 455423 932551 1517468 874...
2,12899781_orders,1200570 1242608 199008 1248748 150294 447645 1...
3,12899782_orders,975116 562753 406001 530899 987399 1674956 476...
4,12899783_orders,1817895 198385 1729553 639338 1257009 1464627 ...
...,...,...
1671798,14571577_orders,30148 63870 1273700 459882 1289343 935830 4648...
1671799,14571578_orders,1403962 131032 664851 178547 1084758 393009 15...
1671800,14571579_orders,304799 832213 630181 739876 374843 581331 5532...
1671801,14571580_orders,391852 1357971 887920 473806 387358 54018 1314...


In [18]:
pred_df = pd.concat([subClicks, subCarts, subOrders])
pred_df.to_csv("../submissions/rerankWithXgb.csv", index=False)
pred_df

Unnamed: 0,session_type,labels
0,12899779_clicks,679602 731692 1790770 696438 448688 1700255 47...
1,12899780_clicks,231487 103974 736515 636813 77422 1125638 4487...
2,12899781_clicks,1242608 447645 1192169 918667 1836671 1767530 ...
3,12899782_clicks,476063 562753 834354 829180 1344773 779477 229...
4,12899783_clicks,1257009 198385 255297 1754419 1492009 1390935 ...
...,...,...
1671798,14571577_orders,30148 63870 1273700 459882 1289343 935830 4648...
1671799,14571578_orders,1403962 131032 664851 178547 1084758 393009 15...
1671800,14571579_orders,304799 832213 630181 739876 374843 581331 5532...
1671801,14571580_orders,391852 1357971 887920 473806 387358 54018 1314...
