In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import pandas as pd, numpy as np
from tqdm.auto import tqdm
import os, sys, pickle, glob, gc, shutil
import math
from pathlib import Path
from collections import Counter
import itertools
from eventsSuggesterNew import ClicksSuggester, BuysSuggester, CartsSuggester
import gensim

In [2]:
inputPath = '../input/newSplited/'
outputPath = '../output/newSplited/'
! mkdir {outputPath}

mkdir: cannot create directory ‘../output/newSplited/’: File exists


In [3]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

# Generate candidates

In [4]:
candidatesNum = 100
SET = 2
VER = 6
note_covisit = 'covisit_20_20_20'
note_candidate = 'suggester_addLast'

In [5]:
def load_test(path):    
    dfs = []
    for e, chunk_file in enumerate(glob.glob(path)):
        chunk = pd.read_parquet(chunk_file)
        chunk.ts = (chunk.ts/1000).astype('int32')
        chunk['type'] = chunk['type'].map(type_labels).astype('int8')
        dfs.append(chunk)
    return pd.concat(dfs).reset_index(drop=True) #.astype({"ts": "datetime64[ms]"})

def pqt_to_dict(df):
    return df.groupby('aid_x').aid_y.apply(list).to_dict()

def saveChunk(data, chunkSize, path, predType):
    n = len(data)
    i = 0
    while(i*chunkSize < n):
        sub = data[i*chunkSize:(i+1)*chunkSize]
        sub.to_parquet(path + f'{predType}_{i}.pqt')
        i += 1

## Load testA data

In [7]:
if SET == 1:
    testA = load_test('../input/split_2-1_pqt/test_parquets/*')
elif SET == 2:
    testA = pd.read_parquet('../input/splited/test.parquet')
elif SET == 3:
    testA = load_test('../input/parquets/test_parquets/*')
testA

Unnamed: 0,session,aid,ts,type
0,11098528,11830,1661119200,0
1,11098529,1105029,1661119200,0
2,11098530,264500,1661119200,0
3,11098530,264500,1661119288,0
4,11098530,409236,1661119369,0
...,...,...,...,...
7683572,12899774,33035,1661723968,0
7683573,12899775,1743151,1661723970,0
7683574,12899776,548599,1661723972,0
7683575,12899777,384045,1661723976,0


## Load covistation matrix

In [8]:
covisitTypes = ['clicks', 'click2click', 'cartsOrders', 'buy2buy', 'click2cart', 'click2order']
topN = {'clicks': 20, 'click2click': 20, 'cartsOrders': 20, 'buy2buy': 20, 'click2cart': 20, 'click2order': 20, 'buy2buy_q': 20}
diskPart = {'clicks': 8, 'click2click': 8, 'cartsOrders': 8, 'buy2buy': 2, 'click2cart': 8, 'click2order': 8, 'buy2buy_q': 2}
covisitMat = {}

In [9]:
coVisitSaveFolder = outputPath + f'/coVisit/set{SET}/top20_20_20/'

In [None]:
for covisitType in covisitTypes:
    for k in range(0, diskPart[covisitType]):
        covisitMat[covisitType] = {}
        covisitMat[covisitType].update( pqt_to_dict( pd.read_parquet(coVisitSaveFolder + f'top_{topN[covisitType]}_{covisitType}_v{VER}_{k}.pqt') ) )

## find most frequently item

In [11]:
top_clicks = testA.loc[testA['type']==0,'aid'].value_counts().index.values[:candidatesNum].astype(np.int32)
top_carts = testA.loc[testA['type'] == 1,'aid'].value_counts().index.values[:candidatesNum].astype(np.int32)
top_orders = testA.loc[testA['type'] == 2,'aid'].value_counts().index.values[:candidatesNum].astype(np.int32)

## Load trained aid2vec model

In [11]:
aid2vec_model = gensim.models.KeyedVectors.load_word2vec_format(f'../output/newSplited/savedModel/set_{SET}/otto_aid2vec_5d.bin', binary=True)

# suggest

In [None]:
candidatesSavePath = outputPath + f'candidates/set{SET}_top_{candidatesNum}/{note_candidate}/'
try: 
    os.makedirs(candidatesSavePath) 
except OSError as error: 
    print(error)

In [None]:
suggesters = {
    'clicks': ClicksSuggester(top_clicks, covisitMat['click2click'], covisitMat['clicks'], aid2vec_model), 
    'carts': CartsSuggester(top_carts, covisitMat['buy2buy'], covisitMat['cartsOrders'], covisitMat['click2cart'], aid2vec_model), 
    'orders': BuysSuggester(top_orders, covisitMat['buy2buy'], covisitMat['cartsOrders'], covisitMat['click2order'], aid2vec_model),
}

pred_dfs = {}
for predType in suggesters:
    pred_dfs[predType] = testA.sort_values(["session", "ts"]).groupby(["session"]).progress_apply(lambda x: suggesters[predType].suggest(x, candidatesNum)).to_frame().reset_index()
    print(pred_dfs[predType])
    aids = pred_dfs[predType].labels.explode().astype('int32').rename('aid')
    candidate = pred_dfs[predType][['session']].astype('int32')
    candidate = candidate.merge(aids, left_index=True, right_index=True, how='left').reset_index(drop=True)
    saveChunk(candidate, 650000 * 100, candidatesSavePath, predType)


## Check recall rate of candidates (for set == 2 or set ==1)

In [None]:
for predType in pred_dfs:
    pred_dfs[predType]['type'] = predType
    print(pred_dfs[predType])
pred_df = pd.concat(list(pred_dfs.values())).reset_index(drop=True)
print(pred_df)

In [None]:
if SET == 1:
    test_labels = pd.read_parquet('../input/split_2-1_pqt/test_labels.parquet')
elif SET == 2:
    test_labels = pd.read_parquet('../input/splited/test_labels.parquet')

In [None]:
test_labels = test_labels.merge(pred_df, how='left', on=['session', 'type'])
test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels[:20]))), axis=1)
test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20)
test_labels

Unnamed: 0,session,type,ground_truth,labels,hits,gt_count
0,11098528,clicks,[1679529],"[11830, 588923, 876129, 1732105, 307904, 20535...",0,1
1,11098528,carts,[1199737],"[11830, 588923, 876129, 1732105, 307904, 88450...",0,1
2,11098528,orders,"[990658, 950341, 1462506, 1561739, 907564, 369...","[11830, 588923, 876129, 1732105, 307904, 88450...",1,11
3,11098529,clicks,[1105029],"[1105029, 297916, 459126, 217742, 1717995, 138...",1,1
4,11098530,orders,[409236],"[409236, 264500, 1268065, 963957, 583026, 2108...",1,1
...,...,...,...,...,...,...
2212687,12899774,clicks,[1399483],"[33035, 771913, 31490, 1399483, 1053112, 86143...",1,1
2212688,12899775,clicks,[1760714],"[1743151, 155954, 1550204, 1760714, 1022572, 3...",1,1
2212689,12899776,clicks,[1737908],"[548599, 1150130, 793373, 1401030, 1440959, 51...",0,1
2212690,12899777,clicks,[384045],"[384045, 1308634, 1450928, 1281056, 1688215, 1...",1,1


In [None]:
print('Set:', SET, 'Top:', candidatesNum)
recall_per_type = test_labels.groupby(['type'])['hits'].sum() / test_labels.groupby(['type'])['gt_count'].sum()
print(recall_per_type)
score = (recall_per_type * pd.Series({'clicks': 0.10, 'carts': 0.30, 'orders': 0.60})).sum()
print(score)

Set: 2 Top: 100
type
carts     0.419272
clicks    0.536148
orders    0.652270
dtype: float64
0.5707582388019186


## Submit only covisitation candidates (for SET == 3)

In [None]:
for predType in pred_dfs:
    pred_dfs[predType].session = pred_dfs[predType].session.apply(lambda x: str(x) + '_' + predType)
    pred_dfs[predType].labels = pred_dfs[predType].labels.apply(lambda x: x[:20])
    print(pred_dfs[predType])

In [None]:
pred_df = pd.concat(list(pred_dfs.values()))
pred_df.columns = ["session_type", "labels"]
pred_df["labels"] = pred_df.labels.apply(lambda x: " ".join(map(str,x)))
pred_df

In [None]:
pred_df.to_csv(f'../submissions/covisitCandidates_{note_candidate}.csv', index=False)

# Validate on small dataset

In [11]:
SET = 2
aid2vec_model = gensim.models.KeyedVectors.load_word2vec_format(f'../output/newSplited/savedModel/set_{SET}/otto_aid2vec_5d.bin', binary=True)

In [322]:
testA = pd.read_parquet('../input/splited/test.parquet')
top_clicks = testA.loc[testA['type']==0,'aid'].value_counts().index.values[:100].astype(np.int32)
top_carts = testA.loc[testA['type'] == 1,'aid'].value_counts().index.values[:100].astype(np.int32)
top_orders = testA.loc[testA['type'] == 2,'aid'].value_counts().index.values[:100].astype(np.int32)

In [341]:
import random
from eventsSuggesterNew import ClicksSuggester, BuysSuggester, CartsSuggester

testA = pd.read_parquet('../input/splited/test.parquet')
random.seed(10)
uniqueUser = list(testA.session.unique())
keepUsers = random.sample(uniqueUser, int(len(uniqueUser)/10))
testA = testA[testA.session.isin(keepUsers)].reset_index(drop=True)

predType = 'orders'
suggesters = {
    'clicks': ClicksSuggester(top_clicks, covisitMat['click2click'], covisitMat['clicks'], aid2vec_model), 
    'carts': CartsSuggester(top_carts, covisitMat['buy2buy'], covisitMat['cartsOrders'], covisitMat['click2cart'], aid2vec_model), 
    'orders': BuysSuggester(top_orders, covisitMat['buy2buy'], covisitMat['cartsOrders'], covisitMat['click2order'], aid2vec_model),
}
suggester = suggesters[predType]

3

In [342]:
tqdm.pandas()
pred_df = testA.sort_values(["session", "ts"]).groupby(["session"]).progress_apply(lambda x: suggester.suggest(x, candidatesNum)).to_frame().reset_index()
pred_df.columns = ['session', 'labels']
pred_df['session'] = pred_df['session'].astype('int32')
pred_df

A Jupyter Widget

Unnamed: 0,session,labels
0,11098529,"[1105029, 1135201, 295362, 441348, 1049489, 51..."
1,11098540,"[1545215, 1277598, 896787, 789082, 1472725, 10..."
2,11098553,"[314297, 1061776, 399992, 1783610, 1125095, 57..."
3,11098557,"[1234327, 719075, 1471333, 1732776, 53060, 527..."
4,11098559,"[907069, 84703, 1003267, 607328, 1381738, 5999..."
...,...,...
180120,12899695,"[206768, 1712543, 1005385, 1572478, 413094, 18..."
180121,12899714,"[559593, 138753, 683268, 472892, 1839548, 5541..."
180122,12899719,"[1217083, 250637, 132109, 226025, 508883, 1156..."
180123,12899763,"[1539032, 750901, 95488, 873385, 1854775, 3695..."


In [343]:
pred_df['type'] = predType
test_labels = pd.read_parquet('../input/splited/test_labels.parquet')
type_labels = test_labels[test_labels['type'] == predType]
type_labels = type_labels.merge(pred_df, how='left', on=['session', 'type']).dropna()
type_labels['hits_all'] = type_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
type_labels['hits_20'] = type_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels[:20]))), axis=1)
type_labels['gt_count'] = type_labels.ground_truth.str.len().clip(0,20)
recall_per_type_all = type_labels['hits_all'].sum() / type_labels['gt_count'].sum()
recall_per_type_20 = type_labels['hits_20'].sum() / type_labels['gt_count'].sum()
print('recall:', recall_per_type_all)
print('recall@20:', recall_per_type_20)

0.6915200924380537