In [1]:
VER = 6

import pandas as pd, numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()

import os, sys, pickle, glob, gc
from collections import Counter
import cudf, itertools
print('We will use RAPIDS version',cudf.__version__)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

from pandarallel import pandarallel

pandarallel.initialize(nb_workers=4, progress_bar=True, use_memory_fs=False)

We will use RAPIDS version 22.10.00a+392.g1558403753
INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


# Candidate Generation

In [2]:
GENERATE_FOR = "kaggle" # "kaggle"
CANDIDATE_COUNT = 100

In [3]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

In [4]:
if GENERATE_FOR == "local":
    target_df = pd.read_parquet("./splitted_raw_data/val.parquet")
    target_covisit = "local"
elif GENERATE_FOR == "kaggle":
    target_df = pd.read_parquet("./splitted_raw_data/test.parquet")
    target_covisit = "kaggle"

In [5]:
def pqt_to_dict(df):
    return df.groupby('aid_x').aid_y.apply(list).to_dict()

DISK_PIECES = 4

# LOAD THREE CO-VISITATION MATRICES
top_20_clicks = pqt_to_dict( pd.read_parquet(f'../raw_data/{target_covisit}_covisitation/{target_covisit}_top_{CANDIDATE_COUNT}_clicks_v{VER}_0.pqt') )

for k in range(1,DISK_PIECES): 
    top_20_clicks.update( pqt_to_dict( pd.read_parquet(f'../raw_data/{target_covisit}_covisitation/{target_covisit}_top_{CANDIDATE_COUNT}_clicks_v{VER}_{k}.pqt') ) )


top_20_buys = pqt_to_dict( pd.read_parquet(f'../raw_data/{target_covisit}_covisitation/{target_covisit}_top_{CANDIDATE_COUNT}_carts_orders_v{VER}_0.pqt') )

for k in range(1,DISK_PIECES): 
    top_20_buys.update( pqt_to_dict( pd.read_parquet(f'../raw_data/{target_covisit}_covisitation/{target_covisit}_top_{CANDIDATE_COUNT}_carts_orders_v{VER}_{k}.pqt') ) )

top_20_buy2buy = pqt_to_dict( pd.read_parquet(f'../raw_data/{target_covisit}_covisitation/{target_covisit}_top_{CANDIDATE_COUNT}_buy2buy_v{VER}_0.pqt') )

print('Here are size of our 3 co-visitation matrices:')
print( len( top_20_clicks ), len( top_20_buy2buy ), len( top_20_buys ) )

Here are size of our 3 co-visitation matrices:
1837166 1168768 1837166


In [6]:
top_clicks = target_df.loc[target_df['type']== 0,'aid'].value_counts().index.values[:CANDIDATE_COUNT] 
top_carts = target_df.loc[target_df['type']== 1,'aid'].value_counts().index.values[:CANDIDATE_COUNT]
top_orders = target_df.loc[target_df['type']== 2,'aid'].value_counts().index.values[:CANDIDATE_COUNT]

In [7]:
type_weight_multipliers = {0: 1, 1: 5, 2: 4}

def suggest_clicks(df):
    # USER HISTORY AIDS AND TYPES
    aids=df.aid.tolist()
    types = df.type.tolist()
    unique_aids = list(dict.fromkeys(aids[::-1] ))
    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids)>=CANDIDATE_COUNT:
        weights=np.logspace(0.1,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        sorted_aids = [k for k,v in aids_temp.most_common(CANDIDATE_COUNT)]
        return sorted_aids
    # USE "CLICKS" CO-VISITATION MATRIX
    aids2 = list(itertools.chain(*[top_20_clicks[aid] for aid in unique_aids if aid in top_20_clicks]))
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2).most_common(CANDIDATE_COUNT) if aid2 not in unique_aids]    
    result = unique_aids + top_aids2[:CANDIDATE_COUNT - len(unique_aids)]
    # USE TOP20 TEST CLICKS
    return result + list(top_clicks)[:CANDIDATE_COUNT-len(result)]

In [8]:
def suggest_carts(df):
    # User history aids and types
    aids = df.aid.tolist()
    types = df.type.tolist()
    
    # UNIQUE AIDS AND UNIQUE BUYS
    unique_aids = list(dict.fromkeys(aids[::-1] ))
    df = df.loc[(df['type'] == 0)|(df['type'] == 1)]
    unique_buys = list(dict.fromkeys(df.aid.tolist()[::-1]))
    
    # Rerank candidates using weights
    if len(unique_aids) >= CANDIDATE_COUNT:
        weights=np.logspace(0.5,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        
        # Rerank based on repeat items and types of items
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        
        # Rerank candidates using"top_20_carts" co-visitation matrix
        aids2 = list(itertools.chain(*[top_20_buys[aid] for aid in unique_buys if aid in top_20_buys]))
        for aid in aids2: aids_temp[aid] += 0.1
        sorted_aids = [k for k,v in aids_temp.most_common(CANDIDATE_COUNT)]
        return sorted_aids
    
    # Use "cart order" and "clicks" co-visitation matrices
    aids1 = list(itertools.chain(*[top_20_clicks[aid] for aid in unique_aids if aid in top_20_clicks]))
    aids2 = list(itertools.chain(*[top_20_buys[aid] for aid in unique_aids if aid in top_20_buys]))
    
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids1+aids2).most_common(CANDIDATE_COUNT) if aid2 not in unique_aids] 
    result = unique_aids + top_aids2[:CANDIDATE_COUNT - len(unique_aids)]
    
    # USE TOP20 TEST ORDERS
    return result + list(top_carts)[:CANDIDATE_COUNT-len(result)]

In [9]:
def suggest_buys(df):
    # USER HISTORY AIDS AND TYPES
    aids=df.aid.tolist()
    types = df.type.tolist()
    # UNIQUE AIDS AND UNIQUE BUYS
    unique_aids = list(dict.fromkeys(aids[::-1] ))
    df = df.loc[(df['type']==1)|(df['type']==2)]
    unique_buys = list(dict.fromkeys( df.aid.tolist()[::-1] ))
    # RERANK CANDIDATES USING WEIGHTS
    if len(unique_aids)>=CANDIDATE_COUNT:
        weights=np.logspace(0.5,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids,weights,types): 
            aids_temp[aid] += w * type_weight_multipliers[t]
        # RERANK CANDIDATES USING "BUY2BUY" CO-VISITATION MATRIX
        aids3 = list(itertools.chain(*[top_20_buy2buy[aid] for aid in unique_buys if aid in top_20_buy2buy]))
        for aid in aids3: aids_temp[aid] += 0.1
        sorted_aids = [k for k,v in aids_temp.most_common(CANDIDATE_COUNT)]
        return sorted_aids
    # USE "CART ORDER" CO-VISITATION MATRIX
    aids2 = list(itertools.chain(*[top_20_buys[aid] for aid in unique_aids if aid in top_20_buys]))
    # USE "BUY2BUY" CO-VISITATION MATRIX
    aids3 = list(itertools.chain(*[top_20_buy2buy[aid] for aid in unique_buys if aid in top_20_buy2buy]))
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2+aids3).most_common(CANDIDATE_COUNT) if aid2 not in unique_aids] 
    result = unique_aids + top_aids2[:CANDIDATE_COUNT - len(unique_aids)]
    # USE TOP20 TEST ORDERS
    return result + list(top_orders)[:CANDIDATE_COUNT-len(result)]

# Create Submission CSV
Inferring test data with Pandas groupby is slow. We need to accelerate the following code.

In [10]:
pred_df_clicks = target_df.sort_values(["session", "ts"]).groupby(["session"]).parallel_apply(
    lambda x: suggest_clicks(x)
)

pred_df_carts = target_df.sort_values(["session", "ts"]).groupby(["session"]).parallel_apply(
    lambda x: suggest_carts(x)
)

pred_df_buys = target_df.sort_values(["session", "ts"]).groupby(["session"]).parallel_apply(
    lambda x: suggest_buys(x)
)

  iterator = iter(dataframe_groupby)


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=417951), Label(value='0 / 417951')…

  iterator = iter(dataframe_groupby)


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=417951), Label(value='0 / 417951')…

  iterator = iter(dataframe_groupby)


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=417951), Label(value='0 / 417951')…

In [11]:
clicks_pred_df = pd.DataFrame(pred_df_clicks, columns=["aid"]).reset_index()
orders_pred_df = pd.DataFrame(pred_df_buys, columns=["aid"]).reset_index()
carts_pred_df = pd.DataFrame(pred_df_carts, columns=["aid"]).reset_index()

In [12]:
clicks_pred_df.explode(column="aid").to_parquet(f"./candidate_data/{target_covisit}_{CANDIDATE_COUNT}candidates_clicks.parquet")
orders_pred_df.explode(column="aid").to_parquet(f"./candidate_data/{target_covisit}_{CANDIDATE_COUNT}candidates_orders.parquet")
carts_pred_df.explode(column="aid").to_parquet(f"./candidate_data/{target_covisit}_{CANDIDATE_COUNT}candidates_carts.parquet")