In [None]:
VER = 6

import pandas as pd, numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()

import os, sys, pickle, glob, gc
from collections import Counter
import cudf, itertools
print('We will use RAPIDS version',cudf.__version__)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

from pandarallel import pandarallel

pandarallel.initialize(nb_workers=4, progress_bar=True, use_memory_fs=False)

# Candidate Generation

In [None]:
GENERATE_FOR = "kaggle" # "kaggle"
CANDIDATE_COUNT = 100 # 候选集大小

In [None]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

In [None]:
if GENERATE_FOR == "local":
    target_df = pd.read_parquet("./splitted_raw_data/val.parquet")
    target_covisit = "local"
elif GENERATE_FOR == "kaggle":
    target_df = pd.read_parquet("./splitted_raw_data/test.parquet")
    target_covisit = "kaggle"

In [None]:
def pqt_to_dict(df):
    return df.groupby('aid_x').aid_y.apply(list).to_dict()

DISK_PIECES = 4

# 载入3个共访问矩阵
# 1. clicks
top_20_clicks = pqt_to_dict( pd.read_parquet(f'../raw_data/{target_covisit}_covisitation/{target_covisit}_top_{CANDIDATE_COUNT}_clicks_v{VER}_0.pqt') )

for k in range(1,DISK_PIECES): 
    top_20_clicks.update( pqt_to_dict( pd.read_parquet(f'../raw_data/{target_covisit}_covisitation/{target_covisit}_top_{CANDIDATE_COUNT}_clicks_v{VER}_{k}.pqt') ) )

# 2. buys
top_20_buys = pqt_to_dict( pd.read_parquet(f'../raw_data/{target_covisit}_covisitation/{target_covisit}_top_{CANDIDATE_COUNT}_carts_orders_v{VER}_0.pqt') )

for k in range(1,DISK_PIECES): 
    top_20_buys.update( pqt_to_dict( pd.read_parquet(f'../raw_data/{target_covisit}_covisitation/{target_covisit}_top_{CANDIDATE_COUNT}_carts_orders_v{VER}_{k}.pqt') ) )

# 3. buy2buy
top_20_buy2buy = pqt_to_dict( pd.read_parquet(f'../raw_data/{target_covisit}_covisitation/{target_covisit}_top_{CANDIDATE_COUNT}_buy2buy_v{VER}_0.pqt') )

print('Here are size of our 3 co-visitation matrices:')
print( len( top_20_clicks ), len( top_20_buy2buy ), len( top_20_buys ) )

# {aid_x1: [aid_y1, aid_y2, aid_y3, ...], ...}
# {0: [532042, 643097,  1735605, 1848174, 706401,  1363081, 1546770, 1670582, 1040235, 1114899, 1798779, 341490,  1211854, 1350484, 326027],
#  1: [28092,  1533875, 645003,  1234826, 1815894, 590833,  1835190, 748990,  166683,  466841,  408008,  1838453, 785492,  1323311, 141693],

In [None]:
top_clicks = target_df.loc[target_df['type']== 0,'aid'].value_counts().index.values[:CANDIDATE_COUNT]  # 点击最多的前100个items
top_carts = target_df.loc[target_df['type']== 1,'aid'].value_counts().index.values[:CANDIDATE_COUNT] # 加入购物车最多的前100个items
top_orders = target_df.loc[target_df['type']== 2,'aid'].value_counts().index.values[:CANDIDATE_COUNT] # 下单最多的前100个items

In [None]:
type_weight_multipliers = {0: 1, 1: 5, 2: 4}

def suggest_clicks(df):
    '''
    df: test_df groupby session, sort by ts
    top_100_clicks
    '''
    aids=df.aid.tolist() #aids列表, ts升序
    types = df.type.tolist() # types列表, ts升序
    unique_aids = list(dict.fromkeys(aids[::-1] )) # 去重aids, ts降序
    # 如果 unique_aids >= 100, 根据 时间权重 和 type权重 从unique_aids中挑出100个items；
    # 如果 unique_aids < 100, 从top_100_clicks挑出和unique_aids最相关的items。
    if len(unique_aids)>=CANDIDATE_COUNT:
        weights=np.logspace(0.1,1,len(aids),base=2, endpoint=True)-1 # logspace权重0-1，时间越早权重越小
        aids_temp = Counter() 
        # RERANK BASED ON REPEAT ITEMS AND TYPE OF ITEMS
        for aid,w,t in zip(aids,weights,types): 
            # ts权重 * type权重
            aids_temp[aid] += w * type_weight_multipliers[t]
        sorted_aids = [k for k,v in aids_temp.most_common(CANDIDATE_COUNT)] # 选择权重最大的100个items
        return sorted_aids


    # 从top_20_clicks挑出和unique_aids最相关的20个items
    aids2 = list(itertools.chain(*[top_20_clicks[aid] for aid in unique_aids if aid in top_20_clicks])) # 时间越近的items，其关联的20个items
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2).most_common(CANDIDATE_COUNT) if aid2 not in unique_aids] # 在 unique_aids 中未出现的，且在上述列表中最常见的100个items    
    result = unique_aids + top_aids2[:CANDIDATE_COUNT - len(unique_aids)]
    # USE TOP100 TEST CLICKS
    return result + list(top_clicks)[:CANDIDATE_COUNT-len(result)] # 如果上述不足100个items, 则补上加上全局clicks最多的items

In [None]:
def suggest_carts(df):
    '''
    df: test_df groupby session, sort by ts
    top_100_clicks
    top_100_buys
    '''
    # User history aids and types
    aids = df.aid.tolist() #aids列表, ts升序
    types = df.type.tolist() # types列表, ts升序
    
    # UNIQUE AIDS AND UNIQUE BUYS
    unique_aids = list(dict.fromkeys(aids[::-1] )) # 去重aids, ts降序
    df = df.loc[(df['type'] == 0)|(df['type'] == 1)] # 只挑选clicks和carts
    unique_buys = list(dict.fromkeys(df.aid.tolist()[::-1])) # 去重aids, ts降序  
    
    # 如果 unique_aids >= 100, 根据 时间权重、type权重、top_100_buys 从unique_aids中挑出100个items；
    # 如果 unique_aids < 100, 从top_100_clicks和top_100_buys挑出unique_aids最相关的items。
    if len(unique_aids) >= CANDIDATE_COUNT:
        weights=np.logspace(0.5,1,len(aids),base=2, endpoint=True)-1
        aids_temp = Counter() 
        
        # Rerank based on repeat items and types of items
        for aid,w,t in zip(aids,weights,types): 
            # ts权重 * type权重
            aids_temp[aid] += w * type_weight_multipliers[t]
        
        # 从top_100_buys挑出和unique_buys相关的items，这些items权重+0.1
        aids2 = list(itertools.chain(*[top_20_buys[aid] for aid in unique_buys if aid in top_20_buys]))
        for aid in aids2: aids_temp[aid] += 0.1
        sorted_aids = [k for k,v in aids_temp.most_common(CANDIDATE_COUNT)]
        return sorted_aids
    
    # 从top_100_clicks挑出和unique_aids相关的items
    aids1 = list(itertools.chain(*[top_20_clicks[aid] for aid in unique_aids if aid in top_20_clicks]))
    # 从top_100_buys挑出和unique_aids相关的items
    aids2 = list(itertools.chain(*[top_20_buys[aid] for aid in unique_aids if aid in top_20_buys]))
    
    # 最常见的100个items
    top_aids2 = [aid2 for aid2, cnt in Counter(aids1+aids2).most_common(CANDIDATE_COUNT) if aid2 not in unique_aids] 
    result = unique_aids + top_aids2[:CANDIDATE_COUNT - len(unique_aids)]
    
    return result + list(top_carts)[:CANDIDATE_COUNT-len(result)] # 如果仍未满100个，则补上全局carts最多的items

In [None]:
def suggest_buys(df):
    '''
    top_100_buys
    top_100_buy2buy
    '''
    # USER HISTORY AIDS AND TYPES
    aids=df.aid.tolist() #aids列表, ts升序
    types = df.type.tolist() # types列表, ts升序
    # UNIQUE AIDS AND UNIQUE BUYS
    unique_aids = list(dict.fromkeys(aids[::-1] )) # 去重aids, ts降序
    df = df.loc[(df['type']==1)|(df['type']==2)]  # 只挑选carts和orders
    unique_buys = list(dict.fromkeys( df.aid.tolist()[::-1] ))  # 去重aids, ts降序
    # 如果 unique_aids >= 100, 
    # 如果 unique_aids < 100, 
    if len(unique_aids)>=CANDIDATE_COUNT:
        weights=np.logspace(0.5,1,len(aids),base=2, endpoint=True)-1 # 时间越近，权重越大
        aids_temp = Counter() 
        for aid,w,t in zip(aids,weights,types): 
            # 时间权重 * type权重
            aids_temp[aid] += w * type_weight_multipliers[t]
        # 从top_20_buy2buy挑出和unique_buys相关的items，这些items权重+0.1
        aids3 = list(itertools.chain(*[top_20_buy2buy[aid] for aid in unique_buys if aid in top_20_buy2buy]))
        for aid in aids3: aids_temp[aid] += 0.1
        # pick 20 most_common 
        sorted_aids = [k for k,v in aids_temp.most_common(CANDIDATE_COUNT)]
        return sorted_aids
    # 从top_20_buys挑出和unique_aids相关的items
    aids2 = list(itertools.chain(*[top_20_buys[aid] for aid in unique_aids if aid in top_20_buys]))
    ## 从top_20_buy2buy挑出和unique_aids相关的items
    aids3 = list(itertools.chain(*[top_20_buy2buy[aid] for aid in unique_buys if aid in top_20_buy2buy]))
    # 最常见的100个items
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2+aids3).most_common(CANDIDATE_COUNT) if aid2 not in unique_aids] 
    result = unique_aids + top_aids2[:CANDIDATE_COUNT - len(unique_aids)]
    # USE TOP100 TEST ORDERS
    return result + list(top_orders)[:CANDIDATE_COUNT-len(result)]  # 如果仍未满100个，则补上全局orders最多的items

# Create Submission CSV
Inferring test data with Pandas groupby is slow. We need to accelerate the following code.

In [None]:
# 执行 candidate generation
pred_df_clicks = target_df.sort_values(["session", "ts"]).groupby(["session"]).parallel_apply(
    lambda x: suggest_clicks(x)
)

pred_df_carts = target_df.sort_values(["session", "ts"]).groupby(["session"]).parallel_apply(
    lambda x: suggest_carts(x)
)

pred_df_buys = target_df.sort_values(["session", "ts"]).groupby(["session"]).parallel_apply(
    lambda x: suggest_buys(x)
)

In [None]:
clicks_pred_df = pd.DataFrame(pred_df_clicks, columns=["aid"]).reset_index()
orders_pred_df = pd.DataFrame(pred_df_buys, columns=["aid"]).reset_index()
carts_pred_df = pd.DataFrame(pred_df_carts, columns=["aid"]).reset_index()

In [None]:
# 保存candidates结果
clicks_pred_df.explode(column="aid").to_parquet(f"./candidate_data/{target_covisit}_{CANDIDATE_COUNT}candidates_clicks.parquet")
orders_pred_df.explode(column="aid").to_parquet(f"./candidate_data/{target_covisit}_{CANDIDATE_COUNT}candidates_orders.parquet")
carts_pred_df.explode(column="aid").to_parquet(f"./candidate_data/{target_covisit}_{CANDIDATE_COUNT}candidates_carts.parquet")