# Step 1 - Candidate Generation with RAPIDS
For candidate generation, we build three co-visitation matrices. One computes the popularity of cart/order given a user's previous click/cart/order. We apply type weighting to this matrix. One computes the popularity of cart/order given a user's previous cart/order. We call this "buy2buy" matrix. One computes the popularity of clicks given a user previously click/cart/order.  We apply time weighting to this matrix. We will use RAPIDS cuDF GPU to compute these matrices quickly!

In [None]:
VER = 10

import pandas as pd, numpy as np
from tqdm.notebook import tqdm
import os, sys, pickle, glob, gc
from collections import Counter
import cudf, itertools
print('We will use RAPIDS version',cudf.__version__)

In [None]:
MODE = "kaggle" # "kaggle"

if MODE == "kaggle":
    readpath = '../input/otto-chunk-data-inparquet-format/*_parquet/*'

elif MODE == "local":
    readpath = '/kaggle/input/otto-validation/*_parquet/*'

    
files = glob.glob(readpath)

In [None]:
# 读取文件
def read_file(f):
    return cudf.DataFrame( data_cache[f] )

def read_file_to_cache(f):
    df = pd.read_parquet(f)
    df.ts = (df.ts/1000).astype('int32')
    df['type'] = df['type'].map(type_labels).astype('int8')
    return df

# CACHE THE DATA ON CPU BEFORE PROCESSING ON GPU
data_cache = {}
type_labels = {'clicks':0, 'carts':1, 'orders':2}
for f in files: data_cache[f] = read_file_to_cache(f)
CHUNK = int( np.ceil( len(files)/6 )) # chunks size

In [None]:
def generate_cooccurence_df(
        aidx_types = [1], # 0=clicks, 1=carts, 2=orders
        aidy_types = [1], # 0=clicks, 1=carts, 2=orders
        time_bw_aids_threshold = None, # 时间间隔
        best_k = None, # 前k个
        type_weights = None, # 类型权重
        file_name = "dummy", # 文件名
        disk_pieces = 4, # pieces循环次数
        read_ct_size = 5 # read_ct循环次数
        ):

    # CHUNK PARAMETERS
    print(f'We will process {len(files)} files, in groups of {read_ct_size} and chunks of {CHUNK}.')

    # USE SMALLEST DISK_PIECES POSSIBLE WITHOUT MEMORY ERROR
    SIZE = 1.86e6/disk_pieces

    # pieces循环
    for PART in range(disk_pieces):
        print()
        print('### DISK PART', PART+1)

        # chunk循环
        # => OUTER CHUNKS
        for j in range(6):
            a = j*CHUNK
            b = min( (j+1)*CHUNK, len(files) )
            print(f'Processing files {a} thru {b-1} in groups of {read_ct_size}...')

            # READ_CT 循环
            # => INNER CHUNKS
            for k in range(a,b,read_ct_size):
                # READ FILE， 5个
                df = [read_file(files[k])]
                for i in range(1,read_ct_size): 
                    if k+i<b: df.append( read_file(files[k+i]) )
                df = cudf.concat(df,ignore_index=True,axis=0) # 合并5个df
                # 按照 session 升序, ts 降序
                df = df.sort_values(['session','ts'], ascending=[True,False])
                

                aidx_df = df.loc[df['type'].isin(aidx_types)] # aidx 保留指定类型
                aidy_df = df.loc[df['type'].isin(aidy_types)] # aidy 保留指定类型
                del df; gc.collect()
                
                # USE TAIL OF SESSION
                aidx_df = aidx_df.reset_index(drop=True)
                aidy_df = aidy_df.reset_index(drop=True)

                # CREATE PAIRS
                df = aidx_df.merge(aidy_df, on='session') # 同一个session的 item-pair
                del aidx_df, aidy_df; gc.collect()
                df = df.loc[(df.aid_x != df.aid_y)] # item-pair中的item不同
                
                # 取最近时间的行为
                if time_bw_aids_threshold:
                    # 距离现在的时间小于 time_bw_aids_threshold
                    df = df.loc[ ((df.ts_x - df.ts_y).abs() < time_bw_aids_threshold) & (df.aid_x != df.aid_y) ]

                # pieces循环数据切分
                df = df.loc[(df.aid_x >= PART*SIZE)&(df.aid_x < (PART+1)*SIZE)] # 内存管理

                # 删除同一用户的重复的 'aid_x', 'aid_y','type_y'
                df = df[['session', 'aid_x', 'aid_y','type_y']].drop_duplicates(['session', 'aid_x', 'aid_y', 'type_y'])
                df['wgt'] = df.type_y.map(type_weights) if type_weights else 1 # 根据类型设置权重
                df = df[['aid_x','aid_y','wgt']]
        # 'aid_x','aid_y' 匹配的所有权值的累加
                df = df.groupby(['aid_x','aid_y']).wgt.sum()

                # COMBINE INNER CHUNKS
                if k==a: tmp2 = df
                else: tmp2 = tmp2.add(df, fill_value=0)
                print(k,', ',end='')

            print()

            # COMBINE OUTER CHUNKS
            if a==0: tmp = tmp2
            else: tmp = tmp.add(tmp2, fill_value=0)
            del tmp2, df
            gc.collect()

        # CONVERT MATRIX TO DICTIONARY
        tmp = tmp.reset_index()
        # 按照 aid_ 升序, wgt 降序
        tmp = tmp.sort_values(['aid_x','wgt'],ascending=[True,False])

        # 每个aid_x保留匹配度最高的前best_k个items
        tmp = tmp.reset_index(drop=True)
        tmp['n'] = tmp.groupby('aid_x').aid_y.cumcount() # 根据wgt排序
        if best_k:
            tmp = tmp.loc[tmp.n<best_k].drop('n',axis=1)

        # SAVE PART TO DISK (convert to pandas first uses less memory)
        tmp.to_pandas().to_parquet(f'{MODE}_{file_name}_cooccurences_v{VER}_{PART}.pqt')
        del tmp;gc.collect()

In [None]:
generate_cooccurence_df(aidx_types = [0],
                        aidy_types = [1],
                        time_bw_aids_threshold = 24 * 60 * 60,
                        best_k = None,
                        type_weights = None,
                        file_name = "(clicks)vs(carts)",
                        disk_pieces = 4,
                        read_ct_size = 5)

In [None]:
generate_cooccurence_df(aidx_types = [0],
                        aidy_types = [2],
                        time_bw_aids_threshold = 24 * 60 * 60,
                        best_k = None,
                        type_weights = None,
                        file_name = "(clicks)vs(orders)",
                        disk_pieces = 4,
                        read_ct_size = 5)

In [None]:
generate_cooccurence_df(aidx_types = [1],
                        aidy_types = [1],
                        time_bw_aids_threshold = None,
                        best_k = None,
                        type_weights = None,
                        file_name = "(carts)vs(carts)",
                        disk_pieces = 4,
                        read_ct_size = 5)

In [None]:
generate_cooccurence_df(aidx_types = [1],
                        aidy_types = [2],
                        time_bw_aids_threshold = None,
                        best_k = None,
                        type_weights = None,
                        file_name = "(carts)vs(orders)",
                        disk_pieces = 4,
                        read_ct_size = 5)

In [None]:
generate_cooccurence_df(aidx_types = [2],
                        aidy_types = [1],
                        time_bw_aids_threshold = None,
                        best_k = None,
                        type_weights = None,
                        file_name = "(orders)vs(carts)",
                        disk_pieces = 4,
                        read_ct_size = 5)

In [None]:
generate_cooccurence_df(aidx_types = [2],
                        aidy_types = [2],
                        time_bw_aids_threshold = None,
                        best_k = None,
                        type_weights = None,
                        file_name = "(orders)vs(orders)",
                        disk_pieces = 4,
                        read_ct_size = 5)