In [None]:
VER = 6

import pandas as pd, numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()

import os, sys, pickle, glob, gc
from collections import Counter
import cudf, itertools
print('We will use RAPIDS version',cudf.__version__)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

from pandarallel import pandarallel

pandarallel.initialize(nb_workers=4, progress_bar=True, use_memory_fs=False)

import xgboost as xgb
from sklearn.model_selection import GroupKFold

from pyarrow.parquet import ParquetFile
import pyarrow as pa 

from catboost import CatBoostRanker, Pool

In [None]:
def remove_negative_session(df,target='label'):
    '''删除没有正样本的用户'''
    true_df = df.groupby('session')[target].agg('sum') > 0
    session = pd.DataFrame(true_df[true_df]).reset_index()['session']
    df = df.merge(session, how = 'inner', on = 'session')
    return df

# Training

In [None]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}
CANDIDATE_COUNT = 100
RUN_FOR = "kaggle" # "kaggle"
# 验证集
train_sessions = np.load("./splitted_raw_data/val_sessions_for_train.npy", allow_pickle=True)

In [None]:
import json
with open("./models/model_iters.json", "r") as read_file:
    model_iters = json.load(read_file)
model_iters

In [None]:
subs = []

for type_str in tqdm(list(type_labels.keys())):
    # candidated_features 路径
    batches = sorted(glob.glob(f"./candidated_features/{RUN_FOR}_{type_str}_all_data_{CANDIDATE_COUNT}candidates_p*.pqt"))
    # xgb 模型路径
    model_paths = sorted(glob.glob(f"./models/XGB_{CANDIDATE_COUNT}candidates_fold*_{type_str}.xgb"))

    all_predictions = []
    for batch in tqdm(batches):
        # 读取 candidated_features
        whole_df = pd.read_parquet(batch)

        if RUN_FOR == "local":
            whole_df = whole_df[~whole_df.session.isin(train_sessions)].reset_index(drop=True) # 排除验证集的用户
        
        print(f"Processing {len(whole_df)} rows...")

        CHUNK_SIZE = 1_500_000
        for chunk_num in range(len(whole_df) // CHUNK_SIZE + 1):
            start_index = chunk_num*CHUNK_SIZE
            end_index = min(chunk_num*CHUNK_SIZE + CHUNK_SIZE, len(whole_df))
            print(start_index, end_index)
            chunk_df = whole_df.iloc[start_index:end_index] # 获取 chunk 的数据

            FEATURES = chunk_df.columns[2 : -1] # 获取特征
            dtest = xgb.DMatrix(data=chunk_df[FEATURES]) # 转换为 xgb 的数据格式

            preds = [] 

            for model_path in model_paths:
                model = xgb.Booster() # 初始化 xgb 模型
                model.load_model(model_path) # 加载模型 
                model.set_param({'predictor': 'gpu_predictor'}) # 设置模型参数
                preds.append(model.predict(dtest)) # 预测
                
            preds = np.mean(preds, axis=0) # 平均多个模型的预测值
            
            predictions = chunk_df[['session','aid']].copy() # 获取 session 和 aid
            predictions['pred'] = preds # 添加预测值
            all_predictions.append(predictions) # 添加到 all_predictions
        
    all_predictions = pd.concat(all_predictions, ignore_index=True) # 合并 all_predictions
    
    # 根据预测的分数排序
    all_predictions = all_predictions.sort_values(['session','pred'],
                                                  ascending=[True,False]).reset_index(drop=True)
    # 保存预测的分数
    all_predictions.to_parquet(f"../raw_data/soft_scores/{RUN_FOR}_{type_str}_soft_scores.parquet")
    # 取出 top20 的 aid    
    all_predictions['n'] = all_predictions.groupby('session').aid.cumcount().astype('int8')
    all_predictions = all_predictions.loc[all_predictions.n<20]
    # 保存进 subs
    sub = all_predictions.groupby('session').aid.apply(list) # 按 session 分组，组内 aid 存成 list
    sub = sub.to_frame().reset_index()
    sub.item = sub.aid.apply(lambda x: " ".join(map(str,x))) # list 转成 str
    sub.columns = ['session_type','labels']
    sub.session_type = sub.session_type.astype('str') + '_' + type_str

    subs.append(sub)

## Local Score

In [None]:
# 合并 subs
final_sub = pd.concat(subs, ignore_index=True)
final_sub.sort_values(by="session_type", ascending=True).reset_index(drop=True)

# 如果本地运行，计算分数
if RUN_FOR == "local":
    # COMPUTE METRIC
    score = 0
    weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}
    for t in [
        'clicks',
        'carts',
        'orders'
    ]:
        sub = final_sub.loc[final_sub.session_type.str.contains(t)].copy()
        sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0]))
        test_labels = pd.read_parquet('./splitted_raw_data/val_labels.parquet') # 读取验证集的标签
        test_labels = test_labels[~test_labels.session.isin(train_sessions)].reset_index(drop=True) # 排除验证集的用户
        test_labels = test_labels.loc[test_labels['type']==t] # 只保留 t 类型的标签
        test_labels = test_labels.merge(sub, how='left', on=['session']) # 合并预测的标签
        test_labels['labels'] = test_labels['labels'].fillna("").apply(list) # 填充空值
        test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1) # 计算命中数
        test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20) # 计算 ground_truth 的长度
        recall = test_labels['hits'].sum() / test_labels['gt_count'].sum() # 计算召回率
        score += weights[t]*recall # 计算加权召回率
        print(f'{t} recall =',recall) # 打印召回率

    print('=============')
    print('Overall Recall =',score)
    print('=============')

# 如果是 kaggle，输出结果文件
elif RUN_FOR == "kaggle":
    final_sub["labels"] = final_sub.labels.apply(lambda x: " ".join([str(elm) for elm in x]))
    final_sub.to_csv("submission.csv.gz", index=False, compression='gzip')