In [1]:
VER = 6

import pandas as pd, numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()

import os, sys, pickle, glob, gc
from collections import Counter
import cudf, itertools
print('We will use RAPIDS version',cudf.__version__)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

from pandarallel import pandarallel

pandarallel.initialize(nb_workers=4, progress_bar=True, use_memory_fs=False)

import xgboost as xgb
from sklearn.model_selection import GroupKFold

from pyarrow.parquet import ParquetFile
import pyarrow as pa 

from catboost import CatBoostRanker, Pool

We will use RAPIDS version 22.10.00a+392.g1558403753
INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


# Training

In [2]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

In [3]:
CANDIDATE_COUNT = 100

In [4]:
RUN_FOR = "kaggle" # "kaggle"

In [5]:
train_sessions = np.load("./splitted_raw_data/val_sessions_for_train.npy", allow_pickle=True)

In [6]:
import json

with open("./models/model_iters.json", "r") as read_file:
    model_iters = json.load(read_file)

In [7]:
dropcols = [
    'type0_session_aid_aid_existed',
       'type0_session_aid_aid_existed_multiple',
       'type1_session_aid_aid_existed',
       'type1_session_aid_aid_existed_multiple',
       'type2_session_aid_aid_existed',
       'type2_session_aid_aid_existed_multiple',
    'type0_aid_aid_existed', 'type0_aid_aid_existed_multiple',
       'type0_aid_session_existed', 'type0_aid_session_existed_multiple',
       'type1_aid_aid_existed', 'type1_aid_aid_existed_multiple',
       'type1_aid_session_existed', 'type1_aid_session_existed_multiple',
       'type2_aid_aid_existed', 'type2_aid_aid_existed_multiple',
       'type2_aid_session_existed', 'type2_aid_session_existed_multiple',
    'all_carts_orders_wgt_min', 'all_carts_orders_wgt_std',
       'all_carts_orders_wgt_sum', 'all_carts_orders_wgt_mean',
       'all_carts_orders_wgt_count', 'all_buy2buy_wgt_max',
       'all_buy2buy_wgt_min', 'all_buy2buy_wgt_std', 'all_buy2buy_wgt_sum',
       'all_buy2buy_wgt_mean', 'all_buy2buy_wgt_count', 'aid_session_existed',
    'clicks_covisit_max',
       'clicks_covisit_min', 'clicks_covisit_std', 'clicks_covisit_sum',
       'clicks_covisit_mean', 'clicks_covisit_count',
       'carts_orders_covisit_max', 'carts_orders_covisit_min',
       'carts_orders_covisit_std', 'carts_orders_covisit_sum',
       'carts_orders_covisit_mean', 'carts_orders_covisit_count',
       'buy2buy_covisit_max', 'buy2buy_covisit_min', 'buy2buy_covisit_std',
       'buy2buy_covisit_sum', 'buy2buy_covisit_mean', 'buy2buy_covisit_count',
    'all_clicks_wgt_max',
       'all_clicks_wgt_min', 'all_clicks_wgt_std', 'all_clicks_wgt_sum',
       'all_clicks_wgt_mean', 'all_clicks_wgt_count',
       'all_carts_orders_wgt_max'
]

In [8]:
subs = []

for type_str in tqdm(list(type_labels.keys())):
        
    batches = sorted(glob.glob(f"./candidated_features/{RUN_FOR}_{type_str}_all_data_{CANDIDATE_COUNT}candidates_p*.pqt"))

    model_paths = sorted(glob.glob(f"./models/XGB_{CANDIDATE_COUNT}candidates_fold*_{type_str}.xgb"))
#     model_paths = sorted(glob.glob(f"./models/CB_{CANDIDATE_COUNT}candidates_fold*_{type_str}.cb"))

    all_predictions = []
    
    for batch in tqdm(batches):
        whole_df = pd.read_parquet(batch)#.drop(labels=["candidate_rank"], axis=1)
        #whole_df = whole_df.drop(labels=[col for col in whole_df.columns if col[:4]=="type"], axis=1)

        if RUN_FOR == "local":
            whole_df = whole_df[~whole_df.session.isin(train_sessions)].reset_index(drop=True)
        
        print(f"Processing {len(whole_df)} rows...")

        CHUNK_SIZE = 4_000_000
        
        for chunk_num in range(len(whole_df) // CHUNK_SIZE + 1):
            start_index = chunk_num*CHUNK_SIZE
            end_index = min(chunk_num*CHUNK_SIZE + CHUNK_SIZE, len(whole_df))
            chunk_df = whole_df.iloc[start_index:end_index]#.drop(dropcols, 1)

            FEATURES = chunk_df.columns[2 : -1]
            dtest = xgb.DMatrix(data=chunk_df[FEATURES])
    #         dtest = Pool(
    #             data=whole_df[FEATURES]
    #         )

            preds = np.zeros(len(chunk_df))

            for model_path in model_paths:
    #             target_it = int(model_path.split("_it")[-1].split(".xgb")[0])
    #             model = CatBoostRanker()
    #             model.load_model(model_path)

                model = xgb.Booster()
                model.load_model(model_path)
                model.set_param({'predictor': 'gpu_predictor'})

                preds += model.predict(dtest,
#                                        iteration_range = (0, model_iters[model_path]+1)
                                      )/len(model_paths)

            predictions = chunk_df[['session','aid']].copy()
            predictions['pred'] = preds
            all_predictions.append(predictions)
        
    all_predictions = pd.concat(all_predictions, ignore_index=True)
    
    all_predictions = all_predictions.sort_values(['session','pred'],
                                                  ascending=[True,False]).reset_index(drop=True)
    all_predictions['n'] = all_predictions.groupby('session').aid.cumcount().astype('int8')
    all_predictions = all_predictions.loc[all_predictions.n<20]

    sub = all_predictions.groupby('session').aid.apply(list)
    sub = sub.to_frame().reset_index()
    sub.item = sub.aid.apply(lambda x: " ".join(map(str,x)))
    sub.columns = ['session_type','labels']
    sub.session_type = sub.session_type.astype('str') + '_' + type_str

    subs.append(sub)

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 7180300 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...


  sub.item = sub.aid.apply(lambda x: " ".join(map(str,x)))


  0%|          | 0/17 [00:00<?, ?it/s]

Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 7180300 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...


  sub.item = sub.aid.apply(lambda x: " ".join(map(str,x)))


  0%|          | 0/17 [00:00<?, ?it/s]

Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 7180300 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...
Processing 10000000 rows...


  sub.item = sub.aid.apply(lambda x: " ".join(map(str,x)))


## Local Score

In [9]:
final_sub = pd.concat(subs, ignore_index=True)
final_sub.sort_values(by="session_type", ascending=True).reset_index(drop=True)

if RUN_FOR == "local":
    # COMPUTE METRIC
    score = 0
    weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}
    for t in ['clicks','carts','orders']:
        sub = final_sub.loc[final_sub.session_type.str.contains(t)].copy()
        sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0]))
    #     sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')[:20]])
        test_labels = pd.read_parquet('./splitted_raw_data/val_labels.parquet')
        test_labels = test_labels[~test_labels.session.isin(train_sessions)].reset_index(drop=True)
        test_labels = test_labels.loc[test_labels['type']==t]
        test_labels = test_labels.merge(sub, how='left', on=['session'])
        test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
        test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20)
        recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
        score += weights[t]*recall
        print(f'{t} recall =',recall)

    print('=============')
    print('Overall Recall =',score)
    print('=============')

elif RUN_FOR == "kaggle":
    final_sub["labels"] = final_sub.labels.apply(lambda x: " ".join([str(elm) for elm in x]))
#     final_sub.to_csv("submission.csv", index=False)
    final_sub.to_csv("submission.csv.gz", index=False, compression='gzip')

In [10]:
final_sub

Unnamed: 0,session_type,labels
0,12899779_clicks,59625 1253524 737445 731692 1790770 438191 339...
1,12899780_clicks,1142000 736515 582732 973453 889686 487136 134...
2,12899781_clicks,199008 918667 141736 1460571 754412 811084 100...
3,12899782_clicks,595994 1007613 834354 740494 1033148 975116 98...
4,12899783_clicks,1817895 607638 1754419 1216820 1729553 198385 ...
...,...,...
5015404,14571577_orders,1141710 1276792 86916 367734 631085 1666114 10...
5015405,14571578_orders,519105 1811714 822641 815460 1084758 476318 97...
5015406,14571579_orders,739876 1750859 785544 51363 857928 436188 1445...
5015407,14571580_orders,202353 433425 1314576 1231403 888228 356096 89...


In [11]:
feat_importances = pd.Series(model.feature_importances_, index=whole_df[FEATURES].columns)

AttributeError: 'Booster' object has no attribute 'feature_importances_'

In [None]:
batches = sorted(glob.glob(f"./candidated_features/{RUN_FOR}_{type_str}_all_data_{CANDIDATE_COUNT}candidates_p*.pqt"))
for batch in tqdm(batches):
    whole_df = pd.read_parquet(batch)
    FEATURES = whole_df.columns[2 : -1]
    dtest = xgb.DMatrix(data=whole_df[FEATURES])
    break

In [None]:
model_path = sorted(glob.glob(f"./models/XGB_{CANDIDATE_COUNT}candidates_fold*_orders.xgb"))[0]
model = xgb.Booster()
model.load_model(model_path)

model.predict(dtest)

imp_df = pd.Series(model.get_score(importance_type='gain'))
imp_df.nlargest(30).plot(kind='barh')

In [None]:
model_path = sorted(glob.glob(f"./models/XGB_{CANDIDATE_COUNT}candidates_fold*_clicks.xgb"))[0]
model = xgb.Booster()
model.load_model(model_path)

model.predict(dtest)

imp_df = pd.Series(model.get_score(importance_type='gain'))
imp_df.nlargest(30).plot(kind='barh')

In [None]:
model_path = sorted(glob.glob(f"./models/XGB_{CANDIDATE_COUNT}candidates_fold*_carts.xgb"))[0]
model = xgb.Booster()
model.load_model(model_path)

model.predict(dtest)

imp_df = pd.Series(model.get_score(importance_type='gain'))
imp_df.nlargest(30).plot(kind='barh')