In [1]:
VER = 6

import pandas as pd, numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()

import os, sys, pickle, glob, gc
from collections import Counter
import cudf, itertools
print('We will use RAPIDS version',cudf.__version__)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

from pandarallel import pandarallel

pandarallel.initialize(nb_workers=4, progress_bar=True, use_memory_fs=False)

import xgboost as xgb
from sklearn.model_selection import GroupKFold

from pyarrow.parquet import ParquetFile
import pyarrow as pa 

We will use RAPIDS version 22.10.00a+392.g1558403753
INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


# Training

In [2]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

In [3]:
RUN_FOR = "kaggle" # "kaggle"

In [4]:
subs = []

for type_str in tqdm(list(type_labels.keys())):
    
    pf = ParquetFile(f"./candidated_features/{RUN_FOR}_{type_str}_all_data.pqt") 
    model_paths = sorted(glob.glob(f"./models/XGB_*candidates_fold*_{type_str}.xgb"))

    all_predictions = []
    
    for batch in tqdm(pf.iter_batches(batch_size = 5_000_000)):
        print("Processing 5M rows...")
        whole_df = batch.to_pandas() 
        FEATURES = whole_df.columns[2 : -1]
        dtest = xgb.DMatrix(data=whole_df[FEATURES])

        preds = np.zeros(len(whole_df))
        
        for model_path in model_paths:
#             target_it = int(model_path.split("_it")[-1].split(".xgb")[0])
            model = xgb.Booster()
            model.load_model(model_path)
            model.set_param({'predictor': 'gpu_predictor'})
            preds += model.predict(dtest)/len(model_paths)
#                                    iteration_range=(0, target_it + 1)
#                                   )/len(model_paths)
        
        predictions = whole_df[['session','aid']].copy()
        predictions['pred'] = preds
        all_predictions.append(predictions)
        
    all_predictions = pd.concat(all_predictions, ignore_index=True)
    
    all_predictions = all_predictions.sort_values(['session','pred'],
                                                  ascending=[True,False]).reset_index(drop=True)
    all_predictions['n'] = all_predictions.groupby('session').aid.cumcount().astype('int8')
    all_predictions = all_predictions.loc[all_predictions.n<20]

    sub = all_predictions.groupby('session').aid.apply(list)
    sub = sub.to_frame().reset_index()
    sub.item = sub.aid.apply(lambda x: " ".join(map(str,x)))
    sub.columns = ['session_type','labels']
    sub.session_type = sub.session_type.astype('str') + '_' + type_str

    subs.append(sub)

  0%|          | 0/3 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...


  sub.item = sub.aid.apply(lambda x: " ".join(map(str,x)))


0it [00:00, ?it/s]

Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...


  sub.item = sub.aid.apply(lambda x: " ".join(map(str,x)))


0it [00:00, ?it/s]

Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...
Processing 5M rows...


  sub.item = sub.aid.apply(lambda x: " ".join(map(str,x)))


## Local Score

In [5]:
final_sub = pd.concat(subs, ignore_index=True)
final_sub.sort_values(by="session_type", ascending=True).reset_index(drop=True)

if RUN_FOR == "local":
    # COMPUTE METRIC
    score = 0
    weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}
    for t in ['clicks','carts','orders']:
        sub = final_sub.loc[final_sub.session_type.str.contains(t)].copy()
        sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0]))
    #     sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')[:20]])
        test_labels = pd.read_parquet('./splitted_raw_data/val_labels.parquet')
        test_labels = test_labels.loc[test_labels['type']==t]
        test_labels = test_labels.merge(sub, how='left', on=['session'])
        test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
        test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20)
        recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
        score += weights[t]*recall
        print(f'{t} recall =',recall)

    print('=============')
    print('Overall Recall =',score)
    print('=============')

elif RUN_FOR == "kaggle":
    final_sub["labels"] = final_sub.labels.apply(lambda x: " ".join([str(elm) for elm in x]))
#     final_sub.to_csv("submission.csv", index=False)
    final_sub.to_csv("submission.csv.gz", index=False, compression='gzip')

In [6]:
final_sub

Unnamed: 0,session_type,labels
0,12899779_clicks,59625 1253524 737445 731692 1790770 1660529 43...
1,12899780_clicks,1142000 736515 973453 582732 1502122 889686 14...
2,12899781_clicks,199008 918667 194067 141736 57315 1583317 1460...
3,12899782_clicks,595994 1007613 834354 740494 1669402 829180 98...
4,12899783_clicks,1817895 607638 1754419 300127 1216820 255297 1...
...,...,...
5015404,14571577_orders,1141710 1276792 86916 1004292 631085 1666114 7...
5015405,14571578_orders,519105 815460 664851 822641 1811714 290137 524...
5015406,14571579_orders,739876 1750859 785544 51363 832213 876416 8579...
5015407,14571580_orders,202353 433425 1314576 1231403 891417 387358 33...
