In [None]:
VER = 6

import pandas as pd, numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()

import os, sys, pickle, glob, gc
from collections import Counter
import cudf, itertools
print('We will use RAPIDS version',cudf.__version__)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

from pandarallel import pandarallel

pandarallel.initialize(nb_workers=4, progress_bar=True, use_memory_fs=False)

import xgboost as xgb
from sklearn.model_selection import GroupKFold

from pyarrow.parquet import ParquetFile
import pyarrow as pa 

from catboost import CatBoostRanker, Pool

In [None]:
def remove_negative_session(df,target='label'):
    true_df = df.groupby('session')[target].agg('sum') > 0
    session = pd.DataFrame(true_df[true_df]).reset_index()['session']
    df = df.merge(session, how = 'inner', on = 'session')
    return df

# Training

In [None]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

In [None]:
CANDIDATE_COUNT = 100

In [None]:
RUN_FOR = "kaggle" # "kaggle"

In [None]:
train_sessions = np.load("./splitted_raw_data/val_sessions_for_train.npy", allow_pickle=True)

In [None]:
import json

with open("./models/model_iters.json", "r") as read_file:
    model_iters = json.load(read_file)

In [None]:
model_iters

In [None]:
subs = []

for type_str in tqdm(list(type_labels.keys())):
        
    batches = sorted(glob.glob(f"./candidated_features/{RUN_FOR}_{type_str}_all_data_{CANDIDATE_COUNT}candidates_p*.pqt"))

    model_paths = sorted(glob.glob(f"./models/XGB_{CANDIDATE_COUNT}candidates_fold*_{type_str}.xgb"))

    all_predictions = []
    
    for batch in tqdm(batches):
        whole_df = pd.read_parquet(batch)

        if RUN_FOR == "local":
#             whole_df = remove_negative_session(whole_df).reset_index(drop=True)
            whole_df = whole_df[~whole_df.session.isin(train_sessions)].reset_index(drop=True)
        
        print(f"Processing {len(whole_df)} rows...")

        CHUNK_SIZE = 1_500_000
        
        for chunk_num in range(len(whole_df) // CHUNK_SIZE + 1):
            start_index = chunk_num*CHUNK_SIZE
            end_index = min(chunk_num*CHUNK_SIZE + CHUNK_SIZE, len(whole_df))
            print(start_index, end_index)
            chunk_df = whole_df.iloc[start_index:end_index]

            FEATURES = chunk_df.columns[2 : -1]
            dtest = xgb.DMatrix(data=chunk_df[FEATURES])

            preds = [] 

            for model_path in model_paths:
                model = xgb.Booster()
                model.load_model(model_path)
                model.set_param({'predictor': 'gpu_predictor'})
                preds.append(model.predict(dtest))
                
            preds = np.mean(preds, axis=0)
            
            predictions = chunk_df[['session','aid']].copy()
            predictions['pred'] = preds
            all_predictions.append(predictions)
        
    all_predictions = pd.concat(all_predictions, ignore_index=True)
    
    all_predictions = all_predictions.sort_values(['session','pred'],
                                                  ascending=[True,False]).reset_index(drop=True)
    
    all_predictions.to_parquet(f"../raw_data/soft_scores/{RUN_FOR}_{type_str}_soft_scores.parquet")
        
    all_predictions['n'] = all_predictions.groupby('session').aid.cumcount().astype('int8')
    all_predictions = all_predictions.loc[all_predictions.n<20]

    sub = all_predictions.groupby('session').aid.apply(list)
    sub = sub.to_frame().reset_index()
    sub.item = sub.aid.apply(lambda x: " ".join(map(str,x)))
    sub.columns = ['session_type','labels']
    sub.session_type = sub.session_type.astype('str') + '_' + type_str

    subs.append(sub)

## Local Score

In [None]:
final_sub = pd.concat(subs, ignore_index=True)
final_sub.sort_values(by="session_type", ascending=True).reset_index(drop=True)

if RUN_FOR == "local":
    # COMPUTE METRIC
    score = 0
    weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}
    for t in [
        'clicks',
        'carts',
        'orders'
    ]:
        sub = final_sub.loc[final_sub.session_type.str.contains(t)].copy()
        sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0]))
        test_labels = pd.read_parquet('./splitted_raw_data/val_labels.parquet')
        test_labels = test_labels[~test_labels.session.isin(train_sessions)].reset_index(drop=True)
        test_labels = test_labels.loc[test_labels['type']==t]
        test_labels = test_labels.merge(sub, how='left', on=['session'])
        test_labels['labels'] = test_labels['labels'].fillna("").apply(list)
        test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
        test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20)
        recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
        score += weights[t]*recall
        print(f'{t} recall =',recall)

    print('=============')
    print('Overall Recall =',score)
    print('=============')

elif RUN_FOR == "kaggle":
    final_sub["labels"] = final_sub.labels.apply(lambda x: " ".join([str(elm) for elm in x]))
    final_sub.to_csv("submission.csv.gz", index=False, compression='gzip')