In [None]:
VER = 6

import pandas as pd, numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()

import os, sys, pickle, glob, gc
from collections import Counter
import cudf, itertools
print('We will use RAPIDS version',cudf.__version__)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

import xgboost as xgb
from sklearn.model_selection import GroupKFold
import polars as pl

from pyarrow.parquet import ParquetFile
import pyarrow as pa 
from fastai.tabular.core import df_shrink

In [None]:
def remove_negative_session(df,target='label'):
    true_df = df.groupby('session')[target].agg('sum') > 0
    session = pd.DataFrame(true_df[true_df]).reset_index()['session']
    df = df.merge(session, how = 'inner', on = 'session')
    return df

# Training

In [None]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

In [None]:
CANDIDATE_COUNT = 100

In [None]:
train_sessions = np.load("./splitted_raw_data/val_sessions_for_train.npy", allow_pickle=True)

In [None]:
type_iters = {
    'clicks': 900,
    'carts': 400,
    'orders': 400
}

type_fracs = {
    'clicks': 0.15,
    'carts': 0.15,
    'orders': 0.15
}

## For each action

In [None]:
from catboost import CatBoostRanker, Pool, MetricVisualizer

model_iters = {}

for type_str in tqdm(list(type_labels.keys())):

    whole_df = []
    batches = sorted(glob.glob(f"./candidated_features/local_{type_str}_all_data_{CANDIDATE_COUNT}candidates_p*.pqt"))
    
    for batch in tqdm(batches):
        
        batch = pd.read_parquet(batch)
        batch = remove_negative_session(batch)
        batch = batch[batch.session.isin(train_sessions)].reset_index(drop=True)
        positives = batch.loc[batch['label']==1].copy()
        negatives = batch.loc[batch['label']==0].groupby("session").sample(frac=type_fracs[type_str],
                                                                           random_state=1337)
        whole_df.append(positives)
        whole_df.append(negatives)
        del batch, positives, negatives
        
    whole_df = pd.concat(whole_df, axis=0, ignore_index=True)
    
    print(f"sampled: {whole_df.shape}")

    FEATURES = whole_df.columns[2 : -1]   
    
    whole_df = whole_df.sort_values('session').reset_index(drop=True)
    
    skf = GroupKFold(n_splits=5)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(whole_df,
                                                            whole_df['label'],
                                                            groups=whole_df['session'])):
        X_train = whole_df.loc[train_idx, FEATURES]
        y_train = whole_df.loc[train_idx, 'label']
        X_valid = whole_df.loc[valid_idx, FEATURES]
        y_valid = whole_df.loc[valid_idx, 'label']

        train_groups = whole_df.loc[train_idx].groupby('session', sort=False)["session"].agg('count').values
        val_groups = whole_df.loc[valid_idx].groupby('session', sort=False)["session"].agg('count').values
        

        dtrain = xgb.DMatrix(X_train, y_train, group = train_groups) 
        dtest = xgb.DMatrix(X_valid, y_valid, group = val_groups) 

        xgb_parms = {'objective':'rank:pairwise',
                     'tree_method':'gpu_hist',
                    "random_state":42, 
                    "learning_rate":0.1,
                    "colsample_bytree":0.9, 
                    "eta":0.05, 
                    "max_depth":6, 
                    "subsample":0.9,
                    }
        model = xgb.train(xgb_parms,
                          dtrain=dtrain,
                          evals=[(dtrain,'train'),(dtest,'valid')],
                          num_boost_round = type_iters[type_str],
                          verbose_eval=100)

    
        
        model_path = f'./models/XGB_{CANDIDATE_COUNT}candidates_fold{fold}_{type_str}.xgb'
        model.save_model(model_path)
        
        model_iters[model_path] = model.best_ntree_limit
        
        del model, X_train, y_train, X_valid, y_valid, dtrain, dtest
        
        for i in range(5):
            gc.collect()
            
    del whole_df
    for i in range(5):
        gc.collect()

In [None]:
import json
with open('./models/model_iters.json', 'w') as fp:
    json.dump(model_iters, fp)