In [1]:
VER = 6

import pandas as pd, numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()

import os, sys, pickle, glob, gc
from collections import Counter
import cudf, itertools
print('We will use RAPIDS version',cudf.__version__)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

from pandarallel import pandarallel

pandarallel.initialize(nb_workers=4, progress_bar=True, use_memory_fs=False)

import xgboost as xgb
from sklearn.model_selection import GroupKFold

We will use RAPIDS version 22.10.00a+392.g1558403753
INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


# Training

In [2]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

In [3]:
CANDIDATE_COUNT = 50

## For each action

In [4]:
for type_str in tqdm(list(type_labels.keys())):
    
    whole_df = pd.read_parquet(f"./candidated_features/local_{type_str}_all_data.pqt")
    
    FEATURES = whole_df.columns[2 : -1]
        
    positives = whole_df.loc[whole_df['label']==1]
    negatives = whole_df.loc[whole_df['label']==0].sample(frac=0.275,
                                                          random_state=1337)
    whole_df = pd.concat([positives,negatives], axis=0, ignore_index=True)
    
    whole_df = whole_df.sort_values('session').reset_index(drop=True)
    whole_df["group_count"] = whole_df.groupby('session', sort=False).cumcount('candidates') + 1
    
    skf = GroupKFold(n_splits=5)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(whole_df,
                                                            whole_df['label'],
                                                            groups=whole_df['session'])):
        X_train = whole_df.loc[train_idx, FEATURES]
        y_train = whole_df.loc[train_idx, 'label']
        X_valid = whole_df.loc[valid_idx, FEATURES]
        y_valid = whole_df.loc[valid_idx, 'label']

        train_groups = whole_df.loc[train_idx, ['session', 'group_count']].groupby('session')['group_count'].last().to_numpy()
        val_groups = whole_df.loc[valid_idx, ['session', 'group_count']].groupby('session')['group_count'].last().to_numpy()
        
        dtrain = xgb.DMatrix(X_train, y_train, group = train_groups) 
        dvalid = xgb.DMatrix(X_valid, y_valid, group = val_groups) 

        xgb_parms = {'objective':'rank:pairwise',
                     'tree_method':'gpu_hist',
                    "random_state":42, 
                    "learning_rate":0.1,
                    "colsample_bytree":0.9, 
                    "eta":0.05, 
                    "max_depth":5, 
                    "subsample":0.8 }
        model = xgb.train(xgb_parms,
                          dtrain=dtrain,
                          evals=[(dtrain,'train'),(dvalid,'valid')],
                          num_boost_round=500,
#                           early_stopping_rounds=400,
                          verbose_eval=100)
        
        model.save_model(f'./models/XGB_{CANDIDATE_COUNT}candidates_fold{fold}_{type_str}.xgb')
        
        del model, dtrain, dvalid, X_train, y_train, X_valid, y_valid
        
        for i in range(5):
            gc.collect()
            
    del whole_df, positives, negatives
    for i in range(5):
        gc.collect()

  0%|          | 0/3 [00:00<?, ?it/s]

[0]	train-map:0.81924	valid-map:0.81961
[100]	train-map:0.82231	valid-map:0.82244
[200]	train-map:0.82282	valid-map:0.82272
[300]	train-map:0.82332	valid-map:0.82305
[400]	train-map:0.82373	valid-map:0.82339
[499]	train-map:0.82404	valid-map:0.82356
[0]	train-map:0.81930	valid-map:0.81915
[100]	train-map:0.82229	valid-map:0.82224
[200]	train-map:0.82285	valid-map:0.82247
[300]	train-map:0.82338	valid-map:0.82296
[400]	train-map:0.82375	valid-map:0.82326
[499]	train-map:0.82405	valid-map:0.82349
[0]	train-map:0.81937	valid-map:0.81927
[100]	train-map:0.82227	valid-map:0.82205
[200]	train-map:0.82280	valid-map:0.82245
[300]	train-map:0.82332	valid-map:0.82287
[400]	train-map:0.82373	valid-map:0.82319
[499]	train-map:0.82407	valid-map:0.82340
[0]	train-map:0.81922	valid-map:0.81984
[100]	train-map:0.82216	valid-map:0.82280
[200]	train-map:0.82268	valid-map:0.82331
[300]	train-map:0.82317	valid-map:0.82370
[400]	train-map:0.82359	valid-map:0.82402
[499]	train-map:0.82386	valid-map:0.82421
