In [1]:
VER = 6

import pandas as pd, numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()

import os, sys, pickle, glob, gc
from collections import Counter
import cudf, itertools
print('We will use RAPIDS version',cudf.__version__)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

# from pandarallel import pandarallel

# pandarallel.initialize(nb_workers=4, progress_bar=True, use_memory_fs=False)

import xgboost as xgb
from sklearn.model_selection import GroupKFold
import polars as pl

from pyarrow.parquet import ParquetFile
import pyarrow as pa 

We will use RAPIDS version 22.10.00a+392.g1558403753


# Training

In [2]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

In [3]:
CANDIDATE_COUNT = 100

In [4]:
train_sessions = np.load("./splitted_raw_data/val_sessions_for_train.npy", allow_pickle=True)

## For each action

In [5]:
from catboost import CatBoostRanker, Pool, MetricVisualizer

for type_str in tqdm(list(type_labels.keys())):
    
#     pf = ParquetFile(f"./candidated_features/local_{type_str}_all_data.pqt") 
    
#     whole_df = pd.read_parquet(f"./candidated_features/local_{type_str}_all_data.pqt").reset_index(drop=True)
   
    whole_df = []
    batches = sorted(glob.glob(f"./candidated_features/local_{type_str}_all_data_{CANDIDATE_COUNT}candidates_p*.pqt"))
    
    for batch in tqdm(batches):
        
        batch = pd.read_parquet(batch)#.drop(labels=["candidate_rank"], axis=1)
        batch = batch[batch.session.isin(train_sessions)].reset_index(drop=True)
        positives = batch.loc[batch['label']==1]
        negatives = batch.loc[batch['label']==0].groupby("session").sample(frac=0.15,
                                                                           random_state=1337)
        whole_df.append(positives)
        whole_df.append(negatives)
        
    whole_df = pd.concat(whole_df, axis=0, ignore_index=True)
    del positives, negatives
    
    print("sampled")

    FEATURES = whole_df.columns[2 : -1]   
    
    whole_df = whole_df.sort_values('session').reset_index(drop=True)
    whole_df["group_count"] = whole_df.groupby('session', sort=False).cumcount('candidates') + 1
    
    skf = GroupKFold(n_splits=5)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(whole_df,
                                                            whole_df['label'],
                                                            groups=whole_df['session'])):
        X_train = whole_df.loc[train_idx, FEATURES]
        y_train = whole_df.loc[train_idx, 'label']
        X_valid = whole_df.loc[valid_idx, FEATURES]
        y_valid = whole_df.loc[valid_idx, 'label']

        train_groups = whole_df.loc[train_idx, ['session', 'group_count']].groupby('session')['group_count'].last().to_numpy()
        val_groups = whole_df.loc[valid_idx, ['session', 'group_count']].groupby('session')['group_count'].last().to_numpy()
        
#         train_groups = whole_df.loc[train_idx, "session"].values
#         val_groups = whole_df.loc[valid_idx, "session"].values
        dtrain = xgb.DMatrix(X_train, y_train, group = train_groups) 
        dtest = xgb.DMatrix(X_valid, y_valid, group = val_groups) 

        xgb_parms = {'objective':'rank:pairwise',
                     'tree_method':'gpu_hist',
                    "random_state":42, 
                    "learning_rate":0.1,
                    "colsample_bytree":0.9, 
                    "eta":0.05, 
#                     "gamma": 2,
#                     "min_child_weight": 200,
                    "max_depth":5, 
                    "subsample":0.8
                    }
        model = xgb.train(xgb_parms,
                          dtrain=dtrain,
                          evals=[(dtrain,'train'),(dtest,'valid')],
                          num_boost_round=350,
#                           early_stopping_rounds=400,
                          verbose_eval=100)
#         train = Pool(
#             data=X_train,
#             label=y_train,
#             group_id=train_groups
#         )

#         test = Pool(
#             data=X_valid,
#             label=y_valid,
#             group_id=val_groups
#         )
        
#         loss = "YetiRankPairwise"
#         cb_parameters = {
#             'iterations': 10000,
#             'loss_function': loss,
#             'train_dir': loss,
#             'task_type': 'GPU',
#             'depth': 7,
#             'learning_rate': 0.025,
# #             'custom_metric': ['RecallAt:top=20', 'PrecisionAt:top=20'],
# #             'eval_metric': ['MAP'],
#             'early_stopping_rounds': 200,
# #             'metric_period':0,
#             'verbose': 100,
#             'random_seed': 0,
#         }
#         model = CatBoostRanker(**cb_parameters)
#         model.fit(train,
#                   eval_set=test,
# #                   plot=True
#                  )
        
#         print(np.mean(model.eval_metrics(test, ['MAP'])['MAP']))
    

        model.save_model(f'./models/XGB_{CANDIDATE_COUNT}candidates_fold{fold}_{type_str}.xgb')
#         model.save_model(f'./models/CB_{CANDIDATE_COUNT}candidates_fold{fold}_{type_str}.cb')
        del model, X_train, y_train, X_valid, y_valid, dtrain, dtest
        
        for i in range(5):
            gc.collect()
            
    del whole_df
    for i in range(5):
        gc.collect()

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

sampled
[0]	train-map:0.81446	valid-map:0.81488
[100]	train-map:0.82056	valid-map:0.82087
[200]	train-map:0.82226	valid-map:0.82189
[300]	train-map:0.82304	valid-map:0.82235
[349]	train-map:0.82337	valid-map:0.82253
[0]	train-map:0.81456	valid-map:0.81483
[100]	train-map:0.82073	valid-map:0.82035
[200]	train-map:0.82235	valid-map:0.82167
[300]	train-map:0.82307	valid-map:0.82214
[349]	train-map:0.82337	valid-map:0.82235
[0]	train-map:0.81452	valid-map:0.81498
[100]	train-map:0.82069	valid-map:0.82111
[200]	train-map:0.82222	valid-map:0.82243
[300]	train-map:0.82291	valid-map:0.82293
[349]	train-map:0.82318	valid-map:0.82300
[0]	train-map:0.81468	valid-map:0.81439
[100]	train-map:0.82073	valid-map:0.82067
[200]	train-map:0.82226	valid-map:0.82208
[300]	train-map:0.82303	valid-map:0.82251
[349]	train-map:0.82333	valid-map:0.82260
[0]	train-map:0.81489	valid-map:0.81370
[100]	train-map:0.82101	valid-map:0.81930
[200]	train-map:0.82256	valid-map:0.82071
[300]	train-map:0.82330	valid-map:0.

  0%|          | 0/19 [00:00<?, ?it/s]

sampled
[0]	train-map:0.97211	valid-map:0.97209
[100]	train-map:0.97451	valid-map:0.97432
[200]	train-map:0.97501	valid-map:0.97465
[300]	train-map:0.97530	valid-map:0.97474
[349]	train-map:0.97542	valid-map:0.97473
[0]	train-map:0.97205	valid-map:0.97202
[100]	train-map:0.97447	valid-map:0.97444
[200]	train-map:0.97502	valid-map:0.97468
[300]	train-map:0.97534	valid-map:0.97475
[349]	train-map:0.97548	valid-map:0.97475
[0]	train-map:0.97205	valid-map:0.97171
[100]	train-map:0.97464	valid-map:0.97403
[200]	train-map:0.97514	valid-map:0.97429
[300]	train-map:0.97544	valid-map:0.97436
[349]	train-map:0.97556	valid-map:0.97435
[0]	train-map:0.97213	valid-map:0.97184
[100]	train-map:0.97459	valid-map:0.97411
[200]	train-map:0.97511	valid-map:0.97444
[300]	train-map:0.97539	valid-map:0.97453
[349]	train-map:0.97553	valid-map:0.97458
[0]	train-map:0.97195	valid-map:0.97233
[100]	train-map:0.97445	valid-map:0.97469
[200]	train-map:0.97500	valid-map:0.97496
[300]	train-map:0.97529	valid-map:0.

  0%|          | 0/19 [00:00<?, ?it/s]

sampled
[0]	train-map:0.99123	valid-map:0.99146
[100]	train-map:0.99282	valid-map:0.99286
[200]	train-map:0.99313	valid-map:0.99296
[300]	train-map:0.99336	valid-map:0.99297
[349]	train-map:0.99347	valid-map:0.99299
[0]	train-map:0.99130	valid-map:0.99104
[100]	train-map:0.99287	valid-map:0.99257
[200]	train-map:0.99319	valid-map:0.99271
[300]	train-map:0.99339	valid-map:0.99273
[349]	train-map:0.99350	valid-map:0.99268
[0]	train-map:0.99130	valid-map:0.99122
[100]	train-map:0.99284	valid-map:0.99273
[200]	train-map:0.99317	valid-map:0.99286
[300]	train-map:0.99337	valid-map:0.99286
[349]	train-map:0.99347	valid-map:0.99285
[0]	train-map:0.99132	valid-map:0.99118
[100]	train-map:0.99286	valid-map:0.99266
[200]	train-map:0.99318	valid-map:0.99273
[300]	train-map:0.99338	valid-map:0.99277
[349]	train-map:0.99349	valid-map:0.99277
[0]	train-map:0.99128	valid-map:0.99137
[100]	train-map:0.99285	valid-map:0.99260
[200]	train-map:0.99317	valid-map:0.99270
[300]	train-map:0.99336	valid-map:0.

In [6]:
feat_importances = pd.Series(model.get_feature_importance(data=train), index=whole_df[FEATURES].columns)

NameError: name 'model' is not defined

In [None]:
feat_importances.nlargest(30).plot(kind='barh')

In [None]:
FEATURES