In [1]:
VER = 6

import pandas as pd, numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()

import os, sys, pickle, glob, gc
from collections import Counter
import cudf, itertools
print('We will use RAPIDS version',cudf.__version__)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

# from pandarallel import pandarallel

# pandarallel.initialize(nb_workers=4, progress_bar=True, use_memory_fs=False)

import xgboost as xgb
from sklearn.model_selection import GroupKFold
import polars as pl

from pyarrow.parquet import ParquetFile
import pyarrow as pa 

We will use RAPIDS version 22.10.00a+392.g1558403753


# Training

In [2]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

In [3]:
CANDIDATE_COUNT = 100

In [4]:
train_sessions = np.load("./splitted_raw_data/val_sessions_for_train.npy", allow_pickle=True)

## For each action

In [5]:
from catboost import CatBoostRanker, Pool, MetricVisualizer

for type_str in tqdm(list(type_labels.keys())):
    
#     pf = ParquetFile(f"./candidated_features/local_{type_str}_all_data.pqt") 
    
#     whole_df = pd.read_parquet(f"./candidated_features/local_{type_str}_all_data.pqt").reset_index(drop=True)
   
    whole_df = []
    batches = sorted(glob.glob(f"./candidated_features/local_{type_str}_all_data_{CANDIDATE_COUNT}candidates_p*.pqt"))
    
    for batch in tqdm(batches):
        
        batch = pd.read_parquet(batch)#.drop(labels=["candidate_rank"], axis=1)
        batch = batch[batch.session.isin(train_sessions)].reset_index(drop=True)
        #batch = batch.drop(labels=[col for col in batch.columns if col[:4]=="type"], axis=1)
        positives = batch.loc[batch['label']==1]
        negatives = batch.loc[batch['label']==0].groupby("session").sample(frac=0.15,
                                                                           random_state=1337)
        whole_df.append(positives)
        whole_df.append(negatives)
        
    whole_df = pd.concat(whole_df, axis=0, ignore_index=True)
    del positives, negatives
    
    print("sampled")

    FEATURES = whole_df.columns[2 : -1]   
    
    whole_df = whole_df.sort_values('session').reset_index(drop=True)
    whole_df["group_count"] = whole_df.groupby('session', sort=False).cumcount('candidates') + 1
    
    skf = GroupKFold(n_splits=5)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(whole_df,
                                                            whole_df['label'],
                                                            groups=whole_df['session'])):
        X_train = whole_df.loc[train_idx, FEATURES]
        y_train = whole_df.loc[train_idx, 'label']
        X_valid = whole_df.loc[valid_idx, FEATURES]
        y_valid = whole_df.loc[valid_idx, 'label']

        train_groups = whole_df.loc[train_idx, ['session', 'group_count']].groupby('session')['group_count'].last().to_numpy()
        val_groups = whole_df.loc[valid_idx, ['session', 'group_count']].groupby('session')['group_count'].last().to_numpy()
        
#         train_groups = whole_df.loc[train_idx, "session"].values
#         val_groups = whole_df.loc[valid_idx, "session"].values
        dtrain = xgb.DMatrix(X_train, y_train, group = train_groups) 
        dtest = xgb.DMatrix(X_valid, y_valid, group = val_groups) 

        xgb_parms = {'objective':'rank:pairwise',
                     'tree_method':'gpu_hist',
                    "random_state":42, 
                    "learning_rate":0.1,
                    "colsample_bytree":0.9, 
                    "eta":0.05, 
#                     "gamma": 2,
#                     "min_child_weight": 200,
                    "max_depth":5, 
                    "subsample":0.8
                    }
        model = xgb.train(xgb_parms,
                          dtrain=dtrain,
                          evals=[(dtrain,'train'),(dtest,'valid')],
                          num_boost_round=350,
#                           early_stopping_rounds=400,
                          verbose_eval=100)
#         train = Pool(
#             data=X_train,
#             label=y_train,
#             group_id=train_groups
#         )

#         test = Pool(
#             data=X_valid,
#             label=y_valid,
#             group_id=val_groups
#         )
        
#         loss = "YetiRankPairwise"
#         cb_parameters = {
#             'iterations': 10000,
#             'loss_function': loss,
#             'train_dir': loss,
#             'task_type': 'GPU',
#             'depth': 7,
#             'learning_rate': 0.025,
# #             'custom_metric': ['RecallAt:top=20', 'PrecisionAt:top=20'],
# #             'eval_metric': ['MAP'],
#             'early_stopping_rounds': 200,
# #             'metric_period':0,
#             'verbose': 100,
#             'random_seed': 0,
#         }
#         model = CatBoostRanker(**cb_parameters)
#         model.fit(train,
#                   eval_set=test,
# #                   plot=True
#                  )
        
#         print(np.mean(model.eval_metrics(test, ['MAP'])['MAP']))
    

        model.save_model(f'./models/XGB_{CANDIDATE_COUNT}candidates_fold{fold}_{type_str}.xgb')
#         model.save_model(f'./models/CB_{CANDIDATE_COUNT}candidates_fold{fold}_{type_str}.cb')
        del model, X_train, y_train, X_valid, y_valid, dtrain, dtest
        
        for i in range(5):
            gc.collect()
            
    del whole_df
    for i in range(5):
        gc.collect()

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

sampled
[0]	train-map:0.81494	valid-map:0.81333
[100]	train-map:0.82157	valid-map:0.81964
[200]	train-map:0.82334	valid-map:0.82098
[300]	train-map:0.82410	valid-map:0.82155
[349]	train-map:0.82437	valid-map:0.82169
[0]	train-map:0.81441	valid-map:0.81458
[100]	train-map:0.82117	valid-map:0.82128
[200]	train-map:0.82292	valid-map:0.82295
[300]	train-map:0.82376	valid-map:0.82348
[349]	train-map:0.82405	valid-map:0.82362
[0]	train-map:0.81447	valid-map:0.81521
[100]	train-map:0.82107	valid-map:0.82156
[200]	train-map:0.82282	valid-map:0.82303
[300]	train-map:0.82365	valid-map:0.82348
[349]	train-map:0.82398	valid-map:0.82365
[0]	train-map:0.81431	valid-map:0.81475
[100]	train-map:0.82115	valid-map:0.82136
[200]	train-map:0.82295	valid-map:0.82281
[300]	train-map:0.82379	valid-map:0.82323
[349]	train-map:0.82407	valid-map:0.82329
[0]	train-map:0.81468	valid-map:0.81429
[100]	train-map:0.82127	valid-map:0.82072
[200]	train-map:0.82320	valid-map:0.82230
[300]	train-map:0.82392	valid-map:0.

  0%|          | 0/19 [00:00<?, ?it/s]

sampled
[0]	train-map:0.97210	valid-map:0.97249
[100]	train-map:0.97455	valid-map:0.97474
[200]	train-map:0.97516	valid-map:0.97501
[300]	train-map:0.97546	valid-map:0.97512
[349]	train-map:0.97562	valid-map:0.97514
[0]	train-map:0.97221	valid-map:0.97201
[100]	train-map:0.97465	valid-map:0.97431
[200]	train-map:0.97523	valid-map:0.97455
[300]	train-map:0.97556	valid-map:0.97466
[349]	train-map:0.97569	valid-map:0.97466
[0]	train-map:0.97212	valid-map:0.97177
[100]	train-map:0.97468	valid-map:0.97411
[200]	train-map:0.97528	valid-map:0.97436
[300]	train-map:0.97563	valid-map:0.97447
[349]	train-map:0.97578	valid-map:0.97456
[0]	train-map:0.97217	valid-map:0.97207
[100]	train-map:0.97462	valid-map:0.97443
[200]	train-map:0.97518	valid-map:0.97483
[300]	train-map:0.97553	valid-map:0.97496
[349]	train-map:0.97567	valid-map:0.97496
[0]	train-map:0.97210	valid-map:0.97198
[100]	train-map:0.97465	valid-map:0.97427
[200]	train-map:0.97521	valid-map:0.97465
[300]	train-map:0.97556	valid-map:0.

  0%|          | 0/19 [00:00<?, ?it/s]

sampled
[0]	train-map:0.99128	valid-map:0.99116
[100]	train-map:0.99293	valid-map:0.99258
[200]	train-map:0.99326	valid-map:0.99272
[300]	train-map:0.99348	valid-map:0.99275
[349]	train-map:0.99358	valid-map:0.99277
[0]	train-map:0.99126	valid-map:0.99145
[100]	train-map:0.99288	valid-map:0.99280
[200]	train-map:0.99322	valid-map:0.99294
[300]	train-map:0.99345	valid-map:0.99297
[349]	train-map:0.99355	valid-map:0.99299
[0]	train-map:0.99122	valid-map:0.99122
[100]	train-map:0.99285	valid-map:0.99282
[200]	train-map:0.99319	valid-map:0.99299
[300]	train-map:0.99342	valid-map:0.99298
[349]	train-map:0.99352	valid-map:0.99296
[0]	train-map:0.99130	valid-map:0.99113
[100]	train-map:0.99287	valid-map:0.99273
[200]	train-map:0.99320	valid-map:0.99288
[300]	train-map:0.99344	valid-map:0.99290
[349]	train-map:0.99353	valid-map:0.99289
[0]	train-map:0.99128	valid-map:0.99126
[100]	train-map:0.99289	valid-map:0.99262
[200]	train-map:0.99325	valid-map:0.99280
[300]	train-map:0.99348	valid-map:0.

In [6]:
feat_importances = pd.Series(model.get_feature_importance(data=train), index=whole_df[FEATURES].columns)

NameError: name 'model' is not defined

In [None]:
feat_importances.nlargest(30).plot(kind='barh')

In [None]:
FEATURES