In [1]:
VER = 6

import pandas as pd, numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()

import os, sys, pickle, glob, gc
from collections import Counter
import cudf, itertools
print('We will use RAPIDS version',cudf.__version__)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

# from pandarallel import pandarallel

# pandarallel.initialize(nb_workers=4, progress_bar=True, use_memory_fs=False)

import xgboost as xgb
from sklearn.model_selection import GroupKFold
import polars as pl

from pyarrow.parquet import ParquetFile
import pyarrow as pa 

We will use RAPIDS version 22.10.00a+392.g1558403753


# Training

In [2]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

In [3]:
CANDIDATE_COUNT = 100

In [4]:
train_sessions = np.load("./splitted_raw_data/val_sessions_for_train.npy", allow_pickle=True)

In [5]:
type_iters = {
    'clicks': 900,
    'carts': 400,
    'orders': 400
}

In [6]:
# dropcols = [col for col in batch.columns if "user_daywise" in col]
dropcols = ['user_daywise_features_type_0_count_dayno-7',
 'user_daywise_features_type_0_count_dayno-6',
 'user_daywise_features_type_0_count_dayno-5',
 'user_daywise_features_type_0_count_dayno-4',
 'user_daywise_features_type_0_count_dayno-3',
 'user_daywise_features_type_0_count_dayno-2',
 'user_daywise_features_type_0_count_dayno-1',
 'user_daywise_features_type_1_count_dayno-7',
 'user_daywise_features_type_1_count_dayno-6',
 'user_daywise_features_type_1_count_dayno-5',
 'user_daywise_features_type_1_count_dayno-4',
 'user_daywise_features_type_1_count_dayno-3',
 'user_daywise_features_type_1_count_dayno-2',
 'user_daywise_features_type_1_count_dayno-1',
 'user_daywise_features_type_2_count_dayno-7',
 'user_daywise_features_type_2_count_dayno-6',
 'user_daywise_features_type_2_count_dayno-5',
 'user_daywise_features_type_2_count_dayno-4',
 'user_daywise_features_type_2_count_dayno-3',
 'user_daywise_features_type_2_count_dayno-2',
 'user_daywise_features_type_2_count_dayno-1',
 'user_daywise_features_type_0_nunique_dayno-7',
 'user_daywise_features_type_0_nunique_dayno-6',
 'user_daywise_features_type_0_nunique_dayno-5',
 'user_daywise_features_type_0_nunique_dayno-4',
 'user_daywise_features_type_0_nunique_dayno-3',
 'user_daywise_features_type_0_nunique_dayno-2',
 'user_daywise_features_type_0_nunique_dayno-1',
 'user_daywise_features_type_1_nunique_dayno-7',
 'user_daywise_features_type_1_nunique_dayno-6',
 'user_daywise_features_type_1_nunique_dayno-5',
 'user_daywise_features_type_1_nunique_dayno-4',
 'user_daywise_features_type_1_nunique_dayno-3',
 'user_daywise_features_type_1_nunique_dayno-2',
 'user_daywise_features_type_1_nunique_dayno-1',
 'user_daywise_features_type_2_nunique_dayno-7',
 'user_daywise_features_type_2_nunique_dayno-6',
 'user_daywise_features_type_2_nunique_dayno-5',
 'user_daywise_features_type_2_nunique_dayno-4',
 'user_daywise_features_type_2_nunique_dayno-3',
 'user_daywise_features_type_2_nunique_dayno-2',
 'user_daywise_features_type_2_nunique_dayno-1']

## For each action

In [7]:
from catboost import CatBoostRanker, Pool, MetricVisualizer

model_iters = {}

for type_str in tqdm(list(type_labels.keys())):
    
#     pf = ParquetFile(f"./candidated_features/local_{type_str}_all_data.pqt") 
    
#     whole_df = pd.read_parquet(f"./candidated_features/local_{type_str}_all_data.pqt").reset_index(drop=True)
   
    whole_df = []
    batches = sorted(glob.glob(f"./candidated_features/local_{type_str}_all_data_{CANDIDATE_COUNT}candidates_p*.pqt"))
    
    for batch in tqdm(batches):
        
        batch = pd.read_parquet(batch)#.drop(labels=["candidate_rank"], axis=1)
        batch.drop(dropcols, 1, inplace=True)
        batch = batch[batch.session.isin(train_sessions)].reset_index(drop=True)
        #batch = batch.drop(labels=[col for col in batch.columns if col[:4]=="type"], axis=1)
        positives = batch.loc[batch['label']==1].copy()
        negatives = batch.loc[batch['label']==0].groupby("session").sample(frac=0.15,
                                                                           random_state=1337)
        whole_df.append(positives)
        whole_df.append(negatives)
        del batch
        
    whole_df = pd.concat(whole_df, axis=0, ignore_index=True)
    del positives, negatives
    
    print("sampled")

    FEATURES = whole_df.columns[2 : -1]   
    
    whole_df = whole_df.sort_values('session').reset_index(drop=True)
#     group_counts = whole_df.groupby('session', sort=False).session.agg('count').values
    
    skf = GroupKFold(n_splits=5)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(whole_df,
                                                            whole_df['label'],
                                                            groups=whole_df['session'])):
        X_train = whole_df.loc[train_idx, FEATURES]
        y_train = whole_df.loc[train_idx, 'label']
        X_valid = whole_df.loc[valid_idx, FEATURES]
        y_valid = whole_df.loc[valid_idx, 'label']

        train_groups = whole_df.loc[train_idx].groupby('session', sort=False)["session"].agg('count').values
        val_groups = whole_df.loc[valid_idx].groupby('session', sort=False)["session"].agg('count').values
        
#         train_groups = whole_df.loc[train_idx, "session"].values
#         val_groups = whole_df.loc[valid_idx, "session"].values
        dtrain = xgb.DMatrix(X_train, y_train, group = train_groups) 
        dtest = xgb.DMatrix(X_valid, y_valid, group = val_groups) 

        xgb_parms = {'objective':'rank:pairwise',
                     'tree_method':'gpu_hist',
                    "random_state":42, 
#                     "learning_rate":0.1,
#                     "colsample_bytree":0.9, 
                    "eta":0.07, 
#                     "gamma": 2,
#                     "min_child_weight": 200,
                    "max_depth":7, 
#                     "subsample":0.9
                    }
#         xgb_parms = type_params[type_str]
        model = xgb.train(xgb_parms,
                          dtrain=dtrain,
                          evals=[(dtrain,'train'),(dtest,'valid')],
                          num_boost_round=type_iters[type_str],
#                           early_stopping_rounds=100,
                          verbose_eval=100)
#         train = Pool(
#             data=X_train,
#             label=y_train,
#             group_id=train_groups
#         )

#         test = Pool(
#             data=X_valid,
#             label=y_valid,
#             group_id=val_groups
#         )
        
#         loss = "YetiRankPairwise"
#         cb_parameters = {
#             'iterations': 10000,
#             'loss_function': loss,
#             'train_dir': loss,
#             'task_type': 'GPU',
#             'depth': 7,
#             'learning_rate': 0.025,
# #             'custom_metric': ['RecallAt:top=20', 'PrecisionAt:top=20'],
# #             'eval_metric': ['MAP'],
#             'early_stopping_rounds': 200,
# #             'metric_period':0,
#             'verbose': 100,
#             'random_seed': 0,
#         }
#         model = CatBoostRanker(**cb_parameters)
#         model.fit(train,
#                   eval_set=test,
# #                   plot=True
#                  )
        
#         print(np.mean(model.eval_metrics(test, ['MAP'])['MAP']))
    
        
        model_path = f'./models/XGB_{CANDIDATE_COUNT}candidates_fold{fold}_{type_str}.xgb'
        model.save_model(model_path)
        
        model_iters[model_path] = model.best_ntree_limit
        
#         model.save_model(f'./models/CB_{CANDIDATE_COUNT}candidates_fold{fold}_{type_str}.cb')
        del model, X_train, y_train, X_valid, y_valid, dtrain, dtest
        
        for i in range(5):
            gc.collect()
            
    del whole_df
    for i in range(5):
        gc.collect()

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)


sampled
[0]	train-map:0.81856	valid-map:0.81861
[100]	train-map:0.82654	valid-map:0.82577
[200]	train-map:0.82930	valid-map:0.82790
[300]	train-map:0.83106	valid-map:0.82875
[400]	train-map:0.83244	valid-map:0.82906
[500]	train-map:0.83371	valid-map:0.82932
[600]	train-map:0.83492	valid-map:0.82951
[700]	train-map:0.83602	valid-map:0.82960
[800]	train-map:0.83718	valid-map:0.82973
[899]	train-map:0.83820	valid-map:0.82975
[0]	train-map:0.81863	valid-map:0.81824
[100]	train-map:0.82666	valid-map:0.82564
[200]	train-map:0.82948	valid-map:0.82738
[300]	train-map:0.83128	valid-map:0.82812
[400]	train-map:0.83269	valid-map:0.82859
[500]	train-map:0.83404	valid-map:0.82875
[600]	train-map:0.83520	valid-map:0.82890
[700]	train-map:0.83628	valid-map:0.82900
[800]	train-map:0.83731	valid-map:0.82900
[899]	train-map:0.83825	valid-map:0.82895
[0]	train-map:0.81861	valid-map:0.81838
[100]	train-map:0.82641	valid-map:0.82597
[200]	train-map:0.82934	valid-map:0.82813
[300]	train-map:0.83107	valid-ma

  0%|          | 0/19 [00:00<?, ?it/s]

  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)


sampled
[0]	train-map:0.97325	valid-map:0.97295
[100]	train-map:0.97580	valid-map:0.97519
[200]	train-map:0.97688	valid-map:0.97551
[300]	train-map:0.97760	valid-map:0.97560
[399]	train-map:0.97843	valid-map:0.97560
[0]	train-map:0.97342	valid-map:0.97301
[100]	train-map:0.97581	valid-map:0.97504
[200]	train-map:0.97688	valid-map:0.97534
[300]	train-map:0.97766	valid-map:0.97542
[399]	train-map:0.97842	valid-map:0.97546
[0]	train-map:0.97328	valid-map:0.97265
[100]	train-map:0.97591	valid-map:0.97476
[200]	train-map:0.97694	valid-map:0.97514
[300]	train-map:0.97768	valid-map:0.97517
[399]	train-map:0.97845	valid-map:0.97525
[0]	train-map:0.97337	valid-map:0.97274
[100]	train-map:0.97590	valid-map:0.97483
[200]	train-map:0.97692	valid-map:0.97523
[300]	train-map:0.97769	valid-map:0.97537
[399]	train-map:0.97838	valid-map:0.97542
[0]	train-map:0.97324	valid-map:0.97332
[100]	train-map:0.97576	valid-map:0.97531
[200]	train-map:0.97681	valid-map:0.97563
[300]	train-map:0.97759	valid-map:0.

  0%|          | 0/19 [00:00<?, ?it/s]

  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)
  batch.drop(dropcols, 1, inplace=True)


sampled
[0]	train-map:0.99203	valid-map:0.99201
[100]	train-map:0.99355	valid-map:0.99302
[200]	train-map:0.99423	valid-map:0.99307
[300]	train-map:0.99479	valid-map:0.99314
[399]	train-map:0.99529	valid-map:0.99313
[0]	train-map:0.99207	valid-map:0.99166
[100]	train-map:0.99362	valid-map:0.99275
[200]	train-map:0.99428	valid-map:0.99291
[300]	train-map:0.99483	valid-map:0.99296
[399]	train-map:0.99535	valid-map:0.99296
[0]	train-map:0.99205	valid-map:0.99177
[100]	train-map:0.99363	valid-map:0.99294
[200]	train-map:0.99428	valid-map:0.99305
[300]	train-map:0.99484	valid-map:0.99301
[399]	train-map:0.99535	valid-map:0.99302
[0]	train-map:0.99204	valid-map:0.99174
[100]	train-map:0.99359	valid-map:0.99298
[200]	train-map:0.99428	valid-map:0.99302
[300]	train-map:0.99482	valid-map:0.99300
[399]	train-map:0.99534	valid-map:0.99302
[0]	train-map:0.99202	valid-map:0.99188
[100]	train-map:0.99360	valid-map:0.99291
[200]	train-map:0.99422	valid-map:0.99306
[300]	train-map:0.99479	valid-map:0.

In [8]:
import json
with open('./models/model_iters.json', 'w') as fp:
    json.dump(model_iters, fp)

In [9]:
print(k)

NameError: name 'k' is not defined

In [None]:
import optuna
from optuna.integration import CatBoostPruningCallback
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, precision_score, recall_score, fbeta_score, make_scorer

import xgboost as xgb

def xgb_objective(trial,
                                          scorer,
                                          input_train,
                                          input_test,
                                          seed=1337):

    param = {
        "objective": trial.suggest_categorical('objective', ["rank:pairwise"]),
        "booster": trial.suggest_categorical("booster", ["gbtree"]),
        "tree_method": trial.suggest_categorical("tree_method", ["gpu_hist"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        'random_state': trial.suggest_categorical('random_state', [seed]),
    }

    if param["booster"] == "gbtree" or param["booster"] == "dart":
        param["max_depth"] = trial.suggest_int("max_depth", 1, 8)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0)
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    
    model = xgb.train(param,
                      dtrain=input_train,
                      evals=[(input_train,'train'),(input_test,'valid')],
                      num_boost_round=200,
                      early_stopping_rounds=20,
                      verbose_eval=100)
    
    return model.best_score


trial_func = lambda trial: xgb_objective(trial=trial,
                                          scorer=f1_score,
                                          input_train=dtrain,
                                          input_test=dtest,
                                          seed=1337)
    
optuna.logging.set_verbosity(1)


study = optuna.create_study(study_name="xgb_otto",
                            sampler=optuna.samplers.TPESampler(seed=1337),
                            direction="maximize"
                            )

study.optimize(trial_func,
               n_trials=750)

In [None]:
feat_importances = pd.Series(model.get_feature_importance(data=train), index=whole_df[FEATURES].columns)

In [None]:
feat_importances.nlargest(30).plot(kind='barh')

In [None]:
FEATURES