In [1]:
VER = 6

import pandas as pd, numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()

import os, sys, pickle, glob, gc
from collections import Counter
import cudf, itertools
print('We will use RAPIDS version',cudf.__version__)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

# from pandarallel import pandarallel

# pandarallel.initialize(nb_workers=4, progress_bar=True, use_memory_fs=False)

import xgboost as xgb
from sklearn.model_selection import GroupKFold
import polars as pl

from pyarrow.parquet import ParquetFile
import pyarrow as pa 

We will use RAPIDS version 22.10.00a+392.g1558403753


# Training

In [2]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

In [3]:
CANDIDATE_COUNT = 100

In [4]:
train_sessions = np.load("./splitted_raw_data/val_sessions_for_train.npy", allow_pickle=True)

In [5]:
type_params = {
    'clicks': {'objective': 'rank:pairwise', 'booster': 'gbtree', 'tree_method': 'gpu_hist',
               'lambda': 0.2000795145852262, 'alpha': 0.021947181560734986, 'subsample': 0.8180322716827366,
               'colsample_bytree': 0.7492401197497842, 'random_state': 1337, 'max_depth': 8,
               'eta': 0.18176204403241988, 'gamma': 0.7268976249046284, 'grow_policy': 'lossguide'},
    
    'carts': {'objective': 'rank:pairwise', 'booster': 'gbtree', 'tree_method': 'gpu_hist',
              'lambda': 0.7196334788308238, 'alpha': 0.6904397149795183, 'subsample': 0.8544839033653899,
              'colsample_bytree': 0.6341093892622888, 'random_state': 1337, 'max_depth': 6,
              'eta': 0.2023188224070553, 'gamma': 0.7169407623979183, 'grow_policy': 'lossguide'},
    
    'orders': {'objective': 'rank:pairwise', 'booster': 'gbtree', 'tree_method': 'gpu_hist',
               'lambda': 0.40954731824096013, 'alpha': 0.366692604406995, 'subsample': 0.7191582485032328,
               'colsample_bytree': 0.9086531686000213, 'random_state': 1337, 'max_depth': 6,
               'eta': 0.2268622568560344, 'gamma': 0.9057817396823519, 'grow_policy': 'depthwise'}
}

In [6]:
dropcols = [
    'type0_session_aid_aid_existed',
       'type0_session_aid_aid_existed_multiple',
       'type1_session_aid_aid_existed',
       'type1_session_aid_aid_existed_multiple',
       'type2_session_aid_aid_existed',
       'type2_session_aid_aid_existed_multiple',
    'type0_aid_aid_existed', 'type0_aid_aid_existed_multiple',
       'type0_aid_session_existed', 'type0_aid_session_existed_multiple',
       'type1_aid_aid_existed', 'type1_aid_aid_existed_multiple',
       'type1_aid_session_existed', 'type1_aid_session_existed_multiple',
       'type2_aid_aid_existed', 'type2_aid_aid_existed_multiple',
       'type2_aid_session_existed', 'type2_aid_session_existed_multiple',
    'all_carts_orders_wgt_min', 'all_carts_orders_wgt_std',
       'all_carts_orders_wgt_sum', 'all_carts_orders_wgt_mean',
       'all_carts_orders_wgt_count', 'all_buy2buy_wgt_max',
       'all_buy2buy_wgt_min', 'all_buy2buy_wgt_std', 'all_buy2buy_wgt_sum',
       'all_buy2buy_wgt_mean', 'all_buy2buy_wgt_count', 'aid_session_existed',
    'clicks_covisit_max',
       'clicks_covisit_min', 'clicks_covisit_std', 'clicks_covisit_sum',
       'clicks_covisit_mean', 'clicks_covisit_count',
       'carts_orders_covisit_max', 'carts_orders_covisit_min',
       'carts_orders_covisit_std', 'carts_orders_covisit_sum',
       'carts_orders_covisit_mean', 'carts_orders_covisit_count',
       'buy2buy_covisit_max', 'buy2buy_covisit_min', 'buy2buy_covisit_std',
       'buy2buy_covisit_sum', 'buy2buy_covisit_mean', 'buy2buy_covisit_count',
    'all_clicks_wgt_max',
       'all_clicks_wgt_min', 'all_clicks_wgt_std', 'all_clicks_wgt_sum',
       'all_clicks_wgt_mean', 'all_clicks_wgt_count',
       'all_carts_orders_wgt_max'
]

## For each action

In [7]:
from catboost import CatBoostRanker, Pool, MetricVisualizer

model_iters = {}

for type_str in tqdm(list(type_labels.keys())):
    
#     pf = ParquetFile(f"./candidated_features/local_{type_str}_all_data.pqt") 
    
#     whole_df = pd.read_parquet(f"./candidated_features/local_{type_str}_all_data.pqt").reset_index(drop=True)
   
    whole_df = []
    batches = sorted(glob.glob(f"./candidated_features/local_{type_str}_all_data_{CANDIDATE_COUNT}candidates_p*.pqt"))
    
    for batch in tqdm(batches):
        
        batch = pd.read_parquet(batch)#.drop(labels=["candidate_rank"], axis=1)
#         batch.drop(dropcols, 1, inplace=True)
        batch = batch[batch.session.isin(train_sessions)].reset_index(drop=True)
        #batch = batch.drop(labels=[col for col in batch.columns if col[:4]=="type"], axis=1)
        positives = batch.loc[batch['label']==1].copy()
        negatives = batch.loc[batch['label']==0].groupby("session").sample(frac=0.15,
                                                                           random_state=1337)
        whole_df.append(positives)
        whole_df.append(negatives)
        del batch
        
    whole_df = pd.concat(whole_df, axis=0, ignore_index=True)
    del positives, negatives
    
    print("sampled")

    FEATURES = whole_df.columns[2 : -1]   
    
    whole_df = whole_df.sort_values('session').reset_index(drop=True)
#     group_counts = whole_df.groupby('session', sort=False).session.agg('count').values
    
    skf = GroupKFold(n_splits=5)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(whole_df,
                                                            whole_df['label'],
                                                            groups=whole_df['session'])):
        X_train = whole_df.loc[train_idx, FEATURES]
        y_train = whole_df.loc[train_idx, 'label']
        X_valid = whole_df.loc[valid_idx, FEATURES]
        y_valid = whole_df.loc[valid_idx, 'label']

        train_groups = whole_df.loc[train_idx].groupby('session', sort=False)["session"].agg('count').values
        val_groups = whole_df.loc[valid_idx].groupby('session', sort=False)["session"].agg('count').values
        
#         train_groups = whole_df.loc[train_idx, "session"].values
#         val_groups = whole_df.loc[valid_idx, "session"].values
        dtrain = xgb.DMatrix(X_train, y_train, group = train_groups) 
        dtest = xgb.DMatrix(X_valid, y_valid, group = val_groups) 

        xgb_parms = {'objective':'rank:pairwise',
                     'tree_method':'gpu_hist',
                    "random_state":42, 
                    "learning_rate":0.1,
                    "colsample_bytree":0.9, 
                    "eta":0.05, 
#                     "gamma": 2,
#                     "min_child_weight": 200,
                    "max_depth":6, 
                    "subsample":0.9
                    }
#         xgb_parms = type_params[type_str]
        model = xgb.train(xgb_parms,
                          dtrain=dtrain,
                          evals=[(dtrain,'train'),(dtest,'valid')],
                          num_boost_round=350,
#                           early_stopping_rounds=100,
                          verbose_eval=100)
#         train = Pool(
#             data=X_train,
#             label=y_train,
#             group_id=train_groups
#         )

#         test = Pool(
#             data=X_valid,
#             label=y_valid,
#             group_id=val_groups
#         )
        
#         loss = "YetiRankPairwise"
#         cb_parameters = {
#             'iterations': 10000,
#             'loss_function': loss,
#             'train_dir': loss,
#             'task_type': 'GPU',
#             'depth': 7,
#             'learning_rate': 0.025,
# #             'custom_metric': ['RecallAt:top=20', 'PrecisionAt:top=20'],
# #             'eval_metric': ['MAP'],
#             'early_stopping_rounds': 200,
# #             'metric_period':0,
#             'verbose': 100,
#             'random_seed': 0,
#         }
#         model = CatBoostRanker(**cb_parameters)
#         model.fit(train,
#                   eval_set=test,
# #                   plot=True
#                  )
        
#         print(np.mean(model.eval_metrics(test, ['MAP'])['MAP']))
    
        
        model_path = f'./models/XGB_{CANDIDATE_COUNT}candidates_fold{fold}_{type_str}.xgb'
        model.save_model(model_path)
        
        model_iters[model_path] = model.best_ntree_limit
        
#         model.save_model(f'./models/CB_{CANDIDATE_COUNT}candidates_fold{fold}_{type_str}.cb')
        del model, X_train, y_train, X_valid, y_valid, dtrain, dtest
        
        for i in range(5):
            gc.collect()
            
    del whole_df
    for i in range(5):
        gc.collect()

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

sampled
[0]	train-map:0.81716	valid-map:0.81754
[100]	train-map:0.82529	valid-map:0.82493
[200]	train-map:0.82771	valid-map:0.82663
[300]	train-map:0.82906	valid-map:0.82722
[349]	train-map:0.82960	valid-map:0.82757
[0]	train-map:0.81726	valid-map:0.81700
[100]	train-map:0.82555	valid-map:0.82463
[200]	train-map:0.82778	valid-map:0.82633
[300]	train-map:0.82912	valid-map:0.82690
[349]	train-map:0.82973	valid-map:0.82714
[0]	train-map:0.81701	valid-map:0.81699
[100]	train-map:0.82533	valid-map:0.82515
[200]	train-map:0.82762	valid-map:0.82709
[300]	train-map:0.82901	valid-map:0.82761
[349]	train-map:0.82951	valid-map:0.82776
[0]	train-map:0.81709	valid-map:0.81706
[100]	train-map:0.82551	valid-map:0.82484
[200]	train-map:0.82777	valid-map:0.82627
[300]	train-map:0.82922	valid-map:0.82690
[349]	train-map:0.82977	valid-map:0.82712
[0]	train-map:0.81739	valid-map:0.81602
[100]	train-map:0.82574	valid-map:0.82392
[200]	train-map:0.82798	valid-map:0.82549
[300]	train-map:0.82934	valid-map:0.

  0%|          | 0/19 [00:00<?, ?it/s]

sampled
[0]	train-map:0.97278	valid-map:0.97260
[100]	train-map:0.97558	valid-map:0.97512
[200]	train-map:0.97640	valid-map:0.97532
[300]	train-map:0.97704	valid-map:0.97537
[349]	train-map:0.97735	valid-map:0.97536
[0]	train-map:0.97288	valid-map:0.97271
[100]	train-map:0.97559	valid-map:0.97503
[200]	train-map:0.97638	valid-map:0.97525
[300]	train-map:0.97703	valid-map:0.97533
[349]	train-map:0.97731	valid-map:0.97537
[0]	train-map:0.97288	valid-map:0.97239
[100]	train-map:0.97570	valid-map:0.97472
[200]	train-map:0.97644	valid-map:0.97494
[300]	train-map:0.97704	valid-map:0.97508
[349]	train-map:0.97733	valid-map:0.97517
[0]	train-map:0.97285	valid-map:0.97238
[100]	train-map:0.97565	valid-map:0.97488
[200]	train-map:0.97641	valid-map:0.97513
[300]	train-map:0.97703	valid-map:0.97532
[349]	train-map:0.97733	valid-map:0.97531
[0]	train-map:0.97283	valid-map:0.97322
[100]	train-map:0.97553	valid-map:0.97536
[200]	train-map:0.97630	valid-map:0.97558
[300]	train-map:0.97694	valid-map:0.

  0%|          | 0/19 [00:00<?, ?it/s]

sampled
[0]	train-map:0.99174	valid-map:0.99179
[100]	train-map:0.99335	valid-map:0.99305
[200]	train-map:0.99387	valid-map:0.99319
[300]	train-map:0.99432	valid-map:0.99316
[349]	train-map:0.99454	valid-map:0.99317
[0]	train-map:0.99185	valid-map:0.99152
[100]	train-map:0.99340	valid-map:0.99284
[200]	train-map:0.99393	valid-map:0.99294
[300]	train-map:0.99435	valid-map:0.99295
[349]	train-map:0.99455	valid-map:0.99295
[0]	train-map:0.99180	valid-map:0.99166
[100]	train-map:0.99339	valid-map:0.99297
[200]	train-map:0.99392	valid-map:0.99301
[300]	train-map:0.99436	valid-map:0.99301
[349]	train-map:0.99458	valid-map:0.99301
[0]	train-map:0.99178	valid-map:0.99161
[100]	train-map:0.99336	valid-map:0.99299
[200]	train-map:0.99391	valid-map:0.99302
[300]	train-map:0.99435	valid-map:0.99296
[349]	train-map:0.99456	valid-map:0.99297
[0]	train-map:0.99177	valid-map:0.99178
[100]	train-map:0.99335	valid-map:0.99293
[200]	train-map:0.99391	valid-map:0.99305
[300]	train-map:0.99434	valid-map:0.

In [8]:
import json
with open('./models/model_iters.json', 'w') as fp:
    json.dump(model_iters, fp)

In [9]:
print(k)

NameError: name 'k' is not defined

In [None]:
import optuna
from optuna.integration import CatBoostPruningCallback
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, precision_score, recall_score, fbeta_score, make_scorer

import xgboost as xgb

def xgb_objective(trial,
                                          scorer,
                                          input_train,
                                          input_test,
                                          seed=1337):

    param = {
        "objective": trial.suggest_categorical('objective', ["rank:pairwise"]),
        "booster": trial.suggest_categorical("booster", ["gbtree"]),
        "tree_method": trial.suggest_categorical("tree_method", ["gpu_hist"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        'random_state': trial.suggest_categorical('random_state', [seed]),
    }

    if param["booster"] == "gbtree" or param["booster"] == "dart":
        param["max_depth"] = trial.suggest_int("max_depth", 1, 8)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0)
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    
    model = xgb.train(param,
                      dtrain=input_train,
                      evals=[(input_train,'train'),(input_test,'valid')],
                      num_boost_round=200,
                      early_stopping_rounds=20,
                      verbose_eval=100)
    
    return model.best_score


trial_func = lambda trial: xgb_objective(trial=trial,
                                          scorer=f1_score,
                                          input_train=dtrain,
                                          input_test=dtest,
                                          seed=1337)
    
optuna.logging.set_verbosity(1)


study = optuna.create_study(study_name="xgb_otto",
                            sampler=optuna.samplers.TPESampler(seed=1337),
                            direction="maximize"
                            )

study.optimize(trial_func,
               n_trials=750)

In [None]:
feat_importances = pd.Series(model.get_feature_importance(data=train), index=whole_df[FEATURES].columns)

In [None]:
feat_importances.nlargest(30).plot(kind='barh')

In [None]:
FEATURES