In [1]:
VER = 6

import pandas as pd, numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()

import os, sys, pickle, glob, gc
from collections import Counter
import cudf, itertools
print('We will use RAPIDS version',cudf.__version__)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

# from pandarallel import pandarallel

# pandarallel.initialize(nb_workers=4, progress_bar=True, use_memory_fs=False)

import xgboost as xgb
from sklearn.model_selection import GroupKFold
import polars as pl

from pyarrow.parquet import ParquetFile
import pyarrow as pa 
from fastai.tabular.core import df_shrink

We will use RAPIDS version 22.10.00a+392.g1558403753


In [2]:
def remove_negative_session(df,target='label'):
    true_df = df.groupby('session')[target].agg('sum') > 0
    session = pd.DataFrame(true_df[true_df]).reset_index()['session']
    df = df.merge(session, how = 'inner', on = 'session')
    return df

# Training

In [3]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

In [4]:
CANDIDATE_COUNT = 100

In [5]:
train_sessions = np.load("./splitted_raw_data/val_sessions_for_train.npy", allow_pickle=True)

In [6]:
type_iters = {
    'clicks': 900,
    'carts': 400,
    'orders': 400
}

type_fracs = {
    'clicks': 0.15,
    'carts': 0.15,
    'orders': 0.15
}

## For each action

In [7]:
from catboost import CatBoostRanker, Pool, MetricVisualizer

model_iters = {}

for type_str in tqdm(list(type_labels.keys())):
    
#     pf = ParquetFile(f"./candidated_features/local_{type_str}_all_data.pqt") 
    
#     whole_df = pd.read_parquet(f"./candidated_features/local_{type_str}_all_data.pqt").reset_index(drop=True)
   
    whole_df = []
    batches = sorted(glob.glob(f"./candidated_features/local_{type_str}_all_data_{CANDIDATE_COUNT}candidates_p*.pqt"))
    
    for batch in tqdm(batches):
        
        batch = pd.read_parquet(batch)#.drop(labels=["candidate_rank"], axis=1)
        batch = remove_negative_session(batch)
#         batch.drop(dropcols, 1, inplace=True)
        batch = batch[batch.session.isin(train_sessions)].reset_index(drop=True)
        #batch = batch.drop(labels=[col for col in batch.columns if col[:4]=="type"], axis=1)
        positives = batch.loc[batch['label']==1].copy()
        negatives = batch.loc[batch['label']==0].groupby("session").sample(frac=type_fracs[type_str],
                                                                           random_state=1337)
        whole_df.append(positives)
        whole_df.append(negatives)
        del batch, positives, negatives
        
    whole_df = pd.concat(whole_df, axis=0, ignore_index=True)
#     whole_df = df_shrink(whole_df)
    
    print(f"sampled: {whole_df.shape}")

    FEATURES = whole_df.columns[2 : -1]   
    
    whole_df = whole_df.sort_values('session').reset_index(drop=True)
#     group_counts = whole_df.groupby('session', sort=False).session.agg('count').values
    
    skf = GroupKFold(n_splits=5)
    for fold, (train_idx, valid_idx) in enumerate(skf.split(whole_df,
                                                            whole_df['label'],
                                                            groups=whole_df['session'])):
        X_train = whole_df.loc[train_idx, FEATURES]
        y_train = whole_df.loc[train_idx, 'label']
        X_valid = whole_df.loc[valid_idx, FEATURES]
        y_valid = whole_df.loc[valid_idx, 'label']

        train_groups = whole_df.loc[train_idx].groupby('session', sort=False)["session"].agg('count').values
        val_groups = whole_df.loc[valid_idx].groupby('session', sort=False)["session"].agg('count').values
        

        dtrain = xgb.DMatrix(X_train, y_train, group = train_groups) 
        dtest = xgb.DMatrix(X_valid, y_valid, group = val_groups) 
#         print(k)
        xgb_parms = {'objective':'rank:pairwise',
                     'tree_method':'gpu_hist',
                    "random_state":42, 
#                      "eval_metric":'map',
                    "learning_rate":0.1,
                    "colsample_bytree":0.9, 
                    "eta":0.05, 
#                     "gamma": 2,
#                     "min_child_weight": 50,
                    "max_depth":6, 
                    "subsample":0.9,
#                      "grow_policy": "lossguide"
                    }
#         xgb_parms = type_params[type_str]
        model = xgb.train(xgb_parms,
                          dtrain=dtrain,
                          evals=[(dtrain,'train'),(dtest,'valid')],
                          num_boost_round = type_iters[type_str],
#                           early_stopping_rounds=25,
                          verbose_eval=100)


#         train_groups = whole_df.loc[train_idx, "session"].values
#         val_groups = whole_df.loc[valid_idx, "session"].values

#         train = Pool(
#             data=X_train,
#             label=y_train,
#             group_id=train_groups
#         )

#         test = Pool(
#             data=X_valid,
#             label=y_valid,
#             group_id=val_groups
#         )
        
#         loss = "YetiRankPairwise"
#         cb_parameters = {
#             'iterations': 10000,
#             'loss_function': loss,
#             'train_dir': loss,
#             'task_type': 'GPU',
#             'depth': 7,
#             'learning_rate': 0.025,
# #             'custom_metric': ['RecallAt:top=20', 'PrecisionAt:top=20'],
# #             'eval_metric': ['MAP'],
#             'early_stopping_rounds': 200,
# #             'metric_period':0,
#             'verbose': 100,
#             'random_seed': 0,
#         }
#         model = CatBoostRanker(**cb_parameters)
#         model.fit(train,
#                   eval_set=test,
# #                   plot=True
#                  )
        
#         print(np.mean(model.eval_metrics(test, ['MAP'])['MAP']))
    
        
        model_path = f'./models/XGB_{CANDIDATE_COUNT}candidates_fold{fold}_{type_str}.xgb'
        model.save_model(model_path)
        
        model_iters[model_path] = model.best_ntree_limit
        
#         model.save_model(f'./models/CB_{CANDIDATE_COUNT}candidates_fold{fold}_{type_str}.cb')
        del model, X_train, y_train, X_valid, y_valid, dtrain, dtest
        
        for i in range(5):
            gc.collect()
            
    del whole_df
    for i in range(5):
        gc.collect()

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/19 [00:00<?, ?it/s]

sampled: (7198528, 176)
[0]	train-map:0.67835	valid-map:0.67593
[100]	train-map:0.72498	valid-map:0.72299
[200]	train-map:0.72863	valid-map:0.72559
[300]	train-map:0.73082	valid-map:0.72661
[400]	train-map:0.73258	valid-map:0.72743
[500]	train-map:0.73428	valid-map:0.72794
[600]	train-map:0.73578	valid-map:0.72823
[700]	train-map:0.73703	valid-map:0.72841
[800]	train-map:0.73839	valid-map:0.72860
[899]	train-map:0.73965	valid-map:0.72860
[0]	train-map:0.67801	valid-map:0.67826
[100]	train-map:0.72481	valid-map:0.72417
[200]	train-map:0.72847	valid-map:0.72650
[300]	train-map:0.73070	valid-map:0.72717
[400]	train-map:0.73248	valid-map:0.72782
[500]	train-map:0.73407	valid-map:0.72828
[600]	train-map:0.73553	valid-map:0.72854
[700]	train-map:0.73687	valid-map:0.72886
[800]	train-map:0.73821	valid-map:0.72900
[899]	train-map:0.73934	valid-map:0.72912
[0]	train-map:0.67807	valid-map:0.67813
[100]	train-map:0.72489	valid-map:0.72359
[200]	train-map:0.72864	valid-map:0.72604
[300]	train-map:

  0%|          | 0/19 [00:00<?, ?it/s]

sampled: (1458871, 176)
[0]	train-map:0.77958	valid-map:0.77563
[100]	train-map:0.80783	valid-map:0.80170
[200]	train-map:0.81464	valid-map:0.80396
[300]	train-map:0.81974	valid-map:0.80446
[399]	train-map:0.82462	valid-map:0.80463
[0]	train-map:0.77864	valid-map:0.78076
[100]	train-map:0.80720	valid-map:0.80471
[200]	train-map:0.81352	valid-map:0.80683
[300]	train-map:0.81860	valid-map:0.80795
[399]	train-map:0.82343	valid-map:0.80828
[0]	train-map:0.77855	valid-map:0.77722
[100]	train-map:0.80787	valid-map:0.80271
[200]	train-map:0.81449	valid-map:0.80477
[300]	train-map:0.81961	valid-map:0.80543
[399]	train-map:0.82434	valid-map:0.80569
[0]	train-map:0.77993	valid-map:0.77430
[100]	train-map:0.80798	valid-map:0.80030
[200]	train-map:0.81454	valid-map:0.80290
[300]	train-map:0.81954	valid-map:0.80347
[399]	train-map:0.82477	valid-map:0.80350
[0]	train-map:0.77784	valid-map:0.77944
[100]	train-map:0.80699	valid-map:0.80486
[200]	train-map:0.81365	valid-map:0.80617
[300]	train-map:0.81

  0%|          | 0/19 [00:00<?, ?it/s]

sampled: (866299, 176)
[0]	train-map:0.88552	valid-map:0.87839
[100]	train-map:0.91136	valid-map:0.90296
[200]	train-map:0.91881	valid-map:0.90465
[300]	train-map:0.92499	valid-map:0.90530
[399]	train-map:0.93100	valid-map:0.90552
[0]	train-map:0.88391	valid-map:0.88547
[100]	train-map:0.91111	valid-map:0.90471
[200]	train-map:0.91846	valid-map:0.90586
[300]	train-map:0.92485	valid-map:0.90587
[399]	train-map:0.93077	valid-map:0.90546
[0]	train-map:0.88305	valid-map:0.88102
[100]	train-map:0.91105	valid-map:0.90534
[200]	train-map:0.91866	valid-map:0.90683
[300]	train-map:0.92454	valid-map:0.90746
[399]	train-map:0.93066	valid-map:0.90749
[0]	train-map:0.88407	valid-map:0.88257
[100]	train-map:0.91048	valid-map:0.90723
[200]	train-map:0.91791	valid-map:0.90845
[300]	train-map:0.92420	valid-map:0.90830
[399]	train-map:0.93062	valid-map:0.90801
[0]	train-map:0.88476	valid-map:0.88157
[100]	train-map:0.91112	valid-map:0.90328
[200]	train-map:0.91894	valid-map:0.90520
[300]	train-map:0.925

In [8]:
#         xgb_parms = {'objective':'rank:pairwise',
#                      'tree_method':'gpu_hist',
#                     "random_state":42, 
# #                      "eval_metric":'map@20',
# #                     "learning_rate":0.1,
#                     "colsample_bytree":0.3, 
#                     "eta":0.07, 
# #                     "gamma": 2,
#                     "min_child_weight": 20,
#                     "max_depth":7, 
# #                     "subsample":0.2,
# #                      "grow_policy": "lossguide"
#                     }
# #         xgb_parms = type_params[type_str]
#         model = xgb.train(xgb_parms,
#                           dtrain=dtrain,
#                           evals=[(dtrain,'train'),(dtest,'valid')],
#                           num_boost_round=1000,
#                           early_stopping_rounds=50,
#                           verbose_eval=100)
#         print(model.best_score, model.best_ntree_limit)

In [9]:
# del model
# for i in range(5):
#     gc.collect()

In [10]:
import json
with open('./models/model_iters.json', 'w') as fp:
    json.dump(model_iters, fp)

In [11]:
print(k)

NameError: name 'k' is not defined

In [None]:
import optuna
from optuna.integration import CatBoostPruningCallback
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, precision_score, recall_score, fbeta_score, make_scorer

import xgboost as xgb

def xgb_objective(trial,
                                          scorer,
                                          input_train,
                                          input_test,
                                          seed=1337):

    param = {
        "objective": trial.suggest_categorical('objective', ["rank:pairwise"]),
        "booster": trial.suggest_categorical("booster", ["gbtree"]),
        "tree_method": trial.suggest_categorical("tree_method", ["gpu_hist"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        'random_state': trial.suggest_categorical('random_state', [seed]),
    }

    if param["booster"] == "gbtree" or param["booster"] == "dart":
        param["max_depth"] = trial.suggest_int("max_depth", 1, 7)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0)
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0)
#         param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    
    model = xgb.train(param,
                      dtrain=input_train,
                      evals=[(input_train,'train'),(input_test,'valid')],
                      num_boost_round=1000,
                      early_stopping_rounds=50,
                      verbose_eval=100)
    score = model.best_score
    
    del model
    gc.collect()
    
    return score


trial_func = lambda trial: xgb_objective(trial=trial,
                                          scorer=f1_score,
                                          input_train=dtrain,
                                          input_test=dtest,
                                          seed=1337)
    
optuna.logging.set_verbosity(1)


study = optuna.create_study(study_name="xgb_otto",
                            sampler=optuna.samplers.TPESampler(seed=1337),
                            direction="maximize"
                            )

study.optimize(trial_func,
               n_trials=100)

In [None]:
feat_importances = pd.Series(model.get_feature_importance(data=train), index=whole_df[FEATURES].columns)

In [None]:
feat_importances.nlargest(30).plot(kind='barh')

In [None]:
del model