In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb

In [3]:
import sys
sys.path.append('/home/kinho.lo/.local/lib/python3.9/site-packages/')

In [4]:
dataset_dir = 'storage/dataset/220310_baseline/'
best_hyperparams = {
    'colsample_bytree': 0.5190375461291256, 
    'gamma': 5.842869326066733, 
    'learning_rate': 0.21789613935143787, 
    'max_depth': 5, 
    'min_child_weight': 4.0, 
    'reg_alpha': 128.0, 
    'reg_lambda': 0.8583267170752089,
}

In [5]:
%%time
trn_df = pd.read_csv(dataset_dir+'trn_df.csv').sort_values('customer_id').reset_index()
val_df = pd.read_csv(dataset_dir+'val_df.csv').sort_values('customer_id').reset_index()

CPU times: user 24.9 s, sys: 2.72 s, total: 27.6 s
Wall time: 27.8 s


In [6]:
%%time
from utils import x_y_group
features = [c for c in trn_df.columns if c not in ['article_id','customer_id','label','index']]
label = 'label'
trn_x,trn_y,trn_grp = x_y_group(trn_df,features,label)
val_x,val_y,val_grp = x_y_group(val_df,features,label)

CPU times: user 3.27 s, sys: 628 ms, total: 3.9 s
Wall time: 3.89 s


In [7]:
%%time
if best_hyperparams is None:
    from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

    space = {
        'objective': 'rank:map',
        'max_depth': hp.choice("max_depth",np.arange(3, 10, dtype=int) ),
        'gamma': hp.uniform ('gamma', 1,9),
        'learning_rate': hp.uniform('learning_rate',0.1,1.0),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0,
        'eval_metric':'map@12',
    }

    def objective(space):
        model = xgb.XGBRanker(**space)
        model.fit(
            trn_x, trn_y, trn_grp, verbose=True,
            eval_set=[(val_x, val_y)], eval_group=[val_grp],
            early_stopping_rounds=5,
        )
        print ("SCORE:", model.best_score)
        return {'loss': 1.-model.best_score,'status': STATUS_OK}

    trials = Trials()
    best_hyperparams = fmin(
        fn = objective,
        space = space,
        algo = tpe.suggest,
        max_evals = 10,
        trials = trials
    )
    print('Best hyperparameters: ',best_hyperparams)

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 4.77 µs


In [8]:
%%time
xgb_config = {    
    'objective': 'rank:map',
    'n_estimators': 180,
    'seed': 0,
    'eval_metric':'map@12',
}
for k,v in best_hyperparams.items():
    xgb_config[k] = v

model = xgb.sklearn.XGBRanker(**xgb_config)
es = xgb.callback.EarlyStopping(
    rounds=5,
    save_best=True,
)
model.fit(
    trn_x, trn_y, trn_grp, verbose=True,
    eval_set=[(val_x, val_y)], eval_group=[val_grp],
    early_stopping_rounds=5,
    callbacks=[es],
)



[0]	validation_0-map@12:0.03419
[1]	validation_0-map@12:0.04914
[2]	validation_0-map@12:0.04938
[3]	validation_0-map@12:0.04967
[4]	validation_0-map@12:0.05093
[5]	validation_0-map@12:0.05186
[6]	validation_0-map@12:0.06213
[7]	validation_0-map@12:0.06149
[8]	validation_0-map@12:0.06250
[9]	validation_0-map@12:0.06256
[10]	validation_0-map@12:0.06259
[11]	validation_0-map@12:0.06259
[12]	validation_0-map@12:0.06259
[13]	validation_0-map@12:0.06259
[14]	validation_0-map@12:0.06289
[15]	validation_0-map@12:0.06295
[16]	validation_0-map@12:0.06376
[17]	validation_0-map@12:0.06376
[18]	validation_0-map@12:0.06407
[19]	validation_0-map@12:0.06419
[20]	validation_0-map@12:0.06475
[21]	validation_0-map@12:0.06479
[22]	validation_0-map@12:0.06459
[23]	validation_0-map@12:0.06459
[24]	validation_0-map@12:0.06460
[25]	validation_0-map@12:0.06460
[26]	validation_0-map@12:0.06491
[27]	validation_0-map@12:0.06491
[28]	validation_0-map@12:0.06491
[29]	validation_0-map@12:0.06509
[30]	validation_0-ma

XGBRanker(base_score=0.5, booster='gbtree', colsample_bylevel=1,
          colsample_bynode=1, colsample_bytree=0.5190375461291256,
          enable_categorical=False, eval_metric='map@12',
          gamma=5.842869326066733, gpu_id=-1, importance_type=None,
          interaction_constraints='', learning_rate=0.21789613935143787,
          max_delta_step=0, max_depth=5, min_child_weight=4.0, missing=nan,
          monotone_constraints='()', n_estimators=180, n_jobs=4,
          num_parallel_tree=1, objective='rank:map', predictor='auto',
          random_state=0, reg_alpha=128.0, reg_lambda=0.8583267170752089,
          scale_pos_weight=None, seed=0, subsample=1, tree_method='exact',
          validate_parameters=1, ...)

In [9]:
def make_prediction(model,test_df,features,label,k=12):
    test_x,_,_ = x_y_group(test_df,features,label,only_x=True)
    test_pred = model.predict(test_x)
    test_x['customer_id'] = test_df['customer_id']
    test_x['article_id'] = test_df['article_id']
    test_x['prediction'] = test_pred
    pred_df = test_x.groupby('customer_id') \
                    .apply(lambda x: x.sort_values('prediction',ascending=False)['article_id'].tolist()[:k]) \
                    .reset_index()
    pred_df.columns = ['customer_id','prediction']
    return pred_df

def evaluate_score(pred_df,gt_df,k=12):
    from metric import mapk
    eval_df = gt_df.merge(pred_df,on='customer_id')
    print('map@'+str(k), mapk(eval_df['ground_truth'].tolist(),eval_df['prediction'].tolist()))

In [None]:
%%time
test_df = pd.read_csv(dataset_dir+'test_df.csv').sort_values('customer_id').reset_index()

In [None]:
%%time
pred_df = make_prediction(model,test_df,features,label,k=12)

In [None]:
%%time
from random import shuffle
def shuffle_list(x):
    shuffle(x)
    return x
random_df = test_df.groupby('customer_id')['article_id'] \
                    .agg(lambda x: shuffle_list(x.tolist())).to_frame().reset_index() \
                    .rename(columns={'article_id':'prediction'})

In [None]:
%%time
gt_df = pd.read_csv(dataset_dir+'gt_df.csv')
gt_df['ground_truth'] = gt_df['ground_truth'].apply(lambda x: eval(x))
print('Score with XGBRanker:')
evaluate_score(pred_df,gt_df)
print('Score with random:')
evaluate_score(random_df,gt_df)

In [None]:
model.get_booster().get_score(importance_type='gain')