In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb

In [2]:
dataset_dir = 'storage/dataset/220308_baseline/'

In [3]:
%%time
trn_df = pd.read_csv(dataset_dir+'trn_df.csv').sort_values('customer_id').reset_index()
val_df = pd.read_csv(dataset_dir+'val_df.csv').sort_values('customer_id').reset_index()

CPU times: user 17.4 s, sys: 2.33 s, total: 19.7 s
Wall time: 19.7 s


In [4]:
def x_y_group(data,only_x=False):
    features = [c for c in trn_df.columns if '_countvec' in c]
    x = data[features]
    if only_x: return x,None,None
    
    target = 'label'
    y = data[target]
    group = data.groupby('customer_id').size().to_frame('size')['size'].to_numpy()
    print('shape (x,y,group): ',x.shape,y.shape,group.shape)
    return x,y,group

trn_x,trn_y,trn_grp = x_y_group(trn_df)
val_x,val_y,val_grp = x_y_group(val_df)

shape (x,y,group):  (3506384, 7) (3506384,) (329034,)
shape (x,y,group):  (7121279, 7) (7121279,) (68984,)


In [5]:
xgb_config = {
    'eval_metric':'map@12',
    'objective': 'rank:map',
    'colsample_bytree': 0.7959932314624918, 
    'gamma': 3.236981174565596, 
    'learning_rate': 0.8092969260411637, 
    'min_child_weight': 10.0, 
    'reg_alpha': 83.0, 
    'reg_lambda': 0.9226958452956067, 
    'max_depth': 20,
    'n_estimators': 180,
}

fit_config = {
    'early_stopping_rounds': 5,
}

model = xgb.sklearn.XGBRanker(**xgb_config)
es = xgb.callback.EarlyStopping(
    rounds=fit_config.get('early_stopping_rounds',5),
    save_best=True,
)
model.fit(
    trn_x, trn_y, trn_grp, verbose=True,
    eval_set=[(val_x, val_y)], eval_group=[val_grp],
    early_stopping_rounds=fit_config.get('early_stopping_rounds',5),
    xgb_model=fit_config.get('xgb_model',None),
    callbacks=[es],
)



[0]	validation_0-map@12:0.04645
[1]	validation_0-map@12:0.04092
[2]	validation_0-map@12:0.04497
[3]	validation_0-map@12:0.04925
[4]	validation_0-map@12:0.05013
[5]	validation_0-map@12:0.05457
[6]	validation_0-map@12:0.05740
[7]	validation_0-map@12:0.05835
[8]	validation_0-map@12:0.05885
[9]	validation_0-map@12:0.05926
[10]	validation_0-map@12:0.05887
[11]	validation_0-map@12:0.06005
[12]	validation_0-map@12:0.05989
[13]	validation_0-map@12:0.06025
[14]	validation_0-map@12:0.06038
[15]	validation_0-map@12:0.06070
[16]	validation_0-map@12:0.06050
[17]	validation_0-map@12:0.06050
[18]	validation_0-map@12:0.06050
[19]	validation_0-map@12:0.06050
[20]	validation_0-map@12:0.06050


XGBRanker(base_score=0.5, booster='gbtree', colsample_bylevel=1,
          colsample_bynode=1, colsample_bytree=0.7959932314624918,
          enable_categorical=False, eval_metric='map@12',
          gamma=3.236981174565596, gpu_id=-1, importance_type=None,
          interaction_constraints='', learning_rate=0.8092969260411637,
          max_delta_step=0, max_depth=20, min_child_weight=10.0, missing=nan,
          monotone_constraints='()', n_estimators=180, n_jobs=4,
          num_parallel_tree=1, objective='rank:map', predictor='auto',
          random_state=0, reg_alpha=83.0, reg_lambda=0.9226958452956067,
          scale_pos_weight=None, subsample=1, tree_method='exact',
          validate_parameters=1, verbosity=None)

In [6]:
def make_prediction(model,test_df,k=12):
    test_x,_,_ = x_y_group(test_df,only_x=True)
    test_pred = model.predict(test_x)
    test_x['customer_id'] = test_df['customer_id']
    test_x['article_id'] = test_df['article_id']
    test_x['prediction'] = test_pred
    pred_df = test_x.groupby('customer_id') \
                    .apply(lambda x: x.sort_values('prediction',ascending=False)['article_id'].tolist()[:k]) \
                    .reset_index()
    pred_df.columns = ['customer_id','prediction']
    return pred_df

def evaluate_score(pred_df,gt_df,k=12):
    from metric import mapk
    eval_df = gt_df.merge(pred_df,on='customer_id')
    print('map@'+str(k), mapk(eval_df['ground_truth'].tolist(),eval_df['prediction'].tolist()))

In [7]:
%%time
test_df = pd.read_csv(dataset_dir+'test_df.csv').sort_values('customer_id').reset_index()

CPU times: user 8.7 s, sys: 1.27 s, total: 9.97 s
Wall time: 9.72 s


In [8]:
%%time
pred_df = make_prediction(model,test_df,k=12)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x['customer_id'] = test_df['customer_id']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x['article_id'] = test_df['article_id']


CPU times: user 24.4 s, sys: 684 ms, total: 25.1 s
Wall time: 23.3 s


In [11]:
%%time
gt_df = pd.read_csv(dataset_dir+'gt_df.csv')
gt_df['ground_truth'] = gt_df['ground_truth'].apply(lambda x: eval(x))
evaluate_score(pred_df,gt_df)

map@12 0.006291077726839401
CPU times: user 750 ms, sys: 16 ms, total: 766 ms
Wall time: 766 ms
