In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb

In [2]:
dataset_dir = 'storage/dataset/220310_baseline/'

In [3]:
%%time
trn_df = pd.read_csv(dataset_dir+'trn_df.csv').sort_values('customer_id').reset_index()
val_df = pd.read_csv(dataset_dir+'val_df.csv').sort_values('customer_id').reset_index()

CPU times: user 17.3 s, sys: 1.66 s, total: 18.9 s
Wall time: 19.1 s


In [4]:
def x_y_group(data,only_x=False):
    features = [c for c in trn_df.columns if c not in ['article_id','customer_id','label','index']]
    x = data[features]
    if only_x: return x,None,None
    
    target = 'label'
    y = data[target]
    group = data.groupby('customer_id').size().to_frame('size')['size'].to_numpy()
    print('shape (x,y,group): ',x.shape,y.shape,group.shape)
    return x,y,group

trn_x,trn_y,trn_grp = x_y_group(trn_df)
val_x,val_y,val_grp = x_y_group(val_df)

shape (x,y,group):  (3506384, 8) (3506384,) (329034,)
shape (x,y,group):  (7121279, 8) (7121279,) (68984,)


In [5]:
xgb_config = {
    'eval_metric':'map@12',
    'objective': 'rank:map',
    'colsample_bytree': 0.7959932314624918, 
    'gamma': 3.236981174565596, 
    'learning_rate': 0.8092969260411637, 
    'min_child_weight': 10.0, 
    'reg_alpha': 83.0, 
    'reg_lambda': 0.9226958452956067, 
    'max_depth': 20,
    'n_estimators': 180,
}

fit_config = {
    'early_stopping_rounds': 5,
}

model = xgb.sklearn.XGBRanker(**xgb_config)
es = xgb.callback.EarlyStopping(
    rounds=fit_config.get('early_stopping_rounds',5),
    save_best=True,
)
model.fit(
    trn_x, trn_y, trn_grp, verbose=True,
    eval_set=[(val_x, val_y)], eval_group=[val_grp],
    early_stopping_rounds=fit_config.get('early_stopping_rounds',5),
    xgb_model=fit_config.get('xgb_model',None),
    callbacks=[es],
)



[0]	validation_0-map@12:0.02982
[1]	validation_0-map@12:0.05764
[2]	validation_0-map@12:0.05770
[3]	validation_0-map@12:0.05777
[4]	validation_0-map@12:0.05777
[5]	validation_0-map@12:0.05777
[6]	validation_0-map@12:0.05777
[7]	validation_0-map@12:0.05806
[8]	validation_0-map@12:0.05806
[9]	validation_0-map@12:0.05806
[10]	validation_0-map@12:0.05806
[11]	validation_0-map@12:0.05806
[12]	validation_0-map@12:0.05806


XGBRanker(base_score=0.5, booster='gbtree', colsample_bylevel=1,
          colsample_bynode=1, colsample_bytree=0.7959932314624918,
          enable_categorical=False, eval_metric='map@12',
          gamma=3.236981174565596, gpu_id=-1, importance_type=None,
          interaction_constraints='', learning_rate=0.8092969260411637,
          max_delta_step=0, max_depth=20, min_child_weight=10.0, missing=nan,
          monotone_constraints='()', n_estimators=180, n_jobs=4,
          num_parallel_tree=1, objective='rank:map', predictor='auto',
          random_state=0, reg_alpha=83.0, reg_lambda=0.9226958452956067,
          scale_pos_weight=None, subsample=1, tree_method='exact',
          validate_parameters=1, verbosity=None)

In [6]:
def make_prediction(model,test_df,k=12):
    test_x,_,_ = x_y_group(test_df,only_x=True)
    test_pred = model.predict(test_x)
    test_x['customer_id'] = test_df['customer_id']
    test_x['article_id'] = test_df['article_id']
    test_x['prediction'] = test_pred
    pred_df = test_x.groupby('customer_id') \
                    .apply(lambda x: x.sort_values('prediction',ascending=False)['article_id'].tolist()[:k]) \
                    .reset_index()
    pred_df.columns = ['customer_id','prediction']
    return pred_df

def evaluate_score(pred_df,gt_df,k=12):
    from metric import mapk
    eval_df = gt_df.merge(pred_df,on='customer_id')
    print('map@'+str(k), mapk(eval_df['ground_truth'].tolist(),eval_df['prediction'].tolist()))

In [7]:
%%time
test_df = pd.read_csv(dataset_dir+'test_df.csv').sort_values('customer_id').reset_index()

CPU times: user 40.1 s, sys: 4.62 s, total: 44.7 s
Wall time: 45.3 s


In [8]:
%%time
pred_df = make_prediction(model,test_df,k=12)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x['customer_id'] = test_df['customer_id']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_x['article_id'] = test_df['article_id']


CPU times: user 1min 50s, sys: 2.79 s, total: 1min 52s
Wall time: 1min 47s


In [26]:
%%time
from random import shuffle
def shuffle_list(x):
    shuffle(x)
    return x
random_df = test_df.groupby('customer_id')['article_id'] \
                    .agg(lambda x: shuffle_list(x.tolist())).to_frame().reset_index() \
                    .rename(columns={'article_id':'prediction'})

CPU times: user 24.3 s, sys: 1.23 s, total: 25.6 s
Wall time: 25.6 s


In [27]:
%%time
gt_df = pd.read_csv(dataset_dir+'gt_df.csv')
gt_df['ground_truth'] = gt_df['ground_truth'].apply(lambda x: eval(x))
print('Score with XGBRanker:')
evaluate_score(pred_df,gt_df)
print('Score with random:')
evaluate_score(random_df,gt_df)

Score with XGBRanker:
map@12 0.02716574325029833
Score with random:
map@12 0.004175692335097729
CPU times: user 1.04 s, sys: 12.1 ms, total: 1.05 s
Wall time: 1.05 s


In [14]:
model.get_booster().get_score(importance_type='gain')

{'product_type_name_countvec': 1134.258544921875,
 'graphical_appearance_name_countvec': 335.81927490234375,
 'perceived_colour_value_name_countvec': 37.089210510253906,
 'colour_group_code_countvec': 3422.0830078125,
 'index_group_name_countvec': 32.791015625,
 'department_name_countvec': 13352.654296875,
 'repeated_purchase_prob': 86492.796875}