In [11]:
import cudf
import cupy
import dask_cudf
import gc
import glob
import numpy as np
import os
import pandas as pd
from tqdm import tqdm
import xgboost as xgb

In [12]:
base_dir = 'storage/output/220314_baseline/'
art_cust_features = [
    'product_group_name', 'product_type_name', 
    'graphical_appearance_name', 'perceived_colour_value_name', 'colour_group_code', 
    'index_group_name','department_name',
]
local_cv = True

In [13]:
%%time
transactions = cudf.read_csv('storage/transactions_train.csv')
articles = cudf.read_csv('storage/articles.csv')
customers = cudf.read_csv('storage/customers.csv')

transactions['customer_id'] = transactions['customer_id'].str[-16:].str.hex_to_int().astype('int64')

CPU times: user 682 ms, sys: 1.03 s, total: 1.71 s
Wall time: 1.71 s


In [14]:
def merge_customer_article_feature_df(df,articles,art_cust_features,art_cust_dir):
    for feature in art_cust_features:
        tmp = cudf.read_csv(os.path.join(art_cust_dir+feature+'_countvec.csv'),index_col=None)
        tmp['customer_id'] = tmp['customer_id'].str[-16:].str.hex_to_int().astype('int64')
        tmp = tmp[['customer_id',feature,feature+'_countvec']]
        df = df.merge(articles[['article_id',feature]],on='article_id',how='left')
        df = df.merge(tmp,on=['customer_id',feature],how='left')
        del(tmp)
    return df

def load_model(dir_to_load,saved_model_name='model.bin'):
    model = xgb.XGBRanker()
    model.load_model(os.path.join(dir_to_load,saved_model_name))
    return model

def make_feature_df(feature_df,art_cust_features,base_dir,explode=True):
    if explode: pred_df = feature_df.explode('article_id')
    feature_df = merge_customer_article_feature_df(feature_df,articles,art_cust_features,base_dir)
    repeated_purchase_df = cudf.read_csv(os.path.join(base_dir,'repeated_purchase_prob.csv')).rename(columns={'count':'repeated_purchase_prob'})
    repeated_purchase_df['customer_id'] = repeated_purchase_df['customer_id'].str[-16:].str.hex_to_int().astype('int64')
    feature_df = feature_df.merge(repeated_purchase_df,on=['customer_id','article_id'],how='left').fillna(0.)
    return feature_df
    
def predict_rank(pred_df,model):
    from utils import x_y_group
    features = [c for c in pred_df.columns if c not in art_cust_features]
    pred_df = pred_df[features]
    test_x,_,_ = x_y_group(pred_df,[f for f in features if f not in ['customer_id','article_id']],None,only_x=True)
    model = load_model(base_dir)
    test_pred = model.predict(test_x)
    pred_df['rank'] = test_pred
    return pred_df

def sort_submission_df(pred_df,to_pandas=True):
    pred_df = pred_df.sort_values(['customer_id','rank'],ascending=False)
    sub_df = pred_df.groupby('customer_id').nth(0).reset_index()
    sub_df['article_id'] = sub_df['article_id'].astype(str)
    sub_df['rank'] = sub_df['rank'].astype(str)
    for ith in range(1,12):
        tmp = pred_df.groupby('customer_id').nth(ith).reset_index()
        tmp = tmp[['article_id','customer_id','rank']].rename(columns={'article_id':'article_id2','rank':'rank2'})
        tmp['article_id2'] = tmp['article_id2'].astype(str)
        tmp['rank2'] = tmp['rank2'].astype(str)
        sub_df = sub_df.merge(tmp,on='customer_id',how='left')
        sub_df['article_id2'] = sub_df['article_id2'].fillna('')
        sub_df['article_id'] = sub_df['article_id'] + ' '+ sub_df['article_id2']
        sub_df['rank2'] = sub_df['rank2'].fillna('')
        sub_df['rank'] = sub_df['rank'] + ' ' + sub_df['rank2']
        del(sub_df['rank2'])
        del(sub_df['article_id2'])
    sub_df = sub_df.rename(columns={'article_id':'prediction'})
    if to_pandas:
        sub_df = sub_df.to_pandas()
        sub_df['prediction'] = sub_df['prediction'].apply(lambda x: [int(s) for s in x.split()])
        sub_df['rank'] = sub_df['rank'].apply(lambda x: [float(s) for s in x.split()])
    return sub_df
    
def sort_truncate_pred_df(pred_df,to_pandas=True,k=12):
    pred_df = pred_df[['customer_id','article_id','rank']].sort_values(['customer_id','rank'],ascending=False)
    pred_df = cudf.concat([pred_df.groupby('customer_id').nth(i).reset_index() for i in range(k)])
    return pred_df
    
def make_pred_df(cust_art_df,art_cust_features,base_dir,explode=True):
    feature_df = make_feature_df(cust_art_df,art_cust_features,base_dir,explode=explode)
    model = load_model(base_dir)
    pred_df = predict_rank(feature_df,model)
    pred_df = sort_truncate_pred_df(pred_df)
    return pred_df

In [21]:
sub_df = None
for i,f in enumerate(tqdm(glob.glob(base_dir+'candidate_df.csv'))):
    cust_art_df = cudf.read_csv(f)[['customer_id','article_id']]
    cust_art_df['customer_id'] = cust_art_df['customer_id'].str[-16:].str.hex_to_int().astype('int64')
    pred_df = make_pred_df(cust_art_df,art_cust_features,base_dir)
    #pred_df = cust_art_df
    tmp = pred_df.groupby('customer_id')['article_id'].unique().to_frame().reset_index()#.to_pandas()
    sub_df = cudf.concat([sub_df,tmp]) if sub_df is not None else tmp

100%|██████████| 1/1 [00:02<00:00,  2.96s/it]


In [22]:
sub_df = sub_df.rename(columns={'article_id':'prediction'})

In [23]:
%%time
from utils import evaluate_score
gt_df = cudf.read_csv(os.path.join(base_dir,'gt_df.csv'))
gt_df['customer_id'] = gt_df['customer_id'].str[-16:].str.hex_to_int().astype('int64')
gt_df = gt_df.to_pandas()
gt_df['ground_truth'] = gt_df['ground_truth'].apply(lambda x: eval(x))
print('Score with XGBRanker:')
evaluate_score(sub_df.to_pandas(),gt_df)

Score with XGBRanker:
map@12 0.010025513151058912
CPU times: user 1.44 s, sys: 123 ms, total: 1.57 s
Wall time: 1.56 s


In [None]:
%%time
sub = cudf.read_csv('storage/sample_submission.csv')[['customer_id']]
sub['customer_id_2'] = sub['customer_id'].str[-16:].str.hex_to_int().astype('int64')
sub = sub.to_pandas()
tmp = sub_df.to_pandas()
tmp['prediction'] = tmp['prediction'].apply(lambda x: ' '.join(['0'+str(i) for i in x]))
tmp = tmp.rename(columns={'customer_id':'customer_id_2'})
sub = sub.merge(tmp,on='customer_id_2', how='left').fillna('')
del sub['customer_id_2']
sub.to_csv(base_dir+'submission.csv',index=False)