In [11]:
import cudf
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

In [12]:
%%time 
transactions = cudf.read_csv('storage/transactions_train.csv')
articles = cudf.read_csv('storage/articles.csv')
customers = cudf.read_csv('storage/customers.csv')

CPU times: user 688 ms, sys: 1.05 s, total: 1.74 s
Wall time: 1.77 s


In [13]:
transactions['t_dat'] = cudf.to_datetime(transactions['t_dat'])
customers['FN'].fillna(0.,inplace=True)
customers['Active'].fillna(0.,inplace=True)
customers['club_member_status'].fillna('None',inplace=True)
customers['age'] = customers['age'] / 10
customers['age'] = customers['age'].astype(int)
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].str.lower().fillna('none')

In [14]:
transactions = transactions.merge(customers[['customer_id','postal_code']],on='customer_id',how='left')

In [49]:
def construct_candidate_dict(transactions,key='customer_id'):
    purchase_dict = {}
    for i,x in enumerate(zip(transactions[key], transactions['article_id'])):
        key_id, art_id = x
        if key_id not in purchase_dict:
            purchase_dict[key_id] = {}
        if art_id not in purchase_dict[key_id]:
            purchase_dict[key_id][art_id] = 0
        purchase_dict[key_id][art_id] += 1
    dummy_list = list((transactions['article_id'].value_counts()).index)[:12]
    return purchase_dict,dummy_list

def construct_pred_df(test_df,transactions,add_random_samples=False):
    
    bool_1w = transactions.t_dat>transactions.t_dat.max()-pd.Timedelta(7,unit='day')
    bool_2w = (transactions.t_dat>transactions.t_dat.max()-2*pd.Timedelta(7,unit='day'))&(transactions.t_dat<=transactions.t_dat.max()-pd.Timedelta(7,unit='day'))
    bool_3w = (transactions.t_dat>transactions.t_dat.max()-3*pd.Timedelta(7,unit='day'))&(transactions.t_dat<=transactions.t_dat.max()-2*pd.Timedelta(7,unit='day'))
    bool_4w = (transactions.t_dat>transactions.t_dat.max()-4*pd.Timedelta(7,unit='day'))&(transactions.t_dat<=transactions.t_dat.max()-3*pd.Timedelta(7,unit='day'))
    bool_5w = (transactions.t_dat>transactions.t_dat.max()-5*pd.Timedelta(7,unit='day'))&(transactions.t_dat<=transactions.t_dat.max()-4*pd.Timedelta(7,unit='day'))
    bool_6w = (transactions.t_dat>transactions.t_dat.max()-6*pd.Timedelta(7,unit='day'))&(transactions.t_dat<=transactions.t_dat.max()-5*pd.Timedelta(7,unit='day'))
    bool_7w = (transactions.t_dat>transactions.t_dat.max()-7*pd.Timedelta(7,unit='day'))&(transactions.t_dat<=transactions.t_dat.max()-6*pd.Timedelta(7,unit='day'))
    
    transactions_7w = transactions[bool_7w]
    transactions_6w = transactions[bool_6w]
    transactions_5w = transactions[bool_5w]
    transactions_4w = transactions[bool_4w]
    transactions_3w = transactions[bool_3w]
    transactions_2w = transactions[bool_2w]
    transactions_1w = transactions[bool_1w]
    
    purchase_dict_1w,dummy_list_1w = construct_candidate_dict(transactions_1w)
    purchase_dict_2w,_ = construct_candidate_dict(transactions_2w)
    purchase_dict_3w,_ = construct_candidate_dict(transactions_3w)
    purchase_dict_4w,_ = construct_candidate_dict(transactions_4w)
    purchase_dict_5w,_ = construct_candidate_dict(transactions_5w)
    purchase_dict_6w,_ = construct_candidate_dict(transactions_6w)
    purchase_dict_7w,_ = construct_candidate_dict(transactions_7w)
    
    pred_df = test_df[['customer_id']]
    prediction_list = []
    
    for i, cust_id in enumerate(
            test_df['customer_id'].values.reshape((-1,)),
    ):
        s = []
        if cust_id in purchase_dict_1w:
            l = sorted((purchase_dict_1w[cust_id]).items(), key=lambda x: x[1], reverse=True)
            l = [y[0] for y in l]
            if len(l)>12:
                s += l[:12]
            else:
                s += l
        if cust_id in purchase_dict_2w:
            l = sorted((purchase_dict_2w[cust_id]).items(), key=lambda x: x[1], reverse=True)
            l = [y[0] for y in l]
            if len(l)>12:
                s += l[:12]
            else:
                s += l
        if cust_id in purchase_dict_3w:
            l = sorted((purchase_dict_3w[cust_id]).items(), key=lambda x: x[1], reverse=True)
            l = [y[0] for y in l]
            if len(l)>12:
                s += l[:12]
            else:
                s += l
                
        if cust_id in purchase_dict_4w:
            l = sorted((purchase_dict_4w[cust_id]).items(), key=lambda x: x[1], reverse=True)
            l = [y[0] for y in l]
            if len(l)>12:
                s += l[:12]
            else:
                s += l
                
        if cust_id in purchase_dict_5w:
            l = sorted((purchase_dict_5w[cust_id]).items(), key=lambda x: x[1], reverse=True)
            l = [y[0] for y in l]
            if len(l)>12:
                s += l[:12]
            else:
                s += l
                
        if cust_id in purchase_dict_6w:
            l = sorted((purchase_dict_6w[cust_id]).items(), key=lambda x: x[1], reverse=True)
            l = [y[0] for y in l]
            if len(l)>12:
                s += l[:12]
            else:
                s += l
                
        if cust_id in purchase_dict_7w:
            l = sorted((purchase_dict_7w[cust_id]).items(), key=lambda x: x[1], reverse=True)
            l = [y[0] for y in l]
            if len(l)>12:
                s += l[:12]
            else:
                s += l
                
        s += dummy_list_1w
        s = s[:12]
        prediction_list.append(s)
    pred_df['prediction'] = prediction_list
    return pred_df

def construct_gt_df(test_transactions):
    gt_df = test_transactions.to_pandas().groupby('customer_id')['article_id'].agg(lambda x: x.tolist()).reset_index()
    gt_df.columns = ['customer_id','ground_truth']
    return gt_df

def evaluate_score(pred_df,gt_df,k=12,verbose=True,group_name='customer_id'):
    from metric import mapk
    eval_df = gt_df.merge(pred_df,on=group_name,how='left')
    score = mapk(eval_df['ground_truth'].tolist(),eval_df['prediction'].tolist())
    if verbose: print('map@'+str(k),score)
    return score

In [50]:
trn_start_time,trn_end_time='2020-06-01','2020-09-15'
test_start_time,test_end_time='2020-09-15','2020-09-22'
trn_transactions = transactions[(transactions.t_dat >= cudf.to_datetime(trn_start_time)) & (transactions.t_dat < cudf.to_datetime(trn_end_time))]
test_transactions = transactions[(transactions.t_dat >= cudf.to_datetime(test_start_time)) & (transactions.t_dat < cudf.to_datetime(test_end_time))]

In [51]:
gt_df = construct_gt_df(test_transactions)

In [52]:
test_df = pd.DataFrame()
test_df['customer_id'] = test_transactions['customer_id'].unique().to_arrow().to_pylist()
test_df = test_df.merge(customers[['customer_id','postal_code']].to_pandas(),on='customer_id',how='left')
pred_df = construct_pred_df(test_df,trn_transactions.to_pandas())

In [53]:
evaluate_score(
    pred_df,
    gt_df,
)

map@12 0.024198494683208064


0.024198494683208064

****Submission****

In [36]:
out_dir = 'storage/output/220320_candidate_generation/'

In [31]:
submission_df = pd.read_csv('storage/sample_submission.csv')
pred_df = construct_pred_df(submission_df,transactions.to_pandas())

In [33]:
pred_df['prediction'] = pred_df['prediction'].apply(lambda x: ' '.join(['0'+str(i) for i in x]))

In [39]:
os.makedirs(out_dir,exist_ok=True)
pred_df.to_csv(os.path.join(out_dir,'submission.csv'),index=False)

In [38]:
pred_df

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0924243001 0924243002 0918522001 09...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0924243001 0924243002 0918522001 0923758001 08...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0924243001 0924243002 0918522001 09...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0924243001 0924243002 0918522001 0923758001 08...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0924243001 0924243002 0918522001 0923758001 08...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0713997002 0720125039 0740922009 0791587007 08...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0924243001 0924243002 0918522001 0923758001 08...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0762846027 0689365050 0884081001 0794819001 09...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0924243001 0924243002 0918522001 0923758001 08...
