In [1]:
import glob
import os

In [2]:
use_gpu = False
import cudf
import pandas as pd

In [3]:
out_dir = 'storage/dataset/220311_baseline/'
topk = 100

****Input dataset****

In [4]:
%%time
transactions = cudf.read_csv('storage/transactions_train.csv')
articles = cudf.read_csv('storage/articles.csv')
customers = cudf.read_csv('storage/customers.csv')

CPU times: user 1.19 s, sys: 1.69 s, total: 2.88 s
Wall time: 5.25 s


In [5]:
%%time
from utils import train_test_split
trn_transactions,val_transactions = train_test_split(transactions,gpu=True)

CPU times: user 21 ms, sys: 30.1 ms, total: 51.1 ms
Wall time: 64.1 ms


In [6]:
%%time
trn_transactions = trn_transactions[trn_transactions['t_dat'] > pd.to_datetime('2020-08-01')]

CPU times: user 2.56 ms, sys: 12.5 ms, total: 15.1 ms
Wall time: 18.4 ms


In [7]:
%%time
trn_transactions = trn_transactions.to_pandas()
val_transactions = val_transactions.to_pandas()
articles = articles.to_pandas()
customers = customers.to_pandas()

CPU times: user 912 ms, sys: 263 ms, total: 1.18 s
Wall time: 2.23 s


****Construct customer and article feature dataframe****

In [8]:
def merge_customer_article_feature_df(df,articles,art_cust_features,art_cust_dir):
    for feature in art_cust_features:
        tmp = pd.read_csv(os.path.join(art_cust_dir+feature+'_countvec.csv'),index_col=None)
        tmp = tmp[['customer_id',feature,feature+'_countvec']]
        df = df.merge(articles[['article_id',feature]],on='article_id',how='left')
        df = df.merge(tmp,on=['customer_id',feature],how='left')
    return df

def make_pos_df(trn_transactions,articles,art_cust_features,art_cust_dir):
    trn_df = trn_transactions[['customer_id','article_id']]
    trn_df = merge_customer_article_feature_df(trn_df,articles,art_cust_features,art_cust_dir)
    trn_df = trn_df[['customer_id','article_id']+[f+'_countvec' for f in art_cust_features]]
    trn_df['label'] = 1
    return trn_df

def make_neg_df(trn_transactions,articles,art_cust_features,art_cust_dir):
    trn_df = trn_transactions[['customer_id','article_id']].reset_index().drop(columns=['index'])
    trn_df['customer_id'] = trn_df['customer_id'].sample(frac=1.).to_frame().reset_index().drop(columns=['index'])
    trn_df = merge_customer_article_feature_df(trn_df,articles,art_cust_features,art_cust_dir)
    trn_df = trn_df[['customer_id','article_id']+[f+'_countvec' for f in art_cust_features]]
    trn_df.fillna(0.,inplace=True)
    trn_df['label'] = 0
    return trn_df

def make_trn_df(trn_transactions,articles,art_cust_features,art_cust_dir):
    pos_df = make_pos_df(trn_transactions,articles,art_cust_features,art_cust_dir)
    neg_df = make_neg_df(trn_transactions,articles,art_cust_features,art_cust_dir)
    return pd.concat([pos_df,neg_df])

def agg_items(x,topk):
    s = set(x)
    pos = x
    neg = [i for i in topk if i not in s]
    return pos+neg,len(pos)*[1.]+len(neg)*[0.]

def make_val_df(val_transactions,trn_transactions,articles,art_cust_features,art_cust_dir,k=500):
    topk = trn_transactions['article_id'].value_counts()[:k].index.tolist()
    neg_val_df = trn_transactions[['customer_id','article_id']].groupby('customer_id')['article_id'].agg(lambda x: x.tolist()+topk).reset_index()
    neg_val_df = neg_val_df.rename(columns={'article_id':'neg_items'})
    pos_val_df = val_transactions.groupby('customer_id')['article_id'].agg(lambda x: x.tolist()).to_frame().reset_index()
    pos_val_df = pos_val_df.rename(columns={'article_id':'pos_items'})
    val_df = pos_val_df.merge(neg_val_df,on=['customer_id'],how='left')
    val_df = val_df.apply(lambda s: s.fillna({i: [] for i in val_df.index}))
    val_df['items'] = val_df.apply(lambda x: agg_items(x['pos_items'],x['neg_items']),axis=1)
    val_df['article_id'] = val_df['items'].apply(lambda x: x[0])
    val_df['label'] = val_df['items'].apply(lambda x: x[1])
    val_df = val_df.explode(['article_id','label'])
    val_df = merge_customer_article_feature_df(val_df,articles,art_cust_features,art_cust_dir)
    val_df = val_df[['customer_id','article_id','label']+[f+'_countvec' for f in art_cust_features]]
    val_df.fillna(0.,inplace=True)
    return val_df

def make_test_df(val_transactions,trn_transactions,articles,art_cust_features,art_cust_dir,k=500):
    topk = trn_transactions['article_id'].value_counts()[:k].index.tolist()
    val_df = trn_transactions[['customer_id','article_id']].groupby('customer_id')['article_id'].agg(lambda x: x.tolist()+topk).reset_index()
    val_df = val_df.explode(['article_id'])
    val_df = merge_customer_article_feature_df(val_df,articles,art_cust_features,art_cust_dir)
    val_df = val_df[['customer_id','article_id']+[f+'_countvec' for f in art_cust_features]]
    val_df.fillna(0.,inplace=True)
    return val_df

def make_gt_df(val_transactions):
    gt_df = val_transactions.groupby('customer_id')['article_id'].agg(lambda x: x.tolist()).reset_index()
    gt_df.columns = ['customer_id','ground_truth']
    return gt_df

In [9]:
art_cust_dir = 'storage/preprocessing/220308_baseline/'
art_cust_features = [
    'product_group_name', 'product_type_name', 
    'graphical_appearance_name', 'perceived_colour_value_name', 'colour_group_code', 
    'index_group_name', 
    'department_name',
]

In [10]:
%%time
trn_df = make_trn_df(trn_transactions,articles,art_cust_features,art_cust_dir)

CPU times: user 23 s, sys: 969 ms, total: 24 s
Wall time: 24 s


In [11]:
%%time
val_df = make_val_df(val_transactions,trn_transactions,articles,art_cust_features,art_cust_dir,k=topk)

CPU times: user 31.6 s, sys: 8.73 s, total: 40.4 s
Wall time: 40.4 s


In [12]:
%%time
test_df = make_test_df(val_transactions,trn_transactions,articles,art_cust_features,art_cust_dir,k=topk)

CPU times: user 2min 2s, sys: 51.6 s, total: 2min 54s
Wall time: 2min 53s


In [13]:
%%time
gt_df = make_gt_df(val_transactions)

CPU times: user 352 ms, sys: 10.5 ms, total: 362 ms
Wall time: 361 ms


****Construct repeated purchase features****

In [14]:
input_dir = 'storage/preprocessing/220308_baseline/'

In [15]:
feature = 'repeated_purchase_prob'
repeated_purchase_df = pd.read_csv(os.path.join(input_dir,feature+'.csv')).rename(columns={'count':'repeated_purchase_prob'})

In [16]:
%%time
trn_df = trn_df.merge(repeated_purchase_df,on=['customer_id','article_id'],how='left').fillna(0.)

CPU times: user 1.91 s, sys: 436 ms, total: 2.35 s
Wall time: 2.34 s


In [17]:
%%time
val_df = val_df.merge(repeated_purchase_df,on=['customer_id','article_id'],how='left').fillna(0.)

CPU times: user 1.51 s, sys: 640 ms, total: 2.15 s
Wall time: 2.14 s


In [18]:
%%time
test_df = test_df.merge(repeated_purchase_df,on=['customer_id','article_id'],how='left').fillna(0.)

CPU times: user 9.18 s, sys: 4.47 s, total: 13.7 s
Wall time: 13.6 s


****Save files****

In [19]:
from utils import save_csv

In [20]:
%%time
save_csv(trn_df,out_dir,'trn_df.csv')

CPU times: user 20.7 s, sys: 333 ms, total: 21.1 s
Wall time: 21.4 s


In [21]:
%%time
save_csv(val_df,out_dir,'val_df.csv')

CPU times: user 23.5 s, sys: 539 ms, total: 24 s
Wall time: 24 s


In [22]:
%%time
save_csv(test_df,out_dir,'test_df.csv')

CPU times: user 2min 48s, sys: 2.67 s, total: 2min 50s
Wall time: 2min 51s


In [23]:
%%time
save_csv(gt_df,out_dir,'gt_df.csv')

CPU times: user 178 ms, sys: 10.3 ms, total: 188 ms
Wall time: 235 ms


In [31]:
val_df[val_df.customer_id==val_df.sample().customer_id.item()]

Unnamed: 0,customer_id,article_id,label,product_group_name_countvec,product_type_name_countvec,graphical_appearance_name_countvec,perceived_colour_value_name_countvec,colour_group_code_countvec,index_group_name_countvec,department_name_countvec,repeated_purchase_prob
3013340,aed94a24b182c0178b3540e9ec9d1ef0053ac4b7a357bc...,905492001,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3013341,aed94a24b182c0178b3540e9ec9d1ef0053ac4b7a357bc...,890215001,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3013342,aed94a24b182c0178b3540e9ec9d1ef0053ac4b7a357bc...,751471001,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3013343,aed94a24b182c0178b3540e9ec9d1ef0053ac4b7a357bc...,706016001,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3013344,aed94a24b182c0178b3540e9ec9d1ef0053ac4b7a357bc...,918292001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
3013437,aed94a24b182c0178b3540e9ec9d1ef0053ac4b7a357bc...,685816002,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3013438,aed94a24b182c0178b3540e9ec9d1ef0053ac4b7a357bc...,599580038,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3013439,aed94a24b182c0178b3540e9ec9d1ef0053ac4b7a357bc...,893059004,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3013440,aed94a24b182c0178b3540e9ec9d1ef0053ac4b7a357bc...,918522001,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
