In [1]:
import glob
import os

In [2]:
use_gpu = False
import cudf
import pandas as pd

In [3]:
base_dir = 'storage/output/220314_baseline/'
topk = 50
art_cust_features = [
    'product_group_name', 'product_type_name', 
    'graphical_appearance_name', 'perceived_colour_value_name', 'colour_group_code', 
    'index_group_name', 
    'department_name',
]

****Input dataset****

In [4]:
%%time
transactions = cudf.read_csv('storage/transactions_train.csv')
articles = cudf.read_csv('storage/articles.csv')
customers = cudf.read_csv('storage/customers.csv')

CPU times: user 1.14 s, sys: 1.31 s, total: 2.45 s
Wall time: 2.5 s


In [5]:
%%time
from utils import train_val_test_split
trn_transactions,val_transactions,test_transactions = train_val_test_split(transactions,gpu=True)

CPU times: user 37.1 ms, sys: 47 ms, total: 84.1 ms
Wall time: 84.7 ms


In [6]:
%%time
trn_transactions = trn_transactions[trn_transactions['t_dat'] > pd.to_datetime('2020-08-01')]

CPU times: user 2.55 ms, sys: 15.6 ms, total: 18.2 ms
Wall time: 17.3 ms


In [7]:
%%time
trn_transactions = trn_transactions.to_pandas()
val_transactions = val_transactions.to_pandas()
test_transactions = test_transactions.to_pandas()
articles = articles.to_pandas()
customers = customers.to_pandas()

CPU times: user 928 ms, sys: 234 ms, total: 1.16 s
Wall time: 1.16 s


****Construct customer and article feature dataframe****

In [8]:
def merge_customer_article_feature_df(df,articles,art_cust_features,art_cust_dir):
    for feature in art_cust_features:
        tmp = pd.read_csv(os.path.join(art_cust_dir+feature+'_countvec.csv'),index_col=None)
        tmp = tmp[['customer_id',feature,feature+'_countvec']]
        df = df.merge(articles[['article_id',feature]],on='article_id',how='left')
        df = df.merge(tmp,on=['customer_id',feature],how='left')
    return df

def make_pos_df(trn_transactions,articles,art_cust_features,art_cust_dir):
    trn_df = trn_transactions[['customer_id','article_id']]
    trn_df = merge_customer_article_feature_df(trn_df,articles,art_cust_features,art_cust_dir)
    trn_df = trn_df[['customer_id','article_id']+[f+'_countvec' for f in art_cust_features]]
    trn_df['label'] = 1
    return trn_df

def make_neg_df(trn_transactions,articles,art_cust_features,art_cust_dir):
    trn_df = trn_transactions[['customer_id','article_id']].reset_index().drop(columns=['index'])
    trn_df['customer_id'] = trn_df['customer_id'].sample(frac=1.).to_frame().reset_index().drop(columns=['index'])
    trn_df = merge_customer_article_feature_df(trn_df,articles,art_cust_features,art_cust_dir)
    trn_df = trn_df[['customer_id','article_id']+[f+'_countvec' for f in art_cust_features]]
    trn_df.fillna(0.,inplace=True)
    trn_df['label'] = 0
    return trn_df

def make_trn_df(trn_transactions,articles,art_cust_features,art_cust_dir):
    pos_df = make_pos_df(trn_transactions,articles,art_cust_features,art_cust_dir)
    neg_df = make_neg_df(trn_transactions,articles,art_cust_features,art_cust_dir)
    return pd.concat([pos_df,neg_df])

def agg_items(x,topk):
    s = set(x)
    pos = x
    neg = [i for i in topk if i not in s]
    return pos+neg,len(pos)*[1.]+len(neg)*[0.]

def make_val_df(val_transactions,trn_transactions,articles,art_cust_features,art_cust_dir,k=500):
    topk = trn_transactions['article_id'].value_counts()[:k].index.tolist()
    val_df = val_transactions.groupby('customer_id')['article_id'].agg(lambda x: x.tolist()).to_frame().reset_index()
    val_df = val_df.rename(columns={'article_id':'pos_items'})
    val_df['neg_items'] = [topk]*len(val_df)
    val_df = val_df.apply(lambda s: s.fillna({i: [] for i in val_df.index}))
    val_df['items'] = val_df.apply(lambda x: agg_items(x['pos_items'],x['neg_items']),axis=1)
    val_df['article_id'] = val_df['items'].apply(lambda x: x[0])
    val_df['label'] = val_df['items'].apply(lambda x: x[1])
    val_df = val_df.explode(['article_id','label'])
    val_df = merge_customer_article_feature_df(val_df,articles,art_cust_features,art_cust_dir)
    val_df = val_df[['customer_id','article_id','label']+[f+'_countvec' for f in art_cust_features]]
    val_df.fillna(0.,inplace=True)
    return val_df

def make_test_df(val_transactions,trn_transactions,articles,art_cust_features,art_cust_dir,k=500):
    topk = trn_transactions['article_id'].value_counts()[:k].index.tolist()
    val_df = trn_transactions[['customer_id','article_id']].groupby('customer_id')['article_id'].agg(lambda x: x.tolist()+topk).reset_index()
    val_df = val_df.explode(['article_id'])
    val_df = merge_customer_article_feature_df(val_df,articles,art_cust_features,art_cust_dir)
    val_df = val_df[['customer_id','article_id']+[f+'_countvec' for f in art_cust_features]]
    val_df.fillna(0.,inplace=True)
    return val_df

def make_gt_df(val_transactions):
    gt_df = val_transactions.groupby('customer_id')['article_id'].agg(lambda x: x.tolist()).reset_index()
    gt_df.columns = ['customer_id','ground_truth']
    return gt_df

In [9]:
%%time
trn_df = make_trn_df(trn_transactions,articles,art_cust_features,base_dir)

CPU times: user 23 s, sys: 880 ms, total: 23.9 s
Wall time: 23.9 s


In [10]:
%%time
val_df = make_val_df(val_transactions,trn_transactions,articles,art_cust_features,base_dir,k=topk)

CPU times: user 25.2 s, sys: 4.83 s, total: 30 s
Wall time: 30 s


In [11]:
%%time
test_df = make_test_df(test_transactions,trn_transactions,articles,art_cust_features,base_dir,k=topk)

CPU times: user 1min 1s, sys: 22.6 s, total: 1min 24s
Wall time: 1min 24s


In [12]:
%%time
gt_df = make_gt_df(test_transactions)

CPU times: user 344 ms, sys: 6.05 ms, total: 350 ms
Wall time: 349 ms


****Construct repeated purchase features****

In [13]:
feature = 'repeated_purchase_prob'
repeated_purchase_df = pd.read_csv(os.path.join(base_dir,feature+'.csv')).rename(columns={'count':'repeated_purchase_prob'})

In [14]:
%%time
trn_df = trn_df.merge(repeated_purchase_df,on=['customer_id','article_id'],how='left').fillna(0.)

CPU times: user 1.79 s, sys: 276 ms, total: 2.06 s
Wall time: 2.06 s


In [15]:
%%time
val_df = val_df.merge(repeated_purchase_df,on=['customer_id','article_id'],how='left').fillna(0.)

CPU times: user 1.66 s, sys: 377 ms, total: 2.03 s
Wall time: 2.03 s


In [16]:
%%time
test_df = test_df.merge(repeated_purchase_df,on=['customer_id','article_id'],how='left').fillna(0.)

CPU times: user 5.78 s, sys: 1.84 s, total: 7.61 s
Wall time: 7.61 s


****Save files****

In [17]:
from utils import save_csv

In [18]:
%%time
save_csv(trn_df,base_dir,'trn_df.csv')

CPU times: user 17.7 s, sys: 261 ms, total: 17.9 s
Wall time: 18 s


In [19]:
%%time
save_csv(val_df,base_dir,'val_df.csv')

CPU times: user 18.9 s, sys: 301 ms, total: 19.2 s
Wall time: 19.2 s


In [20]:
%%time
save_csv(test_df,base_dir,'test_df.csv')

CPU times: user 1min 19s, sys: 1.27 s, total: 1min 21s
Wall time: 1min 21s


In [21]:
%%time
save_csv(gt_df,base_dir,'gt_df.csv')

CPU times: user 179 ms, sys: 11.3 ms, total: 190 ms
Wall time: 190 ms
