In [1]:
import glob
import os

In [2]:
use_gpu = False
import cudf
import pandas as pd

In [3]:
out_dir = 'storage/dataset/220311_baseline/'
topk = 100

****Input dataset****

In [4]:
%%time
transactions = cudf.read_csv('storage/transactions_train.csv')
articles = cudf.read_csv('storage/articles.csv')
customers = cudf.read_csv('storage/customers.csv')

CPU times: user 1.12 s, sys: 1.32 s, total: 2.43 s
Wall time: 2.52 s


In [5]:
%%time
from utils import train_test_split
trn_transactions,val_transactions = train_test_split(transactions,gpu=True)

CPU times: user 17.8 ms, sys: 31.4 ms, total: 49.2 ms
Wall time: 63.6 ms


In [6]:
%%time
trn_transactions = trn_transactions[trn_transactions['t_dat'] > pd.to_datetime('2020-08-01')]

CPU times: user 4.45 ms, sys: 9.29 ms, total: 13.7 ms
Wall time: 17.2 ms


In [7]:
%%time
trn_transactions = trn_transactions.to_pandas()
val_transactions = val_transactions.to_pandas()
articles = articles.to_pandas()
customers = customers.to_pandas()

CPU times: user 952 ms, sys: 271 ms, total: 1.22 s
Wall time: 2.14 s


****Construct customer and article feature dataframe****

In [8]:
def merge_customer_article_feature_df(df,articles,art_cust_features,art_cust_dir):
    for feature in art_cust_features:
        tmp = pd.read_csv(os.path.join(art_cust_dir+feature+'_countvec.csv'),index_col=None)
        tmp = tmp[['customer_id',feature,feature+'_countvec']]
        df = df.merge(articles[['article_id',feature]],on='article_id',how='left')
        df = df.merge(tmp,on=['customer_id',feature],how='left')
    return df

def make_pos_df(trn_transactions,articles,art_cust_features,art_cust_dir):
    trn_df = trn_transactions[['customer_id','article_id']]
    trn_df = merge_customer_article_feature_df(trn_df,articles,art_cust_features,art_cust_dir)
    trn_df = trn_df[['customer_id','article_id']+[f+'_countvec' for f in art_cust_features]]
    trn_df['label'] = 1
    return trn_df

def make_neg_df(trn_transactions,articles,art_cust_features,art_cust_dir):
    trn_df = trn_transactions[['customer_id','article_id']].reset_index().drop(columns=['index'])
    trn_df['customer_id'] = trn_df['customer_id'].sample(frac=1.).to_frame().reset_index().drop(columns=['index'])
    trn_df = merge_customer_article_feature_df(trn_df,articles,art_cust_features,art_cust_dir)
    trn_df = trn_df[['customer_id','article_id']+[f+'_countvec' for f in art_cust_features]]
    trn_df.fillna(0.,inplace=True)
    trn_df['label'] = 0
    return trn_df

def make_trn_df(trn_transactions,articles,art_cust_features,art_cust_dir):
    pos_df = make_pos_df(trn_transactions,articles,art_cust_features,art_cust_dir)
    neg_df = make_neg_df(trn_transactions,articles,art_cust_features,art_cust_dir)
    return pd.concat([pos_df,neg_df])

def agg_items(x,topk):
    s = set(x)
    pos = x
    neg = [i for i in topk if i not in s]
    return pos+neg,len(pos)*[1.]+len(neg)*[0.]

def make_val_df(val_transactions,trn_transactions,articles,art_cust_features,art_cust_dir,k=500):
    topk = trn_transactions['article_id'].value_counts()[:k].index.tolist()
    val_df = val_transactions.groupby('customer_id')['article_id'].agg(lambda x: agg_items(x.tolist(),topk)).to_frame().reset_index()
    val_df = val_df.rename(columns={'article_id':'items'})
    val_df['article_id'] = val_df['items'].apply(lambda x: x[0])
    val_df['label'] = val_df['items'].apply(lambda x: x[1])
    val_df = val_df.explode(['article_id','label'])
    val_df = merge_customer_article_feature_df(val_df,articles,art_cust_features,art_cust_dir)
    val_df = val_df[['customer_id','article_id','label']+[f+'_countvec' for f in art_cust_features]]
    val_df.fillna(0.,inplace=True)
    return val_df

def make_test_df(val_transactions,trn_transactions,articles,art_cust_features,art_cust_dir,k=500):
    topk = trn_transactions['article_id'].value_counts()[:k].index.tolist()
    val_df = trn_transactions[['customer_id','article_id']].groupby('customer_id')['article_id'].agg(lambda x: x.tolist()+topk).reset_index()
    val_df = val_df.explode(['article_id'])
    val_df = merge_customer_article_feature_df(val_df,articles,art_cust_features,art_cust_dir)
    val_df = val_df[['customer_id','article_id']+[f+'_countvec' for f in art_cust_features]]
    val_df.fillna(0.,inplace=True)
    return val_df

def make_gt_df(val_transactions):
    gt_df = val_transactions.groupby('customer_id')['article_id'].agg(lambda x: x.tolist()).reset_index()
    gt_df.columns = ['customer_id','ground_truth']
    return gt_df

In [9]:
art_cust_dir = 'storage/preprocessing/220308_baseline/'
art_cust_features = [
    'product_group_name', 'product_type_name', 
    'graphical_appearance_name', 'perceived_colour_value_name', 'colour_group_code', 
    'index_group_name', 
    'department_name',
]

In [10]:
%%time
trn_df = make_trn_df(trn_transactions,articles,art_cust_features,art_cust_dir)

CPU times: user 24.4 s, sys: 1.81 s, total: 26.2 s
Wall time: 27 s


In [11]:
%%time
val_df = make_val_df(val_transactions,trn_transactions,articles,art_cust_features,art_cust_dir,k=topk)

CPU times: user 34.4 s, sys: 12.6 s, total: 46.9 s
Wall time: 46.9 s


In [12]:
%%time
test_df = make_test_df(val_transactions,trn_transactions,articles,art_cust_features,art_cust_dir,k=topk)

CPU times: user 2min 2s, sys: 55 s, total: 2min 57s
Wall time: 2min 57s


In [13]:
%%time
gt_df = make_gt_df(val_transactions)

CPU times: user 298 ms, sys: 4.05 ms, total: 302 ms
Wall time: 301 ms


****Construct repeated purchase features****

In [14]:
input_dir = 'storage/preprocessing/220308_baseline/'

In [15]:
feature = 'repeated_purchase_prob'
repeated_purchase_df = pd.read_csv(os.path.join(input_dir,feature+'.csv')).rename(columns={'count':'repeated_purchase_prob'})

In [16]:
%%time
trn_df = trn_df.merge(repeated_purchase_df,on=['customer_id','article_id'],how='left').fillna(0.)

CPU times: user 2.17 s, sys: 498 ms, total: 2.66 s
Wall time: 2.66 s


In [17]:
%%time
val_df = val_df.merge(repeated_purchase_df,on=['customer_id','article_id'],how='left').fillna(0.)

CPU times: user 2.82 s, sys: 1.12 s, total: 3.94 s
Wall time: 3.94 s


In [18]:
%%time
test_df = test_df.merge(repeated_purchase_df,on=['customer_id','article_id'],how='left').fillna(0.)

CPU times: user 9.49 s, sys: 4.53 s, total: 14 s
Wall time: 14 s


****Save files****

In [19]:
from utils import save_csv

In [20]:
%%time
save_csv(trn_df,out_dir,'trn_df.csv')

CPU times: user 21.2 s, sys: 351 ms, total: 21.6 s
Wall time: 21.6 s


In [21]:
%%time
save_csv(val_df,out_dir,'val_df.csv')

CPU times: user 36.2 s, sys: 582 ms, total: 36.7 s
Wall time: 36.8 s


In [22]:
%%time
save_csv(test_df,out_dir,'test_df.csv')

CPU times: user 2min 53s, sys: 2.75 s, total: 2min 56s
Wall time: 2min 56s


In [23]:
%%time
save_csv(gt_df,out_dir,'gt_df.csv')

CPU times: user 191 ms, sys: 7.93 ms, total: 199 ms
Wall time: 228 ms
