In [1]:
import glob
import os
import pandas as pd
print(pd.__version__)

1.3.4


In [None]:
out_dir = 'storage/dataset/220308_baseline/'
art_cust_features = [
    'product_group_name', 'product_type_name', 
    'graphical_appearance_name', 'perceived_colour_value_name', 'colour_group_code', 
    'index_group_name', 
    'department_name',
    
]

In [3]:
%%time
transactions = pd.read_csv('storage/transactions_train.csv')
articles = pd.read_csv('storage/articles.csv')
customers = pd.read_csv('storage/customers.csv')

CPU times: user 25.5 s, sys: 2.82 s, total: 28.4 s
Wall time: 28.4 s


In [4]:
from utils import train_test_split

trn_transactions,val_transactions = train_test_split(transactions)

In [5]:
trn_transactions = trn_transactions[trn_transactions['t_dat'] > pd.to_datetime('2020-08-01')]

In [33]:
def merge_df(df,articles,art_cust_features,art_cust_dir):
    for feature in art_cust_features:
        tmp = pd.read_csv(os.path.join(art_cust_dir+feature+'_countvec.csv'),index_col=None)
        tmp = tmp[['customer_id',feature,feature+'_countvec']]
        df = df.merge(articles[['article_id',feature]],on='article_id',how='left')
        df = df.merge(tmp,on=['customer_id',feature],how='left')
    return df

def make_pos_df(trn_transactions,articles,art_cust_features,art_cust_dir):
    trn_df = trn_transactions[['customer_id','article_id']]
    trn_df = merge_df(trn_df,articles,art_cust_features,art_cust_dir)
    trn_df = trn_df[['customer_id','article_id']+[f+'_countvec' for f in art_cust_features]]
    trn_df['label'] = 1
    return trn_df

def make_neg_df(trn_transactions,articles,art_cust_features,art_cust_dir):
    trn_df = trn_transactions[['customer_id','article_id']].reset_index().drop(columns=['index'])
    trn_df['customer_id'] = trn_df['customer_id'].sample(frac=1.).to_frame().reset_index().drop(columns=['index'])
    trn_df = merge_df(trn_df,articles,art_cust_features,art_cust_dir)
    trn_df = trn_df[['customer_id','article_id']+[f+'_countvec' for f in art_cust_features]]
    trn_df.fillna(0.,inplace=True)
    trn_df['label'] = 0
    return trn_df

def make_trn_df(trn_transactions,articles,art_cust_features,art_cust_dir):
    pos_df = make_pos_df(trn_transactions,articles,art_cust_features,art_cust_dir)
    neg_df = make_neg_df(trn_transactions,articles,art_cust_features,art_cust_dir)
    return pd.concat([pos_df,neg_df])

def agg_items(x,topk):
    s = set(x)
    pos = x
    neg = [i for i in topk if i not in s]
    return pos+neg,len(pos)*[1.]+len(neg)*[0.]

def make_val_df(val_transactions,trn_transactions,art_cust_features,art_cust_dir,k=500):
    topk = trn_transactions['article_id'].value_counts()[:k].index.tolist()
    val_df = val_transactions.groupby('customer_id')['article_id'].agg(lambda x: agg_items(x.tolist(),topk)).to_frame().reset_index()
    val_df = val_df.rename(columns={'article_id':'items'})
    val_df['article_id'] = val_df['items'].apply(lambda x: x[0])
    val_df['label'] = val_df['items'].apply(lambda x: x[1])
    val_df = val_df.explode(['article_id','label'])
    val_df = merge_df(val_df,articles,art_cust_features,art_cust_dir)
    val_df = val_df[['customer_id','article_id','label']+[f+'_countvec' for f in art_cust_features]]
    val_df.fillna(0.,inplace=True)
    return val_df

def make_test_df(val_transactions,trn_transactions,art_cust_features,art_cust_dir,k=500):
    topk = trn_transactions['article_id'].value_counts()[:k].index.tolist()
    val_df = val_transactions.groupby('customer_id')['article_id'].agg(lambda x: topk).to_frame().reset_index()
    val_df = val_df.explode(['article_id'])
    val_df = merge_df(val_df,articles,art_cust_features,art_cust_dir)
    val_df = val_df[['customer_id','article_id']+[f+'_countvec' for f in art_cust_features]]
    val_df.fillna(0.,inplace=True)
    return val_df

def make_gt_df(val_transactions):
    gt_df = val_transactions.groupby('customer_id')['article_id'].agg(lambda x: x.tolist()).reset_index()
    gt_df.columns = ['customer_id','ground_truth']
    return gt_df

In [None]:
def save_csv(df,path):
    df.to_csv(path,index=False)

In [None]:
os.makedirs(out_dir,exist_ok=True)

In [7]:
%%time
trn_df = make_trn_df(trn_transactions,articles,art_cust_features,art_cust_dir)
save_csv(trn_df,out_dir+'trn_df.csv')

CPU times: user 28.9 s, sys: 1.44 s, total: 30.4 s
Wall time: 30.4 s


In [8]:
%%time
val_df = make_val_df(val_transactions,trn_transactions,art_cust_features,art_cust_dir,k=topk)
save_csv(val_df,out_dir+'val_df.csv')

CPU times: user 44 s, sys: 20.4 s, total: 1min 4s
Wall time: 1min 4s


In [15]:
%%time
test_df = make_test_df(val_transactions,trn_transactions,art_cust_features,art_cust_dir,k=topk)
save_csv(test_df,out_dir+'test_df.csv')

CPU times: user 1min 2s, sys: 17.7 s, total: 1min 19s
Wall time: 1min 20s


In [31]:
%%time
gt_df = make_gt_df(val_transactions)
save_csv(gt_df,out_dir+'gt_df.csv')

CPU times: user 552 ms, sys: 15.4 ms, total: 567 ms
Wall time: 568 ms


In [32]:
gt_df

Unnamed: 0,customer_id,ground_truth
0,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,[624486001]
1,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,[827487003]
2,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,"[757926001, 788575004, 640021019]"
3,000525e3fe01600d717da8423643a8303390a055c578ed...,[874110016]
4,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,"[903762001, 879189005, 158340001, 867966009, 9..."
...,...,...
68979,fffa67737587e52ff1afa9c7c6490b5eb7acbc439fe82b...,"[874816003, 911870004]"
68980,fffa7d7799eb390a76308454cbdd76e473d65b1497fbe4...,"[861803014, 849886010]"
68981,fffae8eb3a282d8c43c77dd2ca0621703b71e90904dfde...,"[396135007, 817472007, 715624050, 817472003, 8..."
68982,fffd870c6324ad3bda24e4d6aeae221c199479086bfdfd...,"[750423010, 761269001]"
