In [1]:
import cudf
import gc
import lightgbm as lgb
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

In [64]:
%%time 
transactions = cudf.read_csv('storage/transactions_train.csv')
articles = cudf.read_csv('storage/articles.csv')
customers = cudf.read_csv('storage/customers.csv')

CPU times: user 627 ms, sys: 1.26 s, total: 1.89 s
Wall time: 4.33 s


In [65]:
customers['FN'].fillna(0.,inplace=True)
customers['Active'].fillna(0.,inplace=True)
customers['club_member_status'].fillna('None',inplace=True)
customers['age'] = customers['age'] / 10
customers['age'] = customers['age'].astype(int)
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].str.lower().fillna('none')

In [66]:
transactions['t_dat'] = cudf.to_datetime(transactions['t_dat'])

In [67]:
customers = customers.merge(
    transactions.groupby('customer_id')['sales_channel_id'].mean().to_frame().reset_index().rename(columns={'sales_channel_id':'sales_channel_habit'}),
    on='customer_id',
    how='left',
)
customers['sales_channel_habit'] = (customers['sales_channel_habit'] > 1.5).astype(int)
customers['sales_channel_habit'].fillna(0.,inplace=True)

In [80]:
def past_purchase_feature(df,transactions):
    transactions['count'] = 1
    
    time_elapsed_last_purchase = transactions['t_dat'].max()-transactions[['customer_id','article_id','t_dat']].groupby(['customer_id','article_id'])['t_dat'].max()
    time_elapsed_last_purchase = time_elapsed_last_purchase.dt.days
    df = df.merge(time_elapsed_last_purchase,on=['article_id','customer_id'],how='left')
    df = df.rename(columns={'t_dat':'time_elapsed_last_purchase'})
    df['time_elapsed_last_purchase'].fillna(1e6,inplace=True)
    
    time_elapsed_first_release = transactions[['customer_id','article_id','t_dat']].groupby(['customer_id','article_id'])['t_dat'].min()-cudf.to_datetime('2018-09-01')
    time_elapsed_first_release = time_elapsed_first_release.dt.days
    df = df.merge(time_elapsed_first_release,on=['article_id','customer_id'],how='left')
    df = df.rename(columns={'t_dat':'time_elapsed_first_release'})
    df['time_elapsed_first_release'].fillna(1e6,inplace=True)
    
    past_purchase_prob = transactions[['customer_id','article_id','count']].groupby(['customer_id','article_id'])['count'].count().reset_index()
    norm = transactions[['customer_id','article_id']].groupby('customer_id').count().reset_index().rename(columns={'article_id':'norm'})
    past_purchase_prob = past_purchase_prob.merge(norm,on='customer_id')
    past_purchase_prob['count'] = past_purchase_prob['count'] / past_purchase_prob['norm']
    past_purchase_prob.drop(columns=['norm'],inplace=True)
    df = df.merge(past_purchase_prob,on=['article_id','customer_id'],how='left')
    df = df.rename(columns={'count':'past_purchase_prob'})
    df['past_purchase_prob'].fillna(0.,inplace=True)
    
    total_purchase = transactions[['article_id','count']].groupby('article_id')['count'].count().reset_index().rename(columns={'count':'total_purchase'})
    norm = transactions['count'].sum()
    total_purchase['total_purchase'] = total_purchase['total_purchase'] / norm
    df = df.merge(total_purchase,on='article_id',how='left')
    df['total_purchase'].fillna(0.,inplace=True)
    
    number_of_purchase = transactions[['customer_id','count']].groupby('customer_id')['count'].count().reset_index().rename(columns={'count':'number_of_purchase'})
    df = df.merge(number_of_purchase,on='customer_id',how='left')
    df['number_of_purchase'].fillna(0.,inplace=True)
    
    repeated_purchase = transactions[['customer_id','article_id','count']].groupby(['customer_id','article_id'])['count'].count().reset_index().rename(columns={'count':'repeated_purchase'})
    df = df.merge(repeated_purchase,on=['customer_id','article_id'],how='left')
    
    min_dat_purchase = transactions.groupby(['article_id'])['t_dat'].min()
    max_dat_purchase = transactions.groupby(['article_id'])['t_dat'].max()
    sale_duration = (max_dat_purchase - min_dat_purchase).to_frame().reset_index().rename(columns={'t_dat':'duration'})
    sale_duration['duration'] = sale_duration['duration'].dt.days
    sale_count = transactions.groupby(['article_id'])['t_dat'].count().to_frame().reset_index().rename(columns={'t_dat':'count'})
    sale_rate = sale_duration.merge(sale_count,on='article_id')
    sale_rate = sale_rate.loc[sale_rate['duration']!=0]
    sale_rate['sale_rate'] = sale_rate['count'] / sale_rate['duration']
    df = df.merge(sale_rate[['article_id','sale_rate']],on='article_id',how='left')

    return df
    
def article_feature_prob_vector(df,transactions,articles,article_features,postfix='_prob',customer_group_name='customer_id'):
    transactions['count'] = 1
    if customer_group_name != 'customer_id':
        df = df.merge(customers[['customer_id',customer_group_name]],on='customer_id',how='left')
        transactions = transactions.merge(customers[['customer_id',customer_group_name]],on='customer_id',how='left')
    for article_feature in article_features:
        transactions = transactions.merge(articles[['article_id',article_feature]],on='article_id',how='left')
        norm = transactions.groupby([customer_group_name])['count'].count().reset_index()
        norm.rename(columns={'count':'norm'},inplace=True)
        count = transactions.groupby([customer_group_name,article_feature])['count'].count().reset_index()
        count = count.merge(norm,on=customer_group_name)
        count['count'] = count['count'] / count['norm']
        count = count.rename(columns={'count':article_feature+postfix})
        count = count[[customer_group_name,article_feature,article_feature+postfix]]
        del(norm)
        df = df.merge(articles[['article_id',article_feature]],on='article_id',how='left')
        df = df.merge(count,on=[customer_group_name,article_feature],how='left')
    return df

def customer_feature_prob_vector(df,transactions,customers,customer_features,postfix='_prob'):
    transactions['count'] = 1
    for customer_feature in customer_features:
        transactions = transactions.merge(customers[['customer_id',customer_feature]],on='customer_id',how='left')
        norm = transactions.groupby(['article_id'])['count'].count().reset_index()
        norm.rename(columns={'count':'norm'},inplace=True)
        count = transactions.groupby(['article_id',customer_feature])['count'].count().reset_index()
        count = count.merge(norm,on='article_id')
        count['count'] = count['count'] / count['norm']
        count = count.rename(columns={'count':customer_feature+postfix})
        count = count[['article_id',customer_feature,customer_feature+postfix]]
        del(norm)
        df = df.merge(customers[['customer_id',customer_feature]],on='customer_id',how='left')
        df = df.merge(count,on=['article_id',customer_feature],how='left')
    return df

def construct_feature_df(
        df,transactions,
        article_features,
        articles,
        customer_features,
        customers,
        general_features=['article_id','customer_id'],
    ):
    df = article_feature_prob_vector(df,transactions,articles,article_features)
    df = customer_feature_prob_vector(df,transactions,customers,customer_features)
    df = past_purchase_feature(df,transactions)
    df = df[
            general_features+[f for f in df.columns if '_prob' in f] + 
            ['total_purchase','time_elapsed_last_purchase','past_purchase_prob','number_of_purchase','time_elapsed_first_release','repeated_purchase']
        ]
    return df

def construct_candidate_dict(transactions_3w):
    purchase_dict_3w = {}
    for i,x in enumerate(zip(transactions_3w['customer_id'], transactions_3w['article_id'])):
        cust_id, art_id = x
        if cust_id not in purchase_dict_3w:
            purchase_dict_3w[cust_id] = {}
        if art_id not in purchase_dict_3w[cust_id]:
            purchase_dict_3w[cust_id][art_id] = 0
        purchase_dict_3w[cust_id][art_id] += 1
    return purchase_dict_3w

def construct_candidate_df(
        test_df,transactions,
        nweek=8,
        n_popular_item=90,
        n_total_item=None,
    ):
    
    recent_transactions = {}
    purchase_dict = {}
    for i in range(1,nweek+1):
        recent_transactions[i] = transactions[(transactions.t_dat>transactions.t_dat.max()-i*pd.Timedelta(7,unit='day'))&(transactions.t_dat<=transactions.t_dat.max()-(i-1)*pd.Timedelta(7,unit='day'))].to_pandas()
        purchase_dict[i] = construct_candidate_dict(recent_transactions[i])
    
    if 1 in recent_transactions:
        most_popular_items_1w_all = list((recent_transactions[1]['article_id'].value_counts()).index)[:n_popular_item]
    else:
        most_popular_items_1w_all = list(transactions['article_id'].value_counts().index.to_arrow().to_pylist())[:n_popular_item]
    
    pred_df = pd.DataFrame()
    pred_df['customer_id'] = test_df['customer_id'].unique()
    
    prediction_list = []
    
    for i, cust_id in enumerate(pred_df['customer_id']):
        s = []
        total_purchase_dict = {}
        
        for i,purchase_dict_week in purchase_dict.items():
            if cust_id in purchase_dict_week:
                l = sorted((purchase_dict_week[cust_id]).items(), key=lambda x: x[1], reverse=True)
                l = [y[0] for y in l]
                for aid in l:
                    if aid not in total_purchase_dict:
                        total_purchase_dict[aid] = 1
                    else:
                        total_purchase_dict[aid] += 1

        for aid in most_popular_items_1w_all[:n_popular_item]:
            if aid not in total_purchase_dict:
                total_purchase_dict[aid] = 1
            else:
                total_purchase_dict[aid] += 1

        if n_total_item is not None:
            total_purchase_dict = {k: v for k, v in sorted(total_purchase_dict.items(), key=lambda item: item[1], reverse=True)}
            s = list(total_purchase_dict.keys())[:n_total_item]
        else:
            s = list(total_purchase_dict.keys())
        
        prediction_list.append(s)
        
    pred_df['article_id'] = prediction_list
    
    return pred_df

def construct_test_df(test_df,transactions,article_features,articles,customer_features,customers,how='outer',n_popular_item=90):
    test_df = construct_candidate_df(test_df.to_pandas(),transactions,n_popular_item=n_popular_item).explode(['article_id']).reset_index(drop=True)
    test_df = cudf.from_pandas(test_df)
    test_df = construct_feature_df(test_df,transactions,article_features,articles,customer_features,customers,general_features=['article_id','customer_id'])
    test_df = test_df.fillna(0.)
    test_df['article_id'] = test_df['article_id'].astype(int)
    test_df = test_df.sort_values(['customer_id','article_id']).reset_index(drop=True)
    return test_df

def construct_gt_df(test_transactions):
    gt_df = test_transactions.to_pandas().groupby('customer_id')['article_id'].agg(lambda x: x.tolist()).reset_index()
    gt_df.columns = ['customer_id','ground_truth']
    return gt_df
    
def construct_dataset(
        transactions,
        articles,customers,
        trn_start_time='2020-08-31',trn_end_time='2020-09-08',
        val_start_time='2020-09-08',val_end_time='2020-09-15',
        test_start_time='2020-09-08',test_end_time='2020-09-15',
        article_features=[
            'product_type_name','product_group_name',
            'graphical_appearance_name','colour_group_name',
            'perceived_colour_value_name','perceived_colour_master_name',
            'department_name', 'index_name',
            'index_group_name','section_name',
            'garment_group_name',
        ],
        customer_features=[
            'FN','Active','club_member_status','age','fashion_news_frequency',#'sales_channel_habit',
        ],
    ):
    
    trn_start_time = cudf.to_datetime(trn_start_time)
    trn_end_time = cudf.to_datetime(trn_end_time)
    val_start_time = cudf.to_datetime(val_start_time)
    val_end_time = cudf.to_datetime(val_end_time)
    test_start_time = cudf.to_datetime(test_start_time)
    test_end_time = cudf.to_datetime(test_end_time)
    
    trn_transactions = transactions[(transactions.t_dat > trn_start_time) & (transactions.t_dat <= trn_end_time)]
    val_transactions = transactions[(transactions.t_dat > val_start_time) & (transactions.t_dat <= val_end_time)]
    test_transactions = transactions[(transactions.t_dat > test_start_time) & (transactions.t_dat <= test_end_time)]
    gt_df = construct_gt_df(test_transactions)
    
    trn_df = construct_test_df(val_transactions,trn_transactions,article_features,articles,customer_features,customers,how='left',n_popular_item=80)
    pos_label = val_transactions[['article_id','customer_id']].groupby(['article_id','customer_id']).size().to_frame('label')
    pos_label['label'] = pos_label['label'].apply(lambda x: x if x <=20. else 20.)
    trn_df = trn_df.merge(pos_label,on=['article_id','customer_id'],how='left')
    trn_df['label'].fillna(0.,inplace=True)
    
    trn_df = trn_df.merge(trn_df.groupby('customer_id').size().to_frame().rename(columns={0:'group_size'}),on='customer_id')
    test_df = construct_test_df(test_transactions,val_transactions,article_features,articles,customer_features,customers,how='left',n_popular_item=80)
    
    return trn_df.reset_index(drop=True),test_df.reset_index(drop=True),gt_df.reset_index(drop=True)

In [84]:
%%time
label = 'label'
tag = ''
#t1,t2,t3,t4 = '2020-06-01','2020-08-01','2020-09-15','2020-09-22'
#t1,t2,t3,t4 = '2020-05-24','2020-07-24','2020-09-07','2020-09-15'
t1,t2,t3,t4 = '2020-05-17','2020-07-17','2020-09-01','2020-09-07'

base_dir = 'storage/output/220325_dataset_'+'_'.join([t1,t2,t3,t4])+tag+'/'

CPU times: user 6 µs, sys: 5 µs, total: 11 µs
Wall time: 12.9 µs


In [85]:
%%time
trn_df,test_df,gt_df = construct_dataset(
    transactions,
    articles,customers,
    trn_start_time=t1,trn_end_time=t2,
    val_start_time=t2,val_end_time=t3,
    test_start_time=t3,test_end_time=t4,
)
os.makedirs(base_dir,exist_ok=True)
trn_df.to_csv(os.path.join(base_dir,'trn_df.csv'),index=False,chunksize=5e5)
test_df.to_csv(os.path.join(base_dir,'test_df.csv'),index=False)
gt_df.to_csv(os.path.join(base_dir,'gt_df.csv'),index=False)

CPU times: user 21.7 s, sys: 21.5 s, total: 43.1 s
Wall time: 45.8 s


In [86]:
print(base_dir)

storage/output/220325_dataset_2020-05-17_2020-07-17_2020-09-01_2020-09-07/


In [79]:
trn_df

Unnamed: 0,article_id,customer_id,product_type_name_prob,product_group_name_prob,graphical_appearance_name_prob,colour_group_name_prob,perceived_colour_value_name_prob,perceived_colour_master_name_prob,department_name_prob,index_name_prob,...,fashion_news_frequency_prob,sales_channel_habit_prob,past_purchase_prob,total_purchase,time_elapsed_last_purchase,number_of_purchase,time_elapsed_first_release,repeated_purchase,label,group_size
0,866383006,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,0.000000,0.000000,0.900000,0.500000,0.500000,0.500000,0.000000,0.000000,...,0.592662,0.893384,0.000000,0.001024,1000000,10,1000000,0,0.0,85
1,866731003,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,0.000000,0.200000,0.900000,0.100000,0.200000,0.200000,0.000000,0.000000,...,0.528937,0.825034,0.000000,0.000239,1000000,10,1000000,0,0.0,85
2,875350001,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,0.000000,0.500000,0.900000,0.500000,0.500000,0.500000,0.000000,0.000000,...,0.565156,0.923513,0.000000,0.000227,1000000,10,1000000,0,0.0,85
3,879248001,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,0.200000,0.200000,0.900000,0.500000,0.500000,0.500000,0.000000,0.400000,...,0.596634,0.767266,0.000000,0.000553,1000000,10,1000000,0,0.0,85
4,889379006,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,0.200000,0.300000,0.900000,0.100000,0.100000,0.200000,0.100000,0.400000,...,0.483539,0.775720,0.100000,0.000312,24,10,676,1,0.0,85
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27668813,897901002,fffae8eb3a282d8c43c77dd2ca0621703b71e90904dfde...,0.029412,0.147059,0.500000,0.117647,0.147059,0.058824,0.029412,0.088235,...,0.534591,0.753145,0.029412,0.000204,43,34,657,1,0.0,112
27668814,905803002,fffae8eb3a282d8c43c77dd2ca0621703b71e90904dfde...,0.176471,0.411765,0.117647,0.029412,0.382353,0.205882,0.029412,0.088235,...,0.471519,0.569620,0.000000,0.000101,1000000,34,1000000,0,0.0,112
27668815,906293002,fffae8eb3a282d8c43c77dd2ca0621703b71e90904dfde...,0.117647,0.147059,0.500000,0.235294,0.264706,0.176471,0.029412,0.617647,...,0.503623,0.923913,0.000000,0.000089,1000000,34,1000000,0,0.0,112
27668816,883015001,fffae8eb3a282d8c43c77dd2ca0621703b71e90904dfde...,0.117647,0.147059,0.088235,0.117647,0.382353,0.147059,0.058824,0.617647,...,0.535032,0.917197,0.029412,0.000050,43,34,657,1,0.0,112
