In [1]:
import os

In [2]:
use_gpu = True

if use_gpu:
    import cudf as pd
else:
    import pandas as pd

In [3]:
preprocess_dir = 'storage/output/220314_baseline/'

In [4]:
from utils import save_csv

****Input dataset****

In [5]:
%%time
transactions = pd.read_csv('storage/transactions_train.csv')
articles = pd.read_csv('storage/articles.csv')
customers = pd.read_csv('storage/customers.csv')

CPU times: user 1.14 s, sys: 1.31 s, total: 2.45 s
Wall time: 2.49 s


In [6]:
%%time
from utils import train_val_test_split
trn_transactions,val_transactions,test_transactions = train_val_test_split(transactions,gpu=True)
del(transactions)

CPU times: user 27.5 ms, sys: 60.9 ms, total: 88.4 ms
Wall time: 87.5 ms


In [7]:
trn_transactions = trn_transactions[trn_transactions['t_dat'] > pd.to_datetime('2020-08-01')]

****Feature engineering with past purchase history****

In [8]:
def past_purchase_count_vector(df,art_df,selected_feature,postfix='_countvec'):
    df = df.merge(art_df[['article_id',selected_feature]],on='article_id')
    df['count'] = 1
    norm = df.groupby(['customer_id'])['count'].count().reset_index()
    norm.rename(columns={'count':'norm'},inplace=True)
    count = df.groupby(['customer_id',selected_feature])['count'].count().reset_index()
    count = count.merge(norm,on='customer_id')
    count['count'] = count['count'] / count['norm']
    count = count.rename(columns={'count':selected_feature+postfix})
    del(norm)
    return count[['customer_id',selected_feature,selected_feature+postfix]]

In [9]:
%%time
selected_features = [
    'product_group_name', 'product_type_name', 
    'graphical_appearance_name', 'perceived_colour_value_name', 'colour_group_code', 
    'index_name', 'index_group_name', 
    'section_name', 'department_name',
]
for selected_feature in selected_features:
    count = past_purchase_count_vector(trn_transactions[['customer_id','article_id']],articles,selected_feature)
    save_csv(count,preprocess_dir,selected_feature+'_countvec.csv')

CPU times: user 610 ms, sys: 827 ms, total: 1.44 s
Wall time: 1.66 s


****Feature engineering on repeated purchase****

In [10]:
%%time
trn_transactions['count'] = 1
count = trn_transactions[['customer_id','article_id','count']].groupby(['customer_id','article_id'])['count'].count().reset_index()
norm = trn_transactions[['customer_id','article_id']].groupby('customer_id').count().reset_index().rename(columns={'article_id':'norm'})
count = count.merge(norm,on='customer_id')
count['count'] = count['count'] / count['norm']
count.drop(columns=['norm'],inplace=True)

CPU times: user 15 ms, sys: 23 ms, total: 38 ms
Wall time: 49.5 ms


In [11]:
save_csv(count.to_pandas(),preprocess_dir,'repeated_purchase_prob.csv')

In [12]:
trn_transactions.info()

<class 'cudf.core.dataframe.DataFrame'>
Int64Index: 1497951 entries, 29794821 to 31292771
Data columns (total 6 columns):
 #   Column            Non-Null Count    Dtype
---  ------            --------------    -----
 0   t_dat             1497951 non-null  datetime64[ns]
 1   customer_id       1497951 non-null  object
 2   article_id        1497951 non-null  int64
 3   price             1497951 non-null  float64
 4   sales_channel_id  1497951 non-null  int64
 5   count             1497951 non-null  int64
dtypes: datetime64[ns](1), float64(1), int64(3), object(1)
memory usage: 165.7+ MB
