In [1]:
import os 
import numpy as np
import pandas as pd
from datetime import date, timedelta


import matplotlib.pyplot as plt
pd.set_option('display.float_format', lambda x: '%.4f' % x)

## Load Data

In [2]:
path = '../data/processed'
df = pd.read_pickle(os.path.join(path,'transactions.pkl'))
df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,2,663713001,0.0508,2
1,2018-09-20,2,541518023,0.0305,2
2,2018-09-20,7,505221004,0.0152,2
3,2018-09-20,7,685687003,0.0169,2
4,2018-09-20,7,685687004,0.0169,2


## Target Matrix
- cus_id
- week_start
- week_end
- purchased_product_list
- total_price

In [3]:
print('First transaction: ', df.t_dat.min())
print('Last transaction: ',df.t_dat.max())

First transaction:  2018-09-20 00:00:00
Last transaction:  2020-09-22 00:00:00


In [4]:
df['week'] = df['t_dat'].dt.isocalendar().week 
df['year'] = df['t_dat'].dt.year
df['week_start'] = df['t_dat'].dt.to_period('W').apply(lambda r: r.start_time)
df['week_end'] = df['t_dat'].dt.to_period('W').apply(lambda r: r.end_time).dt.normalize()
df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,year,week_start,week_end
0,2018-09-20,2,663713001,0.0508,2,38,2018,2018-09-17,2018-09-23
1,2018-09-20,2,541518023,0.0305,2,38,2018,2018-09-17,2018-09-23
2,2018-09-20,7,505221004,0.0152,2,38,2018,2018-09-17,2018-09-23
3,2018-09-20,7,685687003,0.0169,2,38,2018,2018-09-17,2018-09-23
4,2018-09-20,7,685687004,0.0169,2,38,2018,2018-09-17,2018-09-23


### purchased_product_list

In [5]:
weekly_purchased = pd.DataFrame(df.groupby(['customer_id','week','year'])['article_id'].apply(lambda x: list(set(x))))\
                        .reset_index()\
                        .rename(columns={'article_id':'weekly_purchased_products'})
# weekly_purchased['weekly_purchased_products'] = weekly_purchased['weekly_purchased_products'].apply(lambda x: list(set(x)))
weekly_purchased.head()

Unnamed: 0,customer_id,week,year,weekly_purchased_products
0,0,12,2020,"[795440001, 841260003, 887593002, 859416011, 8..."
1,0,18,2019,[697138006]
2,0,21,2019,[568601006]
3,0,30,2019,"[607642008, 745232001]"
4,0,36,2020,[568601043]


### total_price

In [6]:
total_price = df.groupby(['customer_id','week','year'])\
                .agg({'article_id':'count','price':sum})\
                .reset_index()\
                .rename(columns={'article_id':'total_articles','price':'total_amount'})
total_price.head()

Unnamed: 0,customer_id,week,year,total_articles,total_amount
0,0,12,2020,5,0.0936
1,0,18,2019,1,0.0102
2,0,21,2019,2,0.1017
3,0,30,2019,2,0.0339
4,0,36,2020,1,0.0508


### Final DataFrame

In [7]:
final_df = total_price.merge(weekly_purchased,on=['customer_id','week','year'],how='left')
final_df.head()

Unnamed: 0,customer_id,week,year,total_articles,total_amount,weekly_purchased_products
0,0,12,2020,5,0.0936,"[795440001, 841260003, 887593002, 859416011, 8..."
1,0,18,2019,1,0.0102,[697138006]
2,0,21,2019,2,0.1017,[568601006]
3,0,30,2019,2,0.0339,"[607642008, 745232001]"
4,0,36,2020,1,0.0508,[568601043]


In [8]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8090355 entries, 0 to 8090354
Data columns (total 6 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   customer_id                int64  
 1   week                       UInt32 
 2   year                       int64  
 3   total_articles             int64  
 4   total_amount               float32
 5   weekly_purchased_products  object 
dtypes: UInt32(1), float32(1), int64(3), object(1)
memory usage: 378.1+ MB


In [9]:
final_df.to_pickle(os.path.join(path,'weekly_target.pkl'))

## Sampling Data
- Exclude cold-start customers and one-transaction customers
- Exclude customers with outlier transaction


In [10]:
path = '../data/processed'
trans = pd.read_pickle(os.path.join(path,'transactions.pkl'))
customers = pd.read_pickle(os.path.join(path,'customers.pkl'))
customers.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,0,0.0,0.0,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,1,0.0,0.0,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,2,0.0,0.0,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,3,0.0,0.0,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,4,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...


In [11]:
temp = trans.groupby('customer_id').agg({'t_dat':'nunique','article_id':'count'}).reset_index()
temp.describe(percentiles=[0.1,0.25,0.5,0.75,0.9,0.99])

Unnamed: 0,customer_id,t_dat,article_id
count,1362281.0,1362281.0,1362281.0
mean,685950.8047,6.6654,23.3346
std,396054.1843,9.707,39.2423
min,0.0,1.0,1.0
10%,137164.0,1.0,2.0
25%,342966.0,1.0,3.0
50%,685902.0,3.0,9.0
75%,1028907.0,8.0,27.0
90%,1234778.0,17.0,60.0
99%,1358255.2,45.0,187.0


In [12]:
exclude_list = temp['customer_id'][(temp.t_dat<=1)|(temp.article_id>180)]
print('Total customers: ',len(temp))
print('Total excluded customers: ',len(exclude_list))
print('Exclusion ratio: {:.2%}'.format(len(exclude_list)/len(temp)))

print('Remained customers: {}'.format(len(temp)-len(exclude_list)))

Total customers:  1362281
Total excluded customers:  462910
Exclusion ratio: 33.98%
Remained customers: 899371


### Random Sampling

In [13]:
from random import sample
random_list = temp['customer_id'][-temp.customer_id.isin(exclude_list)].sample(100000)
# random_list = sample(eligible_list,100000)
temp[temp.customer_id.isin(random_list)].describe()

Unnamed: 0,customer_id,t_dat,article_id
count,100000.0,100000.0,100000.0
mean,686645.6166,8.8366,29.4255
std,396667.9976,8.8812,31.2517
min,0.0,2.0,2.0
25%,341597.5,3.0,8.0
50%,686247.0,6.0,18.0
75%,1030989.5,11.0,39.0
max,1371970.0,108.0,180.0


In [14]:
print('First t_dat: ', trans[trans.customer_id.isin(random_list)]['t_dat'].min())
print('Last t_dat: ', trans[trans.customer_id.isin(random_list)]['t_dat'].max())
print('Total transactions: ', len(trans[trans.customer_id.isin(random_list)]))
target_cus = trans['customer_id'][(trans.customer_id.isin(random_list))&(trans.t_dat >= '2020-09-15')].unique()
print('Total target customers: {} ({:.2%})'.format(len(target_cus),len(target_cus)/len(random_list)))

First t_dat:  2018-09-20 00:00:00
Last t_dat:  2020-09-22 00:00:00
Total transactions:  2942549
Total target customers: 7208 (7.21%)


### Save Sampling

In [15]:
output = pd.DataFrame(target_cus,columns=['customer_id'])
path = '../data'
output.to_csv(os.path.join(path,'random_customer.csv'),index=False)