In [14]:
import os 
import numpy as np
import pandas as pd
from datetime import date, timedelta


import matplotlib.pyplot as plt
pd.set_option('display.float_format', lambda x: '%.4f' % x)

## Load Data

In [16]:
path = '../data/processed'
df = pd.read_pickle(os.path.join(path,'transactions.pkl'))
df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,2,663713001,0.0508,2
1,2018-09-20,2,541518023,0.0305,2
2,2018-09-20,7,505221004,0.0152,2
3,2018-09-20,7,685687003,0.0169,2
4,2018-09-20,7,685687004,0.0169,2


## Target Matrix
- cus_id
- week_start
- week_end
- purchased_product_list
- total_price

In [17]:
print('First transaction: ', df.t_dat.min())
print('Last transaction: ',df.t_dat.max())

First transaction:  2018-09-20 00:00:00
Last transaction:  2020-09-22 00:00:00


In [18]:
df['week'] = df['t_dat'].dt.isocalendar().week 
df['year'] = df['t_dat'].dt.year
df['week_start'] = df['t_dat'].dt.to_period('W').apply(lambda r: r.start_time)
df['week_end'] = df['t_dat'].dt.to_period('W').apply(lambda r: r.end_time).dt.normalize()
df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,year,week_start,week_end
0,2018-09-20,2,663713001,0.0508,2,38,2018,2018-09-17,2018-09-23
1,2018-09-20,2,541518023,0.0305,2,38,2018,2018-09-17,2018-09-23
2,2018-09-20,7,505221004,0.0152,2,38,2018,2018-09-17,2018-09-23
3,2018-09-20,7,685687003,0.0169,2,38,2018,2018-09-17,2018-09-23
4,2018-09-20,7,685687004,0.0169,2,38,2018,2018-09-17,2018-09-23


### purchased_product_list

In [36]:
weekly_purchased = pd.DataFrame(df.groupby(['customer_id','week','year'])['article_id'].apply(lambda x: list(set(x))))\
                        .reset_index()\
                        .rename(columns={'article_id':'weekly_purchased_products'})
# weekly_purchased['weekly_purchased_products'] = weekly_purchased['weekly_purchased_products'].apply(lambda x: list(set(x)))
weekly_purchased.head()

Unnamed: 0,customer_id,week,year,weekly_purchased_products
0,0,12,2020,"[0841260003, 0890498002, 0887593002, 085941601..."
1,0,18,2019,[0697138006]
2,0,21,2019,[0568601006]
3,0,30,2019,"[0745232001, 0607642008]"
4,0,36,2020,[0568601043]


### total_price

In [37]:
total_price = df.groupby(['customer_id','week','year'])\
                .agg({'article_id':'count','price':sum})\
                .reset_index()\
                .rename(columns={'article_id':'total_articles','price':'total_amount'})
total_price.head()

Unnamed: 0,customer_id,week,year,total_articles,total_amount
0,0,12,2020,5,0.0936
1,0,18,2019,1,0.0102
2,0,21,2019,2,0.1017
3,0,30,2019,2,0.0339
4,0,36,2020,1,0.0508


### Final DataFrame

In [38]:
final_df = total_price.merge(weekly_purchased,on=['customer_id','week','year'],how='left')
final_df.head()

Unnamed: 0,customer_id,week,year,total_articles,total_amount,weekly_purchased_products
0,0,12,2020,5,0.0936,"[0841260003, 0890498002, 0887593002, 085941601..."
1,0,18,2019,1,0.0102,[0697138006]
2,0,21,2019,2,0.1017,[0568601006]
3,0,30,2019,2,0.0339,"[0745232001, 0607642008]"
4,0,36,2020,1,0.0508,[0568601043]


In [39]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8090355 entries, 0 to 8090354
Data columns (total 6 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   customer_id                int64  
 1   week                       int64  
 2   year                       int64  
 3   total_articles             int64  
 4   total_amount               float32
 5   weekly_purchased_products  object 
dtypes: float32(1), int64(4), object(1)
memory usage: 401.2+ MB


In [40]:
final_df.to_pickle(os.path.join(path,'weekly_target.pkl'))