In [1]:
import pandas as pd
import numpy as np
import seaborn as sns; sns.set(rc={'figure.figsize':(16,9)})


raw_users = pd.read_json('data/users.jsonl', lines=True)
raw_products = pd.read_json('data/products.jsonl', lines=True)
raw_sessions = pd.read_json('data/sessions.jsonl', lines=True)
raw_deliveries = pd.read_json('data/deliveries.jsonl', lines=True)

In [2]:
raw_users.dtypes

user_id     int64
name       object
city       object
street     object
dtype: object

In [3]:
raw_products.dtypes

product_id         int64
product_name      object
category_path     object
price            float64
dtype: object

In [4]:
raw_deliveries.dtypes

purchase_id             int64
purchase_timestamp     object
delivery_timestamp     object
delivery_company      float64
dtype: object

In [5]:
raw_sessions.dtypes

session_id                   int64
timestamp           datetime64[ns]
user_id                    float64
product_id                 float64
event_type                  object
offered_discount             int64
purchase_id                float64
dtype: object

### Clean user data

In [6]:
users = raw_users.astype({
    'user_id': 'float64'
})
users.set_index('user_id')
users.dtypes

user_id    float64
name        object
city        object
street      object
dtype: object

In [7]:
users.head()

Unnamed: 0,user_id,name,city,street
0,102.0,Juliusz Męcik,Warszawa,plac Krakowska 69
1,103.0,Eliza Miętka,Radom,ul. Bursztynowa 56/10
2,104.0,Kacper Kolarz,Warszawa,plac Wiślana 99
3,105.0,Tobiasz Radko,Gdynia,ulica Zwycięstwa 76/98
4,106.0,Roksana Merchel,Kutno,plac Głogowa 53/95


### Clean product data

In [8]:
products = raw_products.astype({
    'product_id': 'float64', 
    'category_path': 'unicode',
})
products.set_index('product_id')
products.dtypes

product_id       float64
product_name      object
category_path     object
price            float64
dtype: object

In [9]:
# Filter invalid prices: out of range (0.01, 100 000)
products = products[(products['price']>0) & (products['price']<100000)]
print('Removed {:.1f}% of rows (before {}, after {}).'.format(
    100-100*len(products)/len(raw_products), len(products), len(raw_products)))

Removed 9.4% of rows (before 289, after 319).


In [10]:
products['categories'] = products.category_path.apply(lambda s: s.split(';'))
products.head()

Unnamed: 0,product_id,product_name,category_path,price,categories
0,1001.0,Telefon Siemens Gigaset DA310,Telefony i akcesoria;Telefony stacjonarne,58.97,"[Telefony i akcesoria, Telefony stacjonarne]"
1,1002.0,Kyocera FS-1135MFP,Komputery;Drukarki i skanery;Biurowe urządzeni...,2048.5,"[Komputery, Drukarki i skanery, Biurowe urządz..."
2,1003.0,Kyocera FS-3640MFP,Komputery;Drukarki i skanery;Biurowe urządzeni...,7639.0,"[Komputery, Drukarki i skanery, Biurowe urządz..."
3,1004.0,Fallout 3 (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,49.99,"[Gry i konsole, Gry na konsole, Gry Xbox 360]"
4,1005.0,Szalone Króliki Na żywo i w kolorze (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,49.99,"[Gry i konsole, Gry na konsole, Gry Xbox 360]"


### Clean delivery data

In [11]:
deliveries = raw_deliveries.astype({
    'purchase_id': 'float64', 
    'delivery_timestamp': 'datetime64',
    'purchase_timestamp': 'datetime64'
})
deliveries.set_index('purchase_id')

deliveries.dtypes

purchase_id                  float64
purchase_timestamp    datetime64[ns]
delivery_timestamp    datetime64[ns]
delivery_company             float64
dtype: object

In [12]:
# Czy brak delivery_timestamp to błąd systemu czy tzn. że klient nie otrzymał zamówionej przesyłki?

In [13]:
deliveries['delivery_time'] = deliveries.apply(lambda row: row['delivery_timestamp'] - row['purchase_timestamp'], axis=1)
deliveries.head()

Unnamed: 0,purchase_id,purchase_timestamp,delivery_timestamp,delivery_company,delivery_time
0,20001.0,2020-01-13 00:29:10,2020-01-17 00:31:53,516.0,4 days 00:02:43
1,20002.0,2020-02-04 19:09:12,2020-02-06 19:13:06,516.0,2 days 00:03:54
2,20003.0,2020-01-16 14:40:39,2020-01-18 14:41:27,620.0,2 days 00:00:48
3,20004.0,2020-02-05 08:08:59,NaT,360.0,NaT
4,20005.0,2020-02-18 09:27:06,2020-02-21 09:31:35,620.0,3 days 00:04:29


### Clean session data

In [14]:
sessions = raw_sessions.astype({
    'session_id': 'float64'
})
sessions.dtypes

session_id                 float64
timestamp           datetime64[ns]
user_id                    float64
product_id                 float64
event_type                  object
offered_discount             int64
purchase_id                float64
dtype: object

In [15]:
# filter sessions with invalid product_id
sessions = sessions[sessions['product_id'].isin(products['product_id'].unique())]
# user_id is not required but needs to be valid if exists
sessions = sessions[sessions['user_id'].isnull() | (sessions['user_id'].isin(users['user_id'].unique()))]

print('Removed {:.1f}% of rows (before {}, after {}).'.format(
    100-100*len(sessions)/len(raw_sessions), len(sessions), len(raw_sessions)))


Removed 10.8% of rows (before 31023, after 34773).


In [16]:
sessions['made_purchase'] = sessions.purchase_id.map(lambda x: False if pd.isnull(x) else True)
sessions.head()

Unnamed: 0,session_id,timestamp,user_id,product_id,event_type,offered_discount,purchase_id,made_purchase
0,100001.0,2020-01-17 16:08:57,102.0,1001.0,VIEW_PRODUCT,0,,False
1,100002.0,2020-01-07 05:10:41,102.0,1277.0,VIEW_PRODUCT,20,,False
2,100002.0,2020-01-07 05:14:26,102.0,1276.0,VIEW_PRODUCT,20,,False
3,100003.0,2020-01-16 09:37:42,102.0,1276.0,VIEW_PRODUCT,0,,False
4,100003.0,2020-01-16 09:38:22,102.0,1277.0,VIEW_PRODUCT,0,,False


## Merge datasets

In [17]:
data = sessions.merge(products, how='left', on='product_id')
data = data.merge(users, how='left', on='user_id')
data = data.merge(deliveries, how='left', on='purchase_id')
data = data.drop(['category_path'], axis=1)
data.head()

Unnamed: 0,session_id,timestamp,user_id,product_id,event_type,offered_discount,purchase_id,made_purchase,product_name,price,categories,name,city,street,purchase_timestamp,delivery_timestamp,delivery_company,delivery_time
0,100001.0,2020-01-17 16:08:57,102.0,1001.0,VIEW_PRODUCT,0,,False,Telefon Siemens Gigaset DA310,58.97,"[Telefony i akcesoria, Telefony stacjonarne]",Juliusz Męcik,Warszawa,plac Krakowska 69,NaT,NaT,,NaT
1,100002.0,2020-01-07 05:10:41,102.0,1277.0,VIEW_PRODUCT,20,,False,Apple iPad mini 64GB 4G,2317.02,"[Komputery, Tablety i akcesoria, Tablety]",Juliusz Męcik,Warszawa,plac Krakowska 69,NaT,NaT,,NaT
2,100002.0,2020-01-07 05:14:26,102.0,1276.0,VIEW_PRODUCT,20,,False,Apple iPad mini 64GB,1816.97,"[Komputery, Tablety i akcesoria, Tablety]",Juliusz Męcik,Warszawa,plac Krakowska 69,NaT,NaT,,NaT
3,100003.0,2020-01-16 09:37:42,102.0,1276.0,VIEW_PRODUCT,0,,False,Apple iPad mini 64GB,1816.97,"[Komputery, Tablety i akcesoria, Tablety]",Juliusz Męcik,Warszawa,plac Krakowska 69,NaT,NaT,,NaT
4,100003.0,2020-01-16 09:38:22,102.0,1277.0,VIEW_PRODUCT,0,,False,Apple iPad mini 64GB 4G,2317.02,"[Komputery, Tablety i akcesoria, Tablety]",Juliusz Męcik,Warszawa,plac Krakowska 69,NaT,NaT,,NaT


In [18]:
data.to_json('data/merged.jsonl', orient='records', lines=True)



docelowo input: session_id zawierający: listę eventów, dane usera i jego zakupy historyczne jeśli są dostępne, dane produktu