In [1]:
import pandas as pd
import numpy as np

raw_users = pd.read_json('data/users.jsonl', lines=True)
raw_sessions = pd.read_json('data/sessions.jsonl', lines=True)
raw_products = pd.read_json('data/products.jsonl', lines=True)
raw_deliveries = pd.read_json('data/deliveries.jsonl', lines=True)

In [2]:
raw_users.dtypes

city       object
name       object
street     object
user_id     int64
dtype: object

In [3]:
raw_sessions.dtypes

event_type                  object
offered_discount             int64
product_id                 float64
purchase_id                float64
session_id                   int64
timestamp           datetime64[ns]
user_id                    float64
dtype: object

In [4]:
raw_products.dtypes

category_path     object
price            float64
product_id         int64
product_name      object
dtype: object

In [5]:
raw_deliveries.dtypes

delivery_company      float64
delivery_timestamp     object
purchase_id             int64
purchase_timestamp     object
dtype: object

### Clean session data

In [6]:
sessions = raw_sessions.astype({
    'session_id': 'float64'
})
sessions.dtypes

event_type                  object
offered_discount             int64
product_id                 float64
purchase_id                float64
session_id                 float64
timestamp           datetime64[ns]
user_id                    float64
dtype: object

In [7]:
sessions['made_purchase'] = sessions.purchase_id.map(lambda x: False if pd.isnull(x) else True)
sessions.head()

Unnamed: 0,event_type,offered_discount,product_id,purchase_id,session_id,timestamp,user_id,made_purchase
0,VIEW_PRODUCT,0,1001.0,,100001.0,2020-01-17 16:08:57,102.0,False
1,VIEW_PRODUCT,20,1277.0,,100002.0,2020-01-07 05:10:41,102.0,False
2,VIEW_PRODUCT,20,1276.0,,100002.0,2020-01-07 05:14:26,102.0,False
3,VIEW_PRODUCT,0,1276.0,,100003.0,2020-01-16 09:37:42,102.0,False
4,VIEW_PRODUCT,0,1277.0,,100003.0,2020-01-16 09:38:22,102.0,False


### Clean product data

In [8]:
products = raw_products.astype({
    'product_id': 'float64', 
    'category_path': 'unicode'
})
products.set_index('product_id')
products.dtypes

category_path     object
price            float64
product_id       float64
product_name      object
dtype: object

In [9]:
products['categories'] = products.category_path.apply(lambda s: s.split(';'))
products.head()

Unnamed: 0,category_path,price,product_id,product_name,categories
0,Telefony i akcesoria;Telefony stacjonarne,58.97,1001.0,Telefon Siemens Gigaset DA310,"[Telefony i akcesoria, Telefony stacjonarne]"
1,Komputery;Drukarki i skanery;Biurowe urządzeni...,2048.5,1002.0,Kyocera FS-1135MFP,"[Komputery, Drukarki i skanery, Biurowe urządz..."
2,Komputery;Drukarki i skanery;Biurowe urządzeni...,7639.0,1003.0,Kyocera FS-3640MFP,"[Komputery, Drukarki i skanery, Biurowe urządz..."
3,Gry i konsole;Gry na konsole;Gry Xbox 360,49.99,1004.0,Fallout 3 (Xbox 360),"[Gry i konsole, Gry na konsole, Gry Xbox 360]"
4,Gry i konsole;Gry na konsole;Gry Xbox 360,49.99,1005.0,Szalone Króliki Na żywo i w kolorze (Xbox 360),"[Gry i konsole, Gry na konsole, Gry Xbox 360]"


### Clean user data

In [10]:
users = raw_users.astype({
    'user_id': 'float64'
})
users.set_index('user_id')
users.dtypes

city        object
name        object
street      object
user_id    float64
dtype: object

In [11]:
users.head()

Unnamed: 0,city,name,street,user_id
0,Warszawa,Juliusz Męcik,plac Krakowska 69,102.0
1,Radom,Eliza Miętka,ul. Bursztynowa 56/10,103.0
2,Warszawa,Kacper Kolarz,plac Wiślana 99,104.0
3,Gdynia,Tobiasz Radko,ulica Zwycięstwa 76/98,105.0
4,Kutno,Roksana Merchel,plac Głogowa 53/95,106.0


### Clean delivery data

In [12]:
deliveries = raw_deliveries.astype({
    'purchase_id': 'float64', 
    'delivery_timestamp': 'datetime64',
    'purchase_timestamp': 'datetime64'
})
deliveries.set_index('purchase_id')
deliveries.dtypes

delivery_company             float64
delivery_timestamp    datetime64[ns]
purchase_id                  float64
purchase_timestamp    datetime64[ns]
dtype: object

In [13]:
raw_deliveries.head()

Unnamed: 0,delivery_company,delivery_timestamp,purchase_id,purchase_timestamp
0,516.0,2020-01-17T00:31:53,20001,2020-01-13T00:29:10
1,516.0,2020-02-06T19:13:06,20002,2020-02-04T19:09:12
2,620.0,2020-01-18T14:41:27,20003,2020-01-16T14:40:39
3,360.0,,20004,2020-02-05T08:08:59
4,620.0,2020-02-21T09:31:35,20005,2020-02-18T09:27:06


### Merge datasets

In [14]:
data = sessions.merge(products, how='left', on='product_id')
data = data.merge(users, how='left', on='user_id')
data = data.merge(deliveries, how='left', on='purchase_id')
data = data.drop(['category_path'], axis=1)
data.head()

Unnamed: 0,event_type,offered_discount,product_id,purchase_id,session_id,timestamp,user_id,made_purchase,price,product_name,categories,city,name,street,delivery_company,delivery_timestamp,purchase_timestamp
0,VIEW_PRODUCT,0,1001.0,,100001.0,2020-01-17 16:08:57,102.0,False,58.97,Telefon Siemens Gigaset DA310,"[Telefony i akcesoria, Telefony stacjonarne]",Warszawa,Juliusz Męcik,plac Krakowska 69,,NaT,NaT
1,VIEW_PRODUCT,20,1277.0,,100002.0,2020-01-07 05:10:41,102.0,False,2317.02,Apple iPad mini 64GB 4G,"[Komputery, Tablety i akcesoria, Tablety]",Warszawa,Juliusz Męcik,plac Krakowska 69,,NaT,NaT
2,VIEW_PRODUCT,20,1276.0,,100002.0,2020-01-07 05:14:26,102.0,False,1816.97,Apple iPad mini 64GB,"[Komputery, Tablety i akcesoria, Tablety]",Warszawa,Juliusz Męcik,plac Krakowska 69,,NaT,NaT
3,VIEW_PRODUCT,0,1276.0,,100003.0,2020-01-16 09:37:42,102.0,False,1816.97,Apple iPad mini 64GB,"[Komputery, Tablety i akcesoria, Tablety]",Warszawa,Juliusz Męcik,plac Krakowska 69,,NaT,NaT
4,VIEW_PRODUCT,0,1277.0,,100003.0,2020-01-16 09:38:22,102.0,False,2317.02,Apple iPad mini 64GB 4G,"[Komputery, Tablety i akcesoria, Tablety]",Warszawa,Juliusz Męcik,plac Krakowska 69,,NaT,NaT


In [15]:
data[data.session_id == 100001.0]

Unnamed: 0,event_type,offered_discount,product_id,purchase_id,session_id,timestamp,user_id,made_purchase,price,product_name,categories,city,name,street,delivery_company,delivery_timestamp,purchase_timestamp
0,VIEW_PRODUCT,0,1001.0,,100001.0,2020-01-17 16:08:57,102.0,False,58.97,Telefon Siemens Gigaset DA310,"[Telefony i akcesoria, Telefony stacjonarne]",Warszawa,Juliusz Męcik,plac Krakowska 69,,NaT,NaT
