In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

def loadData(file_name):
    return pd.read_json("../data/raw/" + file_name, lines = True)

def saveData(data, file_name):
    data.to_csv("../data/processed/" + file_name + ".csv", index = False)

### Wczytanie danych

In [2]:
deliveries = loadData("deliveries.jsonl")
users = loadData("users.jsonl")
sessions = loadData("sessions.jsonl")
products = loadData("products.jsonl")

### Usunięcie niepotrzebnych kolumn i wierszy gdzie przedmioty były tylko przeglądane

In [3]:
data = sessions.merge(deliveries, on = "purchase_id", how = "left").merge(users, on = "user_id", how = "left")
data = data[data["event_type"] == "BUY_PRODUCT"]
data.drop(["session_id", "timestamp", "user_id", "product_id", "event_type", "offered_discount", "purchase_id", "name", "street"], axis = 1, inplace=True)
data.reset_index(inplace = True, drop = True)

### Kodowanie danych kategorycznych

In [4]:
def oneHotEncoding(data, col):
    encoder = OneHotEncoder()
    encoded_cols = encoder.fit_transform(data[[col]]).toarray()
    names_sufix = encoder.get_feature_names_out([col])
    encoded_data = pd.DataFrame(data = encoded_cols, columns = names_sufix, dtype = bool) * 1
    data.drop(col, axis = 1, inplace = True)
    new_table = data.join(encoded_data)
    
    return new_table

In [5]:
data = oneHotEncoding(data, "city")
data = oneHotEncoding(data, "delivery_company")

### Rzutowanie czasu zakupu i dostawy na *datetime*

In [6]:
data["delivery_timestamp"] = pd.to_datetime(data["delivery_timestamp"])
data["purchase_timestamp"] = pd.to_datetime(data["purchase_timestamp"])

### Obliczenie czasu między zakupem, a dostawą

In [7]:
data["time"] = (data["delivery_timestamp"] - data["purchase_timestamp"]) / np.timedelta64(1,'h')

### Rozdzielenie czasu zakupu na składowe

In [8]:
data["purchase_day_of_week"] = data["purchase_timestamp"].dt.dayofweek
data["purchase_hour"] = data["purchase_timestamp"].dt.hour
data["purchase_minute"] = data["purchase_timestamp"].dt.minute
data["purchase_second"] = data["purchase_timestamp"].dt.second

### Ustalenie czy zakup odbył się w weekend

In [9]:
data["purchase_is_weekend"] = data["purchase_day_of_week"].isin([5, 6]) * 1

### Kodowanie czasu i daty - dane cykliczne

In [10]:
def circuralEncoding(data, col_name, unique_nums):
    data[col_name + "_sin"] = np.sin(data[col_name] * (2 * np.pi / unique_nums))
    data[col_name + "_cos"] = np.cos(data[col_name] * (2 * np.pi / unique_nums))

In [11]:
circuralEncoding(data, "purchase_day_of_week", 7)

In [12]:
circuralEncoding(data, "purchase_hour", 24)
circuralEncoding(data, "purchase_minute", 60)

In [13]:
data.drop(["purchase_timestamp", "delivery_timestamp","purchase_day_of_week", "purchase_hour", "purchase_minute", "purchase_second"], axis = 1, inplace = True)

### Zawartość *data*

In [14]:
data.insert(len(data.columns)-1, 'time', data.pop('time'))

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7383 entries, 0 to 7382
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   city_Gdynia               7383 non-null   int32  
 1   city_Kraków               7383 non-null   int32  
 2   city_Poznań               7383 non-null   int32  
 3   city_Radom                7383 non-null   int32  
 4   city_Szczecin             7383 non-null   int32  
 5   city_Warszawa             7383 non-null   int32  
 6   city_Wrocław              7383 non-null   int32  
 7   delivery_company_360.0    7383 non-null   int32  
 8   delivery_company_516.0    7383 non-null   int32  
 9   delivery_company_620.0    7383 non-null   int32  
 10  purchase_is_weekend       7383 non-null   int32  
 11  purchase_day_of_week_sin  7383 non-null   float64
 12  purchase_day_of_week_cos  7383 non-null   float64
 13  purchase_hour_sin         7383 non-null   float64
 14  purchase

In [16]:
data

Unnamed: 0,city_Gdynia,city_Kraków,city_Poznań,city_Radom,city_Szczecin,city_Warszawa,city_Wrocław,delivery_company_360.0,delivery_company_516.0,delivery_company_620.0,purchase_is_weekend,purchase_day_of_week_sin,purchase_day_of_week_cos,purchase_hour_sin,purchase_hour_cos,purchase_minute_sin,purchase_minute_cos,time
0,0,1,0,0,0,0,0,0,0,1,1,-0.781831,0.623490,8.660254e-01,5.000000e-01,5.877853e-01,8.090170e-01,54.442714
1,0,1,0,0,0,0,0,1,0,0,0,0.974928,-0.222521,-9.659258e-01,-2.588190e-01,3.090170e-01,9.510565e-01,65.429676
2,0,1,0,0,0,0,0,0,0,1,0,-0.433884,-0.900969,-7.071068e-01,7.071068e-01,1.224647e-16,-1.000000e+00,58.517116
3,0,1,0,0,0,0,0,0,0,1,0,0.433884,-0.900969,9.659258e-01,2.588190e-01,-1.045285e-01,9.945219e-01,55.222574
4,0,1,0,0,0,0,0,0,0,1,0,0.000000,1.000000,9.659258e-01,-2.588190e-01,3.090170e-01,9.510565e-01,56.719851
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7378,0,0,0,0,0,0,1,1,0,0,0,-0.433884,-0.900969,7.071068e-01,7.071068e-01,-3.090170e-01,-9.510565e-01,77.138135
7379,0,0,0,0,0,0,1,1,0,0,1,-0.781831,0.623490,1.224647e-16,-1.000000e+00,9.781476e-01,2.079117e-01,44.158122
7380,0,0,0,0,0,0,1,1,0,0,0,0.433884,-0.900969,-1.000000e+00,-1.836970e-16,1.000000e+00,6.123234e-17,43.149846
7381,0,0,0,0,0,0,1,0,0,1,1,-0.781831,0.623490,-5.000000e-01,-8.660254e-01,-9.510565e-01,3.090170e-01,18.115892


### Zapisanie zmergowanych danych

In [17]:
saveData(data, "processed_data")

### Podział danych na treningowe, developerskie i testowe

In [18]:
TRAIN_SIZE = 0.75

In [19]:
train_data, test_data = train_test_split(data, train_size = TRAIN_SIZE)

### Zapisanie danych do plików

In [20]:
saveData(train_data, "train")
saveData(test_data, "test")