In [1]:
TRAIN_SIZE = 0.75
TEST_SIZE = 0.75  # (1.0 - TRAIN_SIZE) * TEST_SIZE

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

def loadData(file_name):
    return pd.read_json("../data/raw/" + file_name, lines = True)

def saveData(data, file_name):
    data.to_csv("../data/processed/" + file_name + ".csv", index = False)

### Wczytanie danych

In [3]:
deliveries = loadData("deliveries.jsonl")
users = loadData("users.jsonl")
sessions = loadData("sessions.jsonl")
products = loadData("products.jsonl")

### Usunięcie niepotrzebnych kolumn i wierszy gdzie przedmioty były tylko przeglądane

In [4]:
data = sessions.merge(deliveries, on = "purchase_id", how = "left").merge(users, on = "user_id", how = "left")
data = data[data["event_type"] == "BUY_PRODUCT"]
data.drop(["session_id", "timestamp", "user_id", "product_id", "event_type", "offered_discount", "purchase_id", "name", "street"], axis = 1, inplace=True)
data.reset_index(inplace = True, drop = True)

### Kodowanie danych kategorycznych

In [5]:
def oneHotEncoding(data, col):
    encoder = OneHotEncoder()
    encoded_cols = encoder.fit_transform(data[[col]]).toarray()
    names_sufix = encoder.get_feature_names_out([col])
    encoded_data = pd.DataFrame(data = encoded_cols, columns = names_sufix, dtype = bool) * 1
    data.drop(col, axis = 1, inplace = True)
    new_table = data.join(encoded_data)
    
    return new_table

In [6]:
data = oneHotEncoding(data, "city")
data = oneHotEncoding(data, "delivery_company")

### Rzutowanie czasu zakupu i dostawy na *datetime*

In [7]:
data["delivery_timestamp"] = pd.to_datetime(data["delivery_timestamp"])
data["purchase_timestamp"] = pd.to_datetime(data["purchase_timestamp"])

### Obliczenie czasu między zakupem, a dostawą

In [8]:
data["time"] = (data["delivery_timestamp"] - data["purchase_timestamp"]) / np.timedelta64(1,'h')

### Rozdzielenie czasu zakupu i dostawy na składowe

In [9]:
data["purchase_day_of_week"] = data["purchase_timestamp"].dt.dayofweek
data["delivery_day_of_week"] = data["delivery_timestamp"].dt.dayofweek
data["purchase_hour"] = data["purchase_timestamp"].dt.hour
data["delivery_hour"] = data["delivery_timestamp"].dt.hour
data["purchase_minute"] = data["purchase_timestamp"].dt.minute
data["delivery_minute"] = data["delivery_timestamp"].dt.minute
data["purchase_second"] = data["purchase_timestamp"].dt.second
data["delivery_second"] = data["delivery_timestamp"].dt.second
data.drop(["purchase_timestamp", "delivery_timestamp"], axis = 1, inplace = True)

### Ustalenie czy zakup i dostawa odbyła się w weekend

In [10]:
data["purchase_is_weekend"] = data["purchase_day_of_week"].isin([5, 6]) * 1
data["delivery_is_weekend"] = data["delivery_day_of_week"].isin([5, 6]) * 1

### Kodowanie czasu i daty - dane cykliczne

In [11]:
def circuralEncoding(data, col_name, unique_nums):
    data[col_name + "_sin"] = np.sin(data[col_name] * (2 * np.pi / unique_nums))
    data[col_name + "_cos"] = np.cos(data[col_name] * (2 * np.pi / unique_nums))

In [12]:
circuralEncoding(data, "purchase_day_of_week", 7)
circuralEncoding(data, "delivery_day_of_week", 7)

In [13]:
circuralEncoding(data, "purchase_hour", 24)
circuralEncoding(data, "delivery_hour", 24)

circuralEncoding(data, "purchase_minute", 60)
circuralEncoding(data, "delivery_minute", 60)

circuralEncoding(data, "purchase_second", 60)
circuralEncoding(data, "delivery_second", 60)

In [14]:
data.drop(["purchase_day_of_week", "delivery_day_of_week", "purchase_hour", "delivery_hour", "purchase_minute", "delivery_minute", "purchase_second", "delivery_second"], axis = 1, inplace = True)

### Zawartość *data*

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7383 entries, 0 to 7382
Data columns (total 29 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   city_Gdynia               7383 non-null   int32  
 1   city_Kraków               7383 non-null   int32  
 2   city_Poznań               7383 non-null   int32  
 3   city_Radom                7383 non-null   int32  
 4   city_Szczecin             7383 non-null   int32  
 5   city_Warszawa             7383 non-null   int32  
 6   city_Wrocław              7383 non-null   int32  
 7   delivery_company_360.0    7383 non-null   int32  
 8   delivery_company_516.0    7383 non-null   int32  
 9   delivery_company_620.0    7383 non-null   int32  
 10  time                      7383 non-null   float64
 11  purchase_is_weekend       7383 non-null   int32  
 12  delivery_is_weekend       7383 non-null   int32  
 13  purchase_day_of_week_sin  7383 non-null   float64
 14  purchase

In [16]:
data

Unnamed: 0,city_Gdynia,city_Kraków,city_Poznań,city_Radom,city_Szczecin,city_Warszawa,city_Wrocław,delivery_company_360.0,delivery_company_516.0,delivery_company_620.0,...,delivery_hour_sin,delivery_hour_cos,purchase_minute_sin,purchase_minute_cos,delivery_minute_sin,delivery_minute_cos,purchase_second_sin,purchase_second_cos,delivery_second_sin,delivery_second_cos
0,0,1,0,0,0,0,0,0,0,1,...,0.500000,-0.866025,5.877853e-01,8.090170e-01,-0.309017,-0.951057,-0.951057,0.309017,0.809017,-5.877853e-01
1,0,1,0,0,0,0,0,1,0,0,...,0.500000,-0.866025,3.090170e-01,9.510565e-01,0.104528,-0.994522,-0.978148,-0.207912,0.104528,-9.945219e-01
2,0,1,0,0,0,0,0,0,0,1,...,0.866025,-0.500000,1.224647e-16,-1.000000e+00,0.104528,0.994522,0.587785,-0.809017,0.500000,-8.660254e-01
3,0,1,0,0,0,0,0,0,0,1,...,-0.258819,-0.965926,-1.045285e-01,9.945219e-01,0.951057,0.309017,0.104528,-0.994522,-0.866025,5.000000e-01
4,0,1,0,0,0,0,0,0,0,1,...,-0.707107,-0.707107,3.090170e-01,9.510565e-01,-0.994522,0.104528,0.809017,-0.587785,-0.207912,-9.781476e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7378,0,0,0,0,0,0,1,1,0,0,...,0.866025,-0.500000,-3.090170e-01,-9.510565e-01,-0.913545,-0.406737,-0.406737,-0.913545,-0.809017,5.877853e-01
7379,0,0,0,0,0,0,1,1,0,0,...,0.866025,-0.500000,9.781476e-01,2.079117e-01,0.669131,-0.743145,-0.309017,0.951057,0.406737,-9.135455e-01
7380,0,0,0,0,0,0,1,1,0,0,...,-0.258819,-0.965926,1.000000e+00,6.123234e-17,0.587785,-0.809017,0.207912,0.978148,0.104528,9.945219e-01
7381,0,0,0,0,0,0,1,0,0,1,...,0.866025,-0.500000,-9.510565e-01,3.090170e-01,-0.500000,0.866025,-0.951057,0.309017,-1.000000,-1.836970e-16


### Zapisanie zmergowanych danych

In [17]:
saveData(data, "merged_tables")

### Podział danych na treningowe, developerskie i testowe

In [18]:
trainData, rest = train_test_split(data, train_size = TRAIN_SIZE)
devData, testData = train_test_split(rest, test_size = TEST_SIZE)

### Zapisanie danych do plików

In [19]:
saveData(trainData, "train")
saveData(devData, "dev")
saveData(testData, "test")