In [1]:
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import tensorflow as tf
import numpy as np

In [2]:
df_sessions = pd.read_json(path_or_buf='../../data/raw/sessions.jsonl', lines=True)
df_products = pd.read_json(path_or_buf='../../data/raw/products.jsonl', lines=True)
df_users = pd.read_json(path_or_buf='../../data/raw/users.jsonl', lines=True)

In [3]:
df_users.head()

Unnamed: 0,user_id,name,city,street
0,102,Aurelia Malon,Poznań,pl. Brzoskwiniowa 11/53
1,103,Mateusz Kobel,Poznań,al. Wrocławska 10
2,104,Radosław Ratka,Szczecin,pl. Nowa 89/04
3,105,Anastazja Oszust,Warszawa,ul. Częstochowska 80
4,106,Sylwia Nurek,Warszawa,al. Wiosenna 72


In [4]:
df_products.head()

Unnamed: 0,product_id,product_name,category_path,price
0,1001,Telefon Siemens Gigaset DA310,Telefony i akcesoria;Telefony stacjonarne,58.97
1,1002,Kyocera FS-1135MFP,Komputery;Drukarki i skanery;Biurowe urządzeni...,2048.5
2,1003,Kyocera FS-3640MFP,Komputery;Drukarki i skanery;Biurowe urządzeni...,7639.0
3,1004,Fallout 3 (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,49.99
4,1005,Szalone Króliki Na żywo i w kolorze (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,49.99


In [5]:
categories_one_hot = df_products.category_path.str.get_dummies(';')
categories_one_hot.head()

Unnamed: 0,Akcesoria telefoniczne,Anteny RTV,Audio,Biurowe urządzenia wielofunkcyjne,Drukarki i skanery,Gry PlayStation3,Gry Xbox 360,Gry i konsole,Gry komputerowe,Gry na konsole,...,Słuchawki,Tablety,Tablety i akcesoria,Telefony i akcesoria,Telefony komórkowe,Telefony stacjonarne,Telewizory i akcesoria,Video,Zestawy głośnomówiące,Zestawy słuchawkowe
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
1,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [6]:
cities_one_hot = df_users.city.str.get_dummies()
cities_one_hot.head()

Unnamed: 0,Gdynia,Kraków,Poznań,Radom,Szczecin,Warszawa,Wrocław
0,0,0,1,0,0,0,0
1,0,0,1,0,0,0,0
2,0,0,0,0,1,0,0
3,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0


In [7]:
df_products.drop(columns=['product_name', 'category_path'], inplace=True)
df_products = df_products.join(categories_one_hot)

In [8]:
df_users.drop(columns=['city'], inplace=True)
df_users = df_users.join(cities_one_hot)

In [9]:
df_products.head()

Unnamed: 0,product_id,price,Akcesoria telefoniczne,Anteny RTV,Audio,Biurowe urządzenia wielofunkcyjne,Drukarki i skanery,Gry PlayStation3,Gry Xbox 360,Gry i konsole,...,Słuchawki,Tablety,Tablety i akcesoria,Telefony i akcesoria,Telefony komórkowe,Telefony stacjonarne,Telewizory i akcesoria,Video,Zestawy głośnomówiące,Zestawy słuchawkowe
0,1001,58.97,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
1,1002,2048.5,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1003,7639.0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1004,49.99,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,1005,49.99,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [10]:
df_sessions.head()

Unnamed: 0,session_id,timestamp,user_id,product_id,event_type,offered_discount,purchase_id
0,124,2021-07-28 15:47:21,102,1283,VIEW_PRODUCT,5,
1,124,2021-07-28 15:47:40,102,1283,BUY_PRODUCT,5,20001.0
2,125,2021-09-12 16:43:16,102,1076,VIEW_PRODUCT,0,
3,125,2021-09-12 16:46:24,102,1080,VIEW_PRODUCT,0,
4,125,2021-09-12 16:48:08,102,1079,VIEW_PRODUCT,0,


In [11]:
df_users['sex'] = ['Female' if name.split(' ')[0][-1] == 'a' else 'Male' for name in df_users['name']]
sex_one_hot = df_users.sex.str.get_dummies()
df_users.drop(columns=['sex'], inplace=True)
df_users = df_users.join(sex_one_hot)

In [12]:
df_users.head()

Unnamed: 0,user_id,name,street,Gdynia,Kraków,Poznań,Radom,Szczecin,Warszawa,Wrocław,Female,Male
0,102,Aurelia Malon,pl. Brzoskwiniowa 11/53,0,0,1,0,0,0,0,1,0
1,103,Mateusz Kobel,al. Wrocławska 10,0,0,1,0,0,0,0,0,1
2,104,Radosław Ratka,pl. Nowa 89/04,0,0,0,0,1,0,0,0,1
3,105,Anastazja Oszust,ul. Częstochowska 80,0,0,0,0,0,1,0,1,0
4,106,Sylwia Nurek,al. Wiosenna 72,0,0,0,0,0,1,0,1,0


In [13]:
df = df_sessions.merge(df_products, left_on='product_id', right_on='product_id')
df = df.merge(df_users, left_on='user_id', right_on='user_id')
df.head()

Unnamed: 0,session_id,timestamp,user_id,product_id,event_type,offered_discount,purchase_id,price,Akcesoria telefoniczne,Anteny RTV,...,street,Gdynia,Kraków,Poznań,Radom,Szczecin,Warszawa,Wrocław,Female,Male
0,124,2021-07-28 15:47:21,102,1283,VIEW_PRODUCT,5,,99.99,0,0,...,pl. Brzoskwiniowa 11/53,0,0,1,0,0,0,0,1,0
1,124,2021-07-28 15:47:40,102,1283,BUY_PRODUCT,5,20001.0,99.99,0,0,...,pl. Brzoskwiniowa 11/53,0,0,1,0,0,0,0,1,0
2,139,2021-07-01 11:28:02,102,1283,VIEW_PRODUCT,0,,99.99,0,0,...,pl. Brzoskwiniowa 11/53,0,0,1,0,0,0,0,1,0
3,139,2021-07-01 11:31:18,102,1283,BUY_PRODUCT,0,20013.0,99.99,0,0,...,pl. Brzoskwiniowa 11/53,0,0,1,0,0,0,0,1,0
4,144,2021-03-19 13:55:04,102,1283,VIEW_PRODUCT,0,,99.99,0,0,...,pl. Brzoskwiniowa 11/53,0,0,1,0,0,0,0,1,0


In [14]:
aggregation_functions = {
    'timestamp' : lambda t: [(t.max() - t.min()).seconds, t.min(), t.max()],
    'user_id' : 'first',
    'product_id' : 'unique',
    'event_type' : lambda e: 1 if len(e.unique()) > 1 else 0,
    'offered_discount' : 'first',
    'price' : lambda p: p.sum()
}

for name in categories_one_hot.columns:
    aggregation_functions[name] = lambda n: n.sum()

# for name in cities_one_hot.columns:
#     aggregation_functions[name] = 'first'

for name in sex_one_hot.columns:
    aggregation_functions[name] = 'first'

main_df = df.groupby(df['session_id']).aggregate(aggregation_functions)
main_df.rename(columns={'event_type' : 'purchased'}, inplace=True)

In [15]:
timestamp_dict = [{'session_length' : x[0], 'session_start' : x[1], 'session_end' : x[2]} for x in main_df['timestamp']]
timestamp_df = pd.DataFrame(timestamp_dict)

In [16]:
main_df.reset_index(inplace=True)
main_df = pd.concat([main_df, timestamp_df], axis=1, join="inner")
main_df.drop(columns=['timestamp'], inplace=True)
main_df.index.name = 'session_id'
main_df = main_df[['user_id', 'product_id', 'offered_discount', 'session_length', 'price', 'session_start', 'session_end', 'purchased'] + list(sex_one_hot.columns.values) + list(categories_one_hot.columns.values)]
main_df.head()

Unnamed: 0_level_0,user_id,product_id,offered_discount,session_length,price,session_start,session_end,purchased,Female,Male,...,Słuchawki,Tablety,Tablety i akcesoria,Telefony i akcesoria,Telefony komórkowe,Telefony stacjonarne,Telewizory i akcesoria,Video,Zestawy głośnomówiące,Zestawy słuchawkowe
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,102,[1283],5,19,199.98,2021-07-28 15:47:21,2021-07-28 15:47:40,1,1,0,...,0,0,0,0,0,0,2,2,0,0
1,102,"[1076, 1080, 1079, 1003, 1002, 1075, 1078, 1077]",0,1436,36664.54,2021-09-12 16:43:16,2021-09-12 17:07:12,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,102,"[1281, 1278]",15,117,222.6,2021-04-09 22:31:08,2021-04-09 22:33:05,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,102,"[1281, 1278]",0,519,208.5,2021-08-15 16:46:06,2021-08-15 16:54:45,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,102,"[1008, 1007, 1013, 1012, 1004, 1010]",0,847,568.94,2021-01-05 00:05:38,2021-01-05 00:19:45,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
main_df['n_views'] = [len(x) for x in main_df['product_id'].values]

In [18]:
main_df.head()

Unnamed: 0_level_0,user_id,product_id,offered_discount,session_length,price,session_start,session_end,purchased,Female,Male,...,Tablety,Tablety i akcesoria,Telefony i akcesoria,Telefony komórkowe,Telefony stacjonarne,Telewizory i akcesoria,Video,Zestawy głośnomówiące,Zestawy słuchawkowe,n_views
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,102,[1283],5,19,199.98,2021-07-28 15:47:21,2021-07-28 15:47:40,1,1,0,...,0,0,0,0,0,2,2,0,0,1
1,102,"[1076, 1080, 1079, 1003, 1002, 1075, 1078, 1077]",0,1436,36664.54,2021-09-12 16:43:16,2021-09-12 17:07:12,1,1,0,...,0,0,0,0,0,0,0,0,0,8
2,102,"[1281, 1278]",15,117,222.6,2021-04-09 22:31:08,2021-04-09 22:33:05,1,1,0,...,0,0,0,0,0,0,0,0,0,2
3,102,"[1281, 1278]",0,519,208.5,2021-08-15 16:46:06,2021-08-15 16:54:45,1,1,0,...,0,0,0,0,0,0,0,0,0,2
4,102,"[1008, 1007, 1013, 1012, 1004, 1010]",0,847,568.94,2021-01-05 00:05:38,2021-01-05 00:19:45,1,1,0,...,0,0,0,0,0,0,0,0,0,6


## Przygotowanie danych do modelu

In [19]:
main_df.reset_index(inplace=True)

In [20]:
main_df.drop(columns=['session_id', 'user_id', 'product_id'], inplace=True)

In [21]:
dates_encoded = pd.DataFrame({
    "month": main_df['session_start'].dt.month,
    "day": main_df['session_start'].dt.day,
    "hour": main_df['session_start'].dt.hour,
    "dayofweek": main_df['session_start'].dt.dayofweek,
})

In [22]:
main_df = main_df.join(dates_encoded)

In [23]:
main_df.drop(columns=['session_start', 'session_end'], inplace=True)

In [24]:
main_df.head()

Unnamed: 0,offered_discount,session_length,price,purchased,Female,Male,Akcesoria telefoniczne,Anteny RTV,Audio,Biurowe urządzenia wielofunkcyjne,...,Telefony stacjonarne,Telewizory i akcesoria,Video,Zestawy głośnomówiące,Zestawy słuchawkowe,n_views,month,day,hour,dayofweek
0,5,19,199.98,1,1,0,0,0,0,0,...,0,2,2,0,0,1,7,28,15,2
1,0,1436,36664.54,1,1,0,0,0,0,9,...,0,0,0,0,0,8,9,12,16,6
2,15,117,222.6,1,1,0,0,0,0,0,...,0,0,0,0,0,2,4,9,22,4
3,0,519,208.5,1,1,0,0,0,0,0,...,0,0,0,0,0,2,8,15,16,6
4,0,847,568.94,1,1,0,0,0,0,0,...,0,0,0,0,0,6,1,5,0,1


# Wydzielenie zbioru treningowego, walidacyjnego i testowego

In [65]:
y = main_df['purchased'].values
X = main_df.drop(columns=['purchased']).values

In [66]:
X

array([[5.000000e+00, 1.900000e+01, 1.999800e+02, ..., 2.800000e+01,
        1.500000e+01, 2.000000e+00],
       [0.000000e+00, 1.436000e+03, 3.666454e+04, ..., 1.200000e+01,
        1.600000e+01, 6.000000e+00],
       [1.500000e+01, 1.170000e+02, 2.226000e+02, ..., 9.000000e+00,
        2.200000e+01, 4.000000e+00],
       ...,
       [0.000000e+00, 1.617000e+03, 1.141660e+03, ..., 7.000000e+00,
        7.000000e+00, 6.000000e+00],
       [2.000000e+01, 0.000000e+00, 7.990000e+01, ..., 1.200000e+01,
        0.000000e+00, 1.000000e+00],
       [0.000000e+00, 0.000000e+00, 1.090000e+02, ..., 1.900000e+01,
        2.100000e+01, 3.000000e+00]])

In [67]:
y

array([1, 1, 1, ..., 0, 0, 0])

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [69]:
X_test.shape

(2859, 38)

In [70]:
X_train.shape

(6668, 38)