In [76]:
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import tensorflow as tf
import numpy as np

In [77]:
df_sessions = pd.read_json(path_or_buf='../../data/raw/sessions.jsonl', lines=True)
df_products = pd.read_json(path_or_buf='../../data/raw/products.jsonl', lines=True)
df_users = pd.read_json(path_or_buf='../../data/raw/users.jsonl', lines=True)

In [78]:
df_users.head()

Unnamed: 0,user_id,name,city,street
0,102,Aurelia Malon,Poznań,pl. Brzoskwiniowa 11/53
1,103,Mateusz Kobel,Poznań,al. Wrocławska 10
2,104,Radosław Ratka,Szczecin,pl. Nowa 89/04
3,105,Anastazja Oszust,Warszawa,ul. Częstochowska 80
4,106,Sylwia Nurek,Warszawa,al. Wiosenna 72


In [79]:
df_products.head()

Unnamed: 0,product_id,product_name,category_path,price
0,1001,Telefon Siemens Gigaset DA310,Telefony i akcesoria;Telefony stacjonarne,58.97
1,1002,Kyocera FS-1135MFP,Komputery;Drukarki i skanery;Biurowe urządzeni...,2048.5
2,1003,Kyocera FS-3640MFP,Komputery;Drukarki i skanery;Biurowe urządzeni...,7639.0
3,1004,Fallout 3 (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,49.99
4,1005,Szalone Króliki Na żywo i w kolorze (Xbox 360),Gry i konsole;Gry na konsole;Gry Xbox 360,49.99


In [80]:
categories_one_hot = df_products.category_path.str.get_dummies(';')
categories_one_hot.head()

Unnamed: 0,Akcesoria telefoniczne,Anteny RTV,Audio,Biurowe urządzenia wielofunkcyjne,Drukarki i skanery,Gry PlayStation3,Gry Xbox 360,Gry i konsole,Gry komputerowe,Gry na konsole,...,Słuchawki,Tablety,Tablety i akcesoria,Telefony i akcesoria,Telefony komórkowe,Telefony stacjonarne,Telewizory i akcesoria,Video,Zestawy głośnomówiące,Zestawy słuchawkowe
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
1,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [81]:
cities_one_hot = df_users.city.str.get_dummies()
cities_one_hot.head()

Unnamed: 0,Gdynia,Kraków,Poznań,Radom,Szczecin,Warszawa,Wrocław
0,0,0,1,0,0,0,0
1,0,0,1,0,0,0,0
2,0,0,0,0,1,0,0
3,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0


In [82]:
df_products.drop(columns=['product_name', 'category_path'], inplace=True)
df_products = df_products.join(categories_one_hot)

In [83]:
df_users.drop(columns=['city'], inplace=True)
df_users = df_users.join(cities_one_hot)

In [84]:
df_products.head()

Unnamed: 0,product_id,price,Akcesoria telefoniczne,Anteny RTV,Audio,Biurowe urządzenia wielofunkcyjne,Drukarki i skanery,Gry PlayStation3,Gry Xbox 360,Gry i konsole,...,Słuchawki,Tablety,Tablety i akcesoria,Telefony i akcesoria,Telefony komórkowe,Telefony stacjonarne,Telewizory i akcesoria,Video,Zestawy głośnomówiące,Zestawy słuchawkowe
0,1001,58.97,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
1,1002,2048.5,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1003,7639.0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1004,49.99,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,1005,49.99,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [85]:
df_sessions.head()

Unnamed: 0,session_id,timestamp,user_id,product_id,event_type,offered_discount,purchase_id
0,124,2021-07-28 15:47:21,102,1283,VIEW_PRODUCT,5,
1,124,2021-07-28 15:47:40,102,1283,BUY_PRODUCT,5,20001.0
2,125,2021-09-12 16:43:16,102,1076,VIEW_PRODUCT,0,
3,125,2021-09-12 16:46:24,102,1080,VIEW_PRODUCT,0,
4,125,2021-09-12 16:48:08,102,1079,VIEW_PRODUCT,0,


In [86]:
df_users['sex'] = ['Female' if name.split(' ')[0][-1] == 'a' else 'Male' for name in df_users['name']]
sex_one_hot = df_users.sex.str.get_dummies()
df_users.drop(columns=['sex'], inplace=True)
df_users = df_users.join(sex_one_hot)

In [87]:
df_users.head()

Unnamed: 0,user_id,name,street,Gdynia,Kraków,Poznań,Radom,Szczecin,Warszawa,Wrocław,Female,Male
0,102,Aurelia Malon,pl. Brzoskwiniowa 11/53,0,0,1,0,0,0,0,1,0
1,103,Mateusz Kobel,al. Wrocławska 10,0,0,1,0,0,0,0,0,1
2,104,Radosław Ratka,pl. Nowa 89/04,0,0,0,0,1,0,0,0,1
3,105,Anastazja Oszust,ul. Częstochowska 80,0,0,0,0,0,1,0,1,0
4,106,Sylwia Nurek,al. Wiosenna 72,0,0,0,0,0,1,0,1,0


In [88]:
df = df_sessions.merge(df_products, left_on='product_id', right_on='product_id')
df = df.merge(df_users, left_on='user_id', right_on='user_id')
df.head()

Unnamed: 0,session_id,timestamp,user_id,product_id,event_type,offered_discount,purchase_id,price,Akcesoria telefoniczne,Anteny RTV,...,street,Gdynia,Kraków,Poznań,Radom,Szczecin,Warszawa,Wrocław,Female,Male
0,124,2021-07-28 15:47:21,102,1283,VIEW_PRODUCT,5,,99.99,0,0,...,pl. Brzoskwiniowa 11/53,0,0,1,0,0,0,0,1,0
1,124,2021-07-28 15:47:40,102,1283,BUY_PRODUCT,5,20001.0,99.99,0,0,...,pl. Brzoskwiniowa 11/53,0,0,1,0,0,0,0,1,0
2,139,2021-07-01 11:28:02,102,1283,VIEW_PRODUCT,0,,99.99,0,0,...,pl. Brzoskwiniowa 11/53,0,0,1,0,0,0,0,1,0
3,139,2021-07-01 11:31:18,102,1283,BUY_PRODUCT,0,20013.0,99.99,0,0,...,pl. Brzoskwiniowa 11/53,0,0,1,0,0,0,0,1,0
4,144,2021-03-19 13:55:04,102,1283,VIEW_PRODUCT,0,,99.99,0,0,...,pl. Brzoskwiniowa 11/53,0,0,1,0,0,0,0,1,0


In [89]:
aggregation_functions = {
    'timestamp' : lambda t: [(t.max() - t.min()).seconds, t.min(), t.max()],
    'user_id' : 'first',
    'product_id' : 'unique',
    'event_type' : lambda e: 1 if len(e.unique()) > 1 else 0,
    'offered_discount' : 'first',
    'price' : lambda p: p.sum()
}

for name in categories_one_hot.columns:
    aggregation_functions[name] = lambda n: n.sum()

# for name in cities_one_hot.columns:
#     aggregation_functions[name] = 'first'

for name in sex_one_hot.columns:
    aggregation_functions[name] = 'first'

main_df = df.groupby(df['session_id']).aggregate(aggregation_functions)
main_df.rename(columns={'event_type' : 'purchased'}, inplace=True)

In [90]:
timestamp_dict = [{'session_length' : x[0], 'session_start' : x[1], 'session_end' : x[2]} for x in main_df['timestamp']]
timestamp_df = pd.DataFrame(timestamp_dict)

In [91]:
main_df.reset_index(inplace=True)
main_df = pd.concat([main_df, timestamp_df], axis=1, join="inner")
main_df.drop(columns=['timestamp'], inplace=True)
main_df.index.name = 'session_id'
main_df = main_df[['user_id', 'product_id', 'offered_discount', 'session_length', 'price', 'session_start', 'session_end', 'purchased'] + list(sex_one_hot.columns.values) + list(categories_one_hot.columns.values)]
main_df.head()

Unnamed: 0_level_0,user_id,product_id,offered_discount,session_length,price,session_start,session_end,purchased,Female,Male,...,Słuchawki,Tablety,Tablety i akcesoria,Telefony i akcesoria,Telefony komórkowe,Telefony stacjonarne,Telewizory i akcesoria,Video,Zestawy głośnomówiące,Zestawy słuchawkowe
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,102,[1283],5,19,199.98,2021-07-28 15:47:21,2021-07-28 15:47:40,1,1,0,...,0,0,0,0,0,0,2,2,0,0
1,102,"[1076, 1080, 1079, 1003, 1002, 1075, 1078, 1077]",0,1436,36664.54,2021-09-12 16:43:16,2021-09-12 17:07:12,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,102,"[1281, 1278]",15,117,222.6,2021-04-09 22:31:08,2021-04-09 22:33:05,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,102,"[1281, 1278]",0,519,208.5,2021-08-15 16:46:06,2021-08-15 16:54:45,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,102,"[1008, 1007, 1013, 1012, 1004, 1010]",0,847,568.94,2021-01-05 00:05:38,2021-01-05 00:19:45,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [92]:
main_df['n_views'] = [len(x) for x in main_df['product_id'].values]

In [93]:
main_df.head()

Unnamed: 0_level_0,user_id,product_id,offered_discount,session_length,price,session_start,session_end,purchased,Female,Male,...,Tablety,Tablety i akcesoria,Telefony i akcesoria,Telefony komórkowe,Telefony stacjonarne,Telewizory i akcesoria,Video,Zestawy głośnomówiące,Zestawy słuchawkowe,n_views
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,102,[1283],5,19,199.98,2021-07-28 15:47:21,2021-07-28 15:47:40,1,1,0,...,0,0,0,0,0,2,2,0,0,1
1,102,"[1076, 1080, 1079, 1003, 1002, 1075, 1078, 1077]",0,1436,36664.54,2021-09-12 16:43:16,2021-09-12 17:07:12,1,1,0,...,0,0,0,0,0,0,0,0,0,8
2,102,"[1281, 1278]",15,117,222.6,2021-04-09 22:31:08,2021-04-09 22:33:05,1,1,0,...,0,0,0,0,0,0,0,0,0,2
3,102,"[1281, 1278]",0,519,208.5,2021-08-15 16:46:06,2021-08-15 16:54:45,1,1,0,...,0,0,0,0,0,0,0,0,0,2
4,102,"[1008, 1007, 1013, 1012, 1004, 1010]",0,847,568.94,2021-01-05 00:05:38,2021-01-05 00:19:45,1,1,0,...,0,0,0,0,0,0,0,0,0,6


## Przygotowanie danych do modelu

In [94]:
main_df.reset_index(inplace=True)

In [95]:
main_df.drop(columns=['session_id', 'user_id', 'product_id'], inplace=True)

In [96]:
dates_encoded = pd.DataFrame({
    "month": main_df['session_start'].dt.month,
    "day": main_df['session_start'].dt.day,
    "hour": main_df['session_start'].dt.hour,
    "dayofweek": main_df['session_start'].dt.dayofweek,
})

In [97]:
main_df = main_df.join(dates_encoded)

In [98]:
main_df.drop(columns=['session_start', 'session_end'], inplace=True)

In [99]:
main_df.head()

Unnamed: 0,offered_discount,session_length,price,purchased,Female,Male,Akcesoria telefoniczne,Anteny RTV,Audio,Biurowe urządzenia wielofunkcyjne,...,Telefony stacjonarne,Telewizory i akcesoria,Video,Zestawy głośnomówiące,Zestawy słuchawkowe,n_views,month,day,hour,dayofweek
0,5,19,199.98,1,1,0,0,0,0,0,...,0,2,2,0,0,1,7,28,15,2
1,0,1436,36664.54,1,1,0,0,0,0,9,...,0,0,0,0,0,8,9,12,16,6
2,15,117,222.6,1,1,0,0,0,0,0,...,0,0,0,0,0,2,4,9,22,4
3,0,519,208.5,1,1,0,0,0,0,0,...,0,0,0,0,0,2,8,15,16,6
4,0,847,568.94,1,1,0,0,0,0,0,...,0,0,0,0,0,6,1,5,0,1


# Wydzielenie zbioru treningowego, walidacyjnego i testowego

In [25]:
y = main_df['purchased'].values
X = main_df.drop(columns=['purchased'])

In [26]:
X

Unnamed: 0,offered_discount,session_length,price,Female,Male,Akcesoria telefoniczne,Anteny RTV,Audio,Biurowe urządzenia wielofunkcyjne,Drukarki i skanery,...,Telefony stacjonarne,Telewizory i akcesoria,Video,Zestawy głośnomówiące,Zestawy słuchawkowe,n_views,month,day,hour,dayofweek
0,5,19,199.98,1,0,0,0,0,0,0,...,0,2,2,0,0,1,7,28,15,2
1,0,1436,36664.54,1,0,0,0,0,9,9,...,0,0,0,0,0,8,9,12,16,6
2,15,117,222.60,1,0,0,0,0,0,0,...,0,0,0,0,0,2,4,9,22,4
3,0,519,208.50,1,0,0,0,0,0,0,...,0,0,0,0,0,2,8,15,16,6
4,0,847,568.94,1,0,0,0,0,0,0,...,0,0,0,0,0,6,1,5,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9522,0,0,64.80,1,0,0,0,0,0,0,...,0,0,0,0,0,1,1,27,3,2
9523,15,116,199.98,1,0,0,0,0,0,0,...,0,2,2,0,0,1,7,6,16,1
9524,0,1617,1141.66,1,0,0,10,0,0,0,...,0,10,10,0,0,10,3,7,7,6
9525,20,0,79.90,1,0,0,0,0,0,0,...,0,0,0,0,0,1,10,12,0,1


In [27]:
y

array([1, 1, 1, ..., 0, 0, 0])

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [29]:
X_test.shape

(2859, 38)

In [30]:
X_train.shape

(6668, 38)

## Szukanie modelu

## RandomForest

In [33]:
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)

RandomForestClassifier()

In [34]:
rand_forest_scores = cross_val_score(random_forest, X_train, y_train, cv=10, scoring="accuracy")
rand_forest_scores.mean()

0.9718075646861253

## KNN

In [35]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [36]:
knn_forest_scores = cross_val_score(knn, X_train, y_train, cv=10, scoring="accuracy")
knn_forest_scores.mean()

0.7897425161293227

## SVM

In [37]:
svm = SVC(kernel="rbf")

In [38]:
svm_scores = cross_val_score(svm, X_train, y_train, cv=10, scoring="accuracy")
svm_scores.mean()

0.5947805376590983

## Gradient Boosting

In [39]:
gradient_boosting = GradientBoostingClassifier()

In [40]:
gradient_boosting_scores = cross_val_score(gradient_boosting, X_train, y_train, cv=3, scoring="accuracy")
gradient_boosting_scores.mean()

0.9308646114274045

## Sieć neuronowa

In [41]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_dim=len(main_df.columns)-1),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation="sigmoid")

])

2022-01-08 01:44:51.790330: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-08 01:44:51.816988: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2022-01-08 01:44:51.817006: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-01-08 01:44:51.818215: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN

In [237]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 128)               4992      
                                                                 
 dense_5 (Dense)             (None, 64)                8256      
                                                                 
 dense_6 (Dense)             (None, 32)                2080      
                                                                 
 dense_7 (Dense)             (None, 1)                 33        
                                                                 
Total params: 15,361
Trainable params: 15,361
Non-trainable params: 0
_________________________________________________________________


In [238]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

In [None]:
model.fit(np.array(X_train,dtype=np.float64), np.array(y_train,dtype=np.float64), epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [None]:
predictions = model.predict(np.array(X_test,dtype=np.float64)) > 0.5

In [None]:
accuracy_score(y_test, predictions)