# imports and loading DataFrame

In [83]:
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as ss
import pandas as pd
import numpy as np
import datetime

DATE_FORMAT = "%Y-%m-%dT%H:%M:%S"
PRICE_TRESHOLD = 100_000
WEIGHT_TRESHOLD = 50


# fact table
sessions_df = pd.read_json("data/sessions.jsonl", lines=True)

# dimension tables
deliveries_df = pd.read_json("data/deliveries.jsonl", lines=True)
products_df = pd.read_json("data/products.jsonl", lines=True)
users_df = pd.read_json("data/users.jsonl", lines=True)

# adding a column with time difference in deliveries
adding a column with time difference between purchase_timestamp and delivery_timestamp in deliveries table

1. Cut microseconds from delivery_timestamp, so it will be the same format as purchase_timestamp, because there are no microseconds in purchase_timestamp (using "." as a separator).
2. Change columns format to datetime
3. Add time_diff column (as timedelta64 object).
4. Drop rows where time_diff is null (which means that delivery_timestamp was null).
5. Change type of time_diff from timedelta64 to seconds in float.
6. Drop rows where time_diff is below 0.

In [84]:
# 1.
deliveries_df["delivery_timestamp"] = deliveries_df["delivery_timestamp"].str.split('.', expand=True)[0]

# 2.
deliveries_df["purchase_timestamp"] = pd.to_datetime(deliveries_df["purchase_timestamp"], format=DATE_FORMAT)
deliveries_df["delivery_timestamp"] = pd.to_datetime(deliveries_df["delivery_timestamp"], format=DATE_FORMAT)

# 3.
deliveries_df["time_diff"] = deliveries_df["delivery_timestamp"] - deliveries_df["purchase_timestamp"]

# 4.
deliveries_df = deliveries_df[deliveries_df["time_diff"].notna()]

# 5.
# time diff as duration in seconds
deliveries_df["time_diff"] = deliveries_df["time_diff"].apply(datetime.timedelta.total_seconds)

# 6.
deliveries_df = deliveries_df[deliveries_df["time_diff"] >= 0]


# deliveries_df.info()
# deliveries_df.head()

# join deliveries with sessions

In [85]:
# drop rows where event_type is not equal "BUY_PRODUCT"
sessions_df = sessions_df[sessions_df["event_type"] == "BUY_PRODUCT"]
df = deliveries_df.merge(sessions_df, on="purchase_id", how="left")

# df.info()
# df.head()

In [86]:
# making sure, that timestamp == purchase_timestamp
num_of_rows_before = df.shape[0]
df = df[df["timestamp"] == df["purchase_timestamp"]]
num_of_rows_after = df.shape[0]

assert(num_of_rows_before == num_of_rows_after)

# now we can drop timestamp column, as it is redundant
df = df.drop(columns="timestamp")

# df.info()
# df.head()

# join with other tables

In [87]:
df = df.merge(users_df, on="user_id", how="left")
df = df.merge(products_df, on="product_id", how="left")

# df.info()
# df.head()

# visualizations

## time_diff histogram

In [88]:
# fig, ax = plt.subplots(1, 2)

# def plot_hist(x, num_bins=50, func=None):
#     if func:
#         ax[x].hist(func(df["time_diff"]), bins=num_bins)
#         ax[x].set_title(f"with {func} function")
#     else:
#         ax[x].hist(df["time_diff"], bins=num_bins)
#         ax[x].set_title(f"without function")
#     ax[x].set_xlabel("time difference [seconds]")
#     ax[x].set_ylabel("# of observations")

# plot_hist(0)
# plot_hist(1, func=np.log)
# # plot_hist(1, 0, func=np.log2)
# # plot_hist(1, 1, func=np.log10)

# fig.set_size_inches([12, 6])
# plt.show()

## histograms of continuous variables

In [89]:
NUM_BINS = 50


def plot_histograms(input_df):
    fig, ax = plt.subplots(2, 2)

    def plot_hist(x, y, col_name, num_bins=None):
        if num_bins:
            ax[x, y].hist(input_df[col_name], bins=num_bins)
        else:
            ax[x, y].hist(input_df[col_name])
        ax[x, y].set_title(f"histogram of {col_name}")
        ax[x, y].set_xlabel(col_name)
        ax[x, y].set_ylabel("# of observations")

    plot_hist(0, 0, "time_diff", NUM_BINS)
    plot_hist(0, 1, "offered_discount", NUM_BINS)
    plot_hist(1, 0, "price", NUM_BINS)
    plot_hist(1, 1, "weight_kg", NUM_BINS)

    fig.set_size_inches([12, 12])
    plt.show()

### with outliers

In [90]:
# plot_histograms(df)

### without outliers

In [91]:
# rejecting outliers for given PRICE_TRESHOLD
df = df[df["price"] <= PRICE_TRESHOLD]

# rejecting outliers for given WEIGHT_TRESHOLD
df = df[df["weight_kg"] <= WEIGHT_TRESHOLD]

In [92]:
# plot_histograms(df)

### without prices below 0

In [93]:
# deleting rows with prices below 0
df = df[df["price"] >= 0]

In [94]:
# plot_histograms(df)

### without time_diff below 0

In [95]:
df_without_time_diff_below_0 = df[df["time_diff"] >= 0]

In [96]:
# plot_histograms(df_without_time_diff_below_0)

## heatmap

### with time_diff below zero

In [97]:
def update_list_of_columns():
    banned_list_of_columns = ["purchase_id", "delivery_company", "session_id", "user_id", "product_id"]
    columns_list = [col for col in df.columns.values.tolist() if col not in banned_list_of_columns]
    return columns_list

columns_list = update_list_of_columns()

In [98]:
# print(df.shape)
# ax = sns.heatmap(df[columns_list].corr(), square=True, cmap='RdYlGn')

### without time_diff below zero

#### pearson

In [99]:
# print(df_without_time_diff_below_0.shape)
# ax = sns.heatmap(df_without_time_diff_below_0[columns_list].corr('pearson'), square=True, cmap='RdYlGn')

#### spearman

In [100]:
# print(df_without_time_diff_below_0.shape)
# ax = sns.heatmap(df_without_time_diff_below_0[columns_list].corr('spearman'), square=True, cmap='RdYlGn')

#### kendall

In [101]:
# print(df_without_time_diff_below_0.shape)
# ax = sns.heatmap(df_without_time_diff_below_0[columns_list].corr('kendall'), square=True, cmap='RdYlGn')

# one-hot encoding

## city

- city
- delivery_company
- product_id

In [102]:
def one_hot_encode_a_col_in_pd(df, col_name):
    one_hot = pd.get_dummies(df[col_name])
    df = df.drop(columns=col_name)
    df = df.join(one_hot)
    return df

In [103]:
# drop columns
columns_to_drop = ["delivery_timestamp", "session_id", "purchase_id", "event_type", "name", "street", "product_id"]
df = df.drop(columns=columns_to_drop)
df = df.drop(columns="optional_attributes") # na pewno do zmiany
df = df.drop(columns="purchase_timestamp") # NA PEWNO DO ZMIANY

# df.info()
# df.head()

In [104]:

# one-hot encoding
df = one_hot_encode_a_col_in_pd(df, "city")
df = one_hot_encode_a_col_in_pd(df, "delivery_company")
df = one_hot_encode_a_col_in_pd(df, "product_name")
df = one_hot_encode_a_col_in_pd(df, "category_path")
df = one_hot_encode_a_col_in_pd(df, "brand")
df = one_hot_encode_a_col_in_pd(df, "user_id")

# df.info()
# df.head()

In [105]:
print(df.shape)
columns_list = update_list_of_columns()
# ax = sns.heatmap(df[columns_list].corr(), square=True, cmap='RdYlGn')

(3375, 431)


In [106]:
df = df.dropna()
print(df.shape)
# one-hot encoding took care of missing data:

(3375, 431)


In [107]:
# TO DO: version in scikit learn

# from sklearn.preprocessing import OneHotEncoder

# onehotencoder = OneHotEncoder()

# test of linear regression model

In [108]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split

SEED = 42

y = df["time_diff"].to_numpy()
X = df.drop(columns="time_diff")

X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size=0.2, random_state=SEED)


# reg = LinearRegression()
reg = Ridge(alpha=0.1)
# reg = Lasso(alpha=0.1)

reg.fit(X_train, y_train)
y_pred_df = pd.DataFrame()
y_pred_df["y_test"] = y_test
y_pred_df["prediction"] = reg.predict(X_test)
y_pred_df["mean of time_diff"] = np.full(675, df["time_diff"].mean())
print(y_pred_df.head())
print(y_pred_df.info())
print(y_pred_df.describe())

score = reg.score(X_test, y_test)
print(f"R^2 score = {score}")

     y_test     prediction  mean of time_diff
0  213361.0  121946.800756      174356.337185
1  292992.0  179499.003581      174356.337185
2   69756.0  145516.180887      174356.337185
3  105882.0  162486.034959      174356.337185
4   27516.0  173546.054983      174356.337185
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 675 entries, 0 to 674
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   y_test             675 non-null    float64
 1   prediction         675 non-null    float64
 2   mean of time_diff  675 non-null    float64
dtypes: float64(3)
memory usage: 15.9 KB
None
              y_test     prediction  mean of time_diff
count     675.000000     675.000000       6.750000e+02
mean   174414.380741  175387.179613       1.743563e+05
std    123556.467558   44347.855851       2.912541e-11
min       286.000000   25550.029287       1.743563e+05
25%     78062.500000  147488.101855       1.743563e+05
50% 

# mutual information (współczynnik informacji wzajemnej)