# imports and loading DataFrame

In [941]:
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as ss
import pandas as pd
import numpy as np
import datetime


# fact table
sessions_df = pd.read_json("data/sessions.jsonl", lines=True)

# dimension tables
deliveries_df = pd.read_json("data/deliveries.jsonl", lines=True)
products_df = pd.read_json("data/products.jsonl", lines=True)
users_df = pd.read_json("data/users.jsonl", lines=True)

# constant values

In [942]:
MAKE_PLOTS = False
DATE_FORMAT = "%Y-%m-%dT%H:%M:%S"
PRICE_TRESHOLD = 100_000    # for outliers
WEIGHT_TRESHOLD = 50        # for outliers
COLUMNS_TO_DROP = ["delivery_timestamp", "session_id", "purchase_id", "event_type", "name", "street", "product_id", "offered_discount"]
COLUMNS_TO_ONE_HOT = ["delivery_company", "user_id", "city", "product_name", "category_path", "brand"]
SEED = 42

# merging all data into one Datafram and other transformations

## adding a column with time difference in deliveries
adding a column with time difference between purchase_timestamp and delivery_timestamp in deliveries table

1. Cut microseconds from delivery_timestamp, so it will be the same format as purchase_timestamp, because there are no microseconds in purchase_timestamp (using "." as a separator).
2. Change columns format to datetime
3. Add time_diff column (as timedelta64 object).
4. Drop rows where time_diff is null (which means that delivery_timestamp was null).
5. Change type of time_diff from timedelta64 to seconds in float.
6. Drop rows where time_diff is below 0. THIS STEP IS MADE IN ### without time_diff below 0

In [943]:
# 1.
deliveries_df["delivery_timestamp"] = deliveries_df["delivery_timestamp"].str.split('.', expand=True)[0]

# 2.
deliveries_df["purchase_timestamp"] = pd.to_datetime(deliveries_df["purchase_timestamp"], format=DATE_FORMAT)
deliveries_df["delivery_timestamp"] = pd.to_datetime(deliveries_df["delivery_timestamp"], format=DATE_FORMAT)

# 3.
deliveries_df["time_diff"] = deliveries_df["delivery_timestamp"] - deliveries_df["purchase_timestamp"]

# 4.
deliveries_df = deliveries_df[deliveries_df["time_diff"].notna()]

# 5.
# time diff as duration in seconds
deliveries_df["time_diff"] = deliveries_df["time_diff"].apply(datetime.timedelta.total_seconds)

# 6.
# deliveries_df = deliveries_df[deliveries_df["time_diff"] >= 0]

## join deliveries with sessions

In [944]:
# drop rows where event_type is not equal "BUY_PRODUCT"
sessions_df = sessions_df[sessions_df["event_type"] == "BUY_PRODUCT"]
df = deliveries_df.merge(sessions_df, on="purchase_id", how="left")

In [945]:
# making sure, that timestamp == purchase_timestamp
num_of_rows_before = df.shape[0]
df = df[df["timestamp"] == df["purchase_timestamp"]]
num_of_rows_after = df.shape[0]

assert(num_of_rows_before == num_of_rows_after)

# now we can drop timestamp column, as it is redundant
df = df.drop(columns="timestamp")

## join with other tables

In [946]:
df = df.merge(users_df, on="user_id", how="left")
df = df.merge(products_df, on="product_id", how="left")

# missing data analysis - MCAR, MAR, MNAR

made without outliers but with prices below zero (on copy of df)

In [947]:
missing_data_df = df.copy(deep=False)
missing_data_df["delivery_company_is_missing"] = missing_data_df["delivery_company"].isna()
missing_data_df["user_id_is_missing"] = missing_data_df["user_id"].isna()
missing_data_df["product_id_is_missing"] = missing_data_df["product_id"].isna()

In [948]:
# rejecting outliers for given PRICE_TRESHOLD
missing_data_df = missing_data_df[missing_data_df["price"] <= PRICE_TRESHOLD]

# rejecting outliers for given WEIGHT_TRESHOLD
missing_data_df = missing_data_df[missing_data_df["weight_kg"] <= WEIGHT_TRESHOLD]

In [949]:
NUM_BINS_MISSING = 50

def compare_histograms_for_missing(input_df1, input_df2, end_of_title1="", end_of_title2=""):
    fig, ax = plt.subplots(4, 2)
    
    def plot_histograms_missing(input_df, plot_column, end_of_title=""):

        def plot_hist_missing(x, y, col_name, num_bins=None):
            if num_bins:
                ax[x, y].hist(input_df[col_name], bins=num_bins)
            else:
                ax[x, y].hist(input_df[col_name])
            ax[x, y].set_title(f"histogram of {col_name}" + end_of_title)
            ax[x, y].set_xlabel(col_name)
            ax[x, y].set_ylabel("# of observations")

        plot_hist_missing(0, plot_column, "time_diff", NUM_BINS_MISSING)
        plot_hist_missing(1, plot_column, "offered_discount", NUM_BINS_MISSING)
        plot_hist_missing(2, plot_column, "price", NUM_BINS_MISSING)
        plot_hist_missing(3, plot_column, "weight_kg", NUM_BINS_MISSING)

    plot_histograms_missing(input_df1, 0, end_of_title1)
    plot_histograms_missing(input_df2, 1, end_of_title2)

    fig.set_size_inches([24, 21])
    plt.show()

## delivery_company missing

In [950]:
no_missing_delivery_company = missing_data_df[missing_data_df["delivery_company_is_missing"] == False]
missing_delivery_company = missing_data_df[missing_data_df["delivery_company_is_missing"] == True]

In [951]:
if MAKE_PLOTS:
    compare_histograms_for_missing(no_missing_delivery_company, missing_delivery_company, " without missing data for delivery_company", " with missing delivery_company")

## user_id missing

In [952]:
no_missing_user_id = missing_data_df[missing_data_df["user_id_is_missing"] == False]
missing_user_id = missing_data_df[missing_data_df["user_id_is_missing"] == True]

In [953]:
if MAKE_PLOTS:
    compare_histograms_for_missing(no_missing_user_id, missing_user_id, " without missing data for user_id", " with missing user_id")

## product_id missing
this analysis doesn't make sense

In [954]:
no_missing_product_id = missing_data_df[missing_data_df["product_id_is_missing"] == False]
missing_product_id = missing_data_df[missing_data_df["product_id_is_missing"] == True]

In [955]:
# if MAKE_PLOTS:
#     compare_histograms_for_missing(no_missing_product_id, missing_product_id, " without missing data for product_id", " with missing product_id")

# visualizations

## time_diff histogram and log-normal distribution test

In [956]:
if MAKE_PLOTS:
    fig, ax = plt.subplots(1, 2)

    def plot_hist(x, num_bins=50, func=None):
        if func:
            ax[x].hist(func(df["time_diff"]), bins=num_bins)
            ax[x].set_title(f"with {func} function")
        else:
            ax[x].hist(df["time_diff"], bins=num_bins)
            ax[x].set_title(f"without function")
        ax[x].set_xlabel("time difference [seconds]")
        ax[x].set_ylabel("# of observations")

    plot_hist(0)
    plot_hist(1, func=np.log)
    # plot_hist(1, 0, func=np.log2)
    # plot_hist(1, 1, func=np.log10)

    fig.set_size_inches([12, 6])
    plt.show()

## histograms of continuous variables

In [957]:
NUM_BINS = 50


def plot_histograms(input_df):
    fig, ax = plt.subplots(2, 2)

    def plot_hist(x, y, col_name, num_bins=None):
        if num_bins:
            ax[x, y].hist(input_df[col_name], bins=num_bins)
        else:
            ax[x, y].hist(input_df[col_name])
        ax[x, y].set_title(f"histogram of {col_name}")
        ax[x, y].set_xlabel(col_name)
        ax[x, y].set_ylabel("# of observations")

    plot_hist(0, 0, "time_diff", NUM_BINS)
    plot_hist(0, 1, "offered_discount", NUM_BINS)
    plot_hist(1, 0, "price", NUM_BINS)
    plot_hist(1, 1, "weight_kg", NUM_BINS)

    fig.set_size_inches([12, 12])
    plt.show()

### with outliers

In [958]:
if MAKE_PLOTS:
    plot_histograms(df)

### without outliers

In [959]:
# rejecting outliers for given PRICE_TRESHOLD
df = df[df["price"] <= PRICE_TRESHOLD]

# rejecting outliers for given WEIGHT_TRESHOLD
df = df[df["weight_kg"] <= WEIGHT_TRESHOLD]

In [960]:
if MAKE_PLOTS:
    plot_histograms(df)

### without prices below 0

In [961]:
# deleting rows with prices below 0
df = df[df["price"] >= 0]

In [962]:
if MAKE_PLOTS:
    plot_histograms(df)

### without time_diff below 0

In [963]:
df_with_time_diff_below_0 = df
df = df[df["time_diff"] >= 0]

In [964]:
if MAKE_PLOTS:
    plot_histograms(df)

## heatmap

### with time_diff below zero

In [965]:
def update_list_of_columns():
    banned_list_of_columns = ["purchase_id", "delivery_company", "session_id", "user_id", "product_id"]
    columns_list = [col for col in df.columns.values.tolist() if col not in banned_list_of_columns]
    return columns_list

columns_list = update_list_of_columns()

In [966]:
if MAKE_PLOTS:
    print(df_with_time_diff_below_0.shape)
    ax = sns.heatmap(df_with_time_diff_below_0[columns_list].corr(), square=True, cmap='RdYlGn')

### without time_diff below zero

#### pearson

In [967]:
if MAKE_PLOTS:
    print(df.shape)
    ax = sns.heatmap(df[columns_list].corr('pearson'), square=True, cmap='RdYlGn')

#### spearman

In [968]:
if MAKE_PLOTS:
    print(df.shape)
    ax = sns.heatmap(df[columns_list].corr('spearman'), square=True, cmap='RdYlGn')

#### kendall

In [969]:
# if MAKE_PLOTS:
#     print(df.shape)
#     ax = sns.heatmap(df[columns_list].corr('kendall'), square=True, cmap='RdYlGn')

# dropping columns (choosing attributes)

In [970]:
# drop columns
df = df.drop(columns=COLUMNS_TO_DROP)
df = df.drop(columns="optional_attributes") # chyba do zmiany - wysokosc itp.
df = df.drop(columns="purchase_timestamp") # na pewno do zmiany

In [971]:
df.columns

Index(['delivery_company', 'time_diff', 'user_id', 'city', 'product_name',
       'category_path', 'price', 'brand', 'weight_kg'],
      dtype='object')

# one-hot encoding

In [972]:
# df.to_excel("data_before_one_hot_encoding.xlsx")

In [973]:
def one_hot_encode_a_col_in_pd(df, col_name):
    one_hot = pd.get_dummies(df[col_name])
    df = df.drop(columns=col_name)
    df = df.join(one_hot)
    return df

In [974]:
for col_name in COLUMNS_TO_ONE_HOT:
    df = one_hot_encode_a_col_in_pd(df, col_name)

### testy

In [975]:
# # test only for given attributes
# attributes_names = ["city", "street"]
# df = df[["time_diff", *attributes_names]]
# for name in attributes_names:  
#     df = one_hot_encode_a_col_in_pd(df, name)

In [976]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler

# SEED = 42

# y = df["time_diff"].to_numpy()
# X = df.drop(columns="time_diff")

# # standardize features
# scaler = StandardScaler()
# X_std = scaler.fit_transform(X)

# X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.2, random_state=SEED)

# model = RandomForestRegressor(random_state=SEED)

# model.fit(X_train, y_train)
# y_pred_df = pd.DataFrame()
# y_pred_df["y_test"] = y_test
# y_pred_df["prediction"] = model.predict(X_test)
# y_pred_df["mean of time_diff"] = np.full(675, df["time_diff"].mean())
# print(y_pred_df.head())
# print(y_pred_df.info())
# print(y_pred_df.describe())

# score = model.score(X_test, y_test)
# print(f"model score = {score}")

In [977]:
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler

# SEED = 42

# y = df["time_diff"].to_numpy()
# X = df.drop(columns="time_diff")

# # standardize features
# scaler = StandardScaler()
# X_std = scaler.fit_transform(X)

# X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.2, random_state=SEED)

# model = DecisionTreeRegressor(random_state=SEED)

# model.fit(X_train, y_train)
# y_pred_df = pd.DataFrame()
# y_pred_df["y_test"] = y_test
# y_pred_df["prediction"] = model.predict(X_test)
# y_pred_df["mean of time_diff"] = np.full(675, df["time_diff"].mean())
# print(y_pred_df.head())
# print(y_pred_df.info())
# print(y_pred_df.describe())

# score = model.score(X_test, y_test)
# print(f"model score = {score}")

In [978]:
# from sklearn.linear_model import LinearRegression, Ridge, Lasso
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.linear_model import RidgeCV

# SEED = 42

# y = df["time_diff"].to_numpy()
# X = df.drop(columns="time_diff")

# # standardize features
# scaler = StandardScaler()
# X_std = scaler.fit_transform(X)

# # find best alpha value
# alphas_list_to_try = [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
# reg_cv = RidgeCV(alphas=alphas_list_to_try)
# model_cv = reg_cv.fit(X_std, y)
# best_found_alpha = model_cv.alpha_

# X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.2, random_state=SEED)


# # reg = LinearRegression()
# reg = Ridge(alpha=best_found_alpha)
# # reg = Lasso(alpha=0.1)

# reg.fit(X_train, y_train)
# y_pred_df = pd.DataFrame()
# y_pred_df["y_test"] = y_test
# y_pred_df["prediction"] = reg.predict(X_test)
# y_pred_df["mean of time_diff"] = np.full(675, df["time_diff"].mean())
# print(y_pred_df.head())
# print(y_pred_df.info())
# print(y_pred_df.describe())

# score = reg.score(X_test, y_test)
# print(f"R^2 score = {score}")

### kontynuacja

#### checking df shape

In [979]:
print(df.shape)
columns_list = update_list_of_columns()
# ax = sns.heatmap(df[columns_list].corr(), square=True, cmap='RdYlGn')

(3375, 430)


In [980]:
df = df.dropna()
print(df.shape)
# one-hot encoding took care of missing data, so shape has not changed

(3375, 430)


# test of linear regression models

In [981]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def split_data_and_standardize(df, target_column="time_diff"):
    y = df["time_diff"].to_numpy()
    X = df.drop(columns="time_diff")
    # standardize features
    scaler = StandardScaler()
    X_std = scaler.fit_transform(X)
    return train_test_split(X_std, y, test_size=0.2, random_state=SEED)

In [982]:
def train_models(models_list):
    for model in models_list:
        model.fit(X_train, y_train)
    return models_list

In [983]:
def create_df_with_predictions(models_list, y_test):
    y_pred_df = pd.DataFrame()
    y_pred_df["y_test"] = y_test
    for model in models_list:
        y_pred_df[f"{type(model).__name__} prediction"] = model.predict(X_test)
    return y_pred_df

In [984]:
def display_predictions(y_pred_df):
    display(y_pred_df.head())
    display(y_pred_df.info())
    display(y_pred_df.describe())

In [985]:
def print_scores(models_list):
    for model in models_list:
        score = model.score(X_test, y_test)
        print(f"{type(model).__name__} score = {score}")

In [986]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


X_train, X_test, y_train, y_test = split_data_and_standardize(df)

models_list = [Ridge(alpha=0.1),
                Lasso(alpha=0.1),
                DecisionTreeRegressor(random_state=SEED)]
models_list = train_models(models_list)

y_pred_df = create_df_with_predictions(models_list, y_test)
display_predictions(y_pred_df)

print_scores(models_list)


  model = cd_fast.enet_coordinate_descent(


Unnamed: 0,y_test,Ridge prediction,Lasso prediction,DecisionTreeRegressor prediction
0,213361.0,123079.698983,123081.152876,206795.0
1,292992.0,178863.041549,178876.57102,355226.0
2,69756.0,146516.000928,146521.78645,61822.0
3,105882.0,165487.852799,165497.798867,70518.0
4,27516.0,174459.151645,174460.668145,143221.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 675 entries, 0 to 674
Data columns (total 4 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   y_test                            675 non-null    float64
 1   Ridge prediction                  675 non-null    float64
 2   Lasso prediction                  675 non-null    float64
 3   DecisionTreeRegressor prediction  675 non-null    float64
dtypes: float64(4)
memory usage: 21.2 KB


None

Unnamed: 0,y_test,Ridge prediction,Lasso prediction,DecisionTreeRegressor prediction
count,675.0,675.0,675.0,675.0
mean,174414.380741,175450.003683,175447.625746,171114.642222
std,123556.467558,45037.288072,45038.267265,118266.522832
min,286.0,24484.452485,24497.135111,222.0
25%,78062.5,148008.45777,148014.215556,75923.0
50%,154073.0,171755.537835,171749.94825,154090.0
75%,246080.0,198597.653873,198600.770873,240018.75
max,818364.0,389730.744819,389728.139185,598516.0


Ridge score = -0.15995213529471752
Lasso score = -0.16004478311517234
DecisionTreeRegressor score = -0.925148966035179
