In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import my_funcs as mf

# With these 2 lines you can modify my_funcs propahgate the changes here
%load_ext autoreload
%autoreload 2
# Custom plot layout
plt.rcParams["figure.facecolor"] = "white"
# To use black (auto formatter on the notebook): /opt/anaconda3/envs/plus2/bin/pip install nb_black
%load_ext lab_black

### Load the dataset

In [None]:
# Paths
cliks_path = "data_yoochoose/yoochoose-clicks.dat"
buys_path = "data_yoochoose/yoochoose-buys.dat"
cliks_path_afterExploration = "data_yoochoose/yoochoose-clicks_aftExp.dat"
buys_path_afterExploration = "data_yoochoose/yoochoose-buys_aftExp.dat"
limit = None
fig_folder = "figs/"

In [None]:
%%time
df_clicks = mf.load_file(
    cliks_path,
    limit=limit,
    to_be_sorted=False,
    index_col=None,
    header=None,
    names=["SessionID", "Timestamp", "ItemID", "Category"],
    dtype={"SessionID": np.int32, "ItemID": np.int32},
    parse_dates={"DateTime": [1]},
    date_parser=lambda col: pd.to_datetime(
        col, utc=True, format="%Y-%m-%dT%H:%M:%S.%fZ"
    ),
    converters={"Category": lambda c: -1 if c == "S" else c}
)

df_buys = mf.load_file(
    buys_path,
    limit=limit,
    to_be_sorted=False,
    index_col=None,
    header=None,
    names=["SessionID", "Timestamp", "ItemID", "Price", "Quantity"],
    dtype={
        "SessionID": np.int32,
        "ItemID": np.int32,
        "Price": np.int32,
        "Quantity": np.int16,
    },
    parse_dates={"DateTime": [1]},
    date_parser=lambda col: pd.to_datetime(
        col, utc=True, format="%Y-%m-%dT%H:%M:%S.%fZ"
    )
)

### Sanity checks

In [None]:
mf.sanity_checks(df_clicks, n=5)

In [None]:
mf.sanity_checks(df_buys, n=5)

### Fix: remapping to lower values

In [None]:
# Remapping ItemID: Clicks
dict_replace = {}
items = [x for x in df_clicks["ItemID"].unique()]
for index, n in enumerate(items):
    dict_replace[n] = index
df_clicks["ItemID"] = df_clicks["ItemID"].map(
    dict_replace.get
)  # Faster than using 'replace' method. # Because all values are in the dic

# Remapping ItemID: Buys
dict_replace = {}
items = [x for x in df_buys["ItemID"].unique()]
for index, n in enumerate(items):
    dict_replace[n] = index
df_buys["ItemID"] = df_buys["ItemID"].map(
    dict_replace.get
)  # Because all values are in the dic

In [None]:
# Remapping Category
# Category has few entry that are 10-digit number (special brend-offer)
# Need to re-map to np.16 so i map those ID to lower number that are avaible
dict_replace = {}
df_clicks["Category"] = df_clicks["Category"].astype(np.int32)
cat_larger_than12 = [x for x in df_clicks["Category"].unique() if x > 12]
for index, n in enumerate(cat_larger_than12):
    dict_replace[n] = 13 + index
df_clicks["Category"] = df_clicks["Category"].map(
    lambda x: dict_replace.get(x, x)
)  # Because not all values are in the dict
df_clicks["Category"] = df_clicks["Category"].astype(np.int16)

### CLICKS plots

In [None]:
mf.create_plots(
    df_clicks,
    out_folder=fig_folder,
    col_to_exclude=[],
    params={
        "DateTime": {
            "plot": "date_bar_YM",
            "y": 1,
            "xlabel": "Date",
            "ylabel": "Counts",
            "logy": True,
            "save": True,
            "name": "0clicks_b_DateTime.png",
        },
        "SessionID": {
            "plot": "hist",
            "xlabel": "SessionID",
            "ylabel": "counts",
            "save": True,
            "name": "0clicks_h_SessionID.png",
        },
        "ItemID": {
            "plot": "hist",
            "xlabel": "ItemID",
            "ylabel": "counts",
            "logy": True,
            "show": True,
            "bins": 20,
            "save": True,
            "name": "0clicks_h_ItemID.png",
        },
        "Category": {
            "plot": "hist",
            "xlabel": "Category",
            "ylabel": "counts",
            "logy": True,
            "show": True,
            "bins": 100,
            "save": True,
            "name": "0clicks_h_Category.png",
        },
    },
)

### BUYS plots

In [None]:
mf.create_plots(
    df_buys,
    out_folder=fig_folder,
    col_to_exclude=[],
    params={
        "DateTime": {
            "plot": "date_bar_YM",
            "y": 1,
            "xlabel": "Date",
            "ylabel": "Counts",
            "logy": True,
            "save": True,
            "name": "0buy_b_DateTime.png",
        },
        "SessionID": {
            "plot": "hist",
            "xlabel": "SessionID",
            "ylabel": "counts",
            "save": True,
            "name": "0buy_h_SessionID.png",
        },
        "ItemID": {
            "plot": "hist",
            "xlabel": "ItemID",
            "ylabel": "counts",
            "logy": True,
            "show": True,
            "bins": 20,
            "save": True,
            "name": "0buy_h_ItemID.png",
        },
        "Price": {
            "plot": "hist",
            "xlabel": "Price",
            "ylabel": "counts",
            "logy": True,
            "show": True,
            "bins": 50,
            "save": True,
            "name": "0buy_h_Price.png",
        },
        "Quantity": {
            "plot": "hist",
            "xlabel": "Quantity",
            "ylabel": "counts",
            "logy": True,
            "show": True,
            "bins": 25,
            "save": True,
            "name": "0buy_h_Quantity.png",
        },
    },
)

In [None]:
# Per Day stats
df_tmp = (
    df_buys.groupby(
        [
            df_buys["DateTime"].dt.year,
            df_buys["DateTime"].dt.month,
            df_buys["DateTime"].dt.day,
        ]
    )
    .agg({"SessionID": "count", "Price": "sum", "Quantity": "sum"})
    .rename(
        columns={
            "SessionID": "n_sessions",
            "Price": "Tot_price",
            "Quantity": "Tot_quantity",
        }
    )
)

In [None]:
ax = df_tmp.plot(kind="bar", y="n_sessions", figsize=(12, 4))
ax.set(xlabel="Date", ylabel="Nsessions")
plt.rcParams["xtick.labelsize"] = 6
plt.savefig(fig_folder + "/0buy_b_Nsessions.png", bbox_inches="tight", dpi=300)
plt.clf()
ax = df_tmp.plot(kind="bar", y="Tot_price", figsize=(12, 4))
ax.set(xlabel="Date", ylabel="TotPrice")
plt.rcParams["xtick.labelsize"] = 6
plt.savefig(fig_folder + "/0buy_b_Totprice.png", bbox_inches="tight", dpi=300)
plt.clf()
ax = df_tmp.plot(kind="bar", y="Tot_quantity", figsize=(12, 4))
ax.set(xlabel="Date", ylabel="TotQuantity")
plt.rcParams["xtick.labelsize"] = 6
plt.savefig(fig_folder + "/0buy_b_Totquantity.png", bbox_inches="tight", dpi=300)
plt.clf()

In [None]:
# Per Month stats
df_tmp2 = (
    df_buys.groupby([df_buys["DateTime"].dt.year, df_buys["DateTime"].dt.month])
    .agg({"SessionID": "count", "Price": "sum", "Quantity": "sum"})
    .rename(
        columns={
            "SessionID": "n_sessions",
            "Price": "Tot_price",
            "Quantity": "Tot_quantity",
        }
    )
)

In [None]:
ax = df_tmp2.plot(kind="bar", y="n_sessions", figsize=(12, 4))
ax.set(xlabel="Month", ylabel="Nsessions")
plt.rcParams["xtick.labelsize"] = 6
plt.savefig(fig_folder + "/0buy_b_Nsessions_month.png", bbox_inches="tight", dpi=300)
plt.clf()
ax = df_tmp2.plot(kind="bar", y="Tot_price", figsize=(12, 4))
ax.set(xlabel="Month", ylabel="TotPrice")
plt.rcParams["xtick.labelsize"] = 6
plt.savefig(fig_folder + "/0buy_b_Totprice_month.png", bbox_inches="tight", dpi=300)
plt.clf()
ax = df_tmp2.plot(kind="bar", y="Tot_quantity", figsize=(12, 4))
ax.set(xlabel="Month", ylabel="TotQuantity")
plt.rcParams["xtick.labelsize"] = 6
plt.savefig(fig_folder + "/0buy_b_Totquantity_month.png", bbox_inches="tight", dpi=300)
plt.clf()

In [None]:
# Per Day of week
df_tmp3 = (
    df_buys.groupby([df_buys["DateTime"].dt.dayofweek]) #  Monday=0, Sunday=6.
    .agg({"SessionID": "count", "Price": "sum", "Quantity": "sum"})
    .rename(
        columns={
            "SessionID": "n_sessions",
            "Price": "Tot_price",
            "Quantity": "Tot_quantity",
        }
    )
)

In [None]:
l_name_day = [
    "Monday",
    "Tuesday",
    "Wednesday",
    "Thursday",
    "Friday",
    "Saturday",
    "Sunday",
]

ax = df_tmp3.plot(kind="bar", y="n_sessions", figsize=(12, 4))
ax.set(xticklabels=l_name_day, xlabel="Day", ylabel="Nsessions")
plt.rcParams["xtick.labelsize"] = 6
plt.savefig(
    fig_folder + "/0buy_b_Nsessions_dayOFweek.png", bbox_inches="tight", dpi=300
)
plt.clf()
ax = df_tmp3.plot(kind="bar", y="Tot_price", figsize=(12, 4))
ax.set(xticklabels=l_name_day, xlabel="Day", ylabel="TotPrice")
plt.rcParams["xtick.labelsize"] = 6
plt.savefig(fig_folder + "/0buy_b_Totprice_dayOFweek.png", bbox_inches="tight", dpi=300)
plt.clf()
ax = df_tmp3.plot(kind="bar", y="Tot_quantity", figsize=(12, 4))
ax.set(xticklabels=l_name_day, xlabel="Day", ylabel="TotQuantity")
plt.rcParams["xtick.labelsize"] = 6
plt.savefig(
    fig_folder + "/0buy_b_Totquantity_dayOFweek.png", bbox_inches="tight", dpi=300
)
plt.clf()

In [None]:
# Per Hour
df_tmp4 = (
    df_buys.groupby([df_buys["DateTime"].dt.hour])
    .agg({"SessionID": "count", "Price": "sum", "Quantity": "sum"})
    .rename(
        columns={
            "SessionID": "n_sessions",
            "Price": "Tot_price",
            "Quantity": "Tot_quantity",
        }
    )
)

In [None]:
ax = df_tmp4.plot(kind="bar", y="n_sessions", figsize=(12, 4))
ax.set(xlabel="Hour", ylabel="Nsessions")
plt.rcParams["xtick.labelsize"] = 6
plt.savefig(fig_folder + "/0buy_b_Nsessions_hour.png", bbox_inches="tight", dpi=300)
plt.clf()
ax = df_tmp4.plot(kind="bar", y="Tot_price", figsize=(12, 4))
ax.set(xlabel="Hour", ylabel="TotPrice")
plt.rcParams["xtick.labelsize"] = 6
plt.savefig(fig_folder + "/0buy_b_Totprice_hour.png", bbox_inches="tight", dpi=300)
plt.clf()
ax = df_tmp4.plot(kind="bar", y="Tot_quantity", figsize=(12, 4))
ax.set(xlabel="Hour", ylabel="TotQuantity")
plt.rcParams["xtick.labelsize"] = 6
plt.savefig(fig_folder + "/0buy_b_Totquantity_hour.png", bbox_inches="tight", dpi=300)
plt.clf()

### Save final files

In [None]:
df_clicks.to_csv(cliks_path_afterExploration)
df_buys.to_csv(buys_path_afterExploration)

### Considerations
* Some BUYS entries have quantity equal to zero. What do they represents?
  * When quantity is 0, also Prize is zero.
* Large hole in the BUY dataset. For three months nothing has been sold.

In [None]:
df_buys[df_buys["Price"] == 0]["Quantity"].unique()