In [None]:
import datetime
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from matplotlib import pyplot as plt
import my_funcs as mf
import my_nn as nn

# With these 2 lines you can modify my_funcs propahgate the changes here
%load_ext autoreload
%autoreload 2
# Custom plot layout
plt.rcParams["figure.facecolor"] = "white"
# To use black (auto formatter on the notebook): /opt/anaconda3/envs/plus2/bin/pip install nb_black
%load_ext lab_black

In [None]:
# Paths
final_dataset = "data_yoochoose/final_df.dat"
# Parameters
limit = None
fig_folder = "figs/"

In [None]:
df_final = mf.load_file(
    final_dataset, limit=limit, to_be_sorted=False, index_col=0, header=0
)

### Safety checks

In [None]:
df_final["is_buy"].value_counts()

In [None]:
mf.sanity_checks(df_final)

In [None]:
df_corr = df_final.corr()

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
im = ax.matshow(df_corr)

ax.set_xticks(np.arange(len(df_final.columns)))
ax.set_yticks(np.arange(len(df_final.columns)))
ax.set_xticklabels(df_final.columns)
ax.set_yticklabels(df_final.columns)

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45)
fig.colorbar(im)
# Loop over data dimensions and create text annotations.
for i in range(df_corr.shape[0]):
    for j in range(df_corr.shape[1]):
        text = ax.text(
            j, i, round(df_corr.iloc[i, j], 2), ha="center", va="center", color="w"
        )

ax.set_title("Correlation matrix")
fig.tight_layout()
plt.show()

### Prepare data

### Preparation for training

In [None]:
feature_columns = [
    "total_clicks",
    "total_items",
    "total_cats",
    "max_dwell",
    "mean_dwell",
    "total_duration",
    "click_rate",
    "cat_most_viewed_n_times",
    "cat_most_viewed",
    "item_most_viewed_n_times",
    "item_most_viewed",
    "cat_views_freqs",
    "item_views_freqs",
    "item_buys_freqs",
]
d_val = {
    "total_clicks": {"range": (0, 200), "bins": 20},
    "total_items": {"range": (0, 200), "bins": 20},
    "total_cats": {"range": (0, 60), "bins": 20},
    "max_dwell": {"range": (0, 60), "bins": 20},
    "mean_dwell": {"range": (0, 60), "bins": 20},
    "total_duration": {"range": (0, 1200), "bins": 20},
    "click_rate": {"range": (0, 35), "bins": 20},
    "cat_most_viewed_n_times": {"range": (0, 350), "bins": 20},
    "cat_most_viewed": {"range": (0, 200), "bins": 20},
    "item_most_viewed_n_times": {"range": None, "bins": 20},
    "item_most_viewed": {"range": (0, 200), "bins": 20},
    "cat_views_freqs": {"range": (0, 200), "bins": 20},
    "item_views_freqs": {"range": None, "bins": 20},
    "item_buys_freqs": {"range": None, "bins": 20},
}

In [None]:
for col in feature_columns:
    n_uniq = len(df_final[col].unique())
    print(f"{col} has {n_uniq} unique values.")
    plt.hist(
        df_final[df_final["is_buy"] == 0][col],
        label="not_buy",
        alpha=0.3,
        density=True,
        range=d_val[col]["range"],
        bins=d_val[col]["bins"],
    )
    plt.hist(
        df_final[df_final["is_buy"] == 1][col],
        label="buy",
        alpha=0.3,
        density=True,
        range=d_val[col]["range"],
        bins=d_val[col]["bins"],
    )
    plt.xlabel(col)
    plt.yscale("log")
    plt.legend(loc="best")
    plt.savefig(fig_folder + "/2fin_h_" + col + ".png", bbox_inches="tight", dpi=300)
    plt.clf()

In [None]:
#### POSSIBLE IMPROVEMENTS
### Cap vars above 5 sigma
### item_most_viewed_n_times and cat_most_viewed_n_times should be categorical
### Remove period of time here there is no buy? (look at step0)
### Add a DateTime start period. A float for period of the year, another for period of the month, another for the dat of the week, and the hour.

In [None]:
alf = nn.MlpHelper("model")

In [None]:
label = "is_buy"
alf.load_dataset(
    df=df_final, feature_columns=feature_columns, label=label, scale_dataset=True
)

In [None]:
layers = [20, 20, 15]
alf.build_model(
    layers=layers,
    input_dim=len(feature_columns),
    activation="relu",
    lr=0.01,
    dropout_perc=0.5,
    model_type="mlp",
    l1=0.0,
    l2=0.0,
)
alf.model.summary()

In [None]:
w0 = df_final[df_final["is_buy"] == 1].shape[0] / (
    df_final.shape[0]
)  # n(1)/tot = 0.05510388466516154
w1 = df_final[df_final["is_buy"] == 0].shape[0] / (
    df_final.shape[0]
)  # n(0)/tot = 0.9448961153348384
alf.train_and_validate(
    epochs=10,
    batch_size=50,
    class_weight={0: w0, 1: w1},
    test_perc=10,
    create_split=True,
    train_index=None,
    test_index=None,
    lauc_fct=0.2,
)

In [None]:
alf.save_model()

In [None]:
alf.predict(alf.test_X, batch_size=50, verbose=2)

In [None]:
alf.compute_metrics(alf.test_Y, alf.predictions)

In [None]:
alf.print_metrics()

In [None]:
alf.metrics

In [None]:
_ = plt.plot(
    alf.metrics["laucs"].keys(), list(alf.metrics["laucs"].values()), label="model"
)
plt.plot([0, 1], [0, 1], label="random")
plt.legend(loc="best")