In [None]:
import datetime
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import my_funcs as mf

# With these 2 lines you can modify my_funcs propahgate the changes here
%load_ext autoreload
%autoreload 2
# Custom plot layout
plt.rcParams["figure.facecolor"] = "white"
# To use black (auto formatter on the notebook): /opt/anaconda3/envs/plus2/bin/pip install nb_black
%load_ext lab_black

In [None]:
# Paths
cliks_path_afterExploration = "data_yoochoose/yoochoose-clicks_aftExp.dat"
buys_path_afterExploration = "data_yoochoose/yoochoose-buys_aftExp.dat"
cliks_path_sorted = "data_yoochoose/yoochoose-clicks_sorted.dat"
buys_path_sorted = "data_yoochoose/yoochoose-buys_sorted.dat"
cliks_path_sorted_prep1 = "data_yoochoose/yoochoose-clicks_sorted_prep1.dat"
buys_path_sorted_prep1 = "data_yoochoose/yoochoose-buys_sorted_prep1.dat"
final_dataset = "data_yoochoose/final_df.dat"
# Parameters
limit = None
load_from_beginning = True

### Workflow: Load the Data

In [None]:
%%time
# Load the data
if load_from_beginning:
    df_clicks = mf.load_file(
        cliks_path_afterExploration,
        limit=limit,
        to_be_sorted=False,
        index_col=0,
        header=0,
        dtype={"SessionID": np.int32, "ItemID": np.int32, 'Category': np.int16},
        parse_dates=[1]
    )

    df_buys = mf.load_file(
        buys_path_afterExploration,
        limit=limit,
        to_be_sorted=False,
        index_col=0,
        header=0,
        dtype={"SessionID": np.int32, "ItemID": np.int32, "Price": np.int32, "Quantity": np.int16},
        parse_dates=[1]
    )

### Workflow: Preprocess

In [None]:
# Preprocess buys
if load_from_beginning:
    df_buys_prep = mf.process_buys(df_buys)
    df_buys_prep.to_csv(buys_path_sorted_prep1)
else:
    df_buys_prep = mf.load_file(
        buys_path_sorted_prep1, limit=limit, to_be_sorted=True, header=0, index_col=0
    )

In [None]:
%%time
# Preprocess clicks
if load_from_beginning:
    df_clicks_prep = mf.process_clicks(df_clicks)
    sessions = mf.process_sessions(df_clicks_prep, buys_path_afterExploration, limit=limit)
    sessions.to_csv(cliks_path_sorted_prep1)
else:
    sessions = mf.load_file(
        cliks_path_sorted_prep1,
        limit=limit,
        to_be_sorted=True,
        header=0,
        index_col=0,
        parse_dates=[6,7]
    )

### Check your dataset after processing

In [None]:
mf.sanity_checks(df_buys_prep)

In [None]:
mf.sanity_checks(df_clicks_prep)

In [None]:
mf.sanity_checks(sessions)

### Fix the Dataset

In [None]:
# Remove the NaN
sessions.max_dwell = sessions.max_dwell.replace(np.inf, np.nan)  # Replace inf with NaN
sessions.max_dwell = sessions.max_dwell.fillna(-1)
sessions.mean_dwell = sessions.mean_dwell.replace(np.inf, np.nan)
sessions.mean_dwell = sessions.mean_dwell.fillna(-1)

### Create Train and Test Datasets

In [None]:
# Create Test/Train datasets
Final_df = pd.merge(
    sessions, df_buys_prep, how="left", left_index=True, right_index=True
)
Final_df = Final_df.sort_values(by=["start_ts"])
Final_df["items_bought"] = Final_df["items_bought"].fillna(0)
Final_df["is_buy"] = Final_df["is_buy"].fillna(0)
Final_df["cat_most_viewed"] = Final_df["cat_most_viewed"].astype("float64")
del Final_df["start_ts"]
del Final_df["end_ts"]
Final_df.to_csv(final_dataset)

In [None]:
Final_df.head()