In [51]:
import matplotlib.pyplot as plt
import plotly.express as px


from rich import inspect
from tqdm.notebook import tqdm

import datetime
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)


In [52]:
days_of_week = ["monday", "tuesday", "wednesday", "thursday", "friday"]

dfs = []

for day in days_of_week:
    df = pd.read_csv(
        f"../data/{day}.csv",
        parse_dates=["timestamp"],
        delimiter=";",
    )
    df["dow"] = day
    df["time"] = df.timestamp.dt.time
    df["customer_no"] = df["customer_no"].astype(str) + "_" + df["dow"]
    dfs.append(df)

df_all_days = pd.concat(dfs, ignore_index=True)

df_all_days = df_all_days.set_index("timestamp")

df_all_days.sample(5)


Unnamed: 0_level_0,customer_no,location,dow,time
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-09-03 16:14:00,802_tuesday,checkout,tuesday,16:14:00
2019-09-06 15:53:00,846_friday,spices,friday,15:53:00
2019-09-05 11:24:00,473_thursday,checkout,thursday,11:24:00
2019-09-04 19:17:00,1275_wednesday,spices,wednesday,19:17:00
2019-09-06 15:43:00,819_friday,spices,friday,15:43:00


In [53]:
df_all_days.time.min(), df_all_days.time.max()


(datetime.time(7, 0), datetime.time(21, 50))

## Adding the period of day column, for analyzing data in an aggregate manner:

In [54]:
def get_day_period(time):
    period = None
    if 7 <= time.hour < 12:
        period = "morning"
    elif 12 <= time.hour < 16:
        period = "afternoon"
    elif 16 <= time.hour:
        period = "evening"
    return period


In [55]:
df_all_days["day_period"] = df_all_days.time.apply(get_day_period)


## Resampling the Df to have a sample every minute:
Method used to fill unrecorded timestamps is "forward fill"

In [56]:
df_all_days = (
    df_all_days.groupby("customer_no")
    .resample("1Min")
    .ffill()
    .drop(columns="customer_no")
    .reset_index()
    .sort_values(by=["customer_no", "timestamp"])
)
df_all_days.sample(5)


Unnamed: 0,customer_no,timestamp,location,dow,time,day_period
39223,638_wednesday,2019-09-04 13:11:00,dairy,wednesday,13:10:00,afternoon
35005,528_tuesday,2019-09-03 13:19:00,fruit,tuesday,13:19:00,afternoon
46964,825_tuesday,2019-09-03 16:26:00,dairy,tuesday,16:22:00,evening
46453,811_monday,2019-09-02 15:53:00,spices,monday,15:53:00,afternoon
3001,1071_friday,2019-09-06 18:06:00,fruit,friday,18:05:00,evening


## Adding a "checkout" entry of some customers whose end registered location was not checkout:

In [57]:
missing_checkout_entries = (
    []
)  # -- to be filled with the dicts of missing entries:
for entries, grp_data in tqdm(df_all_days.groupby("customer_no")):
    if "checkout" not in grp_data.location.unique():
        # extract last row as a dict:
        checkout_entries = grp_data.iloc[-1].to_dict()

        # the checkout timestamp is the last timestamp + 1 minute:
        checkout_timestamp = grp_data.iloc[-1].timestamp + datetime.timedelta(
            minutes=1
        )
        checkout_entries["timestamp"] = checkout_timestamp

        # update the location accordingly:
        checkout_entries["location"] = "checkout"
        missing_checkout_entries.append(checkout_entries)


  0%|          | 0/7445 [00:00<?, ?it/s]

In [58]:
# convert entries into df --> add it to the initial df --> re-sort the values per customer, per timestamp:
missing_entries_df = pd.DataFrame(missing_checkout_entries)

df_all_days = pd.concat([df_all_days, missing_entries_df], axis=0)

df_all_days = df_all_days.sort_values(by=["customer_no", "timestamp"])


## Adding a column for the previous timestamp's location:

In [59]:
df_all_days["from_location"] = (
    df_all_days["location"]
    .shift(1)
    .fillna("entrance")
    .replace("checkout", "entrance")
)
df_all_days = df_all_days.rename(columns={"location": "to_location"})
df_all_days.head()


Unnamed: 0,customer_no,timestamp,to_location,dow,time,day_period,from_location
0,1000_friday,2019-09-06 17:19:00,fruit,friday,17:19:00,evening,entrance
1,1000_friday,2019-09-06 17:20:00,checkout,friday,17:20:00,evening,fruit
2,1000_monday,2019-09-02 17:44:00,dairy,monday,17:44:00,evening,entrance
3,1000_monday,2019-09-02 17:45:00,dairy,monday,17:44:00,evening,dairy
4,1000_monday,2019-09-02 17:46:00,dairy,monday,17:44:00,evening,dairy


In [61]:
path = "../data/cleaned_data.csv"
df_all_days.to_csv(path, index=False)