In [None]:
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from rich import inspect
from tqdm.notebook import tqdm

import datetime
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)


## Load Data:

In [None]:
days_of_week = ["monday", "tuesday", "wednesday", "thursday", "friday"]

dfs = []

for day in days_of_week:
    df = pd.read_csv(
        f"../data/{day}.csv",
        parse_dates=["timestamp"],
        delimiter=";",
    )
    df["dow"] = day
    df["time"] = df.timestamp.dt.time
    df["customer_no"] = df["customer_no"].astype(str) + "_" + df["dow"]
    dfs.append(df)

df_all_days = pd.concat(dfs, ignore_index=True)

df_all_days = df_all_days.set_index("timestamp")

df_all_days.sample(5)


In [None]:
df_all_days.time.min(), df_all_days.time.max()


## Adding the period of day column, for analyzing data in an aggregate manner:
- morning: up to 12 pm
- afternoon: 12 to 4 pm 
- evening: after 4 pm 

In [None]:
def get_day_period(time):
    period = None
    if 7 <= time.hour < 12:
        period = "morning"
    elif 12 <= time.hour < 16:
        period = "afternoon"
    elif 16 <= time.hour:
        period = "evening"
    return period


In [None]:
df_all_days["day_period"] = df_all_days.time.apply(get_day_period)


## Resampling the Df to have a sample every minute:
Method used to fill unrecorded timestamps is "forward fill"

In [None]:
df_all_days = (
    df_all_days.groupby("customer_no")
    .resample("1Min")
    .ffill()
    .drop(columns="customer_no")
    .reset_index()
    .sort_values(by=["customer_no", "timestamp"])
)
df_all_days.sample(5)


## Adding a "checkout" entry of some customers whose end registered location was not checkout:

In [None]:
# TODO: add an initial entry per customer for the "entrance" location.


missing_checkout_entries = (
    []
)  # -- to be filled with the dicts of missing entries:
for entries, grp_data in tqdm(df_all_days.groupby("customer_no")):
    if "checkout" not in grp_data.location.unique():
        # extract last row as a dict:
        checkout_entries = grp_data.iloc[-1].to_dict()

        # the checkout timestamp is the last timestamp + 1 minute:
        checkout_timestamp = grp_data.iloc[-1].timestamp + datetime.timedelta(
            minutes=1
        )
        checkout_entries["timestamp"] = checkout_timestamp

        # update the location accordingly:
        checkout_entries["location"] = "checkout"
        missing_checkout_entries.append(checkout_entries)


In [None]:
# convert entries into df --> add it to the initial df --> re-sort the values per customer, per timestamp:
missing_entries_df = pd.DataFrame(missing_checkout_entries)

df_all_days = pd.concat([df_all_days, missing_entries_df], axis=0)

df_all_days = df_all_days.sort_values(by=["customer_no", "timestamp"])


## Adding a column for the previous timestamp's location:

In [None]:
df_all_days["from_location"] = (
    df_all_days["location"]
    .shift(1)
    .fillna("entrance")
    .replace("checkout", "entrance")
)
df_all_days = df_all_days.rename(columns={"location": "to_location"})
df_all_days.head()


In [None]:
path = "../data/cleaned_data.csv"
df_all_days.to_csv(path, index=False)


## EDA - General view of the number of customers per section 

### Entire Week overview (all data),  per period: 

In [None]:
week_overview_per_section_per_period = (
    df_all_days.groupby(["to_location", "day_period"])
    .timestamp.count()
    .reset_index()
    .rename(columns={"timestamp": "customers"})
)
week_overview_per_section_per_period


In [None]:
week_overview_per_section_per_period.query(
    " day_period == 'afternoon' "
).customers.values


In [None]:
layout = dict(
    xaxis=dict(showgrid=False),
    yaxis=dict(showgrid=False),
    paper_bgcolor="rgba(0,0,0,0)",
    plot_bgcolor="rgba(0,0,0,0)",
    title_x=0.5,
    font=dict(color="white"),
)

title = "Count of customers per location and period of day"
fig = go.Figure()

locations = week_overview_per_section_per_period.to_location.unique().tolist()
day_periods = ["morning", "afternoon", "evening"]

for period in day_periods:
    fig.add_trace(
        go.Bar(
            x=locations,
            y=week_overview_per_section_per_period.query(
                f" day_period == '{period}' "
            ).customers.values,
            name=period,
        )
    )

# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(
    **layout, barmode="group", xaxis_tickangle=-45, title_text=title
)
fig.show()


### Entire Week overview (all data), per hour of the day: 

In [None]:
df_all_days["hour_of_day"] = df_all_days.timestamp.dt.hour

week_overview_per_section_per_hour = (
    df_all_days.groupby(["to_location", "hour_of_day"])
    .timestamp.count()
    .reset_index()
    .rename(columns={"timestamp": "customers"})
)
week_overview_per_section_per_hour.head()


In [None]:
df_all_days.sample(5)


In [None]:
layout = dict(
    paper_bgcolor="rgba(0,0,0,0)",
    plot_bgcolor="rgba(0,0,0,0)",
    title_x=0.1,
    font=dict(color="white"),
)

fig = make_subplots(
    rows=len(locations),
    cols=1,
    subplot_titles=locations,
    shared_xaxes=True,
    shared_yaxes=True,
)

for i, location in enumerate(locations):
    df = week_overview_per_section_per_hour.query(
        f" to_location == '{location}' "
    )
    fig.add_trace(
        go.Bar(
            x=df.hour_of_day.values,
            y=df.customers.values,
            name=location,
            text=df.customers.values,
            textposition="auto",
        ),
        row=i + 1,
        col=1,
    )

title = "Count of customers per location and hour of day"

y_max = week_overview_per_section_per_hour.customers.max()


fig.update_layout(
    **layout,
    title_text=title,
    height=1000,
    showlegend=False,
    # 1
    xaxis=dict(showgrid=False),
    yaxis=dict(showgrid=False, range=[0, y_max]),
    # 2
    xaxis2=dict(showgrid=False),
    yaxis2=dict(showgrid=False, range=[0, y_max]),
    # 3
    xaxis3=dict(showgrid=False),
    yaxis3=dict(showgrid=False, range=[0, y_max]),
    # 4
    xaxis4=dict(showgrid=False),
    yaxis4=dict(showgrid=False, range=[0, y_max]),
    # 5
    xaxis5=dict(showgrid=False),
    yaxis5=dict(showgrid=False, range=[0, y_max]),
)

fig.show()
