In [27]:
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from rich import inspect
from tqdm.notebook import tqdm

import datetime
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)


## Load Data:

In [28]:
days_of_week = ["monday", "tuesday", "wednesday", "thursday", "friday"]

dfs = []

for day in days_of_week:
    df = pd.read_csv(
        f"../data/{day}.csv",
        parse_dates=["timestamp"],
        delimiter=";",
    )
    df["dow"] = day
    df["time"] = df.timestamp.dt.time
    df["customer_no"] = df["customer_no"].astype(str) + "_" + df["dow"]
    dfs.append(df)

df_all_days = pd.concat(dfs, ignore_index=True)

df_all_days = df_all_days.set_index("timestamp")

df_all_days.sample(5)


Unnamed: 0_level_0,customer_no,location,dow,time
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-09-02 21:40:00,1431_monday,fruit,monday,21:40:00
2019-09-04 07:22:00,42_wednesday,fruit,wednesday,07:22:00
2019-09-05 14:06:00,672_thursday,fruit,thursday,14:06:00
2019-09-06 10:58:00,383_friday,checkout,friday,10:58:00
2019-09-02 11:21:00,380_monday,spices,monday,11:21:00


In [29]:
df_all_days.time.min(), df_all_days.time.max()


(datetime.time(7, 0), datetime.time(21, 50))

## Adding the period of day column, for analyzing data in an aggregate manner:
- morning: up to 12 pm
- afternoon: 12 to 4 pm 
- evening: after 4 pm 

In [30]:
def get_day_period(time):
    period = None
    if 7 <= time.hour < 12:
        period = "morning"
    elif 12 <= time.hour < 16:
        period = "afternoon"
    elif 16 <= time.hour:
        period = "evening"
    return period


In [31]:
df_all_days["day_period"] = df_all_days.time.apply(get_day_period)


## Resampling the Df to have a sample every minute:
Method used to fill unrecorded timestamps is "forward fill"

In [32]:
df_all_days = (
    df_all_days.groupby("customer_no")
    .resample("1Min")
    .ffill()
    .drop(columns="customer_no")
    .reset_index()
    .sort_values(by=["customer_no", "timestamp"])
)
df_all_days.sample(5)


Unnamed: 0,customer_no,timestamp,location,dow,time,day_period
48149,856_friday,2019-09-06 16:03:00,fruit,friday,16:03:00,evening
39122,636_monday,2019-09-02 13:59:00,spices,monday,13:59:00,afternoon
17225,1428_wednesday,2019-09-04 20:36:00,fruit,wednesday,20:36:00,evening
6148,1148_tuesday,2019-09-03 19:04:00,fruit,tuesday,19:03:00,evening
36863,578_wednesday,2019-09-04 12:27:00,drinks,wednesday,12:27:00,afternoon


## Adding a "checkout" entry of some customers whose end registered location was not checkout:

In [33]:
# TODO: add an initial entry per customer for the "entrance" location.


missing_checkout_entries = (
    []
)  # -- to be filled with the dicts of missing entries:
for entries, grp_data in df_all_days.groupby("customer_no"):
    if "checkout" not in grp_data.location.unique():
        # extract last row as a dict:
        checkout_entries = grp_data.iloc[-1].to_dict()

        # the checkout timestamp is the last timestamp + 1 minute:
        checkout_timestamp = grp_data.iloc[-1].timestamp + datetime.timedelta(
            minutes=1
        )
        checkout_entries["timestamp"] = checkout_timestamp

        # update the location accordingly:
        checkout_entries["location"] = "checkout"
        missing_checkout_entries.append(checkout_entries)


In [34]:
# convert entries into df --> add it to the initial df --> re-sort the values per customer, per timestamp:
missing_entries_df = pd.DataFrame(missing_checkout_entries)

df_all_days = pd.concat([df_all_days, missing_entries_df], axis=0)

df_all_days = df_all_days.sort_values(by=["customer_no", "timestamp"])


## Adding a column for the previous timestamp's location:

In [35]:
df_all_days["from_location"] = (
    df_all_days["location"]
    .shift(1)
    .fillna("entrance")
    .replace("checkout", "entrance")
)
df_all_days = df_all_days.rename(columns={"location": "to_location"})
df_all_days.head()


Unnamed: 0,customer_no,timestamp,to_location,dow,time,day_period,from_location
0,1000_friday,2019-09-06 17:19:00,fruit,friday,17:19:00,evening,entrance
1,1000_friday,2019-09-06 17:20:00,checkout,friday,17:20:00,evening,fruit
2,1000_monday,2019-09-02 17:44:00,dairy,monday,17:44:00,evening,entrance
3,1000_monday,2019-09-02 17:45:00,dairy,monday,17:44:00,evening,dairy
4,1000_monday,2019-09-02 17:46:00,dairy,monday,17:44:00,evening,dairy


In [36]:
path = "../data/cleaned_data.csv"
df_all_days.to_csv(path, index=False)


## EDA - General view of the number of customers per section 

### Entire Week overview (all data),  per period: 

In [37]:
week_overview_per_section_per_period = (
    df_all_days.groupby(["to_location", "day_period"])
    .timestamp.count()
    .reset_index()
    .rename(columns={"timestamp": "customers"})
)
week_overview_per_section_per_period


Unnamed: 0,to_location,day_period,customers
0,checkout,afternoon,1806
1,checkout,evening,3284
2,checkout,morning,2355
3,dairy,afternoon,4514
4,dairy,evening,7352
5,dairy,morning,5908
6,drinks,afternoon,2099
7,drinks,evening,4644
8,drinks,morning,2983
9,fruit,afternoon,3198


In [38]:
week_overview_per_section_per_period.query(
    " day_period == 'afternoon' "
).customers.values


array([1806, 4514, 2099, 3198, 1565])

In [39]:
layout = dict(
    xaxis=dict(showgrid=False),
    yaxis=dict(showgrid=False),
    paper_bgcolor="rgba(0,0,0,0)",
    plot_bgcolor="rgba(0,0,0,0)",
    title_x=0.5,
    font=dict(color="white"),
)

title = "Count of customers per location and period of day"
fig = go.Figure()

locations = week_overview_per_section_per_period.to_location.unique().tolist()
day_periods = ["morning", "afternoon", "evening"]

for period in day_periods:
    fig.add_trace(
        go.Bar(
            x=locations,
            y=week_overview_per_section_per_period.query(
                f" day_period == '{period}' "
            ).customers.values,
            name=period,
        )
    )

# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(
    **layout, barmode="group", xaxis_tickangle=-45, title_text=title
)
fig.show()


### Entire Week overview (all data), per hour of the day: 

In [40]:
df_all_days["hour_of_day"] = df_all_days.timestamp.dt.hour

week_overview_per_section_per_hour = (
    df_all_days.groupby(["to_location", "hour_of_day"])
    .timestamp.count()
    .reset_index()
    .rename(columns={"timestamp": "customers"})
)
week_overview_per_section_per_hour.head()


Unnamed: 0,to_location,hour_of_day,customers
0,checkout,7,454
1,checkout,8,677
2,checkout,9,455
3,checkout,10,427
4,checkout,11,342


In [41]:
df_all_days.sample(5)


Unnamed: 0,customer_no,timestamp,to_location,dow,time,day_period,from_location,hour_of_day
45281,783_monday,2019-09-02 15:30:00,dairy,monday,15:29:00,afternoon,dairy,15
17150,1426_friday,2019-09-06 20:39:00,dairy,friday,20:37:00,evening,dairy,20
16936,141_tuesday,2019-09-03 08:22:00,dairy,tuesday,08:17:00,morning,dairy,8
844,101_monday,2019-09-02 07:59:00,fruit,monday,07:56:00,morning,fruit,7
30407,416_tuesday,2019-09-03 11:38:00,fruit,tuesday,11:38:00,morning,entrance,11


In [42]:
layout = dict(
    paper_bgcolor="rgba(0,0,0,0)",
    plot_bgcolor="rgba(0,0,0,0)",
    title_x=0.1,
    font=dict(color="white"),
)

fig = make_subplots(
    rows=len(locations),
    cols=1,
    subplot_titles=locations,
    shared_xaxes=True,
    shared_yaxes=True,
)

for i, location in enumerate(locations):
    df = week_overview_per_section_per_hour.query(
        f" to_location == '{location}' "
    )
    fig.add_trace(
        go.Bar(
            x=df.hour_of_day.values,
            y=df.customers.values,
            name=location,
            text=df.customers.values,
            textposition="auto",
        ),
        row=i + 1,
        col=1,
    )

title = "Count of customers per location and hour of day"

y_max = week_overview_per_section_per_hour.customers.max()


fig.update_layout(
    **layout,
    title_text=title,
    height=1000,
    showlegend=False,
    # 1
    xaxis=dict(showgrid=False),
    yaxis=dict(showgrid=False, range=[0, y_max]),
    # 2
    xaxis2=dict(showgrid=False),
    yaxis2=dict(showgrid=False, range=[0, y_max]),
    # 3
    xaxis3=dict(showgrid=False),
    yaxis3=dict(showgrid=False, range=[0, y_max]),
    # 4
    xaxis4=dict(showgrid=False),
    yaxis4=dict(showgrid=False, range=[0, y_max]),
    # 5
    xaxis5=dict(showgrid=False),
    yaxis5=dict(showgrid=False, range=[0, y_max]),
)

fig.show()


In [65]:
modified_data = []

for customer_no, grp_data in df_all_days.groupby("customer_no"):
    if grp_data.iloc[0]["from_location"] != "entrance":
        entrance_row = grp_data.iloc[0].copy()
        entrance_row["from_location"] = "entrance"
        entrance_row["timestamp"] = grp_data.iloc[0]["timestamp"] - datetime.timedelta(minutes=1)
        
        modified_data.append(entrance_row)
    
    modified_data.append(grp_data)

df_all_days = pd.concat(modified_data, ignore_index=True)


In [66]:
df_customer = df_all_days[df_all_days["customer_no"] == "6_tuesday"]
df_customer

Unnamed: 0,customer_no,timestamp,to_location,dow,time,day_period,from_location,hour_of_day
41845,6_tuesday,2019-09-03 07:07:00,dairy,tuesday,07:07:00,morning,entrance,7
41846,6_tuesday,2019-09-03 07:08:00,dairy,tuesday,07:07:00,morning,dairy,7
41847,6_tuesday,2019-09-03 07:09:00,dairy,tuesday,07:07:00,morning,dairy,7
41848,6_tuesday,2019-09-03 07:10:00,dairy,tuesday,07:07:00,morning,dairy,7
41849,6_tuesday,2019-09-03 07:11:00,dairy,tuesday,07:07:00,morning,dairy,7
41850,6_tuesday,2019-09-03 07:12:00,dairy,tuesday,07:07:00,morning,dairy,7
41851,6_tuesday,2019-09-03 07:13:00,dairy,tuesday,07:07:00,morning,dairy,7
41852,6_tuesday,2019-09-03 07:14:00,dairy,tuesday,07:07:00,morning,dairy,7
41853,6_tuesday,2019-09-03 07:15:00,spices,tuesday,07:15:00,morning,dairy,7
41854,6_tuesday,2019-09-03 07:16:00,dairy,tuesday,07:16:00,morning,spices,7


In [79]:
df_all_days.head()

Unnamed: 0,customer_no,timestamp,to_location,dow,time,day_period,from_location,hour_of_day
0,1000_friday,2019-09-06 17:19:00,fruit,friday,17:19:00,evening,entrance,17
1,1000_friday,2019-09-06 17:20:00,checkout,friday,17:20:00,evening,fruit,17
2,1000_monday,2019-09-02 17:44:00,dairy,monday,17:44:00,evening,entrance,17
3,1000_monday,2019-09-02 17:45:00,dairy,monday,17:44:00,evening,dairy,17
4,1000_monday,2019-09-02 17:46:00,dairy,monday,17:44:00,evening,dairy,17


In [94]:
locations = df_all_days["from_location"].unique().tolist()
locations.append("checkout")

transition_matrix = pd.DataFrame(0, index=locations, columns=locations)

for i in range(len(df_all_days) - 1):
    print(i)

    current_location = df_all_days.iloc[i]["from_location"]
    print(current_location)
    next_location = df_all_days.iloc[i+1]["from_location"]
    print(next_location)
    transition_matrix.loc[current_location, next_location] += 1

transition_matrix = transition_matrix.div(transition_matrix.sum(axis=1), axis=0)



0
entrance
fruit
1
fruit
entrance
2
entrance
dairy
3
dairy
dairy
4
dairy
dairy
5
dairy
dairy
6
dairy
dairy
7
dairy
dairy
8
dairy
drinks
9
drinks
entrance
10
entrance
spices
11
spices
spices
12
spices
spices
13
spices
dairy
14
dairy
dairy
15
dairy
dairy
16
dairy
drinks
17
drinks
drinks
18
drinks
drinks
19
drinks
drinks
20
drinks
entrance
21
entrance
fruit
22
fruit
dairy
23
dairy
dairy
24
dairy
dairy
25
dairy
entrance
26
entrance
spices
27
spices
dairy
28
dairy
dairy
29
dairy
dairy
30
dairy
dairy
31
dairy
dairy
32
dairy
dairy
33
dairy
dairy
34
dairy
dairy
35
dairy
dairy
36
dairy
spices
37
spices
spices
38
spices
spices
39
spices
entrance
40
entrance
fruit
41
fruit
fruit
42
fruit
drinks
43
drinks
drinks
44
drinks
spices
45
spices
drinks
46
drinks
drinks
47
drinks
drinks
48
drinks
fruit
49
fruit
fruit
50
fruit
entrance
51
entrance
fruit
52
fruit
entrance
53
entrance
dairy
54
dairy
dairy
55
dairy
dairy
56
dairy
fruit
57
fruit
fruit
58
fruit
fruit
59
fruit
dairy
60
dairy
dairy
61
dairy
dairy