# Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns

from tqdm import tqdm

In [None]:
def load_cycle_counts(url: str) -> pd.DataFrame:
    df = pd.read_excel(url, header=2)
    df = df.melt(id_vars=["Time"], var_name="Location", value_name="Count")
    df.columns = df.columns.str.lower()
    df = df.astype({"time": "datetime64[s]", "location": str, "count": int})
    return df

In [None]:
urls = [
    "https://at.govt.nz/media/bb4h3wd3/at-daily-cycle-counts-january-2024.xlsx",
    "https://at.govt.nz/media/xlcaru0v/at-daily-cycle-counts-feb-2024.xlsx",
    "https://at.govt.nz/media/4g3hzpp5/at-daily-cycle-counts-march-2024.xlsx",
    # "https://at.govt.nz/media/htvezqdn/at-daily-cycle-counts-april-2024.xlsx",
    # "https://at.govt.nz/media/ue5cygl0/at-daily-cycle-counts-may-2024.xlsx",
    # "https://at.govt.nz/media/3icd2jug/at-daily-cycle-counts-june-2024.xlsx",
    # "https://at.govt.nz/media/jbdd1rox/cycle-counts-july-2024.xlsx",
    # "https://at.govt.nz/media/bvadzmqg/cycle-counts-august-2024.xlsx",
    # "https://at.govt.nz/media/lpsfdwbe/auckland-transport-cycle-counts-september-2024.xlsx",
    # "https://at.govt.nz/media/ohbhvmrl/auckland-transport-cycle-movements-october-2024.xlsx",
    # "https://at.govt.nz/media/0fip0hz0/auckland-transport-cycle-movements-data-november-2024.xlsx",
    # "https://at.govt.nz/media/zdumuud2/auckland-transport-cycle-movements-december-2024.xlsx",
]

dfs = []
for url in tqdm(urls):
    df = load_cycle_counts(url)
    dfs.append(df)

cycle_counts = pd.concat(dfs, ignore_index=True)

In [None]:
daily_counts = cycle_counts.groupby(["time", "location"]).sum().reset_index()

In [None]:
location = daily_counts["location"].unique()[13]
location_df = daily_counts[daily_counts["location"] == location]
location_df = location_df.set_index("time")
location_df.plot(y="count", title=location)

## EDA

### Total counts by day

In [None]:
daily_total = daily_counts.groupby("time")[["count"]].sum().reset_index()

fig, ax = plt.subplots(1, 2, figsize=(10, 3.5))

ax[0].plot(daily_total["time"], daily_total["count"])
ax[0].set(xlabel="Date", ylabel="Count")

ax[1].hist(daily_total["count"], bins=20);
ax[1].set(xlabel="Count", ylabel="Frequency")

fig.tight_layout()

### Daily counts by location

In [None]:
av_daily_counts = daily_counts.groupby("time")[["count"]].mean().reset_index()

fig, ax = plt.subplots(2, 1, figsize=(12, 5), sharex=True)

locations = daily_counts["location"].unique()
for location in locations:
    location_df = daily_counts[daily_counts["location"] == location]
    ax[0].plot(location_df["time"], location_df["count"], color="gray", alpha=0.5)
    ax[1].plot(location_df["time"], np.log1p(location_df["count"]), color="gray", alpha=0.5)

ax[0].plot(av_daily_counts["time"], av_daily_counts["count"], color="blue", linewidth=2)
ax[1].plot(av_daily_counts["time"], np.log1p(av_daily_counts["count"]), color="blue", linewidth=2)

ax[0].set(ylabel="Count")
ax[1].set(ylabel="Log(Count)", xlabel="Date")

In [None]:
av_daily_counts_by_loc = daily_counts.groupby("location")[["count"]].mean().reset_index()
av_daily_counts_by_loc = av_daily_counts_by_loc.sort_values("count").reset_index(drop=True)

mean_counts = av_daily_counts_by_loc["count"].mean()
median_counts = av_daily_counts_by_loc["count"].median()

fig, ax = plt.subplots(1, 1)
ax.hist(av_daily_counts_by_loc["count"], bins=20)
ax.axvline(mean_counts, color="red", linestyle="--", lw=2.5, label="Mean")
ax.axvline(median_counts, color="orange", linestyle="--", lw=2.5, label="Median")
ax.set(xlabel="Average Daily Count", ylabel="Frequency")
ax.legend();

### High demand locations

In [None]:
threshold = 900
high_demand_locs = av_daily_counts_by_loc[av_daily_counts_by_loc["count"] > threshold]
high_demand_locs.sort_values("count", ascending=False)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 5))

locations = high_demand_locs["location"].values
colors = cm.viridis(np.linspace(0, 1, len(locations)))
for location in locations:
    location_df = daily_counts[daily_counts["location"] == location]
    ax.plot(location_df["time"], location_df["count"], label=location)


In [None]:
# Heatmap of cycle counts by day and location