In [None]:
from datetime import datetime

import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
import pandas as pd
import ruptures as rpt
import seaborn as sns

from bikes.data import load_cycle_counts

### Data Loading

In [2]:
cycle_counts = load_cycle_counts()

100%|██████████| 25/25 [01:18<00:00,  3.13s/it]


In [None]:
cycle_counts.to_csv("raw_cycle_counts.csv", index=False)

In [None]:
cycle_counts = pd.read_csv("raw_cycle_counts.csv", parse_dates=["date"])

### Changepoint detection and adjustment

In [None]:
def get_chpt_locations(location_df: pd.DataFrame, n_chpts: int):
    location_df = location_df.copy()
    location_df = location_df.sort_values("date")
    
    model = rpt.Dynp(model="l2").fit(np.array(location_df["count"]))
    chpts_idx = model.predict(n_bkps=n_chpts)
    
    min_date, max_date = location_df["date"].min(), location_df["date"].max()
    chpts_dt = [location_df.iloc[idx]["date"].to_pydatetime() for idx in chpts_idx[:-1]]
    chpts_dt = [min_date.to_pydatetime()] + chpts_dt + [max_date.to_pydatetime()]
    
    return chpts_dt


def adjust_scale(
    cycle_counts: pd.DataFrame,
    chpts_by_location: dict[str, list[datetime]],
):
    for loc, chpts in chpts_by_location.items():
        loc_mask = cycle_counts["location"] == loc
        
        # Find the longest streak, this is the anchor
        chpt_streaks = list(zip(chpts, chpts[1:]))
        streak_lengths = [
            (loc_mask & cycle_counts["date"].between(start, end, inclusive="left")).sum()
            for (start, end) in chpt_streaks
        ]
        longest_streak = chpt_streaks[streak_lengths.index(max(streak_lengths))]

        # Calculate mean of longest streak
        ls_start, ls_end = longest_streak
        ls_mask = cycle_counts["date"].between(ls_start, ls_end, inclusive="left")
        ls_mean = cycle_counts[loc_mask & ls_mask]["count"].mean()

        # Scale every streak using anchor stats
        for streak_start, streak_end in chpt_streaks:
            streak_mask = cycle_counts["date"].between(streak_start, streak_end, inclusive="left")
            streak_mean = cycle_counts[loc_mask & streak_mask]["count"].mean()
            loc_streak_counts = cycle_counts.loc[loc_mask & streak_mask, "count"]
            loc_streak_counts_adj = loc_streak_counts.div(streak_mean).mul(ls_mean)
            cycle_counts.loc[loc_mask & streak_mask, "count"] = loc_streak_counts_adj
    
    return cycle_counts

In [None]:
i = 11
locs = sorted(cycle_counts["location"].unique())[i * 5:  (i + 1) * 5]
print(locs)

fig, ax = plt.subplots(len(locs), 1, figsize=(10, len(locs) * 1.75), sharex=True)
for i, loc in enumerate(locs):
    location_df = cycle_counts[cycle_counts["location"] == loc].sort_values("date")
    ax[i].plot(location_df["date"].values, (location_df["count"].values), label=loc)
    ax[i].legend()

fig.tight_layout();

In [None]:
locations_and_n_chpts = [
    ("East Coast Road", 1),
    ("Grafton Road", 1),
    ("Great South Road", 2),
    ("Karangahape Road", 2),
    ("Lightpath", 1)
]

In [None]:
loc, n_chpts = locations_and_n_chpts[0]
location_df = cycle_counts[cycle_counts["location"] == loc].sort_values("date")

chpts = get_chpt_locations(location_df, n_chpts)

fig, ax = plt.subplots(figsize=(12, 2.5))
ax.plot(location_df["date"].values, location_df["count"].values)
for chpt in chpts:
    ax.axvline(chpt, color="red")
ax.set(title=loc)
fig.tight_layout();

In [None]:
chpts_by_location = {}
for loc, n_chpts in locations_and_n_chpts:
    location_df = cycle_counts[cycle_counts["location"] == loc].sort_values("date")
    chpts = get_chpt_locations(location_df, n_chpts)
    chpts_by_location[loc] = chpts

In [None]:
chpts_by_location = {
    "East Coast Road": [
        datetime(2022, 1, 1),
        datetime(2022, 11, 22),
        datetime(2024, 12, 31),
    ],
    "Grafton Road": [
        datetime(2022, 1, 1),
        datetime(2023, 1, 16),
        datetime(2024, 12, 31),
    ],
    "Great South Road": [
        datetime(2022, 1, 1),
        datetime(2022, 8, 29),
        datetime(2023, 3, 2),
        datetime(2024, 12, 31)
    ],
    "Karangahape Road": [
        datetime(2022, 1, 1),
        datetime(2024, 4, 15),
        datetime(2024, 7, 19),
        datetime(2024, 12, 31)
    ],
    "Lightpath": [
        datetime(2022, 1, 1),
        datetime(2023, 2, 5),
        datetime(2024, 12, 31)
    ]
}

In [None]:
cycle_counts = adjust_scale(cycle_counts, chpts_by_location)

In [None]:
loc, _ = locations_and_n_chpts[5]
location_df = cycle_counts[cycle_counts["location"] == loc].sort_values("date")

fig, ax = plt.subplots(figsize=(12, 2.5))
ax.plot(location_df["date"].values, location_df["count"].values)
for chpt in chpts_by_location[loc]:
    ax.axvline(chpt, color="red")
ax.set(title=loc)
fig.tight_layout();

In [None]:
cycle_counts.to_csv("cycle_counts_after_scale_adjust.csv", index=False)

## EDA

### Total counts by day

In [None]:
daily_total = cycle_counts.groupby("date")[["count"]].sum().reset_index()

fig, ax = plt.subplots(1, 2, figsize=(10, 3.5))

ax[0].plot(daily_total["date"], daily_total["count"])
ax[0].set(ylabel="Count")
for tick in ax[0].get_xticklabels():
    tick.set_rotation(45)

ax[1].hist(daily_total["count"], bins=20);
ax[1].set(xlabel="Count", ylabel="Frequency")

fig.tight_layout()

### Daily counts by location

In [None]:
av_daily_counts = cycle_counts.groupby("date")[["count"]].mean().reset_index()

fig, ax = plt.subplots(2, 1, figsize=(12, 5), sharex=True)

locations = cycle_counts["location"].unique()
for location in locations:
    outlier_df = cycle_counts[cycle_counts["location"] == location]
    ax[0].plot(outlier_df["date"], outlier_df["count"], color="gray", alpha=0.25)
    ax[1].plot(outlier_df["date"], np.log1p(outlier_df["count"]), color="gray", alpha=0.25)

ax[0].plot(av_daily_counts["date"], av_daily_counts["count"], color="blue", linewidth=2)
ax[1].plot(av_daily_counts["date"], np.log1p(av_daily_counts["count"]), color="blue", linewidth=2)

ax[0].set(ylabel="Count")
ax[1].set(ylabel="Log(Count)")

In [None]:
av_daily_counts_by_loc = cycle_counts.groupby("location")[["count"]].mean().reset_index()
av_daily_counts_by_loc = av_daily_counts_by_loc.sort_values("count").reset_index(drop=True)

mean_counts = av_daily_counts_by_loc["count"].mean()
median_counts = av_daily_counts_by_loc["count"].median()

fig, ax = plt.subplots(1, 1)
ax.hist(av_daily_counts_by_loc["count"], bins=20)
ax.axvline(mean_counts, color="red", linestyle="--", lw=2.5, label="Mean")
ax.axvline(median_counts, color="orange", linestyle="--", lw=2.5, label="Median")
ax.set(xlabel="Average Daily Count", ylabel="Frequency")
ax.legend();

### High demand locations

In [None]:
threshold = 600
high_demand_locs = av_daily_counts_by_loc[av_daily_counts_by_loc["count"] > threshold]
high_demand_locs.sort_values("count")

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 5))

locations = high_demand_locs["location"].values
colors = cm.coolwarm(np.linspace(0, 1, len(locations)))
for i, location in enumerate(locations):
    outlier_df = cycle_counts[cycle_counts["location"] == location].sort_values("date").iloc[-150:]
    ax.plot(
        outlier_df["date"],
        outlier_df["count"],
        label=location,
        color=colors[i],
        lw=2.5,
    )
ax.set(ylabel="Count")
fig.tight_layout();

In [None]:
counts_pivot = cycle_counts.pivot_table(columns="date", index="location", values="count", aggfunc="sum")
counts_pivot = counts_pivot.fillna(0).assign(average=lambda x: x.mean(axis=1))
counts_pivot = counts_pivot[counts_pivot["average"].between(400, 1000)]
counts_pivot = counts_pivot.sort_values("average", ascending=False).drop(columns="average")

fig, ax = plt.subplots(1, 1)
sns.heatmap(counts_pivot, cmap="coolwarm", cbar_kws={"label": "Count"})

xs = np.arange(0, counts_pivot.shape[1], 60)
xticklables = [c.strftime("%Y-%m-%d") for c in counts_pivot.columns[xs]]
ax.set(xticks=xs, xticklabels=xticklables)

ys = np.arange(0, counts_pivot.shape[0])
yticklables = counts_pivot.index[ys]
ax.set_yticks(ys + 0.5)
ax.set_yticklabels(yticklables, fontsize="small")

ax.set(xlabel="", ylabel="");