In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm

sns.set_theme(style="whitegrid")

In [None]:
data_path = Path("../data/synthetic_retail_sales_daily.csv")
if not data_path.exists():
    raise FileNotFoundError(
        f"Missing {data_path}. Run: python ../scripts/generate_synthetic_datasets.py"
    )

df = pd.read_csv(data_path, parse_dates=["date"])
df.head()

In [None]:
# Aggregate to a single daily series (total revenue)
daily = (
    df.groupby("date", as_index=False)
      .agg(revenue=("revenue", "sum"), units=("units", "sum"), promo_days=("promo", "sum"))
)
daily = daily.sort_values("date").set_index("date")
daily.head()

In [None]:
plt.figure(figsize=(12, 4))
plt.plot(daily.index, daily["revenue"])
plt.title("Daily Total Revenue")
plt.xlabel("Date")
plt.ylabel("Revenue")
plt.show()

## Forecasting (SARIMAX)
We fit a simple seasonal model and forecast a held-out window. This is intentionally lightweight and easy to explain in interviews.

In [None]:
# Train/test split
split_date = daily.index.max() - pd.Timedelta(days=90)
train = daily.loc[daily.index < split_date]
test = daily.loc[daily.index >= split_date]

y_train = train["revenue"]
y_test = test["revenue"]

# Weekly seasonality is common in retail
model = sm.tsa.statespace.SARIMAX(
    y_train,
    order=(1, 1, 1),
    seasonal_order=(1, 1, 1, 7),
    enforce_stationarity=False,
    enforce_invertibility=False,
)
results = model.fit(disp=False)

forecast = results.get_forecast(steps=len(y_test))
pred = forecast.predicted_mean
conf_int = forecast.conf_int()

pred.index = y_test.index
conf_int.index = y_test.index

plt.figure(figsize=(12, 5))
plt.plot(y_train.index, y_train, label="train")
plt.plot(y_test.index, y_test, label="test")
plt.plot(pred.index, pred, label="forecast")
plt.fill_between(conf_int.index, conf_int.iloc[:, 0], conf_int.iloc[:, 1], alpha=0.2)
plt.title("SARIMAX Forecast (Holdout)")
plt.legend()
plt.show()

## Anomaly detection (residuals)
We flag days where residuals are unusually large. The synthetic generator also injects anomaly labels at the row level; here we compare against aggregated anomaly days.

In [None]:
residuals = (y_test - pred)
z = (residuals - residuals.mean()) / (residuals.std(ddof=0) + 1e-9)
anomaly_flag = z.abs() >= 3.0

# Ground truth (any injected anomaly rows on that day)
truth = (
    df.loc[df["date"].isin(y_test.index)]
      .groupby("date")["anomaly"]
      .apply(lambda s: (s != "none").any())
)

comparison = pd.DataFrame({"z_score": z, "flagged": anomaly_flag, "injected": truth})
comparison.head()

In [None]:
plt.figure(figsize=(12, 4))
plt.plot(y_test.index, y_test, label="actual")
plt.plot(pred.index, pred, label="forecast")
plt.scatter(y_test.index[comparison["flagged"]], y_test[comparison["flagged"]], color="red", s=20, label="flagged")
plt.title("Holdout Forecast + Flagged Anomalies")
plt.legend()
plt.show()

print("Flagged days:", int(comparison["flagged"].sum()))
print("Injected anomaly days:", int(comparison["injected"].sum()))