In [4]:
import numpy as np
import pandas as pd
import sqlite3

In [2]:
def _safe_mape(y, yhat):
    y = np.asarray(y, dtype=float)
    yhat = np.asarray(yhat, dtype=float)
    mask = y != 0
    if mask.sum() == 0:
        return np.nan
    return float(np.mean(np.abs((yhat[mask] - y[mask]) / y[mask])))


def compute_naive_store_metrics(df, steps=54):
    """
    df: long DataFrame with at least ['Store', 'Date', 'Sales']
        assumed already filtered to the evaluation horizon (train+test)
    steps: number of last days per store used as test set
    """
    df = df.copy()
    df["Date"] = pd.to_datetime(df["Date"])
    df.sort_values(["Store", "Date"], inplace=True)

    store_metrics = []

    for store_id, g in df.groupby("Store"):
        g = g.sort_values("Date")
        y = g["Sales"].to_numpy(dtype=float)

        if len(y) <= steps:
            # not enough history to form train+test
            continue

        # last `steps` days are test
        test_y = y[-steps:]
        # naive prediction: yesterday's sales
        # for each test day t, use y_{t-1}
        naive_pred = y[-steps-1:-1]  # same length as test_y

        err = naive_pred - test_y  # prediction - actual

        rows = len(test_y)
        sales_sum = float(test_y.sum())
        abs_err = np.abs(err)
        abs_err_sum = float(abs_err.sum())
        sq_err_sum = float((err ** 2).sum())

        mae = float(abs_err.mean())
        rmse = float(np.sqrt(sq_err_sum / rows))
        mape = _safe_mape(test_y, naive_pred)
        wape = abs_err_sum / sales_sum if sales_sum != 0 else np.nan

        tae = float(np.abs(naive_pred.sum() - test_y.sum()))
        bias = float(err.mean())
        err_std = float(err.std(ddof=0))

        # for global MAPE (true point-wise weighted MAPE)
        mask = test_y != 0
        abs_pct_err_sum = float(np.abs(err[mask] / test_y[mask]).sum())
        nonzero_n = int(mask.sum())

        store_metrics.append({
            "Store": store_id,
            "rows": rows,
            "sales_sum": sales_sum,
            "wape": wape,
            "bias": bias,
            "err_std": err_std,
            "TAE": tae,
            "RMSE": rmse,
            "MAPE": mape,
            "abs_err_sum": abs_err_sum,
            "sq_err_sum": sq_err_sum,
            "abs_pct_err_sum": abs_pct_err_sum,
            "nonzero_n": nonzero_n,
        })

    return pd.DataFrame(store_metrics)


def summarize_org_metrics(store_df):
    """
    store_df: output of compute_naive_store_metrics
    Returns a dict with organization-level metrics comparable
    to those used for SARIMAX/XGBoost.
    """
    total_sales = store_df["sales_sum"].sum()
    total_abs_err = store_df["abs_err_sum"].sum()
    total_sq_err = store_df["sq_err_sum"].sum()
    total_rows = store_df["rows"].sum()

    # global WAPE (org-level)
    org_wape = total_abs_err / total_sales if total_sales != 0 else np.nan

    # "Weighted RMSE" as RMSE over all store-day observations
    org_weighted_rmse = np.sqrt(total_sq_err / total_rows) if total_rows > 0 else np.nan

    # unweighted MAPE = simple average of store-level MAPEs
    org_mape_unweighted = store_df["MAPE"].mean()

    # global point-wise MAPE (true weighted by number of nonzero points)
    total_abs_pct_err = store_df["abs_pct_err_sum"].sum()
    total_nonzero = store_df["nonzero_n"].sum()
    org_mape_weighted = (total_abs_pct_err / total_nonzero) if total_nonzero > 0 else np.nan

    return {
        "WAPE": org_wape,
        "Weighted_RMSE": org_weighted_rmse,
        "MAPE_weighted": org_mape_weighted,
        "MAPE_unweighted": org_mape_unweighted,
        "Total_sales": total_sales,
        "Total_rows": total_rows,
    }


In [6]:
# Connect to database
conn = sqlite3.connect('rossmann.db')
cursor = conn.cursor()

# Assign 'rossmann' table to Pandas DataFrame
sql = "SELECT * FROM rossmann"
df = pd.read_sql(sql, conn)

In [10]:
# df_all has columns: Store, Date, Sales, ...
steps = 54

# Store-level metrics
naive_store_df = compute_naive_store_metrics(df, steps=steps)

# Org-level metrics (dict)
naive_org_summary_dict = summarize_org_metrics(naive_store_df)

# Convert to single-row DataFrame
naive_org_summary = pd.DataFrame([naive_org_summary_dict])

print(naive_store_df.head())
print(naive_org_summary)

naive_store_df.to_csv("naive_store_metrics.csv", index=False)
naive_org_summary.to_csv("naive_org_summary.csv", index=False)

   Store  rows  sales_sum      wape        bias      err_std      TAE  \
0      1    54   204677.0  0.403186  -97.462963  2444.876758   5263.0   
1      2    54   232766.0  0.448261 -112.296296  2750.712626   6064.0   
2      3    54   327507.0  0.423362 -153.962963  3760.633777   8314.0   
3      4    54   461174.0  0.427706 -259.166667  5622.940113  13995.0   
4      5    54   214640.0  0.444372  -89.296296  2648.474688   4822.0   

          RMSE      MAPE  abs_err_sum    sq_err_sum  abs_pct_err_sum  \
0  2446.818626  0.242496      82523.0  3.232938e+08        11.397315   
1  2753.003888  0.352289     104340.0  4.092676e+08        16.557596   
2  3763.784133  0.331275     138654.0  7.649678e+08        15.569921   
3  5628.909563  0.260955     197247.0  1.710970e+09        12.264908   
4  2649.979623  0.415378      95380.0  3.792092e+08        19.522751   

   nonzero_n  
0         47  
1         47  
2         47  
3         47  
4         47  
       WAPE  Weighted_RMSE  MAPE_weigh