In [None]:
# train_sarima_pipeline.py

import os
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error
import joblib

def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

def load_monthly(data_path):
    df = pd.read_csv(data_path, low_memory=False)
    df["Kalendertag"] = pd.to_datetime(df["Kalendertag"], errors="coerce")
    df = df[df["Kalendertag"].notnull()]
    df["month"] = df["Kalendertag"].dt.to_period("M").dt.to_timestamp()
    monthly = df.groupby("month")["BIWNAV AV Neug Wesu"].sum().sort_index()
    monthly = monthly.asfreq("MS")  # ← ensure a proper monthly freq
    return monthly

# 1. Load & split
DATA_PATH = "../data/4_PrognoseCase_AV_NG_-_Daten.csv"
series    = load_monthly(DATA_PATH)

TEST_SIZE = 6
train     = series.iloc[:-TEST_SIZE]
hold      = series.iloc[-TEST_SIZE:]

# 2. CV
print("CV MAPE / MAE per fold:")
n, vs = len(train), TEST_SIZE
for i in range(3):
    end = n - (3 - i) * vs
    tr  = train.iloc[:end]
    va  = train.iloc[end:end+vs]

    m = SARIMAX(tr, order=(1,1,1), seasonal_order=(1,1,1,12),
                enforce_stationarity=False, enforce_invertibility=False)
    r = m.fit(disp=False)
    f = r.get_forecast(steps=vs).predicted_mean

    mae  = mean_absolute_error(va, f)
    mape = mean_absolute_percentage_error(va, f)
    print(f" Fold {i+1}: MAE={mae:.0f}, MAPE={mape:.2f}%")

# 3. Hold-out
m = SARIMAX(train, order=(1,1,1), seasonal_order=(1,1,1,12),
            enforce_stationarity=False, enforce_invertibility=False)
r = m.fit(disp=False)
fh = r.get_forecast(steps=TEST_SIZE).predicted_mean

mae  = mean_absolute_error(hold, fh)
mape = mean_absolute_percentage_error(hold, fh)
print(f"\nHold-out (last {TEST_SIZE}): MAE={mae:.0f}, MAPE={mape:.2f}%")

# 4. Retrain & save
full = SARIMAX(series, order=(1,1,1), seasonal_order=(1,1,1,12),
               enforce_stationarity=False, enforce_invertibility=False) \
       .fit(disp=False)
os.makedirs("../models", exist_ok=True)
joblib.dump(full, "../models/global_sarima.pkl")
print("Saved SARIMA to ../models/global_sarima.pkl")


In [None]:
# predict_sarima_pipeline.py

import os
import pandas as pd
import joblib

def load_monthly(data_path):
    """
    Load raw daily data, aggregate to month‐start, 
    and enforce a proper MS frequency on the index.
    """
    df = pd.read_csv(data_path, low_memory=False)
    df["Kalendertag"] = pd.to_datetime(df["Kalendertag"], errors="coerce")
    df = df[df["Kalendertag"].notnull()]
    # bucket to month start
    df["month"] = df["Kalendertag"].dt.to_period("M").dt.to_timestamp()
    monthly = (
        df.groupby("month")["BIWNAV AV Neug Wesu"]
          .sum()
          .sort_index()
    )
    # ensure freq so forecasting knows the step
    return monthly.asfreq("MS")

def forecast_next(n_periods=7,
                  model_path="../models/global_sarima.pkl",
                  data_path="../data/4_PrognoseCase_AV_NG_-_Daten.csv"):
    """
    Load the saved SARIMAX model and raw data, then
    forecast `n_periods` months into the future.
    Returns a pandas Series indexed by the forecast months.
    """
    if not os.path.isfile(model_path):
        raise FileNotFoundError(f"Model not found at {model_path}. Please run `train_sarima_pipeline.py` first.")
    # 1) load
    model = joblib.load(model_path)
    series = load_monthly(data_path)

    # 2) forecast
    fc = model.get_forecast(steps=n_periods).predicted_mean

    return fc

if __name__ == "__main__":
    N_PERIODS = 7
    fc = forecast_next(n_periods=N_PERIODS)

    print(f"Forecast for next {N_PERIODS} months:")
    for ds, val in zip(fc.index, fc.values):
        print(f"{ds.strftime('%Y-%m')}: {val:,.0f}")