In [12]:
# train_prophet_pipeline

import os
import pandas as pd
import numpy as np
from prophet import Prophet
from sklearn.metrics import mean_absolute_error
import joblib

# -----------------------------------
# Helpers
# -----------------------------------
def mape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

def time_series_splits(df, n_folds=3, val_size=6):
    """
    Yield (train_df, val_df) pairs by sliding a window of length val_size
    backwards through the data, with n_folds total validations.
    """
    n = len(df)
    for i in range(n_folds):
        train_end = n - (n_folds - i) * val_size
        val_start = train_end
        val_end   = val_start + val_size
        if val_end > n:
            break
        yield df.iloc[:train_end], df.iloc[val_start:val_end]

def load_monthly(data_path):
    """
    Load daily data, filter invalid dates, and aggregate to monthly totals.
    Returns a DataFrame with columns ['ds','y'] where ds = month start.
    """
    df = pd.read_csv(data_path, low_memory=False)
    df["Kalendertag"]    = pd.to_datetime(df["Kalendertag"], errors="coerce")
    df["Eintrittsdatum"] = pd.to_datetime(df["Eintrittsdatum"], errors="coerce")
    df = df[df["Kalendertag"] >= df["Eintrittsdatum"]]
    # month-start timestamp
    df["ds"] = df["Kalendertag"].dt.to_period("M").dt.to_timestamp()
    df["y"]  = df["BIWNAV AV Neug Wesu"]
    monthly = (
        df.groupby("ds")["y"]
          .sum()
          .reset_index()
          .sort_values("ds")
    )
    return monthly

# -----------------------------------
# Main orchestration
# -----------------------------------
def main(test_size=6, n_folds=3):
    DATA_PATH  = "../data/4_PrognoseCase_AV_NG_-_Daten.csv"
    MODEL_DIR  = "../models"
    MODEL_PATH = os.path.join(MODEL_DIR, "global_prophet.pkl")
    os.makedirs(MODEL_DIR, exist_ok=True)

    # 1) Load aggregated monthly series
    monthly = load_monthly(DATA_PATH)

    # 2) Forward‐chaining cross-validation
    print("Validation performance per fold:")
    for idx, (train_df, val_df) in enumerate(
            time_series_splits(monthly, n_folds=n_folds, val_size=test_size), 1):
        model = Prophet(
            yearly_seasonality=True,
            weekly_seasonality=False,
            daily_seasonality=False
        )
        model.add_seasonality(name="monthly", period=30.5, fourier_order=5)
        model.fit(train_df)

        # forecast next `test_size` months at month-start
        future = model.make_future_dataframe(periods=test_size, freq="MS")
        fcst   = model.predict(future)[["ds","yhat"]].tail(test_size)

        # align on ds
        merged = (
            val_df[["ds","y"]]
            .merge(fcst, on="ds", how="left")
            .dropna(subset=["yhat"])
        )
        fold_mae  = mean_absolute_error(merged["y"], merged["yhat"])
        fold_mape = mape(merged["y"], merged["yhat"])
        print(f"  Fold {idx}: MAE = {fold_mae:,.0f} | MAPE = {fold_mape:.2f}%")

    # 3) Honest hold-out on last `test_size` months
    hold    = monthly.iloc[-test_size:].copy()
    train_h = monthly.iloc[:-test_size].copy()

    hold_model = Prophet(
        yearly_seasonality=True,
        weekly_seasonality=False,
        daily_seasonality=False
    )
    hold_model.add_seasonality(name="monthly", period=30.5, fourier_order=5)
    hold_model.fit(train_h)

    future_hold = hold_model.make_future_dataframe(periods=test_size, freq="MS")
    fcst_hold   = hold_model.predict(future_hold)[["ds","yhat"]].tail(test_size)

    merged_hold = (
        hold[["ds","y"]]
        .merge(fcst_hold, on="ds", how="left")
        .dropna(subset=["yhat"])
    )
    hold_mae  = mean_absolute_error(merged_hold["y"], merged_hold["yhat"])
    hold_mape = mape(merged_hold["y"], merged_hold["yhat"])
    print(f"\nHold-out (last {test_size} mo): MAE = {hold_mae:,.0f} | MAPE = {hold_mape:.2f}%")

    # 4) Retrain on all data & save the final model
    final_model = Prophet(
        yearly_seasonality=True,
        weekly_seasonality=False,
        daily_seasonality=False
    )
    final_model.add_seasonality(name="monthly", period=30.5, fourier_order=5)
    final_model.fit(monthly)
    joblib.dump(final_model, MODEL_PATH)
    print(f"\nSaved full Prophet model to {MODEL_PATH}")

    # 5) Forecast next month
    next_future = final_model.make_future_dataframe(periods=1, freq="MS")
    next_fc      = final_model.predict(next_future)[["ds","yhat"]].tail(1).iloc[0]
    ds, yhat     = next_fc["ds"], next_fc["yhat"]
    print(f"Forecast for next month ({ds.strftime('%Y-%m')}): {yhat:,.0f}")

if __name__ == "__main__":
    main()

00:02:23 - cmdstanpy - INFO - Chain [1] start processing
00:02:23 - cmdstanpy - INFO - Chain [1] done processing
00:02:23 - cmdstanpy - INFO - Chain [1] start processing
00:02:23 - cmdstanpy - INFO - Chain [1] done processing
00:02:23 - cmdstanpy - INFO - Chain [1] start processing
00:02:23 - cmdstanpy - INFO - Chain [1] done processing
00:02:23 - cmdstanpy - INFO - Chain [1] start processing
00:02:23 - cmdstanpy - INFO - Chain [1] done processing


Validation performance per fold:
  Fold 1: MAE = 171,818,834 | MAPE = 12.25%
  Fold 2: MAE = 40,653,808 | MAPE = 9.35%
  Fold 3: MAE = 108,688,733 | MAPE = 7.87%


00:02:23 - cmdstanpy - INFO - Chain [1] start processing
00:02:23 - cmdstanpy - INFO - Chain [1] done processing



Hold-out (last 6 mo): MAE = 108,688,733 | MAPE = 7.87%

Saved full Prophet model to ../models/global_prophet.pkl
Forecast for next month (2025-01): 155,634,284


In [11]:
import os
import pandas as pd
import joblib

def load_monthly(data_path):
    df = pd.read_csv(data_path, low_memory=False)
    df["Kalendertag"] = pd.to_datetime(df["Kalendertag"], errors="coerce")
    df["JahrMonat"]   = df["Kalendertag"].dt.to_period("M").dt.to_timestamp()
    return (
        df.groupby("JahrMonat")["BIWNAV AV Neug Wesu"]
          .sum()
          .reset_index()
          .rename(columns={"JahrMonat":"ds", "BIWNAV AV Neug Wesu":"y"})
    )

def forecast(periods=7):
    MODEL_PATH = "../models/global_prophet.pkl"
    DATA_PATH  = "../data/4_PrognoseCase_AV_NG_-_Daten.csv"

    if not os.path.isfile(MODEL_PATH):
        raise FileNotFoundError("Train first with train_prophet.py")

    # 1) Load model & history
    m       = joblib.load(MODEL_PATH)
    history = load_monthly(DATA_PATH)

    # 2) Make future df
    future = m.make_future_dataframe(periods=periods, freq='M')
    forecast = m.predict(future)

    # 3) Extract only the forecasted periods
    fc = forecast[['ds','yhat']].set_index('ds').iloc[-periods:]
    return fc

if __name__ == "__main__":
    df_fc = forecast(periods=7)
    for ds, yhat in df_fc.itertuples():
        print(f"{ds.strftime('%Y-%m')}: {yhat:,.0f}")

2024-12: 224,589,244
2025-01: 290,022,793
2025-02: 474,212,754
2025-03: 673,890,672
2025-04: 728,879,429
2025-05: 927,286,820
2025-06: 1,012,587,355


  dates = pd.date_range(
