In [None]:
# train_pipeline

import os
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
import joblib
import optuna

# ----------------------------
# Helpers
# ----------------------------
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

def time_series_splits(df, n_folds=3, val_size=6):
    """
    Yield (train_df, val_df) pairs by sliding a window of length val_size
    backwards through the data, with n_folds total validations.
    """
    n = len(df)
    for i in range(n_folds):
        train_end = n - (n_folds - i) * val_size
        val_start = train_end
        val_end   = val_start + val_size
        if val_end > n:
            break
        yield df.iloc[:train_end], df.iloc[val_start:val_end]

def load_monthly_all(data_path):
    df = pd.read_csv(data_path, low_memory=False)
    df["Kalendertag"]    = pd.to_datetime(df["Kalendertag"], errors="coerce")
    df["Eintrittsdatum"] = pd.to_datetime(df["Eintrittsdatum"], errors="coerce")
    df = df[df["Kalendertag"] >= df["Eintrittsdatum"]]
    df["JahrMonat"]      = pd.to_datetime(df["Kalendertag"].dt.to_period("M").astype(str))
    return (
        df
        .groupby(["Region","JahrMonat"])
        .agg({
            "BIWNAV AV Neug Wesu":"sum",
            "Anz. NeuFamilien":   "sum",
            "Anz. Neukunden":     "sum",
            "P AV neu":           "sum",
            "SpB AV":             "sum",
        })
        .reset_index()
        .rename(columns={"BIWNAV AV Neug Wesu":"target"})
        .sort_values(["Region","JahrMonat"])
        .reset_index(drop=True)
    )

# -----------------------------------
# Main orchestration
# -----------------------------------
def main(TEST_SIZE=6, N_TRIALS=30):
    DATA_PATH = "../data/4_PrognoseCase_AV_NG_-_Daten.csv"
    MODEL_DIR = "../models"

    os.makedirs(MODEL_DIR, exist_ok=True)
    monthly_all = load_monthly_all(DATA_PATH)

    for region, region_df in monthly_all.groupby("Region"):
        print(f"\n=== Region: {region} ===")
        m = region_df.sort_values("JahrMonat").copy()

        # Feature engineering
        m["month"]     = m["JahrMonat"].dt.month
        m["month_sin"] = np.sin(2 * np.pi * m["month"] / 12)
        m["month_cos"] = np.cos(2 * np.pi * m["month"] / 12)
        for col in ["Anz. NeuFamilien","Anz. Neukunden","P AV neu","SpB AV"]:
            m[f"{col}_lag1"] = m[col].shift(1)
            m[f"{col}_lag2"] = m[col].shift(2)
        m["target_t_plus_1"] = m["target"].shift(-1)

        # Drop raw + helper cols
        m = m.drop(columns=[
            "month","Anz. NeuFamilien","Anz. Neukunden","P AV neu","SpB AV","target"
        ]).reset_index(drop=True)

        full_df = m.dropna(subset=["target_t_plus_1"])
        if len(full_df) < TEST_SIZE + 6:
            print("→ Skipping (not enough history).")
            continue

        train_val_df = full_df.iloc[:-TEST_SIZE]
        test_df      = full_df.iloc[-TEST_SIZE:]
        feature_cols = [
            c for c in full_df.columns
            if c not in ["Region","JahrMonat","target_t_plus_1"]
        ]

        # Optuna objective: minimize avg MAPE over 3‐fold TS CV
        def objective(trial):
            params = {
                "objective":        "reg:squarederror",
                "random_state":     42,
                "verbosity":        0,
                "max_depth":        trial.suggest_int("max_depth", 3, 12),
                "learning_rate":    trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
                "subsample":        trial.suggest_float("subsample", 0.5, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
                "n_estimators":     trial.suggest_int("n_estimators", 100, 500),
            }
            mape_scores = []
            for tr_df, val_df in time_series_splits(train_val_df, n_folds=3, val_size=TEST_SIZE):
                X_tr, y_tr   = tr_df[feature_cols], tr_df["target_t_plus_1"]
                X_val, y_val = val_df[feature_cols], val_df["target_t_plus_1"]
                model = XGBRegressor(**params)
                model.fit(X_tr, y_tr)
                preds = model.predict(X_val)
                mape_scores.append(mean_absolute_percentage_error(y_val, preds))
            return float(np.mean(mape_scores))

        study = optuna.create_study(direction="minimize")
        study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)
        best_params = study.best_trial.params
        print("→ Best params:", best_params)

        # Train on train_val, evaluate hold-out
        tuned = {
            **best_params,
            "objective":    "reg:squarederror",
            "random_state": 42,
            "verbosity":    0,
        }
        prod_model = XGBRegressor(**tuned)
        prod_model.fit(train_val_df[feature_cols], train_val_df["target_t_plus_1"])
        test_preds = prod_model.predict(test_df[feature_cols])
        test_mae   = mean_absolute_error(test_df["target_t_plus_1"], test_preds)
        test_mape  = mean_absolute_percentage_error(test_df["target_t_plus_1"], test_preds)
        print(f"Hold-out ({TEST_SIZE} mo): MAE={test_mae:.0f}, MAPE={test_mape:.2f}%")

        # Retrain on full data & save
        final_model = XGBRegressor(**tuned)
        final_model.fit(full_df[feature_cols], full_df["target_t_plus_1"])
        fn = f"{region.replace(' ','_')}_xgb_model.pkl"
        joblib.dump(final_model, os.path.join(MODEL_DIR, fn))
        print(f"Saved tuned model → {fn}")


if __name__ == "__main__":
    main()


In [None]:
# predict_pipeline

import os
import pandas as pd
import numpy as np
import joblib

def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

def load_monthly_all(data_path):
    df = pd.read_csv(data_path, low_memory=False)
    df["Kalendertag"]    = pd.to_datetime(df["Kalendertag"], errors="coerce")
    df["Eintrittsdatum"] = pd.to_datetime(df["Eintrittsdatum"], errors="coerce")
    df = df[df["Kalendertag"] >= df["Eintrittsdatum"]]
    df["JahrMonat"] = pd.to_datetime(df["Kalendertag"].dt.to_period("M").astype(str))
    return (
        df
        .groupby(["Region","JahrMonat"])
        .agg({
            "BIWNAV AV Neug Wesu":"sum",
            "Anz. NeuFamilien":   "sum",
            "Anz. Neukunden":     "sum",
            "P AV neu":           "sum",
            "SpB AV":             "sum",
        })
        .reset_index()
        .rename(columns={"BIWNAV AV Neug Wesu":"target"})
        .sort_values(["Region","JahrMonat"])
        .reset_index(drop=True)
    )

def make_features(monthly):
    m = monthly.copy()
    # time features
    m["month"]     = m["JahrMonat"].dt.month
    m["month_sin"] = np.sin(2*np.pi*m["month"]/12)
    m["month_cos"] = np.cos(2*np.pi*m["month"]/12)
    # exogenous lags
    for col in ["Anz. NeuFamilien","Anz. Neukunden","P AV neu","SpB AV"]:
        m[f"{col}_lag1"] = m[col].shift(1)
        m[f"{col}_lag2"] = m[col].shift(2)
    # shifted target
    m["target_t"]        = m["target"]
    m["target_t_plus_1"] = m["target"].shift(-1)
    # drop exactly as in training
    m = m.drop(columns=[
        "month",
        "Anz. NeuFamilien","Anz. Neukunden","P AV neu","SpB AV",
        "target"
    ])
    return m.reset_index(drop=True)

def forecast_region(region, model_path, raw_df, n_periods=7):
    model = joblib.load(model_path)
    history = raw_df.copy().reset_index(drop=True)
    forecasts = []

    for _ in range(n_periods):
        feat_df = make_features(history)
        feature_cols = [
            c for c in feat_df.columns
            if c not in ["Region","JahrMonat","target_t","target_t_plus_1"]
        ]

        X_last_df = feat_df[feature_cols].iloc[[-1]]
        y_hat     = model.predict(X_last_df)[0]

        next_month = history["JahrMonat"].iloc[-1] + pd.DateOffset(months=1)
        forecasts.append((next_month, y_hat))

        # append new raw row
        last = history.iloc[-1]
        new_raw = {
            "Region":            last["Region"],
            "JahrMonat":         next_month,
            "target":            y_hat,
            "Anz. NeuFamilien":  last["Anz. NeuFamilien"],
            "Anz. Neukunden":    last["Anz. Neukunden"],
            "P AV neu":          last["P AV neu"],
            "SpB AV":            last["SpB AV"],
        }
        history = pd.concat([history, pd.DataFrame([new_raw])], ignore_index=True)

    return forecasts

if __name__ == "__main__":
    DATA_PATH = "../data/4_PrognoseCase_AV_NG_-_Daten.csv"
    MODEL_DIR = "../models"
    N_PERIODS = 7

    monthly_all = load_monthly_all(DATA_PATH)

    for region in monthly_all["Region"].unique():
        # look for the XGB model filename, not RF
        fname = f"{region.replace(' ', '_')}_xgb_model.pkl"
        model_path = os.path.join(MODEL_DIR, fname)

        if not os.path.isfile(model_path):
            print(f"→ No XGB model for region '{region}', skipping.")
            continue

        raw = monthly_all[monthly_all["Region"] == region][[
            "Region","JahrMonat","target",
            "Anz. NeuFamilien","Anz. Neukunden","P AV neu","SpB AV"
        ]]

        fc = forecast_region(region, model_path, raw, n_periods=N_PERIODS)
        print(f"\n▶ {region} forecast:")
        for dt, val in fc:
            print(f"   {dt.strftime('%Y-%m')}: {val:,.0f}")
