In [None]:
# train_pipeline

import os
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
import joblib
import optuna

# ----------------------------
# Helpers
# ----------------------------
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

def time_series_splits(df, n_folds=3, val_size=6):
    """
    Yield (train_df, val_df) pairs by sliding a window of length val_size
    backwards through the data, with n_folds total validations.
    """
    n = len(df)
    for i in range(n_folds):
        train_end = n - (n_folds - i) * val_size
        val_start = train_end
        val_end   = val_start + val_size
        if val_end > n:
            break
        yield df.iloc[:train_end], df.iloc[val_start:val_end]

def load_monthly_all(data_path):
    df = pd.read_csv(data_path, low_memory=False)
    df["Kalendertag"]    = pd.to_datetime(df["Kalendertag"], errors="coerce")
    df["Eintrittsdatum"] = pd.to_datetime(df["Eintrittsdatum"], errors="coerce")
    df = df[df["Kalendertag"] >= df["Eintrittsdatum"]]
    df["JahrMonat"]      = pd.to_datetime(df["Kalendertag"].dt.to_period("M").astype(str))
    return (
        df
        .groupby(["Region","JahrMonat"])
        .agg({
            "BIWNAV AV Neug Wesu":"sum",
            "Anz. NeuFamilien":   "sum",
            "Anz. Neukunden":     "sum",
            "P AV neu":           "sum",
            "SpB AV":             "sum",
        })
        .reset_index()
        .rename(columns={"BIWNAV AV Neug Wesu":"target"})
        .sort_values(["Region","JahrMonat"])
        .reset_index(drop=True)
    )

# -----------------------------------
# Main orchestration
# -----------------------------------
def main(TEST_SIZE=6, N_TRIALS=30):
    DATA_PATH = "../data/4_PrognoseCase_AV_NG_-_Daten.csv"
    MODEL_DIR = "../models"

    os.makedirs(MODEL_DIR, exist_ok=True)
    monthly_all = load_monthly_all(DATA_PATH)

    for region, region_df in monthly_all.groupby("Region"):
        print(f"\n=== Region: {region} ===")
        m = region_df.sort_values("JahrMonat").copy()

        # Feature engineering
        m["month"]     = m["JahrMonat"].dt.month
        m["month_sin"] = np.sin(2 * np.pi * m["month"] / 12)
        m["month_cos"] = np.cos(2 * np.pi * m["month"] / 12)
        for col in ["Anz. NeuFamilien","Anz. Neukunden","P AV neu","SpB AV"]:
            m[f"{col}_lag1"] = m[col].shift(1)
            m[f"{col}_lag2"] = m[col].shift(2)
        m["target_t_plus_1"] = m["target"].shift(-1)

        # Drop raw + helper cols
        m = m.drop(columns=[
            "month","Anz. NeuFamilien","Anz. Neukunden","P AV neu","SpB AV","target"
        ]).reset_index(drop=True)

        full_df = m.dropna(subset=["target_t_plus_1"])
        if len(full_df) < TEST_SIZE + 6:
            print("→ Skipping (not enough history).")
            continue

        train_val_df = full_df.iloc[:-TEST_SIZE]
        test_df      = full_df.iloc[-TEST_SIZE:]
        feature_cols = [
            c for c in full_df.columns
            if c not in ["Region","JahrMonat","target_t_plus_1"]
        ]

        # Optuna objective: minimize avg MAPE over 3‐fold TS CV
        def objective(trial):
            params = {
                "objective":        "reg:squarederror",
                "random_state":     42,
                "verbosity":        0,
                "max_depth":        trial.suggest_int("max_depth", 3, 12),
                "learning_rate":    trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
                "subsample":        trial.suggest_float("subsample", 0.5, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
                "n_estimators":     trial.suggest_int("n_estimators", 100, 500),
            }
            mape_scores = []
            for tr_df, val_df in time_series_splits(train_val_df, n_folds=3, val_size=TEST_SIZE):
                X_tr, y_tr   = tr_df[feature_cols], tr_df["target_t_plus_1"]
                X_val, y_val = val_df[feature_cols], val_df["target_t_plus_1"]
                model = XGBRegressor(**params)
                model.fit(X_tr, y_tr)
                preds = model.predict(X_val)
                mape_scores.append(mean_absolute_percentage_error(y_val, preds))
            return float(np.mean(mape_scores))

        study = optuna.create_study(direction="minimize")
        study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)
        best_params = study.best_trial.params
        print("→ Best params:", best_params)

        # Train on train_val, evaluate hold-out
        tuned = {
            **best_params,
            "objective":    "reg:squarederror",
            "random_state": 42,
            "verbosity":    0,
        }
        prod_model = XGBRegressor(**tuned)
        prod_model.fit(train_val_df[feature_cols], train_val_df["target_t_plus_1"])
        test_preds = prod_model.predict(test_df[feature_cols])
        test_mae   = mean_absolute_error(test_df["target_t_plus_1"], test_preds)
        test_mape  = mean_absolute_percentage_error(test_df["target_t_plus_1"], test_preds)
        print(f"Hold-out ({TEST_SIZE} mo): MAE={test_mae:.0f}, MAPE={test_mape:.2f}%")

        # Retrain on full data & save
        final_model = XGBRegressor(**tuned)
        final_model.fit(full_df[feature_cols], full_df["target_t_plus_1"])
        fn = f"{region.replace(' ','_')}_xgb_model.pkl"
        joblib.dump(final_model, os.path.join(MODEL_DIR, fn))
        print(f"Saved tuned model → {fn}")


if __name__ == "__main__":
    main()


[I 2025-04-23 22:13:49,035] A new study created in memory with name: no-name-3f966a6d-45b7-495c-9ec9-9ae382d8c5b8



=== Region: Region M-O ===


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-04-23 22:13:50,407] Trial 0 finished with value: 71.1763483177249 and parameters: {'max_depth': 4, 'learning_rate': 0.12435220487508022, 'subsample': 0.6481261176436195, 'colsample_bytree': 0.593551792221262, 'n_estimators': 331}. Best is trial 0 with value: 71.1763483177249.
[I 2025-04-23 22:13:52,793] Trial 1 finished with value: 37.7029029256166 and parameters: {'max_depth': 5, 'learning_rate': 0.16118836888220672, 'subsample': 0.5500304863988297, 'colsample_bytree': 0.5442823770928287, 'n_estimators': 478}. Best is trial 1 with value: 37.7029029256166.
[I 2025-04-23 22:13:55,135] Trial 2 finished with value: 29.14291957857093 and parameters: {'max_depth': 9, 'learning_rate': 0.013629520369023035, 'subsample': 0.8975655349190854, 'colsample_bytree': 0.7671195993281514, 'n_estimators': 320}. Best is trial 2 with value: 29.14291957857093.
[I 2025-04-23 22:13:56,065] Trial 3 finished with value: 35.69146612069298 and parameters: {'max_depth': 5, 'learning_rate': 0.0742799594240

[I 2025-04-23 22:14:49,828] A new study created in memory with name: no-name-fb705fcf-bd56-4286-826e-df174c8a8228


Saved tuned model → Region_M-O_xgb_model.pkl

=== Region: Region Nord ===


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-04-23 22:14:51,193] Trial 0 finished with value: 50.89417605875777 and parameters: {'max_depth': 4, 'learning_rate': 0.02959545863178666, 'subsample': 0.8753233980722956, 'colsample_bytree': 0.8049999216434196, 'n_estimators': 340}. Best is trial 0 with value: 50.89417605875777.
[I 2025-04-23 22:14:51,859] Trial 1 finished with value: 72.1210022193715 and parameters: {'max_depth': 4, 'learning_rate': 0.14281064147564462, 'subsample': 0.5277908489963855, 'colsample_bytree': 0.8420234626376395, 'n_estimators': 164}. Best is trial 0 with value: 50.89417605875777.
[I 2025-04-23 22:14:53,360] Trial 2 finished with value: 59.70162493839812 and parameters: {'max_depth': 3, 'learning_rate': 0.24346396681831342, 'subsample': 0.6135452508126125, 'colsample_bytree': 0.8961279067883712, 'n_estimators': 476}. Best is trial 0 with value: 50.89417605875777.
[I 2025-04-23 22:14:56,262] Trial 3 finished with value: 76.75247822429685 and parameters: {'max_depth': 11, 'learning_rate': 0.053114624

[I 2025-04-23 22:15:31,492] A new study created in memory with name: no-name-3c771c96-463d-4773-97c4-81be7326cbaf


Saved tuned model → Region_Nord_xgb_model.pkl

=== Region: Region Süd ===


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-04-23 22:15:32,665] Trial 0 finished with value: 67.87526968613058 and parameters: {'max_depth': 4, 'learning_rate': 0.021671276755740505, 'subsample': 0.506950236608732, 'colsample_bytree': 0.9414476000073193, 'n_estimators': 293}. Best is trial 0 with value: 67.87526968613058.
[I 2025-04-23 22:15:34,461] Trial 1 finished with value: 119.50529543473412 and parameters: {'max_depth': 7, 'learning_rate': 0.22784715312857406, 'subsample': 0.6644411938392878, 'colsample_bytree': 0.7870902114834561, 'n_estimators': 277}. Best is trial 0 with value: 67.87526968613058.
[I 2025-04-23 22:15:36,983] Trial 2 finished with value: 82.79661195574035 and parameters: {'max_depth': 6, 'learning_rate': 0.12145521735021095, 'subsample': 0.9248383096658235, 'colsample_bytree': 0.5366318370259475, 'n_estimators': 445}. Best is trial 0 with value: 67.87526968613058.
[I 2025-04-23 22:15:39,799] Trial 3 finished with value: 112.77712769084332 and parameters: {'max_depth': 7, 'learning_rate': 0.1695928

[I 2025-04-23 22:16:42,349] A new study created in memory with name: no-name-c5e84db0-1595-48e2-9f32-201c004ad98d


Saved tuned model → Region_Süd_xgb_model.pkl

=== Region: Region West ===


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-04-23 22:16:44,704] Trial 0 finished with value: 41.077699130927705 and parameters: {'max_depth': 5, 'learning_rate': 0.06913550822111443, 'subsample': 0.7415358681942342, 'colsample_bytree': 0.9967955683173302, 'n_estimators': 486}. Best is trial 0 with value: 41.077699130927705.
[I 2025-04-23 22:16:47,803] Trial 1 finished with value: 44.08416902995683 and parameters: {'max_depth': 9, 'learning_rate': 0.010552508652871903, 'subsample': 0.6896242804396344, 'colsample_bytree': 0.7248186622214439, 'n_estimators': 436}. Best is trial 0 with value: 41.077699130927705.
[I 2025-04-23 22:16:49,237] Trial 2 finished with value: 49.50057797136626 and parameters: {'max_depth': 6, 'learning_rate': 0.050568986796494304, 'subsample': 0.706774557046695, 'colsample_bytree': 0.8406581453680968, 'n_estimators': 253}. Best is trial 0 with value: 41.077699130927705.
[I 2025-04-23 22:16:50,242] Trial 3 finished with value: 54.17720257391199 and parameters: {'max_depth': 4, 'learning_rate': 0.0130

[I 2025-04-23 22:18:01,064] A new study created in memory with name: no-name-09206328-1da8-41fd-a88f-e4d4b6fe85dd


Saved tuned model → Region_West_xgb_model.pkl

=== Region: Vertrieb M-O ===


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-04-23 22:18:04,716] Trial 0 finished with value: 93.00121620200684 and parameters: {'max_depth': 8, 'learning_rate': 0.022078481506036397, 'subsample': 0.6180140093157834, 'colsample_bytree': 0.6484304890416078, 'n_estimators': 500}. Best is trial 0 with value: 93.00121620200684.
[I 2025-04-23 22:18:07,433] Trial 1 finished with value: 50.67502251413688 and parameters: {'max_depth': 7, 'learning_rate': 0.10329089452847351, 'subsample': 0.6127652988743824, 'colsample_bytree': 0.907500408251466, 'n_estimators': 416}. Best is trial 1 with value: 50.67502251413688.
[I 2025-04-23 22:18:10,804] Trial 2 finished with value: 65.56145361623948 and parameters: {'max_depth': 11, 'learning_rate': 0.027023429322462832, 'subsample': 0.660966184861162, 'colsample_bytree': 0.8478555464352182, 'n_estimators': 368}. Best is trial 1 with value: 50.67502251413688.
[I 2025-04-23 22:18:12,159] Trial 3 finished with value: 92.1754486275966 and parameters: {'max_depth': 4, 'learning_rate': 0.045461756

[I 2025-04-23 22:19:02,075] A new study created in memory with name: no-name-cf8f5bfe-1692-4b75-bdc9-24a1371439e3


Saved tuned model → Vertrieb_M-O_xgb_model.pkl

=== Region: Vertrieb N-W ===


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-04-23 22:19:03,450] Trial 0 finished with value: 114.13748993294382 and parameters: {'max_depth': 4, 'learning_rate': 0.03231974084393157, 'subsample': 0.9988442147551311, 'colsample_bytree': 0.5040561853158487, 'n_estimators': 342}. Best is trial 0 with value: 114.13748993294382.
[I 2025-04-23 22:19:06,388] Trial 1 finished with value: 123.944411265862 and parameters: {'max_depth': 12, 'learning_rate': 0.011773648782812916, 'subsample': 0.7452254540287155, 'colsample_bytree': 0.5187562964899111, 'n_estimators': 332}. Best is trial 0 with value: 114.13748993294382.
[I 2025-04-23 22:19:08,350] Trial 2 finished with value: 91.49216562980183 and parameters: {'max_depth': 6, 'learning_rate': 0.023816313744436688, 'subsample': 0.6722789767439398, 'colsample_bytree': 0.7651602824682704, 'n_estimators': 345}. Best is trial 2 with value: 91.49216562980183.
[I 2025-04-23 22:19:09,421] Trial 3 finished with value: 87.58049776628006 and parameters: {'max_depth': 5, 'learning_rate': 0.0127

[I 2025-04-23 22:20:00,377] A new study created in memory with name: no-name-fc1f67c9-6438-482b-833e-0858fc13fca8


Saved tuned model → Vertrieb_N-W_xgb_model.pkl

=== Region: Vertrieb S ===


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-04-23 22:20:01,732] Trial 0 finished with value: 71.41541303824212 and parameters: {'max_depth': 5, 'learning_rate': 0.031433126892498525, 'subsample': 0.8875265361388862, 'colsample_bytree': 0.7474245111734532, 'n_estimators': 270}. Best is trial 0 with value: 71.41541303824212.
[I 2025-04-23 22:20:03,224] Trial 1 finished with value: 122.98258271020158 and parameters: {'max_depth': 10, 'learning_rate': 0.014111562380875742, 'subsample': 0.9342457401862911, 'colsample_bytree': 0.5261071547880248, 'n_estimators': 172}. Best is trial 0 with value: 71.41541303824212.
[I 2025-04-23 22:20:04,977] Trial 2 finished with value: 100.13265560500507 and parameters: {'max_depth': 5, 'learning_rate': 0.010684133533383952, 'subsample': 0.6299016567928021, 'colsample_bytree': 0.5566180888061507, 'n_estimators': 358}. Best is trial 0 with value: 71.41541303824212.
[I 2025-04-23 22:20:06,372] Trial 3 finished with value: 78.5702493027464 and parameters: {'max_depth': 12, 'learning_rate': 0.189

In [2]:
# predict_pipeline

import os
import pandas as pd
import numpy as np
import joblib

def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

def load_monthly_all(data_path):
    df = pd.read_csv(data_path, low_memory=False)
    df["Kalendertag"]    = pd.to_datetime(df["Kalendertag"], errors="coerce")
    df["Eintrittsdatum"] = pd.to_datetime(df["Eintrittsdatum"], errors="coerce")
    df = df[df["Kalendertag"] >= df["Eintrittsdatum"]]
    df["JahrMonat"] = pd.to_datetime(df["Kalendertag"].dt.to_period("M").astype(str))
    return (
        df
        .groupby(["Region","JahrMonat"])
        .agg({
            "BIWNAV AV Neug Wesu":"sum",
            "Anz. NeuFamilien":   "sum",
            "Anz. Neukunden":     "sum",
            "P AV neu":           "sum",
            "SpB AV":             "sum",
        })
        .reset_index()
        .rename(columns={"BIWNAV AV Neug Wesu":"target"})
        .sort_values(["Region","JahrMonat"])
        .reset_index(drop=True)
    )

def make_features(monthly):
    m = monthly.copy()
    # time features
    m["month"]     = m["JahrMonat"].dt.month
    m["month_sin"] = np.sin(2*np.pi*m["month"]/12)
    m["month_cos"] = np.cos(2*np.pi*m["month"]/12)
    # exogenous lags
    for col in ["Anz. NeuFamilien","Anz. Neukunden","P AV neu","SpB AV"]:
        m[f"{col}_lag1"] = m[col].shift(1)
        m[f"{col}_lag2"] = m[col].shift(2)
    # shifted target
    m["target_t"]        = m["target"]
    m["target_t_plus_1"] = m["target"].shift(-1)
    # drop exactly as in training
    m = m.drop(columns=[
        "month",
        "Anz. NeuFamilien","Anz. Neukunden","P AV neu","SpB AV",
        "target"
    ])
    return m.reset_index(drop=True)

def forecast_region(region, model_path, raw_df, n_periods=7):
    model = joblib.load(model_path)
    history = raw_df.copy().reset_index(drop=True)
    forecasts = []

    for _ in range(n_periods):
        feat_df = make_features(history)
        feature_cols = [
            c for c in feat_df.columns
            if c not in ["Region","JahrMonat","target_t","target_t_plus_1"]
        ]

        X_last_df = feat_df[feature_cols].iloc[[-1]]
        y_hat     = model.predict(X_last_df)[0]

        next_month = history["JahrMonat"].iloc[-1] + pd.DateOffset(months=1)
        forecasts.append((next_month, y_hat))

        # append new raw row
        last = history.iloc[-1]
        new_raw = {
            "Region":            last["Region"],
            "JahrMonat":         next_month,
            "target":            y_hat,
            "Anz. NeuFamilien":  last["Anz. NeuFamilien"],
            "Anz. Neukunden":    last["Anz. Neukunden"],
            "P AV neu":          last["P AV neu"],
            "SpB AV":            last["SpB AV"],
        }
        history = pd.concat([history, pd.DataFrame([new_raw])], ignore_index=True)

    return forecasts

if __name__ == "__main__":
    DATA_PATH = "../data/4_PrognoseCase_AV_NG_-_Daten.csv"
    MODEL_DIR = "../models"
    N_PERIODS = 7

    monthly_all = load_monthly_all(DATA_PATH)

    for region in monthly_all["Region"].unique():
        # look for the XGB model filename, not RF
        fname = f"{region.replace(' ', '_')}_xgb_model.pkl"
        model_path = os.path.join(MODEL_DIR, fname)

        if not os.path.isfile(model_path):
            print(f"→ No XGB model for region '{region}', skipping.")
            continue

        raw = monthly_all[monthly_all["Region"] == region][[
            "Region","JahrMonat","target",
            "Anz. NeuFamilien","Anz. Neukunden","P AV neu","SpB AV"
        ]]

        fc = forecast_region(region, model_path, raw, n_periods=N_PERIODS)
        print(f"\n▶ {region} forecast:")
        for dt, val in fc:
            print(f"   {dt.strftime('%Y-%m')}: {val:,.0f}")



▶ Region M-O forecast:
   2025-01: 7,660,856
   2025-02: 20,838,056
   2025-03: 27,552,576
   2025-04: 25,816,902
   2025-05: 24,766,528
   2025-06: 25,035,296
   2025-07: 28,090,496

▶ Region Nord forecast:
   2025-01: 27,254,244
   2025-02: 38,389,000
   2025-03: 43,481,520
   2025-04: 56,661,388
   2025-05: 66,474,236
   2025-06: 78,716,296
   2025-07: 81,416,560

▶ Region Süd forecast:
   2025-01: 15,537,219
   2025-02: 21,990,030
   2025-03: 22,179,284
   2025-04: 28,705,136
   2025-05: 38,455,448
   2025-06: 72,721,016
   2025-07: 75,737,368

▶ Region West forecast:
   2025-01: 14,437,361
   2025-02: 30,039,400
   2025-03: 32,408,718
   2025-04: 44,663,032
   2025-05: 50,143,964
   2025-06: 71,562,448
   2025-07: 75,342,464

▶ Vertrieb M-O forecast:
   2025-01: 29,480,394
   2025-02: 43,541,052
   2025-03: 67,059,416
   2025-04: 99,413,216
   2025-05: 157,285,248
   2025-06: 179,251,968
   2025-07: 193,097,344

▶ Vertrieb N-W forecast:
   2025-01: 71,063,360
   2025-02: 93,868,3