In [13]:
# train_pipeline

import os
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
import joblib
import optuna

# -----------------------------------
# Helpers
# -----------------------------------
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

def time_series_splits(df, n_folds=3, val_size=6):
    """Yield (train_df, val_df) forward‐chaining splits."""
    n = len(df)
    for i in range(n_folds):
        train_end = n - (n_folds - i) * val_size
        val_start = train_end
        val_end   = val_start + val_size
        if val_end > n:
            break
        yield df.iloc[:train_end], df.iloc[val_start:val_end]

def load_and_preprocess(data_path):
    """
    Load CSV, filter dates, aggregate to monthly, create
    the same features & dropped columns as in training.
    """
    df = pd.read_csv(data_path, low_memory=False)
    df["Kalendertag"]    = pd.to_datetime(df["Kalendertag"], errors="coerce")
    df["Eintrittsdatum"] = pd.to_datetime(df["Eintrittsdatum"], errors="coerce")
    df = df[df["Kalendertag"] >= df["Eintrittsdatum"]]
    df["JahrMonat"]      = pd.to_datetime(df["Kalendertag"].dt.to_period("M").astype(str))

    monthly = (
        df.groupby("JahrMonat")
          .agg({
            "BIWNAV AV Neug Wesu":"sum",
            "Anz. NeuFamilien":    "sum",
            "Anz. Neukunden":      "sum",
            "P AV neu":            "sum",
            "SpB AV":              "sum",
          })
          .reset_index()
          .rename(columns={"BIWNAV AV Neug Wesu":"target"})
          .sort_values("JahrMonat")
          .reset_index(drop=True)
    )

    # Feature engineering
    monthly["month"]     = monthly["JahrMonat"].dt.month
    monthly["month_sin"] = np.sin(2 * np.pi * monthly["month"] / 12)
    monthly["month_cos"] = np.cos(2 * np.pi * monthly["month"] / 12)
    for col in ["Anz. NeuFamilien","Anz. Neukunden","P AV neu","SpB AV"]:
        monthly[f"{col}_lag1"] = monthly[col].shift(1)
        monthly[f"{col}_lag2"] = monthly[col].shift(2)
    monthly["target_t_plus_1"] = monthly["target"].shift(-1)

    monthly = (
        monthly
        .drop(columns=[
            "month",
            "Anz. NeuFamilien","Anz. Neukunden","P AV neu","SpB AV",
            "target"
        ])
        .dropna(subset=["target_t_plus_1"])
        .reset_index(drop=True)
    )
    return monthly

# -----------------------------------
# Main orchestration
# -----------------------------------
def main(TEST_SIZE=6, N_TRIALS=30):
    DATA_PATH  = "../data/4_PrognoseCase_AV_NG_-_Daten.csv"
    MODEL_PATH = "../models/global_xgb_model.pkl"

    os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)
    monthly       = load_and_preprocess(DATA_PATH)
    train_val_df  = monthly.iloc[:-TEST_SIZE]
    test_df       = monthly.iloc[-TEST_SIZE:]
    feature_cols  = [c for c in monthly.columns if c not in ["JahrMonat","target_t_plus_1"]]

    # 1) Optuna hyperparameter tuning
    def objective(trial):
        params = {
            "objective":        "reg:squarederror",
            "tree_method":      "hist",
            "random_state":     42,
            "verbosity":        0,
            "max_depth":        trial.suggest_int("max_depth", 3, 12),
            "learning_rate":    trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "subsample":        trial.suggest_float("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            "n_estimators":     trial.suggest_int("n_estimators", 100, 500),
        }
        mape_scores = []
        for tr_df, val_df in time_series_splits(train_val_df, n_folds=3, val_size=TEST_SIZE):
            X_tr, y_tr   = tr_df[feature_cols], tr_df["target_t_plus_1"]
            X_val, y_val = val_df[feature_cols], val_df["target_t_plus_1"]
            model = XGBRegressor(**params)
            model.fit(X_tr, y_tr)
            preds = model.predict(X_val)
            mape_scores.append(mean_absolute_percentage_error(y_val, preds))
        return float(np.mean(mape_scores))

    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)
    best_params = study.best_trial.params
    print("Best hyperparameters:", best_params)

    tuned = {
        **best_params,
        "objective":    "reg:squarederror",
        "random_state": 42,
        "verbosity":    0,
    }

    # 2) Train on train_val and evaluate on hold-out
    prod_model = XGBRegressor(**tuned)
    prod_model.fit(train_val_df[feature_cols], train_val_df["target_t_plus_1"])
    test_preds = prod_model.predict(test_df[feature_cols])
    test_mae   = mean_absolute_error(test_df["target_t_plus_1"], test_preds)
    test_mape  = mean_absolute_percentage_error(test_df["target_t_plus_1"], test_preds)
    print(f"Hold‐out Test ({TEST_SIZE} mo): MAE={test_mae:.0f}, MAPE={test_mape:.2f}%")

    # 3) Retrain on full data and save
    final_model = XGBRegressor(**tuned)
    final_model.fit(monthly[feature_cols], monthly["target_t_plus_1"])
    joblib.dump(final_model, MODEL_PATH)
    print(f"Saved tuned model to {MODEL_PATH}")

if __name__ == "__main__":
    main()

[I 2025-04-23 22:29:53,460] A new study created in memory with name: no-name-d978315b-c0c9-4057-9b46-a0f8af854af5


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-04-23 22:29:55,177] Trial 0 finished with value: 11.176479343716721 and parameters: {'max_depth': 9, 'learning_rate': 0.27860476988145516, 'subsample': 0.5585388479136153, 'colsample_bytree': 0.756262202262777, 'n_estimators': 201}. Best is trial 0 with value: 11.176479343716721.
[I 2025-04-23 22:29:57,032] Trial 1 finished with value: 10.664759540617956 and parameters: {'max_depth': 6, 'learning_rate': 0.07917615836556838, 'subsample': 0.6599234049749177, 'colsample_bytree': 0.5863936511111316, 'n_estimators': 325}. Best is trial 1 with value: 10.664759540617956.
[I 2025-04-23 22:29:58,465] Trial 2 finished with value: 9.455923782853079 and parameters: {'max_depth': 5, 'learning_rate': 0.08025879108764446, 'subsample': 0.8602068824456328, 'colsample_bytree': 0.699945804709013, 'n_estimators': 295}. Best is trial 2 with value: 9.455923782853079.
[I 2025-04-23 22:30:00,286] Trial 3 finished with value: 8.709272316788171 and parameters: {'max_depth': 10, 'learning_rate': 0.055073

In [None]:
# predict_pipeline

import os
import pandas as pd
import numpy as np
import joblib

def load_and_preprocess(data_path):
    """
    Load the CSV, filter dates, aggregate to monthly and
    apply the same feature‐engineering/dropping as in training.
    Returns the prepared `monthly` DataFrame.
    """
    df = pd.read_csv(data_path, low_memory=False)
    df["Kalendertag"]    = pd.to_datetime(df["Kalendertag"], errors="coerce")
    df["Eintrittsdatum"] = pd.to_datetime(df["Eintrittsdatum"], errors="coerce")
    df = df[df["Kalendertag"] >= df["Eintrittsdatum"]]
    df["JahrMonat"] = pd.to_datetime(df["Kalendertag"].dt.to_period("M").astype(str))

    monthly = df.groupby("JahrMonat").agg({
        "BIWNAV AV Neug Wesu": "sum",
        "Anz. NeuFamilien":     "sum",
        "Anz. Neukunden":       "sum",
        "P AV neu":             "sum",
        "SpB AV":               "sum",
    }).reset_index().rename(columns={"BIWNAV AV Neug Wesu": "target"})

    monthly = monthly.sort_values("JahrMonat").copy()
    monthly["target_t"]      = monthly["target"]
    monthly["month"]         = monthly["JahrMonat"].dt.month
    monthly["month_sin"]     = np.sin(2 * np.pi * monthly["month"] / 12)
    monthly["month_cos"]     = np.cos(2 * np.pi * monthly["month"] / 12)

    for col in ["Anz. NeuFamilien","Anz. Neukunden","P AV neu","SpB AV"]:
        monthly[f"{col}_lag1"] = monthly[col].shift(1)
        monthly[f"{col}_lag2"] = monthly[col].shift(2)

    monthly["target_t_plus_1"] = monthly["target"].shift(-1)

    # drop exactly the same columns as in training
    monthly = monthly.drop(columns=[
        "month",
        "Anz. NeuFamilien","Anz. Neukunden","P AV neu","SpB AV",
        "target"
    ])
    return monthly.reset_index(drop=True)

def forecast_periods(model_path, data_path, n_periods=7):
    """
    Load the XGBoost model & data, then iteratively forecast `n_periods` ahead.
    Returns a list of (JahrMonat (Timestamp), forecast_value).
    """
    if not os.path.isfile(model_path):
        raise FileNotFoundError(f"Model not found at {model_path}. Please run training first.")
    model = joblib.load(model_path)

    monthly = load_and_preprocess(data_path)
    feature_cols = [c for c in monthly.columns if c not in ["JahrMonat","target_t","target_t_plus_1"]]

    # seed with last known features
    last_feats = monthly.iloc[-1][feature_cols].copy()
    last_date  = monthly.iloc[-1]["JahrMonat"]

    forecasts = []
    for _ in range(n_periods):
        next_date = last_date + pd.DateOffset(months=1)

        # update cycle
        sin = np.sin(2 * np.pi * next_date.month / 12)
        cos = np.cos(2 * np.pi * next_date.month / 12)
        feats = last_feats.copy()
        feats["month_sin"], feats["month_cos"] = sin, cos

        # build 1×n DataFrame so sklearn sees names
        X_pred_df = pd.DataFrame([feats], columns=feature_cols)
        y_pred    = model.predict(X_pred_df)[0]

        forecasts.append((next_date, y_pred))

        # advance
        last_date  = next_date
        # if you need to feed predictions back into lags/target, do it here:
        # e.g. last_feats["target_lag1"] = last_feats["target_lag2"]
        #       last_feats["target_lag2"] = y_pred

    return forecasts


if __name__ == "__main__":
    # point to your tuned XGBoost artifact
    MODEL_PATH = "../models/global_xgb_model.pkl"
    DATA_PATH  = "../data/4_PrognoseCase_AV_NG_-_Daten.csv"
    results    = forecast_periods(MODEL_PATH, DATA_PATH, n_periods=7)

    for period, value in results:
        print(f"{period.strftime('%Y-%m')}: {value:,.0f}")


2025-01: 182,819,312
2025-02: 282,813,248
2025-03: 462,352,448
2025-04: 704,966,656
2025-05: 721,588,736
2025-06: 722,822,976
2025-07: 1,795,055,232
