A (cluster model) + B (stabilizer features) + C (treatment outlier) Tambah D (Tweedie) dan E (hierarchy) sebagai refinement.

In [None]:
# app/profiling/sku_profiler.py

import pandas as pd
import numpy as np

def build_sku_profile(df: pd.DataFrame) -> pd.DataFrame:
    df["qty"] = df["qty"].astype(float)

    profile = (
        df.groupby(["cabang", "sku"])
          .agg(
              n_months=("periode", "nunique"),
              qty_mean=("qty", "mean"),
              qty_std=("qty", "std"),
              qty_max=("qty", "max"),
              qty_min=("qty", "min"),
              total_qty=("qty", "sum"),
              zero_months=("qty", lambda x: (x == 0).sum()),
          )
          .reset_index()
    )

    profile["zero_ratio"] = profile["zero_months"] / profile["n_months"]
    profile["cv"] = profile["qty_std"] / profile["qty_mean"].replace(0, np.nan)

    # demand_level_bin
    profile["demand_level"] = pd.qcut(profile["qty_mean"], q=4, labels=[0,1,2,3])

    return profile


In [3]:
# app/profiling/clustering.py

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

def run_sku_clustering(profile: pd.DataFrame, n_clusters=4) -> pd.DataFrame:
    
    cluster_feats = ["qty_mean", "cv", "zero_ratio"]
    prof_clean = profile.dropna(subset=cluster_feats).copy()

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(prof_clean[cluster_feats].values)

    km = KMeans(n_clusters=n_clusters, random_state=1337, n_init="auto")
    prof_clean["cluster"] = km.fit_predict(X_scaled)

    profile = profile.merge(
        prof_clean[["cabang", "sku", "cluster"]],
        on=["cabang", "sku"],
        how="left"
    )

    profile["cluster"] = profile["cluster"].fillna(-1).astype(int)
    return profile


In [14]:
# app/features/stabilizer_features.py

import pandas as pd
import numpy as np

def add_stabilizer_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Tambah fitur stabilizer:
    - qty_mean_cs, qty_std_cs, qty_cnt_cs: dihitung HANYA dari data train (is_train == 1)
    - seasonal_ratio_12, volatility_score: dihitung di level panel dengan rolling causal
    """
    df = df.copy()

    # Stats SKU HANYA dari train (supaya tidak ngintip test)
    df_train = df[df["is_train"] == 1].copy()

    sku_stats = (
        df_train.groupby(["cabang","sku"])["qty"]
        .agg(["mean","std","count"])
        .reset_index()
    )
    sku_stats.columns = ["cabang","sku","qty_mean_cs","qty_std_cs","qty_cnt_cs"]

    # merge ke full panel (train + test)
    df = df.merge(sku_stats, on=["cabang","sku"], how="left")

    # rolling + seasonal ratio pakai seri full, tapi rolling itu causal (hanya masa lalu)
    df = df.sort_values(["cabang","sku","periode"])

    df["roll_mean_3"] = (
        df.groupby(["cabang","sku"])["qty"]
          .transform(lambda x: x.rolling(3, min_periods=1).mean())
    )
    df["roll_mean_12"] = (
        df.groupby(["cabang","sku"])["qty"]
          .transform(lambda x: x.rolling(12, min_periods=1).mean())
    )
    df["seasonal_ratio_12"] = df["roll_mean_3"] / (df["roll_mean_12"] + 1e-9)

    # volatility_score dari stats train
    df["volatility_score"] = df["qty_std_cs"] / (df["qty_mean_cs"] + 1e-9)

    return df


In [15]:
# app/features/outlier_handler.py

import pandas as pd

def winsorize_outliers(df: pd.DataFrame, clip_ratio=0.01) -> pd.DataFrame:
    """
    Winsorize qty pakai quantile dari TRAIN saja, lalu diaplikasikan ke seluruh periode.
    """
    df = df.copy()

    df_train = df[df["is_train"] == 1].copy()

    q_stats = (
        df_train.groupby(["cabang","sku"])["qty"]
        .quantile([clip_ratio, 1 - clip_ratio])
        .unstack()
        .reset_index()
    )
    q_stats.columns = ["cabang","sku","q_low","q_high"]

    df = df.merge(q_stats, on=["cabang","sku"], how="left")

    # kalau ada SKU yang tidak punya train (edge case), pakai qty apa adanya
    df["qty_wins"] = df["qty"]
    mask = df["q_low"].notna()

    df.loc[mask, "qty_wins"] = df.loc[mask].apply(
        lambda row: max(min(row["qty"], row["q_high"]), row["q_low"]), axis=1
    )

    return df


In [16]:
# app/features/hierarchy_features.py

import pandas as pd

def add_hierarchy_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    def sku_family(s):
        s = str(s).upper().strip()

        if s.endswith("CHAR"):
            return "char"
        if s.endswith("CPOX"):
            return "cpox"
        if s.endswith("CSW"):
            return "csw"
        if s.endswith("CSB") or "KSB" in s:
            return "csb"

        # fallback: prefix 4 huruf
        return s[:4]

    df["family"] = df["sku"].apply(sku_family)
    return df


In [7]:
# app/modeling/tweedie_params.py

def get_tweedie_params():
    return {
        "objective": "tweedie",
        "tweedie_variance_power": 1.25,
        "metric": "rmse",
        "verbosity": -1,
        "force_row_wise": True,
        "seed": 1337,
    }


In [None]:
# app/modeling/lgbm_trainer_cluster.py

import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
from optuna.samplers import TPESampler


def train_lgbm_per_cluster(
    df: pd.DataFrame,
    cluster_id: int,
    feature_cols: list,
    log_target=True,
    n_trials=40,
):
    df_c = df[df["cluster"] == cluster_id].copy()
    if df_c.empty:
        print("Cluster", cluster_id, "kosong. Skip.")
        return None

    df_c = df_c.sort_values(["cabang","sku","periode"]).reset_index(drop=True)

    if log_target:
        df_c["tgt"] = np.log1p(df_c["qty_wins"])
    else:
        df_c["tgt"] = df_c["qty_wins"]

    train_all = df_c[df_c["is_train"] == 1].copy()
    val_cutoff = pd.Timestamp("2024-02-01")

    train_inner = train_all[train_all["periode"] < val_cutoff]
    val_inner   = train_all[train_all["periode"] >= val_cutoff]

    if train_inner.empty or val_inner.empty:
        return None

    X_train = train_inner[feature_cols]
    X_val   = val_inner[feature_cols]

    y_train = train_inner["tgt"].values
    y_val   = val_inner["tgt"].values

    # optuna objective
    def objective(trial):
        params = get_tweedie_params()
        params.update({
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
            "num_leaves": trial.suggest_int("num_leaves", 31, 255),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 200),
            "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
            "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
            "bagging_freq": trial.suggest_int("bagging_freq", 1, 12),
        })

        train_set = lgb.Dataset(X_train, y_train)
        val_set   = lgb.Dataset(X_val, y_val)

        model = lgb.train(
            params,
            train_set,
            num_boost_round=2000,
            valid_sets=[train_set, val_set],
            valid_names=["train","val"],
            callbacks=[
                lgb.early_stopping(stopping_rounds=100, verbose=False)
            ],
        )

        pred_val = model.predict(X_val, num_iteration=model.best_iteration)

        if log_target:
            pred_val = np.expm1(pred_val)

        true_val = val_inner["qty"].values
        rmse_val = np.sqrt(np.mean((true_val - pred_val)**2))
        return rmse_val

    study = optuna.create_study(direction="minimize", sampler=TPESampler(seed=1337))
    study.optimize(objective, n_trials=n_trials, show_progress_bar=False)

    best_iter = study.best_trial.user_attrs.get("best_iteration", 200)
    best_params = study.best_params

    final_params = get_tweedie_params()
    final_params.update(best_params)

    train_full = df_c[df_c["is_train"] == 1]
    X_full = train_full[feature_cols]
    y_full = train_full["tgt"].values

    model = lgb.train(
        final_params,
        lgb.Dataset(X_full, y_full),
        num_boost_round=best_iter,
    )

    return model


In [10]:
# app/inference/predict_cluster_pipeline.py

import numpy as np
import pandas as pd
import lightgbm as lgb

def load_cluster_models(path_dict):
    """path_dict: { cluster_id : model_path }"""
    models = {}
    for cid, path in path_dict.items():
        models[cid] = lgb.Booster(model_file=str(path))
    return models

def predict_full(df, models, feature_cols):
    df = df.copy()
    preds = []

    for cid, model in models.items():
        df_c = df[df["cluster"] == cid].copy()
        if df_c.empty:
            continue

        pred = model.predict(df_c[feature_cols])
        pred = np.expm1(pred)
        df_c["pred_qty"] = pred
        preds.append(df_c)

    return pd.concat(preds, axis=0).sort_index()


In [21]:
import warnings
warnings.filterwarnings("ignore")

from pathlib import Path
from typing import Dict

import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt

# ==============================
# METRIC FUNCTIONS
# ==============================
def mae(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    return np.mean(np.abs(y_true - y_pred))


def mse(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    return np.mean((y_true - y_pred) ** 2)


def rmse(y_true, y_pred):
    return np.sqrt(mse(y_true, y_pred))


def mape(y_true, y_pred, eps=1e-8):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = np.maximum(np.abs(y_true), eps)
    return np.mean(np.abs(y_true - y_pred) / denom) * 100.0


def smape(y_true, y_pred, eps=1e-8):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = np.maximum(np.abs(y_true) + np.abs(y_pred), eps)
    return np.mean(2.0 * np.abs(y_true - y_pred) / denom) * 100.0


# ==============================
# PATH CONFIG
# ==============================
PROJECT_ROOT   = Path(r"D:\Documents\Skripsi\demand-forecasting")
DATASET15_DIR  = PROJECT_ROOT / "data" / "dataset_15"

DATA_PATH      = DATASET15_DIR / "lgbm_dataset_15_fullfeat.csv"

OUT_ROOT       = PROJECT_ROOT / "outputs" / "lgbm_15_clusters_tweedie_noleak"
MODEL_DIR      = OUT_ROOT / "models"
METRIC_DIR     = OUT_ROOT / "metrics"
PLOT_DIR       = OUT_ROOT / "plots_per_series"
DIAG_DIR       = OUT_ROOT / "diagnostics"

for d in [OUT_ROOT, MODEL_DIR, METRIC_DIR, PLOT_DIR, DIAG_DIR]:
    d.mkdir(parents=True, exist_ok=True)


def main():
    print("====================================")
    print("RUN FULL TRAINING (NO LEAK): A+B+C+D+E")
    print("====================================")
    print("Load data:", DATA_PATH)

    df = pd.read_csv(DATA_PATH, parse_dates=["periode"])
    print("Rows:", len(df))

    df["qty"] = df["qty"].astype(float)
    df = df.sort_values(["cabang","sku","periode"]).reset_index(drop=True)

    # ----------------------------------
    # Step 2: build SKU profile dari TRAIN
    # ----------------------------------
    print("\n[STEP 2] Build SKU profile dari TRAIN...")
    df_train = df[df["is_train"] == 1].copy()
    profile = build_sku_profile(df_train)
    PROFILE_PATH = DATASET15_DIR / "cluster_profiles_raw_train_only.csv"
    profile.to_csv(PROFILE_PATH, index=False)
    print("Saved raw train profile to:", PROFILE_PATH)

    # ----------------------------------
    # Step 3: clustering (A) dari TRAIN
    # ----------------------------------
    print("\n[STEP 3] Clustering SKU (TRAIN only)...")
    profile_clustered = run_sku_clustering(profile, n_clusters=4)
    PROFILE_CLUSTER_PATH = DATASET15_DIR / "cluster_profiles_lgbm15_train_only.csv"
    profile_clustered.to_csv(PROFILE_CLUSTER_PATH, index=False)
    print("Saved clustered profile to:", PROFILE_CLUSTER_PATH)
    print("Cluster summary (train stats):")
    print(
        profile_clustered.groupby("cluster")[["qty_mean", "cv", "zero_ratio", "total_qty"]]
        .mean()
        .round(2)
        .to_string()
    )

    # ----------------------------------
    # Step 4: merge cluster + demand_level ke panel penuh
    # ----------------------------------
    print("\n[STEP 4] Merge cluster dan demand_level ke panel (train+test)...")
    df = df.merge(
        profile_clustered[["cabang", "sku", "cluster", "demand_level"]],
        on=["cabang", "sku"],
        how="left",
    )
    df["cluster"] = df["cluster"].fillna(-1).astype(int)

    # ----------------------------------
    # Step 5: add hierarchy features (E)
    # ----------------------------------
    print("\n[STEP 5] Tambah hierarchy features (family)...")
    df = add_hierarchy_features(df)

    # Encode family -> family_idx (numeric)
    if "family" in df.columns:
        family_map = {
            fam: idx for idx, fam in enumerate(sorted(df["family"].astype(str).unique()))
        }
        df["family_idx"] = df["family"].astype(str).map(family_map).astype("int16")
        print("Family mapping:", family_map)

    # ----------------------------------
    # Step 6: add stabilizer features (B) - pakai stats TRAIN
    # ----------------------------------
    print("\n[STEP 6] Tambah stabilizer features (no leak)...")
    df = add_stabilizer_features(df)

    # ----------------------------------
    # Step 7: outlier treatment (C) - quantile dari TRAIN
    # ----------------------------------
    print("\n[STEP 7] Winsorize outliers per SKU (no leak)...")
    df = winsorize_outliers(df)

    # backup log1p original qty juga, kalau mau analisis
    df["log_qty"] = np.log1p(df["qty"])
    df["log_qty_wins"] = np.log1p(df["qty_wins"])

    df = df.sort_values(["cabang", "sku", "periode"]).reset_index(drop=True)

    # ----------------------------------
    # Step 8: feature columns
    # ----------------------------------
    drop_cols = [
        "area",
        "cabang",
        "sku",
        "periode",
        "qty",
        "qty_wins",
        "log_qty",
        "log_qty_wins",
        "is_train",
        "is_test",
        "sample_weight",
        "family",
    ]

    feature_cols = [c for c in df.columns if c not in drop_cols]

    print("\n[STEP 8] Num features:", len(feature_cols))
    print("Contoh fitur:", feature_cols[:20])

    obj_cols = df[feature_cols].select_dtypes(include=["object"]).columns.tolist()
    if obj_cols:
        print("WARNING: Masih ada kolom object di feature_cols:", obj_cols)

    # ----------------------------------
    # Step 9: train per cluster
    # ----------------------------------
    print("\n[STEP 9] Training LGBM per cluster (Tweedie, no leak)...")
    cluster_ids = sorted(df["cluster"].dropna().unique())
    models: Dict[int, lgb.Booster] = {}

    for cid in cluster_ids:
        if cid == -1:
            print(f"Cluster {cid} = -1 (unknown), skip training.")
            continue

        print("\n====================================")
        print(f"TRAINING CLUSTER {cid}")
        print("====================================")

        model = train_lgbm_per_cluster(
            df=df,
            cluster_id=int(cid),
            feature_cols=feature_cols,
            log_target=True,
            n_trials=40,
        )

        if model is None:
            print(f"Cluster {cid}: model is None, skip saving.")
            continue

        models[cid] = model

        model_path = MODEL_DIR / f"lgbm_15_cluster_{cid}.txt"
        model.save_model(str(model_path))
        print(f"Cluster {cid}: model saved to {model_path}")

    if not models:
        raise RuntimeError("Tidak ada model yang berhasil dilatih. Cek cluster atau flag is_train.")

    # ----------------------------------
    # Step 10: prediksi penuh
    # ----------------------------------
    print("\n[STEP 10] Prediksi penuh (train + test) per cluster...")
    df_pred_list = []

    for cid, model in models.items():
        df_c = df[df["cluster"] == cid].copy()
        if df_c.empty:
            continue

        X_c = df_c[feature_cols]
        pred_log = model.predict(X_c)
        pred_qty = np.expm1(pred_log)

        df_c["pred_qty"] = pred_qty
        df_pred_list.append(df_c)

    df_pred = pd.concat(df_pred_list, axis=0).sort_index()
    PRED_PATH = OUT_ROOT / "panel_with_predictions.csv"
    df_pred.to_csv(PRED_PATH, index=False)
    print("Saved full panel with predictions to:", PRED_PATH)

    # ----------------------------------
    # Step 11: GLOBAL METRICS
    # ----------------------------------
    print("\n[STEP 11] Global metrics train/test...")
    metrics_global = []

    for split_name, mask in [
        ("train", df_pred["is_train"] == 1),
        ("test", df_pred["is_test"] == 1),
    ]:
        if not mask.any():
            continue

        yt = df_pred.loc[mask, "qty"].values
        yp = df_pred.loc[mask, "pred_qty"].values

        metrics_global.append({
            "split": split_name,
            "n_obs": int(len(yt)),
            "MSE": mse(yt, yp),
            "RMSE": rmse(yt, yp),
            "MAE": mae(yt, yp),
            "MAPE": mape(yt, yp),
            "sMAPE": smape(yt, yp),
        })

    global_df = pd.DataFrame(metrics_global)
    GLOBAL_METRIC_PATH = METRIC_DIR / "global_metrics_clusters_tweedie_noleak.csv"
    global_df.to_csv(GLOBAL_METRIC_PATH, index=False)
    print("Saved global metrics to:", GLOBAL_METRIC_PATH)
    print(global_df.to_string(index=False))

    # ----------------------------------
    # Step 12: METRICS PER SERIES
    # ----------------------------------
    print("\n[STEP 12] Metrics per cabang–SKU...")
    rows = []

    for (cab, sku), g in df_pred.groupby(["cabang", "sku"], sort=False):
        g_tr = g[g["is_train"] == 1]
        g_te = g[g["is_test"] == 1]

        row = {
            "cabang": cab,
            "sku": sku,
            "cluster": g["cluster"].iloc[0],
            "n_train": int(len(g_tr)),
            "n_test": int(len(g_te)),
        }

        if len(g_tr) > 0:
            yt_tr = g_tr["qty"].values
            yp_tr = g_tr["pred_qty"].values
            row.update({
                "train_mae": mae(yt_tr, yp_tr),
                "train_mse": mse(yt_tr, yp_tr),
                "train_rmse": rmse(yt_tr, yp_tr),
                "train_mape": mape(yt_tr, yp_tr),
                "train_smape": smape(yt_tr, yp_tr),
            })
        else:
            row.update({
                "train_mae": np.nan,
                "train_mse": np.nan,
                "train_rmse": np.nan,
                "train_mape": np.nan,
                "train_smape": np.nan,
            })

        if len(g_te) > 0:
            yt_te = g_te["qty"].values
            yp_te = g_te["pred_qty"].values
            row.update({
                "test_mae": mae(yt_te, yp_te),
                "test_mse": mse(yt_te, yp_te),
                "test_rmse": rmse(yt_te, yp_te),
                "test_mape": mape(yt_te, yp_te),
                "test_smape": smape(yt_te, yp_te),
            })
        else:
            row.update({
                "test_mae": np.nan,
                "test_mse": np.nan,
                "test_rmse": np.nan,
                "test_mape": np.nan,
                "test_smape": np.nan,
            })

        rows.append(row)

    metrics_series = pd.DataFrame(rows)
    metrics_series["gap_RMSE"] = metrics_series["test_rmse"] - metrics_series["train_rmse"]
    metrics_series["ratio_RMSE"] = metrics_series["test_rmse"] / metrics_series["train_rmse"]

    SERIES_METRIC_PATH = METRIC_DIR / "metrics_by_series_clusters_tweedie_noleak.csv"
    metrics_series.to_csv(SERIES_METRIC_PATH, index=False)
    print("Saved metrics per series to:", SERIES_METRIC_PATH)
    print(metrics_series.head(10).to_string(index=False))

    # ----------------------------------
    # Step 13: PLOT ACTUAL vs PRED (TEST)
    # ----------------------------------
    print(f"\n[STEP 13] Plot actual vs pred TEST per seri ke: {PLOT_DIR}")

    test_only = df_pred[df_pred["is_test"] == 1].copy()

    for (cab, sku), g in test_only.groupby(["cabang", "sku"], sort=False):
        g = g.sort_values("periode")

        if g["qty"].notna().sum() == 0:
            continue

        plt.figure(figsize=(10, 5))
        plt.plot(g["periode"], g["qty"], marker="o", label="Actual qty")
        plt.plot(g["periode"], g["pred_qty"], marker="x", label="Predicted qty")
        plt.xlabel("Periode")
        plt.ylabel("Qty")
        plt.title(f"Actual vs Predicted - TEST\nCabang {cab}, SKU {sku}")
        plt.legend()
        plt.grid(True)
        plt.tight_layout()

        fname = f"{cab}_{sku}_test_actual_vs_pred.png".replace("/", "-")
        plt.savefig(PLOT_DIR / fname, dpi=200)
        plt.close()

    # ----------------------------------
    # Step 14: DIAGNOSTICS (residual, overfit, dll)
    # ----------------------------------
    print(f"\n[STEP 14] Diagnostics ke: {DIAG_DIR}")

    df_resid = df_pred.copy()
    df_resid["resid"] = df_resid["qty"].astype(float) - df_resid["pred_qty"].astype(float)
    df_resid["abs_resid"] = df_resid["resid"].abs()

    # Histogram residual global
    plt.figure(figsize=(8, 5))
    plt.hist(df_resid["resid"], bins=80)
    plt.xlabel("Residual (qty - pred_qty)")
    plt.ylabel("Frekuensi")
    plt.title("Histogram residual global (train + test)")
    plt.tight_layout()
    plt.savefig(DIAG_DIR / "hist_residual_global.png", dpi=200)
    plt.close()

    # Residual vs predicted
    plt.figure(figsize=(8, 5))
    plt.scatter(df_resid["pred_qty"], df_resid["resid"], alpha=0.3)
    plt.axhline(0, color="red", linestyle="--")
    plt.xlabel("Predicted qty")
    plt.ylabel("Residual")
    plt.title("Residual vs predicted qty")
    plt.tight_layout()
    plt.savefig(DIAG_DIR / "scatter_resid_vs_pred.png", dpi=200)
    plt.close()

    # Top outliers (train)
    TOP_N = 50
    top_outliers = (
        df_resid[df_resid["is_train"] == 1]
        .sort_values("abs_resid", ascending=False)
        .head(TOP_N)
        [["area", "cabang", "sku", "periode", "qty", "pred_qty", "resid", "abs_resid"]]
    )
    OUTLIER_PATH = DIAG_DIR / "top_outliers_train.csv"
    top_outliers.to_csv(OUTLIER_PATH, index=False)
    print("Saved top outliers train to:", OUTLIER_PATH)

    # Hist ratio_RMSE
    plt.figure(figsize=(8, 5))
    plt.hist(metrics_series["ratio_RMSE"].dropna(), bins=30)
    plt.xlabel("ratio_RMSE = test_rmse / train_rmse")
    plt.ylabel("Jumlah seri")
    plt.title("Distribusi ratio_RMSE antar seri")
    plt.tight_layout()
    plt.savefig(DIAG_DIR / "hist_ratio_RMSE.png", dpi=200)
    plt.close()

    # Scatter train vs test RMSE
    plt.figure(figsize=(6, 6))
    plt.scatter(metrics_series["train_rmse"], metrics_series["test_rmse"], alpha=0.7)
    max_val = np.nanmax([
        metrics_series["train_rmse"].max(),
        metrics_series["test_rmse"].max()
    ])
    plt.plot([0, max_val], [0, max_val], "r--")
    plt.xlabel("Train RMSE")
    plt.ylabel("Test RMSE")
    plt.title("Train vs Test RMSE per cabang–SKU")
    plt.tight_layout()
    plt.savefig(DIAG_DIR / "scatter_train_vs_test_RMSE.png", dpi=200)
    plt.close()

    # Seri overfit / underfit
    overfit_series = metrics_series[metrics_series["ratio_RMSE"] > 1.3].copy()
    under_series   = metrics_series[metrics_series["ratio_RMSE"] < 0.8].copy()

    print("\nSeri dengan ratio_RMSE > 1.3 (indikasi sulit di test / overfit lokal):")
    if len(overfit_series) > 0:
        print(
            overfit_series[["cabang", "sku", "cluster", "train_rmse", "test_rmse", "ratio_RMSE"]]
            .sort_values("ratio_RMSE", ascending=False)
            .head(20)
            .to_string(index=False)
        )
    else:
        print("Tidak ada.")

    print("\nSeri dengan ratio_RMSE < 0.8 (train lebih jelek dari test):")
    if len(under_series) > 0:
        print(
            under_series[["cabang", "sku", "cluster", "train_rmse", "test_rmse", "ratio_RMSE"]]
            .sort_values("ratio_RMSE")
            .head(20)
            .to_string(index=False)
        )
    else:
        print("Tidak ada.")

    print("\nSELESAI: A+B+C+D+E (NO LEAK) + diagnostics lengkap.")


if __name__ == "__main__":
    main()


RUN FULL TRAINING (NO LEAK): A+B+C+D+E
Load data: D:\Documents\Skripsi\demand-forecasting\data\dataset_15\lgbm_dataset_15_fullfeat.csv
Rows: 4965

[STEP 2] Build SKU profile dari TRAIN...
Saved raw train profile to: D:\Documents\Skripsi\demand-forecasting\data\dataset_15\cluster_profiles_raw_train_only.csv

[STEP 3] Clustering SKU (TRAIN only)...
Saved clustered profile to: D:\Documents\Skripsi\demand-forecasting\data\dataset_15\cluster_profiles_lgbm15_train_only.csv
Cluster summary (train stats):
         qty_mean    cv  zero_ratio  total_qty
cluster                                       
0         1382.52  0.39         0.0   56683.43
1         4741.34  0.54         0.0  194394.92
2         3926.01  1.14         0.0  160966.57
3         1623.43  0.80         0.0   66560.77

[STEP 4] Merge cluster dan demand_level ke panel (train+test)...

[STEP 5] Tambah hierarchy features (family)...
Family mapping: {'APQR': 0, 'ATUV': 1, 'AUVW': 2, 'BBCD': 3, 'BKLM': 4, 'BUVW': 5, 'BVWX': 6, 'CFGH':

[I 2025-11-23 20:51:22,806] A new study created in memory with name: no-name-a403765d-c47e-4402-b153-9cc84b24759a



[STEP 8] Num features: 40
Contoh fitur: ['event_flag', 'event_flag_lag1', 'holiday_count', 'holiday_count_lag1', 'rainfall_lag1', 'imputed', 'spike_flag', 'month', 'year', 'qtr', 'qty_lag1', 'qty_lag2', 'qty_lag3', 'qty_lag4', 'qty_lag5', 'qty_lag6', 'qty_lag7', 'qty_lag8', 'qty_lag9', 'qty_lag10']

[STEP 9] Training LGBM per cluster (Tweedie, no leak)...

TRAINING CLUSTER 0


[I 2025-11-23 20:51:26,936] Trial 0 finished with value: 121.59964812496591 and parameters: {'learning_rate': 0.021923099845827587, 'num_leaves': 66, 'max_depth': 5, 'min_data_in_leaf': 103, 'feature_fraction': 0.7284002162080668, 'bagging_fraction': 0.8073571282390148, 'bagging_freq': 4}. Best is trial 0 with value: 121.59964812496591.
[I 2025-11-23 20:51:28,538] Trial 1 finished with value: 126.00291693328303 and parameters: {'learning_rate': 0.18617280148093396, 'num_leaves': 195, 'max_depth': 3, 'min_data_in_leaf': 89, 'feature_fraction': 0.8514004718158846, 'bagging_fraction': 0.6500231705342397, 'bagging_freq': 12}. Best is trial 0 with value: 121.59964812496591.
[I 2025-11-23 20:51:33,919] Trial 2 finished with value: 107.21132632409115 and parameters: {'learning_rate': 0.03772670263489984, 'num_leaves': 208, 'max_depth': 9, 'min_data_in_leaf': 85, 'feature_fraction': 0.7664415758283992, 'bagging_fraction': 0.8337032512441286, 'bagging_freq': 10}. Best is trial 2 with value: 107

Cluster 0: model saved to D:\Documents\Skripsi\demand-forecasting\outputs\lgbm_15_clusters_tweedie_noleak\models\lgbm_15_cluster_0.txt

TRAINING CLUSTER 1


[I 2025-11-23 20:55:08,362] Trial 0 finished with value: 2131.3749911918694 and parameters: {'learning_rate': 0.021923099845827587, 'num_leaves': 66, 'max_depth': 5, 'min_data_in_leaf': 103, 'feature_fraction': 0.7284002162080668, 'bagging_fraction': 0.8073571282390148, 'bagging_freq': 4}. Best is trial 0 with value: 2131.3749911918694.
[I 2025-11-23 20:55:08,555] Trial 1 finished with value: 2024.649452623456 and parameters: {'learning_rate': 0.18617280148093396, 'num_leaves': 195, 'max_depth': 3, 'min_data_in_leaf': 89, 'feature_fraction': 0.8514004718158846, 'bagging_fraction': 0.6500231705342397, 'bagging_freq': 12}. Best is trial 1 with value: 2024.649452623456.
[I 2025-11-23 20:55:09,015] Trial 2 finished with value: 1864.41206721324 and parameters: {'learning_rate': 0.03772670263489984, 'num_leaves': 208, 'max_depth': 9, 'min_data_in_leaf': 85, 'feature_fraction': 0.7664415758283992, 'bagging_fraction': 0.8337032512441286, 'bagging_freq': 10}. Best is trial 2 with value: 1864.41

Cluster 1: model saved to D:\Documents\Skripsi\demand-forecasting\outputs\lgbm_15_clusters_tweedie_noleak\models\lgbm_15_cluster_1.txt

TRAINING CLUSTER 2


[I 2025-11-23 20:55:34,220] Trial 0 finished with value: 5019.127933989106 and parameters: {'learning_rate': 0.021923099845827587, 'num_leaves': 66, 'max_depth': 5, 'min_data_in_leaf': 103, 'feature_fraction': 0.7284002162080668, 'bagging_fraction': 0.8073571282390148, 'bagging_freq': 4}. Best is trial 0 with value: 5019.127933989106.
[I 2025-11-23 20:55:34,331] Trial 1 finished with value: 4483.913725156995 and parameters: {'learning_rate': 0.18617280148093396, 'num_leaves': 195, 'max_depth': 3, 'min_data_in_leaf': 89, 'feature_fraction': 0.8514004718158846, 'bagging_fraction': 0.6500231705342397, 'bagging_freq': 12}. Best is trial 1 with value: 4483.913725156995.
[I 2025-11-23 20:55:34,540] Trial 2 finished with value: 4594.299331087771 and parameters: {'learning_rate': 0.03772670263489984, 'num_leaves': 208, 'max_depth': 9, 'min_data_in_leaf': 85, 'feature_fraction': 0.7664415758283992, 'bagging_fraction': 0.8337032512441286, 'bagging_freq': 10}. Best is trial 1 with value: 4483.913

Cluster 2: model saved to D:\Documents\Skripsi\demand-forecasting\outputs\lgbm_15_clusters_tweedie_noleak\models\lgbm_15_cluster_2.txt

TRAINING CLUSTER 3


[I 2025-11-23 20:55:45,686] Trial 0 finished with value: 613.9524689397414 and parameters: {'learning_rate': 0.021923099845827587, 'num_leaves': 66, 'max_depth': 5, 'min_data_in_leaf': 103, 'feature_fraction': 0.7284002162080668, 'bagging_fraction': 0.8073571282390148, 'bagging_freq': 4}. Best is trial 0 with value: 613.9524689397414.
[I 2025-11-23 20:55:45,925] Trial 1 finished with value: 657.0625738189198 and parameters: {'learning_rate': 0.18617280148093396, 'num_leaves': 195, 'max_depth': 3, 'min_data_in_leaf': 89, 'feature_fraction': 0.8514004718158846, 'bagging_fraction': 0.6500231705342397, 'bagging_freq': 12}. Best is trial 0 with value: 613.9524689397414.
[I 2025-11-23 20:55:47,039] Trial 2 finished with value: 537.1565237500789 and parameters: {'learning_rate': 0.03772670263489984, 'num_leaves': 208, 'max_depth': 9, 'min_data_in_leaf': 85, 'feature_fraction': 0.7664415758283992, 'bagging_fraction': 0.8337032512441286, 'bagging_freq': 10}. Best is trial 2 with value: 537.1565

Cluster 3: model saved to D:\Documents\Skripsi\demand-forecasting\outputs\lgbm_15_clusters_tweedie_noleak\models\lgbm_15_cluster_3.txt

[STEP 10] Prediksi penuh (train + test) per cluster...
Saved full panel with predictions to: D:\Documents\Skripsi\demand-forecasting\outputs\lgbm_15_clusters_tweedie_noleak\panel_with_predictions.csv

[STEP 11] Global metrics train/test...
Saved global metrics to: D:\Documents\Skripsi\demand-forecasting\outputs\lgbm_15_clusters_tweedie_noleak\metrics\global_metrics_clusters_tweedie_noleak.csv
split  n_obs           MSE       RMSE        MAE      MAPE     sMAPE
train   4920 313683.507019 560.074555 173.828192  8.962125  8.243627
 test     45 626566.313539 791.559419 512.546156 16.429938 15.987120

[STEP 12] Metrics per cabang–SKU...
Saved metrics per series to: D:\Documents\Skripsi\demand-forecasting\outputs\lgbm_15_clusters_tweedie_noleak\metrics\metrics_by_series_clusters_tweedie_noleak.csv
cabang          sku  cluster  n_train  n_test  train_mae     

train semua sku eligible

In [24]:
import warnings
warnings.filterwarnings("ignore")

from pathlib import Path
from typing import Dict

import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt

# ==============================
# METRIC FUNCTIONS
# ==============================
def mae(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    return np.mean(np.abs(y_true - y_pred))


def mse(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    return np.mean((y_true - y_pred) ** 2)


def rmse(y_true, y_pred):
    return np.sqrt(mse(y_true, y_pred))


def mape(y_true, y_pred, eps=1e-8):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = np.maximum(np.abs(y_true), eps)
    return np.mean(np.abs(y_true - y_pred) / denom) * 100.0


def smape(y_true, y_pred, eps=1e-8):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = np.maximum(np.abs(y_true) + np.abs(y_pred), eps)
    return np.mean(2.0 * np.abs(y_true - y_pred) / denom) * 100.0


# ==============================
# PATH CONFIG
# ==============================
PROJECT_ROOT    = Path(r"D:\Documents\Skripsi\demand-forecasting")
DATASETFULL_DIR = PROJECT_ROOT / "data" / "dataset_full"

# PAKAI DATASET FULL
DATA_PATH = DATASETFULL_DIR / "lgbm_dataset_full_fullfeat.csv"

OUT_ROOT  = PROJECT_ROOT / "outputs" / "lgbm_full_clusters_tweedie_noleak"
MODEL_DIR = OUT_ROOT / "models"
METRIC_DIR = OUT_ROOT / "metrics"
PLOT_DIR  = OUT_ROOT / "plots_per_series"
DIAG_DIR  = OUT_ROOT / "diagnostics"

for d in [OUT_ROOT, MODEL_DIR, METRIC_DIR, PLOT_DIR, DIAG_DIR]:
    d.mkdir(parents=True, exist_ok=True)


def main():
    print("====================================")
    print("RUN FULL TRAINING (NO LEAK): A+B+C+D+E - DATASET FULL")
    print("====================================")
    print("Load data:", DATA_PATH)

    df = pd.read_csv(DATA_PATH, parse_dates=["periode"])
    print("Rows:", len(df))

    df["qty"] = df["qty"].astype(float)
    df = df.sort_values(["cabang", "sku", "periode"]).reset_index(drop=True)

    # ----------------------------------
    # Step 2: build SKU profile dari TRAIN
    # ----------------------------------
    print("\n[STEP 2] Build SKU profile dari TRAIN...")
    df_train = df[df["is_train"] == 1].copy()
    profile = build_sku_profile(df_train)

    PROFILE_PATH = DATASETFULL_DIR / "cluster_profiles_raw_train_only.csv"
    profile.to_csv(PROFILE_PATH, index=False)
    print("Saved raw train profile to:", PROFILE_PATH)

    # ----------------------------------
    # Step 3: clustering (A) dari TRAIN
    # ----------------------------------
    print("\n[STEP 3] Clustering SKU (TRAIN only)...")
    profile_clustered = run_sku_clustering(profile, n_clusters=4)

    PROFILE_CLUSTER_PATH = DATASETFULL_DIR / "cluster_profiles_full_train_only.csv"
    profile_clustered.to_csv(PROFILE_CLUSTER_PATH, index=False)
    print("Saved clustered profile to:", PROFILE_CLUSTER_PATH)
    print("Cluster summary (train stats):")
    print(
        profile_clustered.groupby("cluster")[["qty_mean", "cv", "zero_ratio", "total_qty"]]
        .mean()
        .round(2)
        .to_string()
    )

    # ----------------------------------
    # Step 4: merge cluster + demand_level ke panel penuh
    # ----------------------------------
    print("\n[STEP 4] Merge cluster dan demand_level ke panel (train+test)...")
    df = df.merge(
        profile_clustered[["cabang", "sku", "cluster", "demand_level"]],
        on=["cabang", "sku"],
        how="left",
    )
    df["cluster"] = df["cluster"].fillna(-1).astype(int)

    # ----------------------------------
    # Step 5: add hierarchy features (E)
    # ----------------------------------
    print("\n[STEP 5] Tambah hierarchy features (family)...")
    df = add_hierarchy_features(df)

    # Encode family -> family_idx (numeric)
    if "family" in df.columns:
        family_map = {
            fam: idx for idx, fam in enumerate(sorted(df["family"].astype(str).unique()))
        }
        df["family_idx"] = df["family"].astype(str).map(family_map).astype("int16")
        print("Family mapping:", family_map)

    # ----------------------------------
    # Step 6: add stabilizer features (B) - pakai stats TRAIN
    # ----------------------------------
    print("\n[STEP 6] Tambah stabilizer features (no leak)...")
    df = add_stabilizer_features(df)

    # ----------------------------------
    # Step 7: outlier treatment (C) - quantile dari TRAIN
    # ----------------------------------
    print("\n[STEP 7] Winsorize outliers per SKU (no leak)...")
    df = winsorize_outliers(df)

    # backup log1p original qty juga, kalau mau analisis
    df["log_qty"] = np.log1p(df["qty"])
    df["log_qty_wins"] = np.log1p(df["qty_wins"])

    df = df.sort_values(["cabang", "sku", "periode"]).reset_index(drop=True)

    # ----------------------------------
    # Step 8: feature columns
    # ----------------------------------
    drop_cols = [
        "area",
        "cabang",
        "sku",
        "periode",
        "qty",
        "qty_wins",
        "log_qty",
        "log_qty_wins",
        "is_train",
        "is_test",
        "sample_weight",
        "family",
    ]

    feature_cols = [c for c in df.columns if c not in drop_cols]

    print("\n[STEP 8] Num features:", len(feature_cols))
    print("Contoh fitur:", feature_cols[:20])

    obj_cols = df[feature_cols].select_dtypes(include=["object"]).columns.tolist()
    if obj_cols:
        print("WARNING: Masih ada kolom object di feature_cols:", obj_cols)

    # ----------------------------------
    # Step 9: train per cluster
    # ----------------------------------
    print("\n[STEP 9] Training LGBM per cluster (Tweedie, no leak)...")
    cluster_ids = sorted(df["cluster"].dropna().unique())
    models: Dict[int, lgb.Booster] = {}

    for cid in cluster_ids:
        if cid == -1:
            print(f"Cluster {cid} = -1 (unknown), skip training.")
            continue

        print("\n====================================")
        print(f"TRAINING CLUSTER {cid}")
        print("====================================")

        model = train_lgbm_per_cluster(
            df=df,
            cluster_id=int(cid),
            feature_cols=feature_cols,
            log_target=True,   # kalau fungsi kamu tidak pakai arg ini, hapus saja
            n_trials=40,
        )

        if model is None:
            print(f"Cluster {cid}: model is None, skip saving.")
            continue

        models[cid] = model

        model_path = MODEL_DIR / f"lgbm_full_cluster_{cid}.txt"
        model.save_model(str(model_path))
        print(f"Cluster {cid}: model saved to {model_path}")

    if not models:
        raise RuntimeError("Tidak ada model yang berhasil dilatih. Cek cluster atau flag is_train.")

    # ----------------------------------
    # Step 10: prediksi penuh
    # ----------------------------------
    print("\n[STEP 10] Prediksi penuh (train + test) per cluster...")
    df_pred_list = []

    for cid, model in models.items():
        df_c = df[df["cluster"] == cid].copy()
        if df_c.empty:
            continue

        X_c = df_c[feature_cols]
        pred_log = model.predict(X_c)
        pred_qty = np.expm1(pred_log)

        df_c["pred_qty"] = pred_qty
        df_pred_list.append(df_c)

    df_pred = pd.concat(df_pred_list, axis=0).sort_index()
    PRED_PATH = OUT_ROOT / "panel_with_predictions.csv"
    df_pred.to_csv(PRED_PATH, index=False)
    print("Saved full panel with predictions to:", PRED_PATH)

    # ----------------------------------
    # Step 11: GLOBAL METRICS
    # ----------------------------------
    print("\n[STEP 11] Global metrics train/test...")
    metrics_global = []

    for split_name, mask in [
        ("train", df_pred["is_train"] == 1),
        ("test", df_pred["is_test"] == 1),
    ]:
        if not mask.any():
            continue

        yt = df_pred.loc[mask, "qty"].values
        yp = df_pred.loc[mask, "pred_qty"].values

        metrics_global.append({
            "split": split_name,
            "n_obs": int(len(yt)),
            "MSE": mse(yt, yp),
            "RMSE": rmse(yt, yp),
            "MAE": mae(yt, yp),
            "MAPE": mape(yt, yp),
            "sMAPE": smape(yt, yp),
        })

    global_df = pd.DataFrame(metrics_global)
    GLOBAL_METRIC_PATH = METRIC_DIR / "global_metrics_clusters_tweedie_full_noleak.csv"
    global_df.to_csv(GLOBAL_METRIC_PATH, index=False)
    print("Saved global metrics to:", GLOBAL_METRIC_PATH)
    print(global_df.to_string(index=False))

    # ----------------------------------
    # Step 12: METRICS PER SERIES
    # ----------------------------------
    print("\n[STEP 12] Metrics per cabang–SKU...")
    rows = []

    for (cab, sku), g in df_pred.groupby(["cabang", "sku"], sort=False):
        g_tr = g[g["is_train"] == 1]
        g_te = g[g["is_test"] == 1]

        row = {
            "cabang": cab,
            "sku": sku,
            "cluster": g["cluster"].iloc[0],
            "n_train": int(len(g_tr)),
            "n_test": int(len(g_te)),
        }

        if len(g_tr) > 0:
            yt_tr = g_tr["qty"].values
            yp_tr = g_tr["pred_qty"].values
            row.update({
                "train_mae": mae(yt_tr, yp_tr),
                "train_mse": mse(yt_tr, yp_tr),
                "train_rmse": rmse(yt_tr, yp_tr),
                "train_mape": mape(yt_tr, yp_tr),
                "train_smape": smape(yt_tr, yp_tr),
            })
        else:
            row.update({
                "train_mae": np.nan,
                "train_mse": np.nan,
                "train_rmse": np.nan,
                "train_mape": np.nan,
                "train_smape": np.nan,
            })

        if len(g_te) > 0:
            yt_te = g_te["qty"].values
            yp_te = g_te["pred_qty"].values
            row.update({
                "test_mae": mae(yt_te, yp_te),
                "test_mse": mse(yt_te, yp_te),
                "test_rmse": rmse(yt_te, yp_te),
                "test_mape": mape(yt_te, yp_te),
                "test_smape": smape(yt_te, yp_te),
            })
        else:
            row.update({
                "test_mae": np.nan,
                "test_mse": np.nan,
                "test_rmse": np.nan,
                "test_mape": np.nan,
                "test_smape": np.nan,
            })

        rows.append(row)

    metrics_series = pd.DataFrame(rows)
    metrics_series["gap_RMSE"] = metrics_series["test_rmse"] - metrics_series["train_rmse"]
    metrics_series["ratio_RMSE"] = metrics_series["test_rmse"] / metrics_series["train_rmse"]

    SERIES_METRIC_PATH = METRIC_DIR / "metrics_by_series_clusters_tweedie_full_noleak.csv"
    metrics_series.to_csv(SERIES_METRIC_PATH, index=False)
    print("Saved metrics per series to:", SERIES_METRIC_PATH)
    print(metrics_series.head(10).to_string(index=False))

    # ----------------------------------
    # Step 13: PLOT ACTUAL vs PRED (TEST)
    # ----------------------------------
    print(f"\n[STEP 13] Plot actual vs pred TEST per seri ke: {PLOT_DIR}")

    test_only = df_pred[df_pred["is_test"] == 1].copy()

    for (cab, sku), g in test_only.groupby(["cabang", "sku"], sort=False):
        g = g.sort_values("periode")

        if g["qty"].notna().sum() == 0:
            continue

        plt.figure(figsize=(10, 5))
        plt.plot(g["periode"], g["qty"], marker="o", label="Actual qty")
        plt.plot(g["periode"], g["pred_qty"], marker="x", label="Predicted qty")
        plt.xlabel("Periode")
        plt.ylabel("Qty")
        plt.title(f"Actual vs Predicted - TEST\nCabang {cab}, SKU {sku}")
        plt.legend()
        plt.grid(True)
        plt.tight_layout()

        fname = f"{cab}_{sku}_test_actual_vs_pred.png".replace("/", "-")
        plt.savefig(PLOT_DIR / fname, dpi=200)
        plt.close()

    # ----------------------------------
    # Step 14: DIAGNOSTICS (residual, overfit, dll)
    # ----------------------------------
    print(f"\n[STEP 14] Diagnostics ke: {DIAG_DIR}")

    df_resid = df_pred.copy()
    df_resid["resid"] = df_resid["qty"].astype(float) - df_resid["pred_qty"].astype(float)
    df_resid["abs_resid"] = df_resid["resid"].abs()

    # Histogram residual global
    plt.figure(figsize=(8, 5))
    plt.hist(df_resid["resid"], bins=80)
    plt.xlabel("Residual (qty - pred_qty)")
    plt.ylabel("Frekuensi")
    plt.title("Histogram residual global (train + test)")
    plt.tight_layout()
    plt.savefig(DIAG_DIR / "hist_residual_global.png", dpi=200)
    plt.close()

    # Residual vs predicted
    plt.figure(figsize=(8, 5))
    plt.scatter(df_resid["pred_qty"], df_resid["resid"], alpha=0.3)
    plt.axhline(0, color="red", linestyle="--")
    plt.xlabel("Predicted qty")
    plt.ylabel("Residual")
    plt.title("Residual vs predicted qty")
    plt.tight_layout()
    plt.savefig(DIAG_DIR / "scatter_resid_vs_pred.png", dpi=200)
    plt.close()

    # Top outliers (train)
    TOP_N = 50
    top_outliers = (
        df_resid[df_resid["is_train"] == 1]
        .sort_values("abs_resid", ascending=False)
        .head(TOP_N)
        [["area", "cabang", "sku", "periode", "qty", "pred_qty", "resid", "abs_resid"]]
    )
    OUTLIER_PATH = DIAG_DIR / "top_outliers_train_full.csv"
    top_outliers.to_csv(OUTLIER_PATH, index=False)
    print("Saved top outliers train to:", OUTLIER_PATH)

    # Hist ratio_RMSE
    plt.figure(figsize=(8, 5))
    plt.hist(metrics_series["ratio_RMSE"].dropna(), bins=30)
    plt.xlabel("ratio_RMSE = test_rmse / train_rmse")
    plt.ylabel("Jumlah seri")
    plt.title("Distribusi ratio_RMSE antar seri")
    plt.tight_layout()
    plt.savefig(DIAG_DIR / "hist_ratio_RMSE.png", dpi=200)
    plt.close()

    # Scatter train vs test RMSE
    plt.figure(figsize=(6, 6))
    plt.scatter(metrics_series["train_rmse"], metrics_series["test_rmse"], alpha=0.7)
    max_val = np.nanmax([
        metrics_series["train_rmse"].max(),
        metrics_series["test_rmse"].max()
    ])
    plt.plot([0, max_val], [0, max_val], "r--")
    plt.xlabel("Train RMSE")
    plt.ylabel("Test RMSE")
    plt.title("Train vs Test RMSE per cabang–SKU")
    plt.tight_layout()
    plt.savefig(DIAG_DIR / "scatter_train_vs_test_RMSE.png", dpi=200)
    plt.close()

    # Seri overfit / underfit
    overfit_series = metrics_series[metrics_series["ratio_RMSE"] > 1.3].copy()
    under_series   = metrics_series[metrics_series["ratio_RMSE"] < 0.8].copy()

    print("\nSeri dengan ratio_RMSE > 1.3 (indikasi sulit di test / overfit lokal):")
    if len(overfit_series) > 0:
        print(
            overfit_series[["cabang", "sku", "cluster", "train_rmse", "test_rmse", "ratio_RMSE"]]
            .sort_values("ratio_RMSE", ascending=False)
            .head(20)
            .to_string(index=False)
        )
    else:
        print("Tidak ada.")

    print("\nSeri dengan ratio_RMSE < 0.8 (train lebih jelek dari test):")
    if len(under_series) > 0:
        print(
            under_series[["cabang", "sku", "cluster", "train_rmse", "test_rmse", "ratio_RMSE"]]
            .sort_values("ratio_RMSE")
            .head(20)
            .to_string(index=False)
        )
    else:
        print("Tidak ada.")

    print("\nSELESAI: A+B+C+D+E (NO LEAK, DATASET FULL) + diagnostics lengkap.")


if __name__ == "__main__":
    main()


RUN FULL TRAINING (NO LEAK): A+B+C+D+E - DATASET FULL
Load data: D:\Documents\Skripsi\demand-forecasting\data\dataset_full\lgbm_dataset_full_fullfeat.csv
Rows: 158863

[STEP 2] Build SKU profile dari TRAIN...
Saved raw train profile to: D:\Documents\Skripsi\demand-forecasting\data\dataset_full\cluster_profiles_raw_train_only.csv

[STEP 3] Clustering SKU (TRAIN only)...
Saved clustered profile to: D:\Documents\Skripsi\demand-forecasting\data\dataset_full\cluster_profiles_full_train_only.csv
Cluster summary (train stats):
         qty_mean    cv  zero_ratio  total_qty
cluster                                       
0           53.35  1.19        0.30    2185.03
1          116.64  0.64        0.02    4780.90
2         2409.69  0.58        0.00   98797.44
3           14.43  1.82        0.52     587.28

[STEP 4] Merge cluster dan demand_level ke panel (train+test)...

[STEP 5] Tambah hierarchy features (family)...
Family mapping: {'AIJK': 0, 'APQR': 1, 'AQRS': 2, 'ARST': 3, 'ATUV': 4, 'AUVW'

[I 2025-11-24 02:16:28,113] A new study created in memory with name: no-name-fa8854f9-eb87-49bf-a269-c9bfa9a815cd



[STEP 8] Num features: 40
Contoh fitur: ['event_flag', 'event_flag_lag1', 'holiday_count', 'holiday_count_lag1', 'rainfall_lag1', 'imputed', 'spike_flag', 'month', 'year', 'qtr', 'qty_lag1', 'qty_lag2', 'qty_lag3', 'qty_lag4', 'qty_lag5', 'qty_lag6', 'qty_lag7', 'qty_lag8', 'qty_lag9', 'qty_lag10']

[STEP 9] Training LGBM per cluster (Tweedie, no leak)...

TRAINING CLUSTER 0


[I 2025-11-24 02:16:30,517] Trial 0 finished with value: 164.37121692825158 and parameters: {'learning_rate': 0.021923099845827587, 'num_leaves': 66, 'max_depth': 5, 'min_data_in_leaf': 103, 'feature_fraction': 0.7284002162080668, 'bagging_fraction': 0.8073571282390148, 'bagging_freq': 4}. Best is trial 0 with value: 164.37121692825158.
[I 2025-11-24 02:16:30,938] Trial 1 finished with value: 163.30382246291862 and parameters: {'learning_rate': 0.18617280148093396, 'num_leaves': 195, 'max_depth': 3, 'min_data_in_leaf': 89, 'feature_fraction': 0.8514004718158846, 'bagging_fraction': 0.6500231705342397, 'bagging_freq': 12}. Best is trial 1 with value: 163.30382246291862.
[I 2025-11-24 02:16:32,779] Trial 2 finished with value: 156.65571579358547 and parameters: {'learning_rate': 0.03772670263489984, 'num_leaves': 208, 'max_depth': 9, 'min_data_in_leaf': 85, 'feature_fraction': 0.7664415758283992, 'bagging_fraction': 0.8337032512441286, 'bagging_freq': 10}. Best is trial 2 with value: 156

Cluster 0: model saved to D:\Documents\Skripsi\demand-forecasting\outputs\lgbm_full_clusters_tweedie_noleak\models\lgbm_full_cluster_0.txt

TRAINING CLUSTER 1


[I 2025-11-24 02:17:45,759] A new study created in memory with name: no-name-a5c66818-9aa5-4186-8213-261ab728b76e
[I 2025-11-24 02:18:08,283] Trial 0 finished with value: 35.63993085063561 and parameters: {'learning_rate': 0.021923099845827587, 'num_leaves': 66, 'max_depth': 5, 'min_data_in_leaf': 103, 'feature_fraction': 0.7284002162080668, 'bagging_fraction': 0.8073571282390148, 'bagging_freq': 4}. Best is trial 0 with value: 35.63993085063561.
[I 2025-11-24 02:18:21,219] Trial 1 finished with value: 42.70448914025789 and parameters: {'learning_rate': 0.18617280148093396, 'num_leaves': 195, 'max_depth': 3, 'min_data_in_leaf': 89, 'feature_fraction': 0.8514004718158846, 'bagging_fraction': 0.6500231705342397, 'bagging_freq': 12}. Best is trial 0 with value: 35.63993085063561.
[I 2025-11-24 02:18:50,119] Trial 2 finished with value: 26.735313936919482 and parameters: {'learning_rate': 0.03772670263489984, 'num_leaves': 208, 'max_depth': 9, 'min_data_in_leaf': 85, 'feature_fraction': 0.

Cluster 1: model saved to D:\Documents\Skripsi\demand-forecasting\outputs\lgbm_full_clusters_tweedie_noleak\models\lgbm_full_cluster_1.txt

TRAINING CLUSTER 2


[I 2025-11-24 02:35:16,228] Trial 0 finished with value: 1087.0862007958247 and parameters: {'learning_rate': 0.021923099845827587, 'num_leaves': 66, 'max_depth': 5, 'min_data_in_leaf': 103, 'feature_fraction': 0.7284002162080668, 'bagging_fraction': 0.8073571282390148, 'bagging_freq': 4}. Best is trial 0 with value: 1087.0862007958247.
[I 2025-11-24 02:35:16,407] Trial 1 finished with value: 1324.0815780783455 and parameters: {'learning_rate': 0.18617280148093396, 'num_leaves': 195, 'max_depth': 3, 'min_data_in_leaf': 89, 'feature_fraction': 0.8514004718158846, 'bagging_fraction': 0.6500231705342397, 'bagging_freq': 12}. Best is trial 0 with value: 1087.0862007958247.
[I 2025-11-24 02:35:17,172] Trial 2 finished with value: 973.772697085323 and parameters: {'learning_rate': 0.03772670263489984, 'num_leaves': 208, 'max_depth': 9, 'min_data_in_leaf': 85, 'feature_fraction': 0.7664415758283992, 'bagging_fraction': 0.8337032512441286, 'bagging_freq': 10}. Best is trial 2 with value: 973.7

Cluster 2: model saved to D:\Documents\Skripsi\demand-forecasting\outputs\lgbm_full_clusters_tweedie_noleak\models\lgbm_full_cluster_2.txt

TRAINING CLUSTER 3


[I 2025-11-24 02:36:16,237] Trial 0 finished with value: 57.01806957753861 and parameters: {'learning_rate': 0.021923099845827587, 'num_leaves': 66, 'max_depth': 5, 'min_data_in_leaf': 103, 'feature_fraction': 0.7284002162080668, 'bagging_fraction': 0.8073571282390148, 'bagging_freq': 4}. Best is trial 0 with value: 57.01806957753861.
[I 2025-11-24 02:36:16,964] Trial 1 finished with value: 61.18928751999034 and parameters: {'learning_rate': 0.18617280148093396, 'num_leaves': 195, 'max_depth': 3, 'min_data_in_leaf': 89, 'feature_fraction': 0.8514004718158846, 'bagging_fraction': 0.6500231705342397, 'bagging_freq': 12}. Best is trial 0 with value: 57.01806957753861.
[I 2025-11-24 02:36:20,298] Trial 2 finished with value: 53.30257546124513 and parameters: {'learning_rate': 0.03772670263489984, 'num_leaves': 208, 'max_depth': 9, 'min_data_in_leaf': 85, 'feature_fraction': 0.7664415758283992, 'bagging_fraction': 0.8337032512441286, 'bagging_freq': 10}. Best is trial 2 with value: 53.30257

Cluster 3: model saved to D:\Documents\Skripsi\demand-forecasting\outputs\lgbm_full_clusters_tweedie_noleak\models\lgbm_full_cluster_3.txt

[STEP 10] Prediksi penuh (train + test) per cluster...
Saved full panel with predictions to: D:\Documents\Skripsi\demand-forecasting\outputs\lgbm_full_clusters_tweedie_noleak\panel_with_predictions.csv

[STEP 11] Global metrics train/test...
Saved global metrics to: D:\Documents\Skripsi\demand-forecasting\outputs\lgbm_full_clusters_tweedie_noleak\metrics\global_metrics_clusters_tweedie_full_noleak.csv
split  n_obs          MSE        RMSE        MAE         MAPE     sMAPE
train 158803 6.398103e+03   79.988145  11.512020 5.877468e+07 38.497883
 test     45 1.218703e+06 1103.948951 614.911569 1.426396e+01 14.458012

[STEP 12] Metrics per cabang–SKU...
Saved metrics per series to: D:\Documents\Skripsi\demand-forecasting\outputs\lgbm_full_clusters_tweedie_noleak\metrics\metrics_by_series_clusters_tweedie_full_noleak.csv
cabang         sku  cluster  n_t