In [1]:
"""
retrain_catboost.py — 重新訓練 CatBoostRegressor（移除 OUTPUT_BIN 洩漏）
=======================================================================
• 讀取 FINAL_DATA.csv
• 特徵工程：車次每司機、遠距比例、月份 sin / cos
• 類別特徵：星期、明日天氣（字串型）
• 5‑Fold 交叉驗證列出 Strict / ±1 / ±2 平均值
• 取交叉驗證各折最佳迭代中位數 → 全量資料重訓
• 儲存模型為 driver_catboost_v2.cbm

指令：
    python retrain_catboost.py
依賴：
    pip install catboost pandas numpy scikit-learn
"""

import numpy as np
import pandas as pd
from pathlib import Path
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold

DATA_PATH = Path("FINAL_DATA.csv")
MODEL_OUT = Path("driver_catboost_v2.cbm")

if not DATA_PATH.exists():
    raise FileNotFoundError(f"找不到 {DATA_PATH}")

CAT_COLS = ["星期", "明日天氣"]

# -----------------------------
# 1. 讀檔 & 特徵工程
# -----------------------------

df = pd.read_csv(DATA_PATH).dropna()

df["車次每司機"] = df["明日車次"] / (df["今日在職"] + 1e-5)
df["遠距比例"] = df["明日遠距離"] / (df["明日車次"] + 1e-5)
df["月份_sin"] = np.sin(2 * np.pi * df["月份"] / 12)
df["月份_cos"] = np.cos(2 * np.pi * df["月份"] / 12)

# 類別轉字串
for c in CAT_COLS:
    df[c] = df[c].astype(str)

X = df.drop(columns=["OUTPUT"])
y = df["OUTPUT"].values

# -----------------------------
# 2. 交叉驗證
# -----------------------------

def acc_tol(y_true, y_pred, tol):
    return (np.abs(np.round(y_true) - np.round(y_pred)) <= tol).mean() * 100

kf = KFold(n_splits=5, shuffle=True, random_state=90)
strict_scores, pm1_scores, pm2_scores = [], [], []
best_iters = []

for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
    train_pool = Pool(X.iloc[train_idx], y[train_idx], cat_features=CAT_COLS)
    test_pool = Pool(X.iloc[test_idx], y[test_idx], cat_features=CAT_COLS)

    model = CatBoostRegressor(
        iterations=3000,
        learning_rate=0.03,
        depth=6,
        loss_function="RMSE",
        random_seed=fold,
        early_stopping_rounds=100,
        verbose=False,
    )
    model.fit(train_pool, eval_set=test_pool)

    pred = model.predict(test_pool)
    strict_scores.append(acc_tol(y[test_idx], pred, 0))
    pm1_scores.append(acc_tol(y[test_idx], pred, 1))
    pm2_scores.append(acc_tol(y[test_idx], pred, 2))
    best_iters.append(model.get_best_iteration())

print("=== 5‑Fold CV 平均結果 ===")
print(f"Strict  : {np.mean(strict_scores):.2f} % ± {np.std(strict_scores):.2f}")
print(f"±1      : {np.mean(pm1_scores):.2f} % ± {np.std(pm1_scores):.2f}")
print(f"±2      : {np.mean(pm2_scores):.2f} % ± {np.std(pm2_scores):.2f}")

# -----------------------------
# 3. 全資料訓練並保存
# -----------------------------
full_pool = Pool(X, y, cat_features=CAT_COLS)
median_iter = int(np.median(best_iters)) if best_iters else 2000

final_model = CatBoostRegressor(
    iterations=median_iter,
    learning_rate=0.03,
    depth=6,
    loss_function="RMSE",
    random_seed=123,
    verbose=False,
)
final_model.fit(full_pool)
final_model.save_model(str(MODEL_OUT))
print(f"\n✅ 已儲存模型至 {MODEL_OUT}")


=== 5‑Fold CV 平均結果 ===
Strict  : 25.64 % ± 4.74
±1      : 69.61 % ± 4.13
±2      : 88.35 % ± 3.36

✅ 已儲存模型至 driver_catboost_v2.cbm
