In [None]:
from boosting import BoostRegressor
import numpy as np
import time
# ================ USAGE EXAMPLE ================
if __name__ == "__main__":
    # Generate sample data
    np.random.seed(42)
    n_samples, n_features = 5000, 10
    X = np.random.randn(n_samples, n_features)
    y = np.sum(X[:, :3], axis=1) + 0.1 * np.random.randn(n_samples)
    
    # Split data
    split_idx = int(0.8 * n_samples)
    X_train, X_test = X[:split_idx], X[split_idx:]
    y_train, y_test = y[:split_idx], y[split_idx:]
    
    print("🔥 Comparing Tree Growth Strategies")
    print("=" * 50)
    
    # 1. Level-wise (original)
    start_time = time.time()
    
    model_level = BoostRegressor(
        n_estimators=50,
        learning_rate=0.1,
        adaptive_lr=False,
        # lr_schedule="cosine_restart",  # Original approach
        max_depth=6,
        tree_learner="leaf",  # Original approach
        tree_method="binned",
        binned_mode="hist",
        verbose=True,
        batch_size=1,
        use_gpu=False,
        use_goss=True,
        use_neural=False,  # Original approach
        enable_interactions=False,
    )

    
    model_level.fit(X_train, y_train, eval_set=(X_test, y_test))
    level_time = time.time() - start_time
    level_pred = model_level.predict(X_test)
    level_mse = np.mean((y_test - level_pred) ** 2)
    print(f"   Time: {level_time:.2f}s")
    print(f"   Test MSE: {level_mse:.6f}")
    print(f"   Trees: {len(model_level.trees)}")
    
    # print feature importances
    importances = model_level.feature_importances()
    print("   Feature Importances:", importances)
    # # plot feature importances like shap
    # import matplotlib.pyplot as plt
    # plt.bar(range(n_features), importances)
    # plt.xlabel("Feature")
    # plt.ylabel("Importance")
    # plt.title("Feature Importances")
    # plt.show()
    
    # # # # compare with scikit learn's GradientBoostingRegressor
    from sklearn.ensemble import GradientBoostingRegressor
    model_sklearn = GradientBoostingRegressor(
        n_estimators=50,
        learning_rate=0.1,
        max_depth=6,
        verbose=1,
    )
    model_sklearn.fit(X_train, y_train)
    sklearn_time = time.time() - start_time
    sklearn_pred = model_sklearn.predict(X_test)
    sklearn_mse = np.mean((y_test - sklearn_pred) ** 2)
    print(f"   Scikit-learn Time: {sklearn_time:.2f}s")
    print(f"   Scikit-learn Test MSE: {sklearn_mse:.6f}")
    print(f"   Scikit-learn Trees: {len(model_sklearn.estimators_)}")




🔥 Comparing Tree Growth Strategies
Training leaf-wise trees
  Loss: mse, Base: -0.0095
[  10] Train: 0.590026, Val: 0.663195, Time: 0.11s
[  20] Train: 0.135681, Val: 0.176386, Time: 0.22s
[  30] Train: 0.052351, Val: 0.082435, Time: 0.32s
[  40] Train: 0.033992, Val: 0.054255, Time: 0.38s
[  50] Train: 0.027840, Val: 0.047505, Time: 0.48s
Training completed in 0.48s
   Time: 0.49s
   Test MSE: 0.047505
   Trees: 50
   Feature Importances: [3.52513675e-01 3.15337841e-01 3.28674262e-01 4.20885014e-04
 2.95784968e-04 4.58280092e-04 6.03707692e-04 6.48259849e-04
 4.22619366e-04 6.24684671e-04]
      Iter       Train Loss   Remaining Time 
         1           2.6504            0.68s
         2           2.2393            0.67s
         3           1.8968            0.64s
         4           1.6071            0.62s
         5           1.3665            0.61s
         6           1.1653            0.60s
         7           0.9935            0.59s
         8           0.8490            0.

In [18]:
from boosting import BoostRegressor
import numpy as np
import time

# Optional dependencies
try:
    from xgboost import XGBRegressor
    _HAS_XGB = True
except Exception:
    _HAS_XGB = False

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


def metrics(y_true, y_pred):
    return {
        "MSE": mean_squared_error(y_true, y_pred),
        "MAE": mean_absolute_error(y_true, y_pred),
        "R2":  r2_score(y_true, y_pred),
    }


def benchmark(name, model, X_train, y_train, X_test, y_test, get_importance=None):
    """Fit, time, predict, score, and (optionally) collect feature importance."""
    t0 = time.perf_counter()
    model.fit(X_train, y_train)
    fit_time = time.perf_counter() - t0

    y_pred = model.predict(X_test)
    m = metrics(y_test, y_pred)

    importance = None
    if get_importance is not None:
        try:
            importance = get_importance(model)
        except Exception:
            importance = None

    return {
        "name": name,
        "time_s": fit_time,
        **m,
        "n_trees": getattr(model, "n_estimators", getattr(model, "n_estimators_", None)),
        "feature_importance": importance,
        "model": model,
    }


if __name__ == "__main__":
    # ----------------- data -----------------
    np.random.seed(42)
    n_samples, n_features = 5000, 20
    X = np.random.randn(n_samples, n_features)
    y = np.sum(X[:, :3], axis=1) + 0.1 * np.random.randn(n_samples)

    # Split
    split_idx = int(0.8 * n_samples)
    X_train, X_test = X[:split_idx], X[split_idx:]
    y_train, y_test = y[:split_idx], y[split_idx:]

    print("🔥 Comparing Regressors")
    print("=" * 60)

    results = []

    # 1) Your BoostRegressor (as before)
    model_level = BoostRegressor(
        n_estimators=200,
        learning_rate=0.09,
        adaptive_lr=False,
        max_depth=6,
        tree_learner="leaf",
        tree_method="binned",
        binned_mode="hist",
        verbose=True,
        batch_size=1,
        use_gpu=False,
        use_goss=True,
        use_neural=False,
        enable_interactions=False,
    )
    res_boost = benchmark(
        "BoostRegressor",
        model_level,
        X_train, y_train, X_test, y_test,
        get_importance=lambda m: getattr(m, "feature_importances", lambda: None)(),
    )
    results.append(res_boost)

    # 2) Scikit-learn GradientBoostingRegressor (kept for reference)
    model_sklearn_gbr = GradientBoostingRegressor(
        n_estimators=50,
        learning_rate=0.1,
        max_depth=6,
        random_state=4,
        verbose=0,
    )
    res_gbr = benchmark(
        "sklearn-GBR",
        model_sklearn_gbr,
        X_train, y_train, X_test, y_test,
        get_importance=lambda m: getattr(m, "feature_importances_", None),
    )
    results.append(res_gbr)

    # 3) Random Forest
    rf = RandomForestRegressor(
        n_estimators=200,          # RF benefits from more trees; adjust if you want strict parity
        max_depth=6,
        n_jobs=-1,
        random_state=4,
        verbose=0,
    )
    res_rf = benchmark(
        "RandomForest",
        rf,
        X_train, y_train, X_test, y_test,
        get_importance=lambda m: getattr(m, "feature_importances_", None),
    )
    results.append(res_rf)

    # 4) XGBoost (if available)
    if _HAS_XGB:
        xgb = XGBRegressor(
            n_estimators=200,      # modest; XGB often likes more, but this keeps it snappy
            max_depth=6,
            learning_rate=0.1,
            subsample=1.0,
            colsample_bytree=1.0,
            reg_lambda=1.0,
            tree_method="hist",    # fast, robust default
            random_state=4,
            verbosity=0,
            n_jobs=-1,
        )
        res_xgb = benchmark(
            "XGBoost",
            xgb,
            X_train, y_train, X_test, y_test,
            get_importance=lambda m: getattr(m, "feature_importances_", None),
        )
        results.append(res_xgb)
    else:
        print("⚠️  xgboost not installed; skipping XGBoost comparison.")

    # ----------------- pretty print -----------------
    # Sort by MSE ascending
    results_sorted = sorted(results, key=lambda r: r["MSE"])
    w = max(len(r["name"]) for r in results_sorted)
    print(f"\n{'Model'.ljust(w)} |   Time (s) |        MSE |        MAE |        R² | Trees")
    print("-" * (w + 62))
    for r in results_sorted:
        trees = r["n_trees"] if r["n_trees"] is not None else "-"
        print(
            f"{r['name'].ljust(w)} | {r['time_s']:10.3f} | {r['MSE']:10.6f} | "
            f"{r['MAE']:10.6f} | {r['R2']:8.4f} | {trees}"
        )

    # ----------------- feature importances (quick glance) -----------------
    # Show top-5 importances for each model that provides them
    print("\nTop-5 feature importances (if available):")
    for r in results_sorted:
        imp = r["feature_importance"]
        if imp is None:
            print(f" - {r['name']}: (no importances)")
            continue
        # Ensure numpy array
        imp = np.asarray(imp)
        top_idx = np.argsort(imp)[::-1][:5]
        pairs = ", ".join([f"f{j}={imp[j]:.3f}" for j in top_idx])
        print(f" - {r['name']}: {pairs}")


🔥 Comparing Regressors
Training leaf-wise trees
  Loss: mse, Base: -0.0180
[  10] Train: 0.676082, Time: 0.18s
[  20] Train: 0.172662, Time: 0.32s
[  30] Train: 0.066244, Time: 0.47s
[  40] Train: 0.038745, Time: 0.58s
[  50] Train: 0.029299, Time: 0.67s
[  60] Train: 0.027196, Time: 0.74s
[  70] Train: 0.025173, Time: 0.85s
[  80] Train: 0.026411, Time: 0.98s
[  90] Train: 0.024041, Time: 1.16s
[ 100] Train: 0.019182, Time: 1.33s
[ 110] Train: 0.018191, Time: 1.53s
[ 120] Train: 0.017076, Time: 1.69s
[ 130] Train: 0.016655, Time: 1.82s
[ 140] Train: 0.036784, Time: 2.09s
[ 150] Train: 0.037602, Time: 2.36s
[ 160] Train: 0.013887, Time: 2.58s
[ 170] Train: 0.018052, Time: 2.81s
[ 180] Train: 0.020789, Time: 3.16s
[ 190] Train: 0.039319, Time: 3.52s
[ 200] Train: 0.020572, Time: 3.73s
Training completed in 3.73s

Model          |   Time (s) |        MSE |        MAE |        R² | Trees
----------------------------------------------------------------------------
BoostRegressor |      3.7

In [2]:
from shap import *

BoostRegressor = add_shap_to_boostregressor(BoostRegressor)

model_level = BoostRegressor(
    n_estimators=50,
    learning_rate=0.1,
    max_depth=6,
    tree_learner="leaf",  # Original approach
    tree_method="hist",
    verbose=True,
    batch_size=1
)

model_level.fit(X_train, y_train, eval_set=(X_test, y_test))
level_time = time.time() - start_time
# Set background for proper expected value
model_level.set_shap_background(X_train[:100])  # Use sample of training data

# Compute SHAP values (should have much lower additivity errors)
shap_values = model_level.shap_values(X_test[:10], debug=True)

# Validate the fix
# Explain individual predictions
explanation = model_level.explain_prediction(X_test[0])
# Get feature importance
#importance = model_level.shap_feature_importance(X_test[:100])
#print("Feature Importance:", importance)

# Analyze model behavior
#shap_values = model_level.shap_values(X_test[:50])

🚀 Training with Leaf-wise trees (batch_size=1
   Objective: reg:squarederror, Loss: mse, Base score: -0.0180
   NODE layers disabled.
   GOSS: top_rate=0.2, other_rate=0.1
   DART: rate_drop=0.1, skip_drop=0.5, normalize_type=tree, one_drop=no
   Binning: hist (hist), 256 bins
   Training on 4000 samples with 20 features


ValueError: tree_method must be 'binned' or 'exact'

In [None]:
# Add SHAP to your model
from your_corrected_shap import add_shap_to_boostregressor

add_shap_to_boostregressor(BoostRegressor)

# Train model
model = BoostRegressor(n_estimators=100)
model.fit(X_train, y_train)



In [None]:
shap_values