In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import logging
from new_strategy import Asset, BetSizingMethod, get_bet_sizing
import nbimporter
from backtest import Backtest
from meta_strategy import MetaLabelingStrategy
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import CalibratedClassifierCV
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from lightgbm import LGBMClassifier
import shap
import os
from datetime import datetime
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import brier_score_loss
from sklearn.calibration import calibration_curve
from scipy.stats import binom
from sklearn.model_selection import train_test_split

%load_ext autoreload
%autoreload 2

In [None]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# ---------------------- MetaModelHandler ---------------------- #
class MetaModelHandler:
    def __init__(self):
        self.long_model = None
        self.short_model = None
        self.long_scaler = None
        self.short_scaler = None
        self.feature_cols = []

    def train(self, trades_df, long_feature_cols, short_feature_cols, asset_name, method_name):

        self.long_feature_cols = long_feature_cols
        self.short_feature_cols = short_feature_cols

        trades_df = trades_df.dropna(subset=['meta_label'])

        # Remove any 'set' column if it exists (we're not using train/val split anymore)
        if 'set' in trades_df.columns:
            trades_df = trades_df.drop('set', axis=1)

        # Prepare direction-specific data
        long_trades = trades_df[trades_df['direction'] == 'long'].dropna(subset=long_feature_cols)
        short_trades = trades_df[trades_df['direction'] == 'short'].dropna(subset=short_feature_cols)

        # =================== LONG MODEL ===================
        X_long = long_trades[long_feature_cols]
        y_long = long_trades["meta_label"]


        # Fit scaler on all long data
        X_long_scaled = X_long.values
        self.long_scaler = None

        self._sanity_check(X_long_scaled, y_long, "LONG")

        print("[Optuna] Tuning LONG model...")
       # best_long_params = self.optimize_model_cv(X_long_scaled, y_long, n_trials=50)
        best_long_params  = self.optimize_model_cv(X_long_scaled,  y_long,  n_trials=50, n_splits=5, metric="auc")


        # Train final model with best parameters
        lgbm_long = LGBMClassifier(**best_long_params)

        # === LONG: OOF SHAP ===
        asset_series_long = long_trades["asset"] if "asset" in long_trades.columns else None
        global_imp_long, portable_imp_long, per_fold_imp_long = self.oof_shap_importance_lgb(
            X_long.values, y_long,
            feature_names=self.long_feature_cols,
            base_params=best_long_params,
            n_splits=5,
            asset_series=asset_series_long,
            embargo=0  # bump if your windows overlap near split boundary
        )

        # Save artifacts (same files as before, but OOF-based)
        self.save_oof_shap_artifacts(global_imp_long, asset_name, method_name, title="long")

        # Optional: select features by share of OOF SHAP (e.g., >= 5%)
        glong = global_imp_long.copy()
        glong["pct_of_total"] = glong["mean_abs_shap"] / glong["mean_abs_shap"].sum()
        selected_long = glong.loc[glong["pct_of_total"] >= 0.05, "feature"].tolist()
        print("[LONG] Selected features (pct >= 5%):", selected_long)


        # Fit raw model for evaluation purposes
        lgbm_long.fit(X_long_scaled, y_long)

        self.long_model = lgbm_long
        self.raw_long_model = lgbm_long
        self.long_raw_metrics, self.long_oof, self.long_lift = self._evaluate_raw_model(
        X_long_scaled, y_long, lgbm_long, label="LONG", n_splits=5
)          
        raw_probs = self.long_oof  # Use the OOF predictions from _evaluate_raw_model
        self.plot_calibration_with_ci(y_true=y_long, y_probs=self.long_oof, 
                             model_name="Raw LONG Model (OOF)", label="LONG")

        X_long_df = pd.DataFrame(X_long_scaled, columns=self.long_feature_cols)

        """self.plot_shap_values(
            model=lgbm_long,
            X=X_long,
            feature_names=self.long_feature_cols,
            title="Long",
            asset_name=asset_name,
            method_name=method_name
        )"""

        # =================== SHORT MODEL ===================
        X_short = short_trades[short_feature_cols]
        y_short = short_trades["meta_label"]

        # Fit scaler on all short data
        X_short_scaled = X_short.values
        self.short_scaler = None

        self._sanity_check(X_short_scaled, y_short, "SHORT")

        print("[Optuna] Tuning SHORT model...")
        #best_short_params = self.optimize_model_cv(X_short_scaled, y_short, n_trials=50)
        best_short_params = self.optimize_model_cv(X_short_scaled, y_short, n_trials=50, n_splits=5, metric="auc")


        # Train final model with best parameters
        lgbm_short = LGBMClassifier(**best_short_params)

        # === SHORT: OOF SHAP ===
        asset_series_short = short_trades["asset"] if "asset" in short_trades.columns else None
        global_imp_short, portable_imp_short, per_fold_imp_short = self.oof_shap_importance_lgb(
            X_short.values, y_short,
            feature_names=self.short_feature_cols,
            base_params=best_short_params,
            n_splits=5,
            asset_series=asset_series_short,
            embargo=0
        )

        # Save artifacts (same files as before, but OOF-based)
        self.save_oof_shap_artifacts(global_imp_short, asset_name, method_name, title="short")

        gshort = global_imp_short.copy()
        gshort["pct_of_total"] = gshort["mean_abs_shap"] / gshort["mean_abs_shap"].sum()
        selected_short = gshort.loc[gshort["pct_of_total"] >= 0.05, "feature"].tolist()
        print("[SHORT] Selected features (pct >= 5%):", selected_short)


      



        # Fit raw model for evaluation purposes
        lgbm_short.fit(X_short_scaled, y_short)

        self.short_model = lgbm_short
        self.raw_short_model = lgbm_short
        self.short_raw_metrics, self.short_oof, self.short_lift = self._evaluate_raw_model(
        X_short_scaled, y_short, lgbm_short, label="SHORT", n_splits=5
         )

        raw_probs = self.short_oof  # Use OOF predictions
        self.plot_calibration_with_ci(y_true=y_short, y_probs=self.short_oof,
                                    model_name="Raw SHORT Model", label="SHORT")
     

        X_short_df = pd.DataFrame(X_short_scaled, columns=self.short_feature_cols)

        """self.plot_shap_values(
            model=lgbm_short,
            X=X_short,
            feature_names=self.short_feature_cols,
            title="Short",
            asset_name=asset_name,
            method_name=method_name
        )"""


    def evaluate_calibration(self, raw_model, calibrated_model, X, y, label):
        cal_probs = calibrated_model.predict_proba(X)[:, 1]
        print(f"\n📉 Calibration Evaluation — {label}")
        print(f"  → Brier score (calibrated): {brier_score_loss(y, cal_probs):.4f}")
        
        # ✅ Plot calibrated model calibration curve
        self.plot_calibration_with_ci(y_true=y, y_probs=cal_probs,
                                model_name="Calibrated Model", label=label)

        # Also evaluate raw model if provided
        if raw_model is not None:
            raw_probs = raw_model.predict_proba(X)[:, 1]
            print(f"  → Brier score (raw):        {brier_score_loss(y, raw_probs):.4f}")
            
            # ✅ Plot raw model calibration curve
            self.plot_calibration_with_ci(y_true=y, y_probs=raw_probs,
                                    model_name="Raw Model", label=label)

    def optimize_model_cv(self, X, y, n_trials=80, n_splits=5, metric="auc", patience=150, seed=42):
        """
        Hyperparameter search with TimeSeriesSplit + early stopping.
        Supports boosting_type in {"gbdt","goss"}.
        metric: "auc" (recommended) or "f1".
        """
        import numpy as np
        import lightgbm as lgb
        import optuna
        from sklearn.model_selection import TimeSeriesSplit
        from sklearn.metrics import roc_auc_score, f1_score

        tscv = TimeSeriesSplit(n_splits=n_splits)

        def objective(trial):
            boosting = trial.suggest_categorical("boosting_type", ["gbdt", "goss"])
            params = {
                "objective": "binary",
                "boosting_type": boosting,
                "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.08, log=True),
                "num_leaves": trial.suggest_int("num_leaves", 16, 256, log=True),
                "max_depth": trial.suggest_int("max_depth", -1, 12),
                "min_child_samples": trial.suggest_int("min_child_samples", 10, 300),
                "min_split_gain": trial.suggest_float("min_split_gain", 0.0, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
                "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 5.0),
                "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 5.0),
                "n_estimators": 4000,  # early stopping will cap this
                "random_state": seed,
                "n_jobs": -1,
                "verbosity": -1,
            }
            if boosting == "goss":
                params["top_rate"] = trial.suggest_float("top_rate", 0.1, 0.4)
                params["other_rate"] = trial.suggest_float("other_rate", 0.0, 0.2)
                params["subsample"] = 1.0
                params["subsample_freq"] = 0
            else:
                params["subsample"] = trial.suggest_float("subsample", 0.6, 1.0)
                params["subsample_freq"] = trial.suggest_int("subsample_freq", 0, 7)

            scores, best_iters = [], []
            for tr, va in tscv.split(X):
                m = lgb.LGBMClassifier(**params)
                y_tr = y.iloc[tr] if hasattr(y, "iloc") else y[tr]
                y_va = y.iloc[va] if hasattr(y, "iloc") else y[va]
                m.fit(
                    X[tr], y_tr,
                    eval_set=[(X[va], y_va)],
                    eval_metric=("auc" if metric == "auc" else "binary_logloss"),
                    callbacks=[lgb.early_stopping(patience, verbose=False)],
                )
                p = m.predict_proba(X[va])[:, 1]
                fold_score = roc_auc_score(y_va, p) if metric == "auc" else f1_score(y_va, (p >= 0.5).astype(int))
                scores.append(fold_score)
                best_iters.append(getattr(m, "best_iteration_", params["n_estimators"]))

            trial.set_user_attr("mean_best_iter", int(np.mean(best_iters)))
            return float(np.mean(scores))

        study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=seed))
        study.optimize(objective, n_trials=n_trials, show_progress_bar=False)

        best = study.best_params
        mean_best_iter = int(study.best_trial.user_attrs.get("mean_best_iter", 800))
        best.update({
            "objective": "binary",
            "n_estimators": max(100, int(mean_best_iter * 1.1)),
            "random_state": seed,
            "n_jobs": -1,
            "verbosity": -1,
        })
        if best.get("boosting_type") == "goss":
            best.setdefault("subsample", 1.0)
            best.setdefault("subsample_freq", 0)

        print("✅ Best parameters:", best)
        print(f"📈 Best CV {metric.upper()}: {study.best_value:.4f} (mean best_iter ≈ {mean_best_iter})")
        return best

    
    def evaluate_calibration_cv(self, calibrated_model, X, y, label):
        print(f"\n📉 Calibration Evaluation (CV) — {label}")
        
        tscv = TimeSeriesSplit(n_splits=5)
        cv_probs_calibrated = []
        cv_probs_raw = []
        cv_true = []
        
        for train_idx, val_idx in tscv.split(X):
            X_train_fold, X_val_fold = X[train_idx], X[val_idx]
            y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
            
            # Handle different sklearn versions
            base_est = getattr(calibrated_model, "base_estimator", None)
            if base_est is None:
                base_est = getattr(calibrated_model, "estimator", None)
            base_model = LGBMClassifier(**base_est.get_params())
            base_model.fit(X_train_fold, y_train_fold)
            
            # Get RAW predictions on validation fold
            raw_probs = base_model.predict_proba(X_val_fold)[:, 1]
            cv_probs_raw.extend(raw_probs)
            
            # Calibrate on validation fold (paper's method)
            fold_calibrated = CalibratedClassifierCV(base_model, method='isotonic', cv='prefit')
            fold_calibrated.fit(X_val_fold, y_val_fold)
            
            # Evaluate on same validation fold (paper's method - optimistic but consistent)
            cal_probs = fold_calibrated.predict_proba(X_val_fold)[:, 1]
            cv_probs_calibrated.extend(cal_probs)
            
            cv_true.extend(y_val_fold)
        
        cv_probs_calibrated = np.array(cv_probs_calibrated)
        cv_probs_raw = np.array(cv_probs_raw)
        cv_true = np.array(cv_true)
        
        # Calculate Brier scores (will be optimistic for calibrated model)
        brier_calibrated = brier_score_loss(cv_true, cv_probs_calibrated)
        brier_raw = brier_score_loss(cv_true, cv_probs_raw)
        print(f"  → Brier score (raw):        {brier_raw:.4f}")
        print(f"  → Brier score (calibrated): {brier_calibrated:.4f}")
        print(f"  → Improvement:              {brier_raw - brier_calibrated:.4f}")
        
        # Plot comparison (calibrated will look very good)
        self.plot_calibration_comparison(
            y_true=cv_true, 
            y_probs_raw=cv_probs_raw,
            y_probs_calibrated=cv_probs_calibrated,
            label=label
        )

    
    def plot_calibration_comparison(self, y_true, y_probs_raw, y_probs_calibrated, label='LONG', n_bins=10):
        """Plot both raw and calibrated predictions on the same chart"""
        try:
            print(f"📊 Creating calibration comparison plot - {label}")
            
            y_true = np.array(y_true)
            y_probs_raw = np.array(y_probs_raw)
            y_probs_calibrated = np.array(y_probs_calibrated)
            
            if len(y_true) < 10:
                print(f"⚠️ Too few samples: {len(y_true)}")
                return
            
            actual_bins = min(10, len(y_true) // 10)
            if actual_bins < 2:
                actual_bins = 2
            
            # Get calibration curves for both
            frac_pos_raw, mean_pred_raw = calibration_curve(y_true, y_probs_raw, n_bins=actual_bins)
            frac_pos_cal, mean_pred_cal = calibration_curve(y_true, y_probs_calibrated, n_bins=actual_bins)
            
            # Calculate Brier scores
            brier_raw = np.mean((y_probs_raw - y_true) ** 2)
            brier_cal = np.mean((y_probs_calibrated - y_true) ** 2)
            
            # Create comparison plot
            fig, ax = plt.subplots(figsize=(10, 8))
            
            # Plot both curves
            ax.plot(mean_pred_raw, frac_pos_raw, 'o-', 
                    linewidth=2, markersize=8, label=f'Raw Model (Brier: {brier_raw:.4f})', color='red')
            ax.plot(mean_pred_cal, frac_pos_cal, 's-', 
                    linewidth=2, markersize=8, label=f'Calibrated Model (Brier: {brier_cal:.4f})', color='blue')
            
            # Perfect calibration reference line
            ax.plot([0, 1], [0, 1], '--', color='gray', label='Perfect Calibration')
            
            # Labels and formatting
            ax.set_xlabel('Mean Predicted Probability')
            ax.set_ylabel('Fraction of Positives') 
            ax.set_title(f'Calibration Comparison - {label}\nImprovement: {brier_raw - brier_cal:.4f}')
            ax.legend()
            ax.grid(True, alpha=0.3)
            ax.set_xlim(0, 1)
            ax.set_ylim(0, 1)
            
            plt.tight_layout()
            plt.show()
            plt.close()
            
            print(f"✅ Calibration comparison plot created successfully")
            
        except Exception as e:
            print(f"❌ Calibration comparison plot failed: {e}")
            try:
                plt.close()
            except:
                pass

    def _sanity_check(self, X, y, label):
        """Sanity check for data quality"""
        print(f"\n📊 Sanity Check for {label} dataset")
        print("  → Shape:", X.shape)
        print("  → NaNs in X:", np.isnan(X).sum())
        print("  → All-zero columns:", (X == 0).all(axis=0).sum())
        print("  → y balance:", np.bincount(y.astype(int)) if len(np.unique(y)) == 2 else y.value_counts())
                    
    def plot_shap_values(self, model, X, feature_names, title, asset_name, method_name):
    
        plt.close()
        plt.style.use('default')

        print(f"[SHAP] Generating plot for: {title}")
            
        # If using CalibratedClassifierCV, get the base estimator
        if hasattr(model, "base_estimator_"):
            model = model.base_estimator_

        try:
            # Create SHAP explainer with proper settings
            explainer = shap.TreeExplainer(model, feature_perturbation="interventional")
            
            # Use smaller sample for speed and stability
            sample_size = min(500, len(X))
            sample_idx = np.random.choice(len(X), sample_size, replace=False)
            X_sample = X.iloc[sample_idx] if hasattr(X, 'iloc') else X[sample_idx]
            
            shap_values = explainer(X_sample)

            # Plot SHAP bar chart
            shap.plots.bar(shap_values, max_display=len(feature_names), show=False)

            fig = plt.gcf()
            fig.suptitle(
                f"SHAP Feature Importance — {title.capitalize()} — {asset_name.upper()} — {method_name.upper()}",
                fontsize=14
            )
            plt.tight_layout(rect=[0, 0, 1, 0.95])

            # Output directory
            output_dir = "results_metalabel/shap"
            os.makedirs(output_dir, exist_ok=True)

            # Construct filename
            safe_title = title.lower().replace(" ", "_")
            filename = f"shap_{asset_name.lower()}_{method_name.lower()}_{safe_title}.png"
            full_path = os.path.join(output_dir, filename)

            # Save image
            plt.savefig(full_path, dpi=300)
            plt.close()
            print(f"[SHAP] Saved to {full_path}")

            # Save mean absolute SHAP values summary
            mean_abs_shap = np.abs(shap_values.values).mean(axis=0)
            shap_summary_df = pd.DataFrame({
                'feature': feature_names,
                'mean_abs_shap_value': mean_abs_shap
            }).sort_values(by='mean_abs_shap_value', ascending=False)

            summary_filename = f"shap_summary_{asset_name.lower()}_{method_name.lower()}_{safe_title}.csv"
            summary_path = os.path.join(output_dir, summary_filename)
            shap_summary_df.to_csv(summary_path, index=False)
            print(f"[SHAP] Summary CSV saved to {summary_path}")
            
        except Exception as e:
            print(f"[SHAP] Error generating SHAP plot: {e}")
            print(f"[SHAP] Skipping SHAP analysis for {title}")
    

    def is_trade_approved(self, features: dict, direction: str, threshold: float = 0.3) -> bool:
        if direction == 'long':
            feature_list = self.long_feature_cols
            model = self.long_model
        else:
            feature_list = self.short_feature_cols
            model = self.short_model
            

        cleaned = {}
        for k in feature_list:
            val = features.get(k, 0)
            if pd.isna(val) or val in [np.inf, -np.inf]:
                cleaned[k] = 0
            else:
                cleaned[k] = val

        df = pd.DataFrame([cleaned])[feature_list]
        X = df.values  # Use raw values, no scaling
        prob = model.predict_proba(X)[0, 1]

        print(f"[MetaModel] Direction: {direction}, Prob: {prob:.3f}, Threshold: {threshold}, Approved: {prob >= threshold}")
        return prob >= threshold

    def plot_calibration_with_ci(self, y_true, y_probs, model_name='Calibrated Model', label='LONG', n_bins=10):
        """Simple replacement - no confidence intervals, just basic calibration plot"""
        try:
            print(f"📊 Creating simple calibration plot for {model_name} - {label}")
            
            # Convert to numpy arrays
            y_true = np.array(y_true)
            y_probs = np.array(y_probs)
            
            # Basic checks
            if len(y_true) < 10:
                print(f"⚠️ Too few samples: {len(y_true)}")
                return
            
            # Use fewer bins for reliability
            actual_bins = min(5, len(y_true) // 10)
            if actual_bins < 2:
                actual_bins = 2
            
            # Get calibration curve
            fraction_of_positives, mean_predicted_value = calibration_curve(
                y_true, y_probs, n_bins=actual_bins
            )
            
            # Calculate Brier score
            brier = np.mean((y_probs - y_true) ** 2)
            
            # Create simple plot
            fig, ax = plt.subplots(figsize=(8, 8))
            
            # Plot calibration line
            ax.plot(mean_predicted_value, fraction_of_positives, 'o-', 
                    linewidth=2, markersize=8, label=f'{model_name}')
            
            # Perfect calibration reference line
            ax.plot([0, 1], [0, 1], '--', color='gray', label='Perfect Calibration')
            
            # Labels and formatting
            ax.set_xlabel('Mean Predicted Probability')
            ax.set_ylabel('Fraction of Positives') 
            ax.set_title(f'Calibration Plot - {label} - {model_name}\nBrier Score: {brier:.4f}')
            ax.legend()
            ax.grid(True, alpha=0.3)
            ax.set_xlim(0, 1)
            ax.set_ylim(0, 1)
            
            plt.tight_layout()
            plt.show()
            plt.close()
            
            print(f"✅ Calibration plot created successfully")
            
        except Exception as e:
            print(f"❌ Calibration plot failed: {e}")
            try:
                plt.close()
            except:
                pass    
    def oof_shap_importance_lgb(
        self,
        X, y,
        feature_names,
        base_params,
        n_splits=5,
        asset_series=None,  # kept for compatibility; not used
        embargo=0
    ):
        """
        Honest OOF SHAP for LightGBM:
        - For each TimeSeriesSplit fold, fit on train and compute SHAP (pred_contrib) on validation rows only.
        - Returns: (global_df, None, per_fold_df)
        global_df/per_fold_df columns: ['feature','mean_abs_shap', ...]
        """
        import numpy as np, pandas as pd
        from sklearn.model_selection import TimeSeriesSplit
        from lightgbm import LGBMClassifier

        X = np.asarray(X)
        n, p = X.shape
        shap_vals = np.zeros((n, p), dtype=float)  # only validation rows will be filled
        in_val = np.zeros(n, dtype=bool)
        fold_id = np.full(n, -1, dtype=int)

        tscv = TimeSeriesSplit(n_splits=n_splits)
        y_arr = y.iloc if hasattr(y, "iloc") else y

        for f, (tr, va) in enumerate(tscv.split(X)):
            # optional embargo to reduce boundary leakage
            if embargo > 0:
                tr = tr[tr < (va[0] - embargo)]
                va = va[embargo:] if len(va) > embargo else va[:0]
            if len(tr) == 0 or len(va) == 0:
                continue

            m = LGBMClassifier(**base_params)
            m.fit(X[tr], y_arr[tr])

            explainer = shap.TreeExplainer(m, feature_perturbation="interventional")  # model_output defaults to raw
            sv = explainer(X[va])                          # shap.Explanation
            contrib = sv.values         # (n_val, p+1) last col is bias
            shap_vals[va] = contrib               # drop bias col

            in_val[va] = True
            fold_id[va] = f

        # keep only OOF rows
        shap_oof = shap_vals[in_val]
        folds_oof = fold_id[in_val]
        abs_shap = np.abs(shap_oof)

        # per-fold mean(|SHAP|)
        import pandas as pd
        per_fold = []
        for f in np.unique(folds_oof):
            idx = (folds_oof == f)
            mean_abs = abs_shap[idx].mean(axis=0)
            per_fold.append(pd.DataFrame({"feature": feature_names, "mean_abs_shap": mean_abs, "fold": f}))
        per_fold_df = pd.concat(per_fold, ignore_index=True).sort_values(
            ["fold", "mean_abs_shap"], ascending=[True, False]
        )

        # global = mean across folds
        global_df = (
            per_fold_df.groupby("feature", as_index=False)["mean_abs_shap"]
            .mean()
            .sort_values("mean_abs_shap", ascending=False)
            .reset_index(drop=True)
        )

        return global_df, None, per_fold_df
    
    def save_oof_shap_artifacts(self, imp_df, asset_name, method_name, title, outdir="results_metalabel/shap"):
        """
        Save CSV + bar plot (same naming as old code) based on OOF SHAP.
        - Expects imp_df with columns: ['feature','mean_abs_shap']
        - Writes:
            shap_summary_{asset}_{method}_{title}.csv  (feature, mean_abs_shap_value, pct_of_total)
            shap_{asset}_{method}_{title}.png
        """
        import os
        import pandas as pd
        import matplotlib.pyplot as plt

        os.makedirs(outdir, exist_ok=True)

        df = imp_df.copy()
        total = df["mean_abs_shap"].sum()
        df["pct_of_total"] = df["mean_abs_shap"] / (total if total > 0 else 1.0)
        df = df.sort_values("mean_abs_shap", ascending=False).reset_index(drop=True)
        df.rename(columns={"mean_abs_shap": "mean_abs_shap_value"}, inplace=True)

        # CSV (backward-compatible schema)
        csv_path = os.path.join(
            outdir, f"shap_summary_{asset_name.lower()}_{method_name.lower()}_{title.lower()}.csv"
        )
        df[["feature", "mean_abs_shap_value", "pct_of_total"]].to_csv(csv_path, index=False)
        print(f"[SHAP-OOF] Summary CSV saved to {csv_path}")

        # PNG bar plot (like shap.plots.bar)
        max_display = len(df)
        fig_h = max(4, 0.35 * max_display)
        fig, ax = plt.subplots(figsize=(10, fig_h))
        ax.barh(df["feature"].iloc[:max_display][::-1], df["mean_abs_shap_value"].iloc[:max_display][::-1])
        ax.set_xlabel("Mean |SHAP| (OOF)")
        ax.set_ylabel("Feature")
        ax.set_title(
            f"SHAP Feature Importance — {title.capitalize()} — {asset_name.upper()} — {method_name.upper()} (OOF)"
        )
        plt.tight_layout()

        png_path = os.path.join(
            outdir, f"shap_{asset_name.lower()}_{method_name.lower()}_{title.lower()}.png"
        )
        plt.savefig(png_path, dpi=300)
        plt.close()
        print(f"[SHAP-OOF] Plot saved to {png_path}")


    def _oof_scores(self, X, y, base_params, n_splits=5):
        """Out-of-fold probabilities with TimeSeriesSplit (honest, no leakage)."""
        import numpy as np
        from sklearn.model_selection import TimeSeriesSplit
        from lightgbm import LGBMClassifier

        tscv = TimeSeriesSplit(n_splits=n_splits)
        oof = np.zeros(len(y), dtype=float)
        for tr, va in tscv.split(X):
            m = LGBMClassifier(**base_params)
            m.fit(X[tr], y.iloc[tr] if hasattr(y, "iloc") else y[tr])
            oof[va] = m.predict_proba(X[va])[:, 1]
        return oof

    def _decile_lift(self, probs, y, n=10):
        """Hit rate and lift per score decile (top=decile 1)."""
        import numpy as np, pandas as pd
        order = np.argsort(-probs)
        cuts = np.array_split(order, n)
        base = float(y.mean())
        rows = []
        for i, idx in enumerate(cuts, 1):
            rate = float(y.iloc[idx].mean() if hasattr(y, "iloc") else y[idx].mean())
            lift = (rate / base) if base > 0 else float("nan")
            rows.append((i, len(idx), rate, lift))
        return pd.DataFrame(rows, columns=["decile","count","pos_rate","lift"])

    def _evaluate_raw_model(self, X, y, fitted_model, label="LONG", n_splits=5):
        """Print in-sample spread + OOF metrics + top-decile lift."""
        import numpy as np
        from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss, log_loss

        # In-sample spread (sanity check)
        p_in = fitted_model.predict_proba(X)[:, 1]
        print(f"\n[{label}] score spread: min {p_in.min():.3f} | max {p_in.max():.3f} | "
              f"std {p_in.std():.3f} | q05 {np.quantile(p_in,.05):.3f} | q95 {np.quantile(p_in,.95):.3f}")

        # Honest OOF metrics
        base_params = fitted_model.get_params()
        oof = self._oof_scores(X, y, base_params, n_splits=n_splits)
        metrics = {
            "AUC_OOF": float(roc_auc_score(y, oof)),
            "PR_AUC_OOF": float(average_precision_score(y, oof)),
            "Brier_OOF": float(brier_score_loss(y, oof)),
            "LogLoss_OOF": float(log_loss(y, oof, eps=1e-15)),
            "BaseRate": float(np.mean(y)),
        }
        print(f"[{label}] OOF metrics → {metrics}")

        # Decile lift (top few lines)
        lift_df = self._decile_lift(oof, y, n=10)
        print(f"[{label}] Top deciles (OOF):")
        print(lift_df.head(3).to_string(index=False))
        return metrics, oof, lift_df


    


