In [7]:
from boosting import BoostRegressor
import numpy as np
import time
# ================ USAGE EXAMPLE ================
if __name__ == "__main__":
    # Generate sample data
    np.random.seed(42)
    n_samples, n_features = 5000, 10
    X = np.random.randn(n_samples, n_features)
    y = np.sum(X[:, :3], axis=1) + 0.1 * np.random.randn(n_samples)
    
    # Split data
    split_idx = int(0.8 * n_samples)
    X_train, X_test = X[:split_idx], X[split_idx:]
    y_train, y_test = y[:split_idx], y[split_idx:]
    
    print("🔥 Comparing Tree Growth Strategies")
    print("=" * 50)
    
    # 1. Level-wise (original)
    print("\n1️⃣ Level-wise Tree Growth:")
    start_time = time.time()
    
    model_level = BoostRegressor(
        n_estimators=50,
        learning_rate=0.1,
        max_depth=6,
        tree_learner="leaf",  # Original approach
        tree_method="exact",
        verbose=True,
        batch_size=1
    )

    # compare with scikit learn's GradientBoostingRegressor
    # from sklearn.ensemble import GradientBoostingRegressor
    # model_sklearn = GradientBoostingRegressor(
    #     n_estimators=50,
    #     learning_rate=0.1,
    #     max_depth=6,
    #     verbose=1,
    #     random_state=42
    # )
    # model_sklearn.fit(X_train, y_train)
    # sklearn_time = time.time() - start_time
    # sklearn_pred = model_sklearn.predict(X_test)
    # sklearn_mse = np.mean((y_test - sklearn_pred) ** 2)
    # print(f"   Scikit-learn Time: {sklearn_time:.2f}s")
    # print(f"   Scikit-learn Test MSE: {sklearn_mse:.6f}")
    # print(f"   Scikit-learn Trees: {len(model_sklearn.estimators_)}")
    
    model_level.fit(X_train, y_train, eval_set=(X_test, y_test))
    level_time = time.time() - start_time
    level_pred = model_level.predict(X_test)
    level_mse = np.mean((y_test - level_pred) ** 2)
    
    print(f"   Time: {level_time:.2f}s")
    print(f"   Test MSE: {level_mse:.6f}")
    print(f"   Trees: {len(model_level.trees)}")
    

🔥 Comparing Tree Growth Strategies

1️⃣ Level-wise Tree Growth:
🚀 Training with Leaf-wise trees (DART=on, GOSS=on), batch_size=1
[  10] Train: 0.596217, Val: 0.680102, Time: 0.04s
[  20] Train: 0.130052, Val: 0.176376, Time: 0.08s
[  30] Train: 0.041564, Val: 0.070371, Time: 0.12s
[  40] Train: 0.085109, Val: 0.117890, Time: 0.15s
[  50] Train: 0.022351, Val: 0.041749, Time: 0.18s
✅ Training completed in 0.18s, 50 trees
   Time: 0.18s
   Test MSE: 0.041749
   Trees: 50


In [2]:
import numpy as np
from numba import njit
from typing import Tuple, List, Optional
from dataclasses import dataclass

@njit
def _build_histogram_numba(values: np.ndarray, 
                          gradients: np.ndarray, 
                          hessians: np.ndarray,
                          bin_edges: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    """Standalone numba function for building histograms."""
    n_bins = len(bin_edges) - 1
    hist_g = np.zeros(n_bins, dtype=np.float64)
    hist_h = np.zeros(n_bins, dtype=np.float64)
    
    for i in range(len(values)):
        val = values[i]
        if np.isnan(val):
            continue
            
        # Find bin using binary search
        bin_idx = np.searchsorted(bin_edges[1:], val)
        if bin_idx < n_bins:
            hist_g[bin_idx] += gradients[i]
            hist_h[bin_idx] += hessians[i]
    
    return hist_g, hist_h


@njit
def _find_split_gains_numba(hist_g: np.ndarray, hist_h: np.ndarray, 
                           lambda_reg: float, gamma: float) -> np.ndarray:
    """Numba-optimized gain calculation for all split points."""
    n_bins = len(hist_g)
    gains = np.full(n_bins - 1, -np.inf, dtype=np.float64)
    
    # Calculate cumulative sums
    cum_g = np.cumsum(hist_g)
    cum_h = np.cumsum(hist_h)
    total_g = cum_g[-1]
    total_h = cum_h[-1]
    
    if total_h <= 0:
        return gains
    
    # Parent score
    parent_score = total_g * total_g / (total_h + lambda_reg)
    
    # Calculate gains for all split points
    for i in range(n_bins - 1):
        g_left = cum_g[i]
        h_left = cum_h[i]
        g_right = total_g - g_left
        h_right = total_h - h_left
        
        if h_left > 0 and h_right > 0:
            left_score = g_left * g_left / (h_left + lambda_reg)
            right_score = g_right * g_right / (h_right + lambda_reg)
            gain = 0.5 * (left_score + right_score - parent_score) - gamma
            gains[i] = gain
    
    return gains


@dataclass
class BinningConfig:
    """Configuration for multi-level binning strategy."""
    coarse_bins: int = 32          # Reduced from 64 for better efficiency
    max_total_bins: int = 128      # Hard cap on total bins
    top_regions: int = 3           # Maximum regions to refine
    min_region_samples: int = 50   # Minimum samples to justify refinement
    min_gain_threshold: float = 0.1  # Skip refinement below this gain
    overlap_merge_threshold: int = 2  # Merge regions if they overlap by this many bins


class MultiLevelBinner:
    """
    Refined multi-level binning that prevents bin explosion and overlapping regions.
    """
    
    def __init__(self, config: BinningConfig = None):
        self.config = config or BinningConfig()
        
    def create_adaptive_bins(self, 
                           feature_values: np.ndarray, 
                           gradients: np.ndarray,
                           hessians: np.ndarray,
                           lambda_reg: float = 1.0,
                           gamma: float = 0.0) -> Tuple[np.ndarray, List[Tuple[int, int, float]]]:
        """
        Create multi-level bins with proper constraints to prevent bin explosion.
        """
        n_samples = len(feature_values)
        finite_values = feature_values[np.isfinite(feature_values)]
        n_unique = len(np.unique(finite_values))
        
        # Early exit conditions
        if (n_samples < self.config.min_region_samples * 2 or 
            n_unique <= self.config.coarse_bins):
            # Use uniform binning for simple cases
            uniform_bins = self._create_uniform_bins(finite_values, min(n_unique, self.config.max_total_bins))
            return uniform_bins, []
        
        # Step 1: Create coarse bins
        coarse_edges = self._create_uniform_bins(finite_values, self.config.coarse_bins)
        
        # Step 2: Build coarse histogram
        coarse_hist_g, coarse_hist_h = self._build_histogram(
            feature_values, gradients, hessians, coarse_edges
        )
        
        # Step 3: Find and merge promising regions
        regions = self._find_promising_regions_fixed(
            coarse_hist_g, coarse_hist_h, coarse_edges, lambda_reg, gamma
        )
        
        if not regions:
            return coarse_edges, []
        
        # Step 4: Create refined bins with budget constraints
        refined_edges = self._create_refined_bins_with_budget(
            finite_values, coarse_edges, regions
        )
        
        return refined_edges, regions
    
    def _create_uniform_bins(self, values: np.ndarray, n_bins: int) -> np.ndarray:
        """Create uniform quantile-based bins."""
        if len(values) == 0:
            return np.array([0.0, 1.0])
        
        unique_vals = np.unique(values)
        if len(unique_vals) <= n_bins:
            return unique_vals
        
        # Create quantile-based bins
        quantiles = np.linspace(0, 1, n_bins + 1)
        bin_edges = np.quantile(unique_vals, quantiles)
        return np.unique(bin_edges)
    
    def _build_histogram(self, values: np.ndarray, 
                        gradients: np.ndarray, 
                        hessians: np.ndarray,
                        bin_edges: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        """Build gradient/hessian histograms."""
        return _build_histogram_numba(values, gradients, hessians, bin_edges)
    
    def _find_promising_regions_fixed(self, 
                                    hist_g: np.ndarray,
                                    hist_h: np.ndarray, 
                                    bin_edges: np.ndarray,
                                    lambda_reg: float,
                                    gamma: float) -> List[Tuple[int, int, float]]:
        """Find promising regions with proper overlap handling."""
        
        # Get gains for all split points
        gains = _find_split_gains_numba(hist_g, hist_h, lambda_reg, gamma)
        
        # Find peaks above threshold
        valid_gains = gains[gains > self.config.min_gain_threshold]
        if len(valid_gains) == 0:
            return []
        
        # Get top split points
        top_indices = np.argsort(gains)[-self.config.top_regions * 3:]  # Get more candidates
        top_indices = top_indices[gains[top_indices] > self.config.min_gain_threshold]
        
        if len(top_indices) == 0:
            return []
        
        # Sort by index to process left to right
        top_indices = np.sort(top_indices)
        
        # Merge overlapping regions
        merged_regions = []
        current_start = top_indices[0]
        current_end = top_indices[0] + 1
        current_max_gain = gains[top_indices[0]]
        
        for i in range(1, len(top_indices)):
            idx = top_indices[i]
            
            # Check if this region overlaps with current
            if idx <= current_end + self.config.overlap_merge_threshold:
                # Extend current region
                current_end = max(current_end, idx + 1)
                current_max_gain = max(current_max_gain, gains[idx])
            else:
                # Save current region and start new one
                if current_max_gain > self.config.min_gain_threshold:
                    merged_regions.append((current_start, current_end, current_max_gain))
                current_start = idx
                current_end = idx + 1
                current_max_gain = gains[idx]
        
        # Don't forget the last region
        if current_max_gain > self.config.min_gain_threshold:
            merged_regions.append((current_start, current_end, current_max_gain))
        
        # Sort by gain and take top regions
        merged_regions.sort(key=lambda x: x[2], reverse=True)
        return merged_regions[:self.config.top_regions]
    
    def _create_refined_bins_with_budget(self, 
                                       values: np.ndarray,
                                       coarse_edges: np.ndarray, 
                                       regions: List[Tuple[int, int, float]]) -> np.ndarray:
        """Create refined bins while respecting the total bin budget."""
        
        # Start with coarse edges
        all_edges = set(coarse_edges)
        coarse_bins_used = len(coarse_edges)
        
        # Calculate remaining budget
        remaining_budget = self.config.max_total_bins - coarse_bins_used
        if remaining_budget <= 0:
            return coarse_edges
        
        # Distribute budget among regions based on their gain
        total_gain = sum(gain for _, _, gain in regions)
        if total_gain <= 0:
            return coarse_edges
        
        for start_idx, end_idx, gain in regions:
            if remaining_budget <= 0:
                break
                
            # Allocate budget proportional to gain
            region_budget = max(4, int(remaining_budget * gain / total_gain))
            region_budget = min(region_budget, remaining_budget)
            
            # Get region boundaries
            region_start = coarse_edges[start_idx]
            region_end = coarse_edges[min(end_idx + 1, len(coarse_edges) - 1)]
            
            # Find values in this region
            in_region = (values >= region_start) & (values <= region_end)
            region_values = values[in_region]
            
            if len(region_values) >= self.config.min_region_samples:
                # Create fine bins in this region
                region_edges = self._create_uniform_bins(region_values, region_budget)
                
                # Only add edges that are truly new
                new_edges = set(region_edges) - all_edges
                if len(new_edges) > 0:
                    all_edges.update(new_edges)
                    remaining_budget -= len(new_edges)
        
        # Return sorted unique edges
        return np.array(sorted(all_edges))


class AdaptiveMultiLevelBinner(MultiLevelBinner):
    """
    Version that adapts binning parameters based on feature characteristics.
    """
    
    def create_adaptive_bins(self, 
                           feature_values: np.ndarray,
                           gradients: np.ndarray,
                           hessians: np.ndarray,
                           feature_idx: int = None,
                           lambda_reg: float = 1.0,
                           gamma: float = 0.0) -> Tuple[np.ndarray, List[Tuple[int, int, float]]]:
        """
        Adaptive binning that adjusts strategy based on feature characteristics.
        """
        # Analyze feature
        stats = self._analyze_feature(feature_values, gradients)
        
        # Adapt configuration
        adapted_config = self._adapt_config(stats)
        
        # Temporarily use adapted config
        original_config = self.config
        self.config = adapted_config
        
        try:
            result = super().create_adaptive_bins(
                feature_values, gradients, hessians, lambda_reg, gamma
            )
        finally:
            self.config = original_config
        
        return result
    
    def _analyze_feature(self, values: np.ndarray, gradients: np.ndarray) -> dict:
        """Analyze feature characteristics."""
        finite_vals = values[np.isfinite(values)]
        finite_grads = gradients[np.isfinite(values)]
        
        if len(finite_vals) < 10:
            return {'n_unique': len(finite_vals), 'correlation': 0.0, 'skewness': 0.0}
        
        stats = {
            'n_unique': len(np.unique(finite_vals)),
            'n_samples': len(finite_vals),
            'missing_rate': 1.0 - len(finite_vals) / len(values),
        }
        
        # Calculate correlation safely
        if np.std(finite_vals) > 1e-8 and np.std(finite_grads) > 1e-8:
            stats['correlation'] = np.corrcoef(finite_vals, finite_grads)[0, 1]
        else:
            stats['correlation'] = 0.0
        
        # Calculate skewness
        if len(finite_vals) > 2:
            mean_val = np.mean(finite_vals)
            std_val = np.std(finite_vals)
            if std_val > 1e-8:
                stats['skewness'] = np.mean(((finite_vals - mean_val) / std_val) ** 3)
            else:
                stats['skewness'] = 0.0
        else:
            stats['skewness'] = 0.0
        
        return stats
    
    def _adapt_config(self, stats: dict) -> BinningConfig:
        """Adapt binning configuration based on feature statistics."""
        config = BinningConfig()
        
        # For categorical-like features (few unique values)
        if stats['n_unique'] < 20:
            config.coarse_bins = min(16, stats['n_unique'])
            config.max_total_bins = min(32, stats['n_unique'] * 2)
            config.top_regions = 1
            config.min_gain_threshold = 0.01
        
        # For features with strong gradient correlation
        elif abs(stats.get('correlation', 0)) > 0.3:
            config.coarse_bins = 24
            config.max_total_bins = 96
            config.top_regions = 4
            config.min_gain_threshold = 0.05
        
        # For highly skewed features
        elif abs(stats.get('skewness', 0)) > 2:
            config.coarse_bins = 20
            config.max_total_bins = 80
            config.top_regions = 2  # Focus on fewer regions
            config.min_gain_threshold = 0.1
        
        # Default case - balanced approach
        else:
            config.coarse_bins = 32
            config.max_total_bins = 128
            config.top_regions = 3
            config.min_gain_threshold = 0.1
        
        return config


def demonstrate_multilevel_binning():
    """Demonstrate the refined multi-level binning approach."""
    np.random.seed(42)
    
    # Create more realistic test data
    n_samples = 10000
    
    # Exponential feature (skewed)
    feature1 = np.random.exponential(2, n_samples)
    
    # Uniform feature
    feature2 = np.random.uniform(0, 100, n_samples)
    
    # Categorical-like feature
    feature3 = np.random.choice([1, 2, 3, 4, 5], n_samples, p=[0.5, 0.2, 0.15, 0.1, 0.05])
    
    # Create more realistic gradients (smaller values, less extreme correlation)
    gradients = (0.1 * np.log(feature1 + 1) + 
                 0.05 * feature2 + 
                 0.2 * feature3 + 
                 np.random.normal(0, 0.5, n_samples))
    
    hessians = np.random.uniform(0.1, 0.3, n_samples)  # Smaller hessian values
    
    # Test refined binning strategies
    binner = AdaptiveMultiLevelBinner()
    
    features = [feature1, feature2, feature3]
    feature_names = ['Exponential (skewed)', 'Uniform', 'Categorical']
    
    print("=== Refined Multi-Level Binning Results ===\n")
    
    for i, (feature, name) in enumerate(zip(features, feature_names)):
        print(f"=== {name} Feature ===")
        print(f"Unique values: {len(np.unique(feature[np.isfinite(feature)]))}")
        
        # Standard uniform binning
        uniform_bins = binner._create_uniform_bins(feature[np.isfinite(feature)], 128)
        print(f"Uniform bins (target 128): {len(uniform_bins)}")
        
        # Multi-level adaptive binning
        adaptive_bins, regions = binner.create_adaptive_bins(
            feature, gradients, hessians, feature_idx=i
        )
        print(f"Adaptive bins: {len(adaptive_bins)}")
        print(f"Refinement regions: {len(regions)}")
        
        if regions:
            print("Promising regions:")
            for start, end, gain in regions:
                print(f"  Bins {start}-{end}: gain = {gain:.4f}")
        
        # Show efficiency gain
        efficiency = len(uniform_bins) / len(adaptive_bins) if len(adaptive_bins) > 0 else 0
        print(f"Efficiency ratio: {efficiency:.2f} (>1 means fewer bins)\n")


if __name__ == "__main__":
    demonstrate_multilevel_binning()

=== Refined Multi-Level Binning Results ===

=== Exponential (skewed) Feature ===
Unique values: 10000
Uniform bins (target 128): 129
Adaptive bins: 129
Refinement regions: 1
Promising regions:
  Bins 16-17: gain = 17.8097
Efficiency ratio: 1.00 (>1 means fewer bins)

=== Uniform Feature ===
Unique values: 10000
Uniform bins (target 128): 129
Adaptive bins: 97
Refinement regions: 1
Promising regions:
  Bins 5-17: gain = 39108.9701
Efficiency ratio: 1.33 (>1 means fewer bins)

=== Categorical Feature ===
Unique values: 5
Uniform bins (target 128): 5
Adaptive bins: 5
Refinement regions: 0
Efficiency ratio: 1.00 (>1 means fewer bins)



In [3]:
from boosting_tree import *
compare_exact_methods()

NameError: name 'compare_exact_methods' is not defined

In [None]:
from shap import *

BoostRegressor = add_shap_to_boostregressor(BoostRegressor)

model_level = BoostRegressor(
    n_estimators=50,
    learning_rate=0.1,
    max_depth=6,
    tree_learner="leaf",  # Original approach
    tree_method="hist",
    verbose=True,
    batch_size=1
)

model_level.fit(X_train, y_train, eval_set=(X_test, y_test))
level_time = time.time() - start_time
# Set background for proper expected value
model_level.set_shap_background(X_train[:100])  # Use sample of training data

# Compute SHAP values (should have much lower additivity errors)
shap_values = model_level.shap_values(X_test[:10], debug=True)

# Validate the fix
# Explain individual predictions
explanation = model_level.explain_prediction(X_test[0])
# Get feature importance
#importance = model_level.shap_feature_importance(X_test[:100])
#print("Feature Importance:", importance)

# Analyze model behavior
#shap_values = model_level.shap_values(X_test[:50])

🚀 Training with Leaf-wise trees (DART=on, GOSS=on), batch_size=1
[  10] Train: 0.624518, Val: 0.696365, Time: 0.04s
[  20] Train: 0.178994, Val: 0.224378, Time: 0.08s
[  30] Train: 0.052540, Val: 0.085379, Time: 0.11s
[  40] Train: 0.031857, Val: 0.056084, Time: 0.13s
[  50] Train: 0.026562, Val: 0.050178, Time: 0.15s
✅ Training completed in 0.15s, 50 trees
[TreeSHAP] Computing SHAP for 10 samples, 10 features, 50 trees
  Sample 0: Adjusting SHAP sum from 0.30939024 to 0.33801284
  Sample 1: Adjusting SHAP sum from 1.23546252 to 1.26408512
  Sample 2: Adjusting SHAP sum from 1.61465677 to 1.64327937
  Sample 3: Adjusting SHAP sum from 0.49134412 to 0.51996672
  Sample 4: Adjusting SHAP sum from -0.03185380 to -0.00323120
  Sample 5: Adjusting SHAP sum from 3.24775689 to 3.27637949
  Sample 6: Adjusting SHAP sum from -1.02974121 to -1.00111861
  Sample 7: Adjusting SHAP sum from -1.83947136 to -1.81084876
  Sample 8: Adjusting SHAP sum from -2.52004928 to -2.49142668
  Sample 9: Adjusti

In [None]:
# Add SHAP to your model
from your_corrected_shap import add_shap_to_boostregressor

add_shap_to_boostregressor(BoostRegressor)

# Train model
model = BoostRegressor(n_estimators=100)
model.fit(X_train, y_train)



ModuleNotFoundError: No module named 'your_corrected_shap'

In [None]:
shap_values