In [3]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from xgboost import XGBClassifier, XGBRegressor, plot_importance
import matplotlib.pyplot as plt
from lightgbm import LGBMClassifier
from scipy.stats import randint
import pandas as pd
import numpy as np

In [2]:
x, y = make_classification(n_samples=2000, n_classes=2, weights=[0.7, 0.3], random_state=42)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

**Greedy Ensemble Selection**

In [4]:
def greedy_ensemble_selection(base_models, X, y, max_models=5):
    selected = []
    remaining = list(range(len(base_models)))
    best_score = 0
    
    for _ in range(min(max_models, len(base_models))):
        best_idx = None
        best_new_score = best_score
        
        for idx in remaining:
            test_selected = selected + [idx]
            test_models = [base_models[i] for i in test_selected]
            
            # Create voting classifier
            voting = VotingClassifier(
                estimators=[(f'model_{i}', base_models[i]) for i in test_selected],
                voting='soft'
            )
            
            # Evaluate
            scores = cross_val_score(
                voting, X, y, cv=5, scoring='accuracy', n_jobs=-1
            )
            score = scores.mean()
            
            if score > best_new_score:
                best_new_score = score
                best_idx = idx
        
        if best_idx is not None:
            selected.append(best_idx)
            remaining.remove(best_idx)
            best_score = best_new_score
        else:
            break
    
    return selected, best_score

In [5]:
base_models_list = [
    RandomForestClassifier(n_estimators=50, random_state=42),
    GradientBoostingClassifier(n_estimators=50, random_state=42),
    SVC(probability=True, random_state=42),
    KNeighborsClassifier(),
    LogisticRegression(random_state=42, max_iter=1000)
]

selected_indices, score = greedy_ensemble_selection(
    base_models_list, x_train, y_train, max_models=3
)
print(f"Selected models: {selected_indices}")
print(f"Ensemble score: {score:.3f}")

Selected models: [0]
Ensemble score: 0.916


**Ensemble Pruning**

In [6]:
def ensemble_pruning(ensemble, X, y, threshold=0.01):
    base_models = ensemble.estimators
    selected = list(range(len(base_models)))
    
    # Start with all models
    current_score = cross_val_score(
        ensemble, X, y, cv=5, scoring='accuracy', n_jobs=-1
    ).mean()
    
    # Try removing each model
    improved = True
    while improved and len(selected) > 1:
        improved = False
        best_removal = None
        best_new_score = current_score
        
        for idx in selected:
            test_selected = [i for i in selected if i != idx]
            test_models = [base_models[i] for i in test_selected]
            
            test_ensemble = VotingClassifier(
                estimators=[(f'model_{i}', base_models[i]) for i in test_selected],
                voting='soft'
            )
            
            score = cross_val_score(
                test_ensemble, X, y, cv=5, scoring='accuracy', n_jobs=-1
            ).mean()
            
            if score > best_new_score + threshold:
                best_new_score = score
                best_removal = idx
                improved = True
        
        if best_removal is not None:
            selected.remove(best_removal)
            current_score = best_new_score
    
    return selected, current_score