In [6]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import cohen_kappa_score

import pandas as pd
import numpy as np

In [2]:
x, y = make_classification(n_samples=1000, n_classes=2, weights=[0.7, 0.3], random_state=42)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

**Measuring Diversity**

In [5]:
def ensemble_diversity(models, x, y):
    predictions = []
    
    for model in models:
        pred = model.predict(x)
        predictions.append(pred)
        
    predictions = np.array(predictions)
    n_models = len(models)
    
    # Pairwise agreement (kappa score)
    kappa_scores = []
    for i in range(n_models):
        for j in range(i+1, n_models):
            kappa = cohen_kappa_score(predictions[i], predictions[j])
            kappa_scores.append(kappa)
    
    # Lower kappa = more diverse
    avg_kappa = np.mean(kappa_scores)
    diversity = 1 - avg_kappa # Higher = more diverse

    return diversity, avg_kappa

In [7]:
models = [
    RandomForestClassifier(n_estimators=50, random_state=42),
    GradientBoostingClassifier(n_estimators=50, random_state=42),
]

for model in models:
    model.fit(x_train, y_train)

diversity, kappa = ensemble_diversity(models, x_test, y_test)
print(f"Ensemble Diversity: {diversity:.3f}")
print(f"Average Kappa: {kappa:.3f}")

Ensemble Diversity: 0.107
Average Kappa: 0.893


**Creating Diverse Models**

In [8]:
from sklearn.feature_selection import SelectKBest, f_classif

# Strategy 1: Different algorithms
diverse_models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('knn', KNeighborsClassifier(n_neighbors=5)),
    ('nb', GaussianNB())
]

# Strategy 2: Different hyperparameters
rf_models = [
    RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42),
    RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42),
    RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42)
]

feature_sets = []
for k in [5, 10, 15]:
    selector = SelectKBest(f_classif, k=k)
    x_selected = selector.fit_transform(x_train, y_train)
    feature_sets.append((selector, x_selected))