In [1]:
# Exhaustive Wrapper Selection
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from mlxtend.feature_selection import ExhaustiveFeatureSelector
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
# Load data
X, y = load_breast_cancer(return_X_y=True)
X = pd.DataFrame(X, columns=load_breast_cancer().feature_names)
X.head()


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [3]:
# Split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [4]:
# Create logistic regression model
model = LogisticRegression(max_iter=5000)

In [6]:
# Exhaustive Selection: evaluate all feature subsets of size 1–5
efs = ExhaustiveFeatureSelector(
    model,
    min_features=1,
    max_features=2,
    scoring='accuracy',
    print_progress=True,
    cv=5,
    n_jobs=-1
)
efs = efs.fit(X_train, y_train)

Features: 465/465

In [7]:
# Best subset and score
best_idx = list(efs.best_idx_)
best_score = efs.best_score_
best_features = X.columns[best_idx]
print("Exhaustive Selection Best Features:", list(best_features))


Exhaustive Selection Best Features: ['worst texture', 'worst perimeter']


In [8]:
# Train model with selected features and evaluate on test set
model.fit(X_train[best_features], y_train)
y_pred = model.predict(X_test[best_features])
acc = accuracy_score(y_test, y_pred)
print(f"Test Accuracy with Exhaustive Selection ({len(best_features)} features): {acc:.4f}")

Test Accuracy with Exhaustive Selection (2 features): 0.9720


## 1. Automated Feature-Count Tuning

**Purpose:**  
Automatically determine the best number of features (`k`) by evaluating cross-validated accuracy for each `k`.

**Key Changes:**
- Loop over `k` in the desired range.
- Fit `ExhaustiveFeatureSelector` with `min_features=k` and `max_features=k`.
- Track and select `k` with the highest `best_score_`.

**Impact:**  
Removes the need for manual selection of feature count, ensuring the model uses the most predictive subset size.

---

## 2. Stratified K-Fold Integration

**Purpose:**  
Embed stratified cross-validation directly within exhaustive selection to maintain class balance across folds.

**Key Changes:**
- Create a `StratifiedKFold` object with `n_splits=5`.
- Pass `cv=cv` to `ExhaustiveFeatureSelector`.

**Impact:**  
Provides more reliable subset evaluation by preserving target-class proportions during cross-validation.

---

## 3. Custom Scoring Functions

**Purpose:**  
Optimize feature selection for metrics beyond accuracy, such as F1-score and balanced accuracy.

**Key Changes:**
- Change the `scoring` parameter in `ExhaustiveFeatureSelector` to `'f1'` or `'balanced_accuracy'`.

**Impact:**  
Produces feature subsets that better align with specific performance goals (e.g., handling imbalanced classes).

---

## 4. Application to a New Dataset (Iris)

**Purpose:**  
Demonstrate the generality of the exhaustive-wrapper pipeline on a dataset with different characteristics.

**Key Changes:**
- Replace `load_breast_cancer` with `load_iris`.
- Adjust `max_features` to suit the smaller feature set.

**Impact:**  
Shows that the same feature-selection logic can be applied across domains without structural code changes.