In [34]:
# Backward Selection
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [35]:
# Load data
X, y = load_breast_cancer(return_X_y=True)
X = pd.DataFrame(X)

In [36]:
# Split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [37]:
# Create logistic regression model
model = LogisticRegression(max_iter=5000)

In [None]:
# Backward Selection: start with all features and remove the least significant one by one
sfs_backward = SequentialFeatureSelector(
    model, n_features_to_select=5, direction='backward'
)
sfs_backward.fit(X_train, y_train)

selected_features = X.columns[sfs_backward.get_support()]
print("Backward Selection Chosen Features:", list(selected_features))

In [None]:
# Train model with selected features and evaluate
model.fit(X_train[selected_features], y_train)
y_pred = model.predict(X_test[selected_features])
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy with Backward Selection (5 features): {acc:.4f}")


## Analysis:
- Forward selection generally starts from an empty set and adds features that best improve
- model performance incrementally. This can help identify a small subset of highly predictive features.
- You may observe slightly improved or comparable accuracy to using all features with fewer variables,indicating a good feature subset.

## 1. Automated Feature‐Count Tuning
#### Adapt your script to automatically select the optimal n_features_to_select from a range (e.g., 1–30) by choosing the value that maximizes test accuracy.

- Loop over possible feature counts.

- Record test accuracy for each.

- Print the best feature count and its accuracy.



In [None]:
best_k, best_acc = None, 0
for k in range(1, 31):
    sfs = SequentialFeatureSelector(
        model, n_features_to_select=k, direction='backward'
    ).fit(X_train, y_train)
    feats = X_train.columns[sfs.get_support()]
    model.fit(X_train[feats], y_train)
    acc = accuracy_score(y_test, model.predict(X_test[feats]))
    if acc > best_acc:
        best_k, best_acc = k, acc

print(f"Best feature count: {best_k} → Accuracy: {best_acc:.4f}")

## 2. Stratified K-Fold Integration
#### Replace the single train/test split with Stratified K-Fold during feature selection:

- Use StratifiedKFold(n_splits=5) within SequentialFeatureSelector.

- Report mean and standard deviation of accuracy across folds for your final feature set.

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
sfs = SequentialFeatureSelector(
    model,
    n_features_to_select=5,
    direction='backward',
    scoring='accuracy',
    cv=kf
).fit(X, y)

feats = X.columns[sfs.get_support()]
scores = cross_val_score(model, X[feats], y, cv=kf)
print("Selected Features:", list(feats))
print(f"CV Accuracy: {scores.mean():.4f} ± {scores.std():.4f}")


## 3. Custom Scoring Function
#### Experiment with a different scoring metric in forward selection (e.g., F1-score or balanced accuracy) to handle class imbalance:

- Pass scoring='f1' or scoring='balanced_accuracy' to SequentialFeatureSelector.

- Compare the selected feature lists and test performance under each metric.

In [None]:
sfs_bal = SequentialFeatureSelector(
    model,
    n_features_to_select=5,
    direction='backward',
    scoring='balanced_accuracy'
)
sfs_bal.fit(X_train, y_train)
feats_bal = X_train.columns[sfs_bal.get_support()]
model.fit(X_train[feats_bal], y_train)
print("Balanced Accuracy Features:", list(feats_bal))

from sklearn.metrics import balanced_accuracy_score
bal_acc = balanced_accuracy_score(y_test, model.predict(X_test[feats_bal]))
print(f"Balanced Accuracy on Test Set: {bal_acc:.4f}")

## 4. Application to a New Dataset
#### Apply your backward‐selection pipeline unchanged to a different classification dataset (e.g., the Iris or Wine dataset):

- Load a new dataset from sklearn.datasets.

- Compare which features are selected and the resulting model accuracy.

- Each exercise reuses your original code structure and deepens your grasp of forward wrapper selection through tuning, validation, stability, and computational considerations.

In [None]:
from sklearn.datasets import load_wine

X_wine, y_wine = load_wine(return_X_y=True, as_frame=True)
X_tr, X_ts, y_tr, y_ts = train_test_split(X_wine, y_wine, random_state=42)

sfs_wine = SequentialFeatureSelector(
    model, n_features_to_select=5, direction='backward'
).fit(X_tr, y_tr)

feats = X_tr.columns[sfs_wine.get_support()]
model.fit(X_tr[feats], y_tr)
acc = accuracy_score(y_ts, model.predict(X_ts[feats]))

print(f"Backward Features (Wine): {list(feats)}")
print(f"Wine Dataset Accuracy: {acc:.4f}")
