In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Load Iris dataset into a DataFrame
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = iris.target

print("Original Features:\n", X.head(), "\n")

# -------- 1) FILTER METHOD: Select features by correlation threshold --------
print("1) Filter Method: Feature selection based on correlation with target")

# Calculate absolute correlation of each feature with target
correlations = []
for col in X.columns:
    corr = np.corrcoef(X[col], y)[0, 1]  # Pearson correlation
    correlations.append(abs(corr))

# Create DataFrame of correlations
corr_df = pd.DataFrame({
    'Feature': X.columns,
    'AbsCorrelation': correlations
}).sort_values(by='AbsCorrelation', ascending=False)

print(corr_df)

# Select features with absolute correlation > 0.5 (you can change threshold)
selected_features = corr_df[corr_df['AbsCorrelation'] > 0.5]['Feature'].tolist()
print("Selected features by filter method:", selected_features, "\n")

X_filtered = X[selected_features]

# -------- 2) WRAPPER METHOD: Recursive Feature Elimination (RFE) --------
print("2) Wrapper Method: Recursive Feature Elimination (RFE) with Logistic Regression")

model = LogisticRegression(max_iter=500)
# RFE to select top 2 features
rfe = RFE(model, n_features_to_select=2)
rfe.fit(X, y)

# Print which features were selected
for i in range(X.shape[1]):
    print(f"{X.columns[i]}: {'Selected' if rfe.support_[i] else 'Not Selected'}")

X_rfe = X.loc[:, rfe.support_]
print("Selected features by wrapper method:", X_rfe.columns.tolist(), "\n")

# -------- 3) EMBEDDED METHOD: Feature importance from Random Forest --------
print("3) Embedded Method: Feature importance using Random Forest")

rf = RandomForestClassifier()
rf.fit(X, y)

importances = rf.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(feature_importance_df)

# Select features with importance above the mean importance
mean_importance = importances.mean()
selected_rf = feature_importance_df[feature_importance_df['Importance'] > mean_importance]['Feature'].tolist()
print("Selected features by embedded method:", selected_rf, "\n")

X_rf_selected = X[selected_rf]

# -------- Summary of selected features --------
print("Summary of selected features:")
print(f"Filter method: {selected_features}")
print(f"Wrapper method: {X_rfe.columns.tolist()}")
print(f"Embedded method: {selected_rf}")


Original Features:
    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2 

1) Filter Method: Feature selection based on correlation with target
             Feature  AbsCorrelation
3   petal width (cm)        0.956547
2  petal length (cm)        0.949035
0  sepal length (cm)        0.782561
1   sepal width (cm)        0.426658
Selected features by filter method: ['petal width (cm)', 'petal length (cm)', 'sepal length (cm)'] 

2) Wrapper Method: Recursive Feature Elimination (RFE) with Logistic Regression
sepal length (cm): Not Selected
sepal width (cm): Not Selected
petal length (cm): Selec

In [None]:
import pandas as pd
from sklearn.datasets import load_iris, make_regression
from sklearn.feature_selection import SelectKBest, chi2, VarianceThreshold, RFE, SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# -------- Load datasets --------
iris = load_iris()
X_iris = pd.DataFrame(iris.data, columns=iris.feature_names)
y_iris = iris.target

X_reg, y_reg = make_regression(n_samples=100, n_features=10, noise=0.1, random_state=42)

print("Original Iris features:\n", X_iris.head(), "\n")

# -------- 1) Filter: Chi-Square Test (for classification with discretized features) --------
print("1) Filter Method: Chi-Square Test")

# Discretize numeric features for chi2
X_discrete = X_iris.apply(lambda x: pd.cut(x, bins=10, labels=False))
chi2_selector = SelectKBest(chi2, k=2)
X_chi2 = chi2_selector.fit_transform(X_discrete, y_iris)

selected_chi2 = X_iris.columns[chi2_selector.get_support()]
print("Selected features by Chi-Square:", selected_chi2.tolist(), "\n")

# -------- 2) Filter: Variance Threshold (remove low variance features) --------
print("2) Filter Method: Variance Threshold")

# Add a low variance (constant) column for demonstration
X_var = X_iris.copy()
X_var['low_variance'] = 1  # Constant column same value for all rows

selector = VarianceThreshold(threshold=0.0)  # remove zero-variance columns
X_var_reduced = selector.fit_transform(X_var)

print("Original features shape:", X_var.shape)
print("Reduced features shape after VarianceThreshold:", X_var_reduced.shape, "\n")

# -------- 3) Wrapper: Recursive Feature Elimination (RFE) --------
print("3) Wrapper Method: RFE with Logistic Regression")

model = LogisticRegression(max_iter=500)
rfe = RFE(model, n_features_to_select=2)
rfe.fit(X_iris, y_iris)

selected_rfe = X_iris.columns[rfe.support_]
print("Selected features by RFE:", selected_rfe.tolist(), "\n")

# -------- 4) Embedded: Lasso for regression --------
print("4) Embedded Method: Lasso for regression feature selection")

lasso = Lasso(alpha=0.1)
lasso.fit(X_reg, y_reg)

selected_lasso = np.array(range(X_reg.shape[1]))[lasso.coef_ != 0]
print("Selected features by Lasso:", selected_lasso.tolist(), "\n")

# -------- Bonus: Sequential Feature Selector --------
print("Bonus) Wrapper: Sequential Forward Selection with KNN")

knn = KNeighborsClassifier(n_neighbors=3)
sfs = SequentialFeatureSelector(knn, n_features_to_select=2, direction='forward')
sfs.fit(X_iris, y_iris)

selected_sfs = X_iris.columns[sfs.get_support()]
print("Selected features by Sequential Forward Selection:", selected_sfs.tolist())



Original Iris features:
    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2 

1) Filter Method: Chi-Square Test
Selected features by Chi-Square: ['petal length (cm)', 'petal width (cm)'] 

2) Filter Method: Variance Threshold


ValueError: Length of values (50) does not match length of index (150)