In [3]:
# 03_feature_selection.ipynb

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from collections import Counter
import pandas as pd
# Load preprocessed data
df_clean = pd.read_csv("heart_disease_clean.csv")
# --- Copy dataset ---
df_model = df_clean.copy()

# --- Features & target ---
X = df_model.drop("num", axis=1)
y = df_model["num"].astype(int)

# --- Random Forest Feature Importance ---
rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)
importance_df = pd.DataFrame({
    "Feature": X.columns,
    "Importance": rf.feature_importances_
}).sort_values("Importance", ascending=False)
print("Top 10 Features (Random Forest):")
print(importance_df.head(10))

# --- RFE (Logistic Regression) ---
model = LogisticRegression(max_iter=1000, solver="liblinear")
rfe = RFE(model, n_features_to_select=8)
rfe.fit(X, y)
selected_rfe = X.columns[rfe.support_].tolist()

# --- Chi² test (categorical only) ---
categorical_cols = ["sex","cp_1","cp_2","cp_3","cp_4","fbs",
                    "restecg_0","restecg_1","restecg_2",
                    "exang","slope_1","slope_2","slope_3",
                    "thal_3","thal_6","thal_7"]
chi2_selector = SelectKBest(score_func=chi2, k="all")
chi2_selector.fit(df_model[categorical_cols], y)
chi2_scores = pd.DataFrame({
    "Feature": categorical_cols,
    "Chi2_Score": chi2_selector.scores_,
    "p_value": chi2_selector.pvalues_
}).sort_values("Chi2_Score", ascending=False)

# --- Build final feature set based on consensus (≥2 methods) ---
# --------------------------------------------------------------
# Final Feature Selection Strategy:
# 1. Take the Top 10 features from Random Forest importance.
# 2. Add the features selected by Recursive Feature Elimination (RFE).
# 3. Add the features with significant Chi-Square test results (p < 0.05).
# 4. Apply a strict filter: keep only features that appear in at least
#    2 out of the 3 methods above.
# --------------------------------------------------------------
top_rf = importance_df["Feature"].head(10).tolist()
selected_chi2 = chi2_scores[chi2_scores["p_value"] < 0.05]["Feature"].tolist()
all_selected = top_rf + selected_rfe + selected_chi2
final_features = [f for f, c in Counter(all_selected).items() if c >= 2]

print("\n✅ Final Selected Features:", final_features)
X_reduced = X[final_features]


Top 10 Features (Random Forest):
     Feature  Importance
12   thalach    0.126205
20   oldpeak    0.113218
7       chol    0.099233
0        age    0.098495
21        ca    0.093341
6   trestbps    0.093259
5       cp_4    0.052339
19    thal_7    0.046792
17    thal_3    0.043225
16     exang    0.041992

✅ Final Selected Features: ['ca', 'cp_4', 'thal_7', 'thal_3', 'exang', 'cp_2', 'cp_3', 'slope_1', 'thal_6']


