In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
import pandas as pd

project_path = "/content/drive/MyDrive/HeartProject"
data_path = os.path.join(project_path, "data")

# Load
X_scaled_df = pd.read_csv(os.path.join(data_path, "X_scaled.csv"))
y = pd.read_csv(os.path.join(data_path, "y.csv")).squeeze()

print("✅ Data loaded successfully from Google Drive!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Data loaded successfully from Google Drive!


# Step 3: Feature Selection

To improve model performance and reduce noise, we need to select the most important features.  

We will apply three different methods:
1. **Random Forest Feature Importance** → ranks features based on how useful they are in classification.
2. **Recursive Feature Elimination (RFE)** → recursively removes the least important features using Logistic Regression.
3. **Chi-Square Test** → statistical test to check dependency between features and target.  

Finally, we will combine results to select the **top common features** for our final model.


Random Forest Feature Importance

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest for feature importance
rf = RandomForestClassifier(random_state=42)
rf.fit(X_scaled_df, y)

importances = pd.Series(rf.feature_importances_, index=X_scaled_df.columns)
importances = importances.sort_values(ascending=False)

print("Top 10 features by RF importance:")
print(importances.head(10))


Top 10 features by RF importance:
thalach      0.130526
oldpeak      0.123987
chol         0.114644
age          0.114296
ca           0.099617
trestbps     0.093183
cp_4.0       0.063869
thal_7.0     0.060529
exang        0.042995
slope_2.0    0.033848
dtype: float64


RFE (Recursive Feature Elimination)

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Logistic Regression as base model
log_reg = LogisticRegression(max_iter=1000, solver="liblinear")

# Apply RFE to select top 10 features
rfe = RFE(log_reg, n_features_to_select=10)
rfe.fit(X_scaled_df, y)

selected_features = X_scaled_df.columns[rfe.support_]
print("RFE selected features:", list(selected_features))


RFE selected features: ['age', 'sex', 'fbs', 'thalach', 'oldpeak', 'ca', 'cp_4.0', 'thal_7.0', 'slope_2.0', 'restecg_2.0']


Chi-Square Test

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler

# Scale features to [0,1] for chi2
X_chi2 = MinMaxScaler().fit_transform(X_scaled_df)

chi2_selector = SelectKBest(chi2, k=10)
chi2_selector.fit(X_chi2, y)

chi2_features = X_scaled_df.columns[chi2_selector.get_support()]
print("Chi-Square selected features:", list(chi2_features))


Chi-Square selected features: ['sex', 'exang', 'oldpeak', 'ca', 'cp_2.0', 'cp_3.0', 'cp_4.0', 'thal_6.0', 'thal_7.0', 'slope_2.0']


Combine Results

In [None]:
# Take intersection of important features
common_features = set(importances.head(15).index) & set(selected_features) & set(chi2_features)

print("Common features (RF ∩ RFE ∩ Chi2):", list(common_features))

# Final features to use
final_features = list(common_features)
X_final = X_scaled_df[final_features]

print("Final features to use:", final_features)


Common features (RF ∩ RFE ∩ Chi2): ['thal_7.0', 'cp_4.0', 'oldpeak', 'sex', 'ca', 'slope_2.0']
Final features to use: ['thal_7.0', 'cp_4.0', 'oldpeak', 'sex', 'ca', 'slope_2.0']
