In [1]:
# compare_features.py

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# ======================
# 1. Load the datasets
# ======================
selected = pd.read_csv(r"D:\DATA_ANALYSIS\ML_\1\Heart_Disease_Project\data\selected_features.csv")
top10 = pd.read_csv(r"D:\DATA_ANALYSIS\ML_\1\Heart_Disease_Project\data\top10_features.csv")

# Split X and y
X_sel, y_sel = selected.drop("target", axis=1), selected["target"]
X_top, y_top = top10.drop("target", axis=1), top10["target"]

# ======================
# 2. Train/Test Split
# ======================
X_train_sel, X_test_sel, y_train_sel, y_test_sel = train_test_split(
    X_sel, y_sel, test_size=0.2, random_state=42, stratify=y_sel
)
X_train_top, X_test_top, y_train_top, y_test_top = train_test_split(
    X_top, y_top, test_size=0.2, random_state=42, stratify=y_top
)

# ======================
# 3. Scale Data
# ======================
scaler = StandardScaler()

X_train_sel = scaler.fit_transform(X_train_sel)
X_test_sel = scaler.transform(X_test_sel)

X_train_top = scaler.fit_transform(X_train_top)
X_test_top = scaler.transform(X_test_top)

# ======================
# 4. Train the Model
# ======================
rf = RandomForestClassifier(random_state=42)

# --- Selected features
rf.fit(X_train_sel, y_train_sel)
y_pred_sel = rf.predict(X_test_sel)
y_proba_sel = rf.predict_proba(X_test_sel)[:, 1]

# --- Top10 features
rf.fit(X_train_top, y_train_top)
y_pred_top = rf.predict(X_test_top)
y_proba_top = rf.predict_proba(X_test_top)[:, 1]

# ======================
# 5. Evaluate Results
# ======================
results = pd.DataFrame({
    "Feature Set": ["selected_features", "top10_features"],
    "Accuracy": [
        accuracy_score(y_test_sel, y_pred_sel),
        accuracy_score(y_test_top, y_pred_top)
    ],
    "F1-score": [
        f1_score(y_test_sel, y_pred_sel),
        f1_score(y_test_top, y_pred_top)
    ],
    "AUC": [
        roc_auc_score(y_test_sel, y_proba_sel),
        roc_auc_score(y_test_top, y_proba_top)
    ]
})

print("\n=== Feature Set Comparison ===")
print(results)



=== Feature Set Comparison ===
         Feature Set  Accuracy  F1-score       AUC
0  selected_features  0.770492  0.774194  0.878788
1     top10_features  0.868852  0.866667  0.939394
