In [12]:
import os, sys, json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

sys.path.append(os.path.abspath("../src"))

from feature_selection import vote_feature_selection
from models import (
    train_knn, train_decision_tree, train_svm,
    train_gradient_boosting, train_adaboost, train_xgboost,
     evaluate_model, evaluate_thresholds,
)


In [13]:
df = pd.read_csv("../data/processed/heart_disease_clean.csv")

target = "Heart Disease Status"
X = df.drop(columns=[target])
y = df[target]

print("Dataset shape:", df.shape)
print("Target distribution:\n", y.value_counts())


Dataset shape: (10000, 25)
Target distribution:
 Heart Disease Status
0    8000
1    2000
Name: count, dtype: int64


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    stratify=y,
    random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (8000, 24)
Test shape: (2000, 24)


In [15]:
selected_features, votes = vote_feature_selection(X_train, y_train, top_n=18)

print("Final selected features:", selected_features)
print("\nVote counts:")
print(votes)


Final selected features: ['Sleep Hours', 'Alcohol Consumption_Medium', 'Homocysteine Level', 'Family Heart Disease', 'Stress Level_Medium', 'Age', 'Smoking', 'Exercise Habits_Medium', 'Blood Pressure', 'Low HDL Cholesterol', 'Fasting Blood Sugar', 'Stress Level_Low', 'High LDL Cholesterol', 'Sugar Consumption_Low', 'Alcohol Consumption_Low', 'Sugar Consumption_Medium', 'BMI', 'Gender', 'Exercise Habits_Low', 'Cholesterol Level', 'Triglyceride Level', 'CRP Level']

Vote counts:
Sleep Hours                   5
Alcohol Consumption_Medium    5
Homocysteine Level            5
Family Heart Disease          5
Stress Level_Medium           5
Age                           5
Smoking                       5
Exercise Habits_Medium        4
Blood Pressure                4
Low HDL Cholesterol           4
Fasting Blood Sugar           4
Stress Level_Low              4
High LDL Cholesterol          4
Sugar Consumption_Low         4
Alcohol Consumption_Low       4
Sugar Consumption_Medium      4
BMI   

In [16]:
with open("../data/processed/selected_features.json", "w") as f:
    json.dump(selected_features, f)


In [17]:
X_train_sel = X_train[selected_features]
X_test_sel = X_test[selected_features]


In [18]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_train_smote, y_train_smote = sm.fit_resample(X_train_sel, y_train)

print("After SMOTE:", X_train_smote.shape, pd.Series(y_train_smote).value_counts())


After SMOTE: (12800, 22) Heart Disease Status
0    6400
1    6400
Name: count, dtype: int64


In [19]:
knn_model = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=7))
])

knn_model.fit(X_train_smote, y_train_smote)

In [20]:
svm_model = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(kernel='rbf', C=2.0, probability=True, class_weight='balanced'))
])

svm_model.fit(X_train_smote, y_train_smote)

In [21]:
tree_model = train_decision_tree(X_train_smote, y_train_smote)
gb_model = train_gradient_boosting(X_train_smote, y_train_smote)
ada_model = train_adaboost(X_train_smote, y_train_smote)
xgb_model = train_xgboost(X_train_smote, y_train_smote)
# brf_model = train_balanced_rf(X_train_smote, y_train_smote)




In [23]:
results = []
results.append(evaluate_model(knn_model, X_test_sel, y_test, "KNN"))
results.append(evaluate_model(svm_model, X_test_sel, y_test, "SVM"))
results.append(evaluate_model(tree_model, X_test_sel, y_test, "Decision Tree"))
results.append(evaluate_model(gb_model, X_test_sel, y_test, "Gradient Boosting"))
results.append(evaluate_model(ada_model, X_test_sel, y_test, "AdaBoost"))
results.append(evaluate_model(xgb_model, X_test_sel, y_test, "XGBoost"))
# results.append(evaluate_model(brf_model, X_test_sel, y_test, "Balanced Random Forest"))



===== KNN =====
Accuracy: 0.548
Precision: 0.20422535211267606
Recall: 0.435
F1 Score: 0.2779552715654952

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.58      0.67      1600
           1       0.20      0.43      0.28       400

    accuracy                           0.55      2000
   macro avg       0.50      0.51      0.47      2000
weighted avg       0.68      0.55      0.59      2000

Confusion Matrix:
 [[922 678]
 [226 174]]


===== SVM =====
Accuracy: 0.7295
Precision: 0.19480519480519481
Recall: 0.1125
F1 Score: 0.14263074484944532

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.88      0.84      1600
           1       0.19      0.11      0.14       400

    accuracy                           0.73      2000
   macro avg       0.50      0.50      0.49      2000
weighted avg       0.68      0.73      0.70      2000

Confusion Matrix:
 [[1414  186]
 [ 3

In [24]:
pd.DataFrame(results)


Unnamed: 0,model,accuracy,precision,recall,f1
0,KNN,0.548,0.204225,0.435,0.277955
1,SVM,0.7295,0.194805,0.1125,0.142631
2,Decision Tree,0.8,0.0,0.0,0.0
3,Gradient Boosting,0.8,0.0,0.0,0.0
4,AdaBoost,0.794,0.2,0.01,0.019048
5,XGBoost,0.7995,0.0,0.0,0.0


In [26]:
threshold_tables = {
    "KNN": evaluate_thresholds(knn_model, X_test_sel, y_test, "KNN"),
    "SVM": evaluate_thresholds(svm_model, X_test_sel, y_test, "SVM"),
    "Decision Tree": evaluate_thresholds(tree_model, X_test_sel, y_test, "Decision Tree"),
    "Gradient Boosting": evaluate_thresholds(gb_model, X_test_sel, y_test, "Gradient Boosting"),
    "AdaBoost": evaluate_thresholds(ada_model, X_test_sel, y_test, "AdaBoost"),
    "XGBoost": evaluate_thresholds(xgb_model, X_test_sel, y_test, "XGBoost"),
    # "Balanced RF": evaluate_thresholds(brf_model, X_test_sel, y_test, "Balanced RF"),
}

threshold_tables


{'KNN':     threshold  precision  recall        f1
 0        0.05   0.204060  0.9550  0.336268
 1        0.10   0.204060  0.9550  0.336268
 2        0.15   0.204700  0.8275  0.328210
 3        0.20   0.204700  0.8275  0.328210
 4        0.25   0.204700  0.8275  0.328210
 5        0.30   0.208500  0.6500  0.315726
 6        0.35   0.208500  0.6500  0.315726
 7        0.40   0.208500  0.6500  0.315726
 8        0.45   0.204225  0.4350  0.277955
 9        0.50   0.204225  0.4350  0.277955
 10       0.55   0.204225  0.4350  0.277955
 11       0.60   0.194332  0.2400  0.214765
 12       0.65   0.194332  0.2400  0.214765
 13       0.70   0.194332  0.2400  0.214765
 14       0.75   0.201794  0.1125  0.144462
 15       0.80   0.201794  0.1125  0.144462
 16       0.85   0.201794  0.1125  0.144462
 17       0.90   0.300000  0.0375  0.066667
 18       0.95   0.300000  0.0375  0.066667,
 'SVM':     threshold  precision  recall        f1
 0        0.05   0.206490  0.8750  0.334129
 1        0.10   

In [27]:
df[target].value_counts(normalize=True)


Heart Disease Status
0    0.8
1    0.2
Name: proportion, dtype: float64

In [28]:
df.corr()[target].sort_values()


Stress Level_Low             -0.022296
Gender                       -0.017200
Alcohol Consumption_Medium   -0.017103
Blood Pressure               -0.013876
Age                          -0.009231
Sugar Consumption_Medium     -0.008540
Family Heart Disease         -0.007500
CRP Level                    -0.006009
Low HDL Cholesterol          -0.006000
Exercise Habits_Low          -0.004902
Sleep Hours                  -0.003821
Diabetes                     -0.002700
Fasting Blood Sugar          -0.002248
Sugar Consumption_Low        -0.002108
High Blood Pressure           0.002200
Smoking                       0.002701
Cholesterol Level             0.002703
Triglyceride Level            0.002904
Alcohol Consumption_Low       0.006014
Exercise Habits_Medium        0.006683
High LDL Cholesterol          0.008201
Homocysteine Level            0.008302
BMI                           0.019682
Stress Level_Medium           0.024894
Heart Disease Status          1.000000
Name: Heart Disease Statu

In [29]:
from sklearn.feature_selection import mutual_info_classif
mi = mutual_info_classif(X, y)


In [30]:
mi

array([0.00000000e+00, 2.51144265e-04, 0.00000000e+00, 4.94248193e-03,
       5.35873184e-03, 1.15739632e-03, 0.00000000e+00, 0.00000000e+00,
       1.21349248e-03, 0.00000000e+00, 0.00000000e+00, 3.85186486e-03,
       5.25621277e-03, 3.54739912e-03, 0.00000000e+00, 2.37440843e-03,
       0.00000000e+00, 1.92610128e-03, 3.30893153e-03, 0.00000000e+00,
       2.13709949e-05, 0.00000000e+00, 1.10325963e-02, 2.47506272e-03])

In [31]:
df1 = pd.read_csv("../data/raw/heart.csv")

target1 = "target"
X1 = df.drop(columns=[target])
y1 = df[target]



In [32]:
df1[target1].value_counts(normalize=True)


target
1    0.513171
0    0.486829
Name: proportion, dtype: float64

In [33]:
df1.corr()[target1].sort_values()


oldpeak    -0.438441
exang      -0.438029
ca         -0.382085
thal       -0.337838
sex        -0.279501
age        -0.229324
trestbps   -0.138772
chol       -0.099966
fbs        -0.041164
restecg     0.134468
slope       0.345512
thalach     0.422895
cp          0.434854
target      1.000000
Name: target, dtype: float64

In [34]:
mi1 = mutual_info_classif(X1, y1)
mi1

array([0.00501567, 0.        , 0.00272728, 0.00085179, 0.        ,
       0.        , 0.00436137, 0.        , 0.        , 0.0027373 ,
       0.        , 0.00373974, 0.        , 0.00868715, 0.        ,
       0.0022464 , 0.0101551 , 0.00620331, 0.        , 0.        ,
       0.00100395, 0.        , 0.00301849, 0.        ])