In [2]:
import os, sys, json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

sys.path.append(os.path.abspath("../src"))

from feature_selection import vote_feature_selection
from models import (
    train_knn, train_decision_tree, train_svm,
    train_gradient_boosting, train_adaboost, train_xgboost,
    train_balanced_rf, evaluate_model, evaluate_thresholds,
)


In [3]:
df = pd.read_csv("../data/processed/heart_disease_clean.csv")

target = "Heart Disease Status"
X = df.drop(columns=[target])
y = df[target]

print("Dataset shape:", df.shape)
print("Target distribution:\n", y.value_counts())


Dataset shape: (10000, 25)
Target distribution:
 Heart Disease Status
0    8000
1    2000
Name: count, dtype: int64


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    stratify=y,
    random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (8000, 24)
Test shape: (2000, 24)


In [4]:
selected_features, votes = vote_feature_selection(X_train, y_train, top_n=18)

print("Final selected features:", selected_features)
print("\nVote counts:")
print(votes)


Final selected features: ['BMI', 'Alcohol Consumption_Medium', 'Fasting Blood Sugar', 'Homocysteine Level', 'Sleep Hours', 'Family Heart Disease', 'Smoking', 'Age', 'Blood Pressure', 'Low HDL Cholesterol', 'High LDL Cholesterol', 'Alcohol Consumption_Low', 'Stress Level_Low', 'Stress Level_Medium', 'Sugar Consumption_Medium', 'Gender', 'Exercise Habits_Medium', 'Sugar Consumption_Low', 'Cholesterol Level', 'CRP Level', 'Exercise Habits_Low']

Vote counts:
BMI                           5
Alcohol Consumption_Medium    5
Fasting Blood Sugar           5
Homocysteine Level            5
Sleep Hours                   5
Family Heart Disease          5
Smoking                       5
Age                           5
Blood Pressure                5
Low HDL Cholesterol           5
High LDL Cholesterol          4
Alcohol Consumption_Low       4
Stress Level_Low              4
Stress Level_Medium           4
Sugar Consumption_Medium      4
Gender                        4
Exercise Habits_Medium      

In [5]:
with open("../data/processed/selected_features.json", "w") as f:
    json.dump(selected_features, f)


In [6]:
X_train_sel = X_train[selected_features]
X_test_sel = X_test[selected_features]


In [7]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_train_smote, y_train_smote = sm.fit_resample(X_train_sel, y_train)

print("After SMOTE:", X_train_smote.shape, pd.Series(y_train_smote).value_counts())


After SMOTE: (12800, 21) Heart Disease Status
0    6400
1    6400
Name: count, dtype: int64


In [8]:
knn_model = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=7))
])

knn_model.fit(X_train_smote, y_train_smote)

0,1,2
,steps,"[('scaler', ...), ('knn', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_neighbors,7
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [9]:
svm_model = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(kernel='rbf', C=2.0, probability=True, class_weight='balanced'))
])

svm_model.fit(X_train_smote, y_train_smote)

0,1,2
,steps,"[('scaler', ...), ('svm', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,C,2.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,'balanced'


In [10]:
tree_model = train_decision_tree(X_train_smote, y_train_smote)
gb_model = train_gradient_boosting(X_train_smote, y_train_smote)
ada_model = train_adaboost(X_train_smote, y_train_smote)
xgb_model = train_xgboost(X_train_smote, y_train_smote)
brf_model = train_balanced_rf(X_train_smote, y_train_smote)


In [11]:
results = []
results.append(evaluate_model(knn_model, X_test_sel, y_test, "KNN"))
results.append(evaluate_model(svm_model, X_test_sel, y_test, "SVM"))
results.append(evaluate_model(tree_model, X_test_sel, y_test, "Decision Tree"))
results.append(evaluate_model(gb_model, X_test_sel, y_test, "Gradient Boosting"))
results.append(evaluate_model(ada_model, X_test_sel, y_test, "AdaBoost"))
results.append(evaluate_model(xgb_model, X_test_sel, y_test, "XGBoost"))
results.append(evaluate_model(brf_model, X_test_sel, y_test, "Balanced Random Forest"))


===== KNN =====
Accuracy: 0.548
Precision: 0.20352941176470588
Recall: 0.4325
F1 Score: 0.2768

Classification Report:

              precision    recall  f1-score   support

           0       0.80      0.58      0.67      1600
           1       0.20      0.43      0.28       400

    accuracy                           0.55      2000
   macro avg       0.50      0.50      0.47      2000
weighted avg       0.68      0.55      0.59      2000

Confusion Matrix:
 [[923 677]
 [227 173]]

===== SVM =====
Accuracy: 0.731
Precision: 0.19736842105263158
Recall: 0.1125
F1 Score: 0.14331210191082802

Classification Report:

              precision    recall  f1-score   support

           0       0.80      0.89      0.84      1600
           1       0.20      0.11      0.14       400

    accuracy                           0.73      2000
   macro avg       0.50      0.50      0.49      2000
weighted avg       0.68      0.73      0.70      2000

Confusion Matrix:
 [[1417  183]
 [ 355   45]]

===

In [12]:
pd.DataFrame(results)


Unnamed: 0,model,accuracy,precision,recall,f1
0,KNN,0.548,0.203529,0.4325,0.2768
1,SVM,0.731,0.197368,0.1125,0.143312
2,Decision Tree,0.6635,0.217391,0.2625,0.237826
3,Gradient Boosting,0.7905,0.12,0.0075,0.014118
4,AdaBoost,0.7925,0.173913,0.01,0.018913
5,XGBoost,0.7875,0.162162,0.015,0.02746
6,Balanced Random Forest,0.7955,0.2,0.0075,0.014458


In [13]:
threshold_tables = {
    "KNN": evaluate_thresholds(knn_model, X_test_sel, y_test, "KNN"),
    "SVM": evaluate_thresholds(svm_model, X_test_sel, y_test, "SVM"),
    "Decision Tree": evaluate_thresholds(tree_model, X_test_sel, y_test, "Decision Tree"),
    "Gradient Boosting": evaluate_thresholds(gb_model, X_test_sel, y_test, "Gradient Boosting"),
    "AdaBoost": evaluate_thresholds(ada_model, X_test_sel, y_test, "AdaBoost"),
    "XGBoost": evaluate_thresholds(xgb_model, X_test_sel, y_test, "XGBoost"),
    "Balanced RF": evaluate_thresholds(brf_model, X_test_sel, y_test, "Balanced RF"),
}

threshold_tables


{'KNN':     threshold  precision  recall        f1
 0        0.05   0.199457  0.9175  0.327679
 1        0.10   0.199457  0.9175  0.327679
 2        0.15   0.207356  0.8175  0.330804
 3        0.20   0.207356  0.8175  0.330804
 4        0.25   0.207356  0.8175  0.330804
 5        0.30   0.210654  0.6525  0.318487
 6        0.35   0.210654  0.6525  0.318487
 7        0.40   0.210654  0.6525  0.318487
 8        0.45   0.203529  0.4325  0.276800
 9        0.50   0.203529  0.4325  0.276800
 10       0.55   0.203529  0.4325  0.276800
 11       0.60   0.187008  0.2375  0.209251
 12       0.65   0.187008  0.2375  0.209251
 13       0.70   0.187008  0.2375  0.209251
 14       0.75   0.217391  0.1250  0.158730
 15       0.80   0.217391  0.1250  0.158730
 16       0.85   0.217391  0.1250  0.158730
 17       0.90   0.222222  0.0350  0.060475
 18       0.95   0.222222  0.0350  0.060475,
 'SVM':     threshold  precision  recall        f1
 0        0.05   0.200118  0.8475  0.323782
 1        0.10   

In [4]:
df[target].value_counts(normalize=True)


Heart Disease Status
0    0.8
1    0.2
Name: proportion, dtype: float64

In [5]:
df.corr()[target].sort_values()


Stress Level_Low             -0.022296
Gender                       -0.017200
Alcohol Consumption_Medium   -0.017103
Blood Pressure               -0.013876
Age                          -0.009231
Sugar Consumption_Medium     -0.008540
Family Heart Disease         -0.007500
CRP Level                    -0.006009
Low HDL Cholesterol          -0.006000
Exercise Habits_Low          -0.004902
Sleep Hours                  -0.003821
Diabetes                     -0.002700
Fasting Blood Sugar          -0.002248
Sugar Consumption_Low        -0.002108
High Blood Pressure           0.002200
Smoking                       0.002701
Cholesterol Level             0.002703
Triglyceride Level            0.002904
Alcohol Consumption_Low       0.006014
Exercise Habits_Medium        0.006683
High LDL Cholesterol          0.008201
Homocysteine Level            0.008302
BMI                           0.019682
Stress Level_Medium           0.024894
Heart Disease Status          1.000000
Name: Heart Disease Statu

In [7]:
from sklearn.feature_selection import mutual_info_classif
mi = mutual_info_classif(X, y)


In [8]:
mi

array([0.00000000e+00, 2.45752682e-03, 4.65804660e-03, 0.00000000e+00,
       1.61561202e-03, 0.00000000e+00, 4.26060197e-03, 0.00000000e+00,
       0.00000000e+00, 7.93061351e-03, 3.61629288e-03, 3.73687044e-03,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.29940478e-03,
       0.00000000e+00, 4.23047817e-03, 2.49757248e-03, 3.27523624e-03,
       3.12515535e-03, 2.60938983e-05, 6.38003363e-03, 2.63134662e-03])

In [9]:
df1 = pd.read_csv("../data/raw/heart.csv")

target1 = "target"
X1 = df.drop(columns=[target])
y1 = df[target]



In [11]:
df1[target1].value_counts(normalize=True)


target
1    0.513171
0    0.486829
Name: proportion, dtype: float64

In [12]:
df1.corr()[target1].sort_values()


oldpeak    -0.438441
exang      -0.438029
ca         -0.382085
thal       -0.337838
sex        -0.279501
age        -0.229324
trestbps   -0.138772
chol       -0.099966
fbs        -0.041164
restecg     0.134468
slope       0.345512
thalach     0.422895
cp          0.434854
target      1.000000
Name: target, dtype: float64

In [13]:
mi1 = mutual_info_classif(X1, y1)
mi1

array([0.        , 0.00395415, 0.        , 0.        , 0.00218878,
       0.00739463, 0.00350132, 0.        , 0.00508231, 0.00338976,
       0.00090575, 0.00374225, 0.00706926, 0.        , 0.        ,
       0.00231565, 0.00108939, 0.        , 0.00195953, 0.00161415,
       0.00489046, 0.        , 0.00525857, 0.        ])