In [1]:
import os, sys, json
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

sys.path.append(os.path.abspath("../src"))

from feature_selection import vote_feature_selection
from models import (
    train_knn, train_decision_tree, train_svm,
    train_gradient_boosting, train_adaboost, train_xgboost,
    evaluate_model, evaluate_thresholds
)


<h3>Load Data</h3>

In [2]:
df = pd.read_csv("../data/processed/heart.csv")

target = "target"
X = df.drop(columns=[target])
y = df[target]

print("Dataset shape:", df.shape)
print("Target distribution:\n", y.value_counts(normalize=True))


Dataset shape: (1025, 14)
Target distribution:
 target
1    0.513171
0    0.486829
Name: proportion, dtype: float64


<h3>Train/Test Split</h3>

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    stratify=y,
    random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (820, 13)
Test shape: (205, 13)


<h3>Feature Selection</h3>

In [4]:
selected_features, votes = vote_feature_selection(X_train, y_train, top_n=18)

print("\nSelected Features:", selected_features)
print("\nVotes:\n", votes)

with open("../data/processed/selected_features.json", "w") as f:
    json.dump(selected_features, f)

X_train_sel = X_train[selected_features]
X_test_sel = X_test[selected_features]





Selected Features: ['exang', 'cp', 'ca', 'oldpeak', 'sex', 'slope', 'thalach', 'thal', 'age', 'restecg', 'fbs', 'trestbps', 'chol']

Votes:
 exang       5
cp          5
ca          5
oldpeak     5
sex         5
slope       5
thalach     5
thal        5
age         5
restecg     5
fbs         5
trestbps    5
chol        5
Name: count, dtype: int64


<h3>Models</h3>

In [5]:
knn_model = train_knn(X_train_sel, y_train)
svm_model = train_svm(X_train_sel, y_train)
tree_model = train_decision_tree(X_train_sel, y_train)
gb_model = train_gradient_boosting(X_train_sel, y_train)
ada_model = train_adaboost(X_train_sel, y_train)
xgb_model = train_xgboost(X_train_sel, y_train)


<h3>Evaluation</h3>

In [6]:
results = []

results.append(evaluate_model(knn_model, X_test_sel, y_test, "KNN"))
results.append(evaluate_model(svm_model, X_test_sel, y_test, "SVM"))
results.append(evaluate_model(tree_model, X_test_sel, y_test, "Decision Tree"))
results.append(evaluate_model(gb_model, X_test_sel, y_test, "Gradient Boosting"))
results.append(evaluate_model(ada_model, X_test_sel, y_test, "AdaBoost"))
results.append(evaluate_model(xgb_model, X_test_sel, y_test, "XGBoost"))


===== KNN =====
Accuracy: 0.6926829268292682
Precision: 0.6944444444444444
Recall: 0.7142857142857143
F1 Score: 0.704225352112676

Classification Report:
               precision    recall  f1-score   support

           0       0.69      0.67      0.68       100
           1       0.69      0.71      0.70       105

    accuracy                           0.69       205
   macro avg       0.69      0.69      0.69       205
weighted avg       0.69      0.69      0.69       205

Confusion Matrix:
 [[67 33]
 [30 75]]


===== SVM =====
Accuracy: 0.7170731707317073
Precision: 0.7079646017699115
Recall: 0.7619047619047619
F1 Score: 0.7339449541284404

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.67      0.70       100
           1       0.71      0.76      0.73       105

    accuracy                           0.72       205
   macro avg       0.72      0.72      0.72       205
weighted avg       0.72      0.72      0.72      

In [7]:
pd.DataFrame(results)

Unnamed: 0,model,accuracy,precision,recall,f1
0,KNN,0.692683,0.694444,0.714286,0.704225
1,SVM,0.717073,0.707965,0.761905,0.733945
2,Decision Tree,0.834146,0.84466,0.828571,0.836538
3,Gradient Boosting,0.970732,0.971429,0.971429,0.971429
4,AdaBoost,0.970732,0.962617,0.980952,0.971698
5,XGBoost,0.956098,0.970588,0.942857,0.956522


<h3>Threshold Optimization</h3>

In [8]:
threshold_tables = {
    "KNN": evaluate_thresholds(knn_model, X_test_sel, y_test),
    "SVM": evaluate_thresholds(svm_model, X_test_sel, y_test),
    "Decision Tree": evaluate_thresholds(tree_model, X_test_sel, y_test),
    "Gradient Boosting": evaluate_thresholds(gb_model, X_test_sel, y_test),
    "AdaBoost": evaluate_thresholds(ada_model, X_test_sel, y_test),
    "XGBoost": evaluate_thresholds(xgb_model, X_test_sel, y_test),
}

threshold_tables

{'KNN':     threshold  precision    recall        f1
 0        0.05   0.603550  0.971429  0.744526
 1        0.10   0.603550  0.971429  0.744526
 2        0.15   0.615385  0.914286  0.735632
 3        0.20   0.615385  0.914286  0.735632
 4        0.25   0.615385  0.914286  0.735632
 5        0.30   0.645669  0.780952  0.706897
 6        0.35   0.645669  0.780952  0.706897
 7        0.40   0.645669  0.780952  0.706897
 8        0.45   0.694444  0.714286  0.704225
 9        0.50   0.694444  0.714286  0.704225
 10       0.55   0.694444  0.714286  0.704225
 11       0.60   0.759494  0.571429  0.652174
 12       0.65   0.759494  0.571429  0.652174
 13       0.70   0.759494  0.571429  0.652174
 14       0.75   0.852459  0.495238  0.626506
 15       0.80   0.852459  0.495238  0.626506
 16       0.85   0.852459  0.495238  0.626506
 17       0.90   1.000000  0.380952  0.551724
 18       0.95   1.000000  0.380952  0.551724,
 'SVM':     threshold  precision    recall        f1
 0        0.05   0.

In [9]:
results_df = pd.DataFrame(results)
results_path = "../data/processed/model_results.csv"

results_df.to_csv(results_path, index=False)

print("Saved model results to:", results_path)
results_df


Saved model results to: ../data/processed/model_results.csv


Unnamed: 0,model,accuracy,precision,recall,f1
0,KNN,0.692683,0.694444,0.714286,0.704225
1,SVM,0.717073,0.707965,0.761905,0.733945
2,Decision Tree,0.834146,0.84466,0.828571,0.836538
3,Gradient Boosting,0.970732,0.971429,0.971429,0.971429
4,AdaBoost,0.970732,0.962617,0.980952,0.971698
5,XGBoost,0.956098,0.970588,0.942857,0.956522


In [10]:
import joblib

os.makedirs("../data/models", exist_ok=True)

joblib.dump(knn_model, "../data/models/knn_model.pkl")
joblib.dump(svm_model, "../data/models/svm_model.pkl")
joblib.dump(tree_model, "../data/models/decision_tree.pkl")
joblib.dump(gb_model, "../data/models/gradient_boosting.pkl")
joblib.dump(ada_model, "../data/models/adaboost.pkl")
joblib.dump(xgb_model, "../data/models/xgboost.pkl")

print("Models saved successfully!")


Models saved successfully!


In [11]:
X_test_sel.to_csv("../data/processed/X_test_selected.csv", index=False)
y_test.to_csv("../data/processed/y_test.csv", index=False)
