In [73]:
# Install the exact versions used for compatibility
# ! pip install scikit-learn==1.4.2
# ! pip install imbalanced-learn==0.12.0
# ! pip install -U scikit-learn imbalanced-learn


In [74]:
import sys, os, json
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

sys.path.append(os.path.abspath("../src"))

from models import (
    train_knn, train_decision_tree, train_svm,
    train_gradient_boosting, train_adaboost, train_xgboost,
    evaluate_model
)



<h3>Load Preprocessed Dataset</h3>

In [75]:
# Load dataset
df = pd.read_csv("../data/processed/heart_disease_clean.csv")
target = "Heart Disease Status"

# Load selected features
with open("../data/processed/selected_features.json", "r") as f:
    selected_features = json.load(f)

X = df[selected_features]
y = df[target]



In [76]:
# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Apply SMOTE to training set
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)

print("Training data shape:", X_train.shape)
print("Class distribution after SMOTE:\n", pd.Series(y_train).value_counts())



Training data shape: (12800, 15)
Class distribution after SMOTE:
 Heart Disease Status
0    6400
1    6400
Name: count, dtype: int64


<h3>Train Machine Learning Models</h3>

In [77]:
# KNN with k = 5 neighbors
knn_model = train_knn(X_train, y_train)

In [78]:
# Decision Tree Classifier
tree_model = train_decision_tree(X_train, y_train)


In [79]:
# Support Vector Machine (SVM)
svm_model = train_svm(X_train, y_train)


In [80]:
# Gradient Boosting Classifier
gb_model = train_gradient_boosting(X_train, y_train)


In [81]:
# AdaBoost Classifier
ada_model = train_adaboost(X_train, y_train)


In [82]:
# XGBoost Classifier
xgb_model = train_xgboost(X_train, y_train)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


<h3>Evaluate all Models on the Test Set</h3>

In [83]:
results = []
results.append(evaluate_model(knn_model, X_test, y_test, "KNN"))
results.append(evaluate_model(tree_model, X_test, y_test, "Decision Tree"))
results.append(evaluate_model(svm_model, X_test, y_test, "SVM"))
results.append(evaluate_model(gb_model, X_test, y_test, "Gradient Boosting"))
results.append(evaluate_model(ada_model, X_test, y_test, "AdaBoost"))
results.append(evaluate_model(xgb_model, X_test, y_test, "XGBoost"))


===== KNN =====
Accuracy: 0.532
Precision: 0.20484581497797358
Recall: 0.465
F1 Score: 0.28440366972477066

Classification Report:

              precision    recall  f1-score   support

           0       0.80      0.55      0.65      1600
           1       0.20      0.47      0.28       400

    accuracy                           0.53      2000
   macro avg       0.50      0.51      0.47      2000
weighted avg       0.68      0.53      0.58      2000


===== Decision Tree =====
Accuracy: 0.632
Precision: 0.19784172661870503
Recall: 0.275
F1 Score: 0.2301255230125523

Classification Report:

              precision    recall  f1-score   support

           0       0.80      0.72      0.76      1600
           1       0.20      0.28      0.23       400

    accuracy                           0.63      2000
   macro avg       0.50      0.50      0.49      2000
weighted avg       0.68      0.63      0.65      2000


===== SVM =====
Accuracy: 0.5795
Precision: 0.18273381294964028
Recall: