# Heart Disease Prediction - Model Comparison
This notebook compares several ML models to predict the likelihood of having diabetes.

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("heart.csv")
df.head(304)

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


## Prepare the data

In [5]:
X = df.drop(columns=["output"])
y = df["output"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Train and evaluate models

In [6]:
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(probability=True),
    "KNN": KNeighborsClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    proba = model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, preds)
    auc = roc_auc_score(y_test, proba)

    results.append((name, acc, auc))
    print(f"Model: {name}")
    print(f"Accuracy: {acc:.4f}, AUC: {auc:.4f}")
    print(classification_report(y_test, preds))
    print("-" * 40)

best_model_name, best_acc, best_auc = max(results, key=lambda x: x[2])
print(f"Best Model: {best_model_name} (AUC = {best_auc:.4f})")

best_model = models[best_model_name]


Model: Random Forest
Accuracy: 0.8361, AUC: 0.9203
              precision    recall  f1-score   support

           0       0.83      0.83      0.83        29
           1       0.84      0.84      0.84        32

    accuracy                           0.84        61
   macro avg       0.84      0.84      0.84        61
weighted avg       0.84      0.84      0.84        61

----------------------------------------
Model: Logistic Regression
Accuracy: 0.8689, AUC: 0.9267
              precision    recall  f1-score   support

           0       0.86      0.86      0.86        29
           1       0.88      0.88      0.88        32

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.87      0.87      0.87        61

----------------------------------------
Model: SVM
Accuracy: 0.7049, AUC: 0.8394
              precision    recall  f1-score   support

           0       0.79      0.52      0.62        29
      

## Make a Prediction with Best Model

In [9]:
input_data = {
    "age": 45,
    "sex": 0,
    "cp": 2,
    "trtbps": 125,
    "chol": 250,
    "fbs": 1,
    "restecg": 1,
    "thalachh": 160,
    "exng": 0,
    "oldpeak": 1.5,
    "slp": 2,
    "caa": 0,
    "thall": 1,
}

input_df = pd.DataFrame([input_data])
#input_features = input_df.drop(columns=["FamilyHistory"])

proba = best_model.predict_proba(input_df)[0][1]
print(f"Chance of having heart disease: {proba * 100:.2f}%")

prediction = best_model.predict(input_df)[0]
print("Prediction:", "Has heart disease" if prediction == 1 else "No heart disease")


Chance of having heart disease: 99.05%
Prediction: Has heart disease


In [10]:
import joblib
joblib.dump(best_model, "best_heart_disease_model.pkl")

['best_heart_disease_model.pkl']