In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb
import pickle



In [3]:
df = pd.read_csv("C:/Users/SESA804787/OneDrive - Schneider Electric/SE/WILP/Machine Learning/Assignment 2/heart.csv")
df.head()


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [4]:
# Separate features and target
X = df.drop("HeartDisease", axis=1)
y = df["HeartDisease"]

# Identify categorical and numeric columns
categorical_cols = X.select_dtypes(include=["object"]).columns
numeric_cols = X.select_dtypes(exclude=["object"]).columns

# One-hot encode categoricals
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Feature scaling (Standardization)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

# Convert back to DataFrame
X_scaled = pd.DataFrame(X_scaled, columns=X_encoded.columns)

X_scaled.head()


Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,-1.43314,0.410909,0.82507,-0.551341,1.382928,-0.832432,0.515952,2.075177,-0.532838,-0.229679,0.814275,-0.490449,-0.823556,-1.002181,1.150674
1,-0.478484,1.491752,-0.171961,-0.551341,0.754157,0.105664,-1.938163,-0.481887,1.876744,-0.229679,0.814275,-0.490449,-0.823556,0.997824,-0.869056
2,-1.751359,-0.129513,0.770188,-0.551341,-1.525138,-0.832432,0.515952,2.075177,-0.532838,-0.229679,-1.228087,2.038947,-0.823556,-1.002181,1.150674
3,-0.584556,0.302825,0.13904,-0.551341,-1.132156,0.574711,-1.938163,-0.481887,-0.532838,-0.229679,0.814275,-0.490449,1.214246,0.997824,-0.869056
4,0.051881,0.951331,-0.034755,-0.551341,-0.581981,-0.832432,0.515952,-0.481887,1.876744,-0.229679,0.814275,-0.490449,-0.823556,-1.002181,1.150674


In [5]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape


((734, 15), (184, 15))

In [6]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": xgb.XGBClassifier(eval_metric="logloss")
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    results[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "ROC_AUC": roc_auc_score(y_test, y_proba),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred),
    }

results


{'Logistic Regression': {'Accuracy': 0.8858695652173914,
  'ROC_AUC': 0.9296987087517934,
  'Precision': 0.8715596330275229,
  'Recall': 0.9313725490196079,
  'F1 Score': 0.9004739336492891,
  'MCC': 0.7693833294492657},
 'Decision Tree': {'Accuracy': 0.8043478260869565,
  'ROC_AUC': 0.7984218077474893,
  'Precision': 0.8055555555555556,
  'Recall': 0.8529411764705882,
  'F1 Score': 0.8285714285714286,
  'MCC': 0.6024886789428129},
 'KNN': {'Accuracy': 0.8858695652173914,
  'ROC_AUC': 0.936154949784792,
  'Precision': 0.8857142857142857,
  'Recall': 0.9117647058823529,
  'F1 Score': 0.8985507246376812,
  'MCC': 0.7686001458761952},
 'Naive Bayes': {'Accuracy': 0.9130434782608695,
  'ROC_AUC': 0.9451219512195121,
  'Precision': 0.93,
  'Recall': 0.9117647058823529,
  'F1 Score': 0.9207920792079208,
  'MCC': 0.8246260961195151},
 'Random Forest': {'Accuracy': 0.8858695652173914,
  'ROC_AUC': 0.9280846484935438,
  'Precision': 0.8857142857142857,
  'Recall': 0.9117647058823529,
  'F1 Scor

In [7]:
results_df = pd.DataFrame(results).T
results_df


Unnamed: 0,Accuracy,ROC_AUC,Precision,Recall,F1 Score,MCC
Logistic Regression,0.88587,0.929699,0.87156,0.931373,0.900474,0.769383
Decision Tree,0.804348,0.798422,0.805556,0.852941,0.828571,0.602489
KNN,0.88587,0.936155,0.885714,0.911765,0.898551,0.7686
Naive Bayes,0.913043,0.945122,0.93,0.911765,0.920792,0.824626
Random Forest,0.88587,0.928085,0.885714,0.911765,0.898551,0.7686
XGBoost,0.858696,0.921927,0.872549,0.872549,0.872549,0.714012


In [8]:
for name, model in models.items():
    filename = name.replace(" ", "_") + ".pkl"
    with open(filename, "wb") as f:
        pickle.dump(model, f)

# Save scaler too
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)
