# Machine Learning Assignment 2
## Heart Disease Prediction 

**Objective:** To train and evaluate machine learning models that can classify whether a patient has heart disease (target variable) based on input health measurements. 

**Metric:** 
1. Accuracy
2. AUC Score
3. Precision
4. Recall
5. F1 Score
6. Matthews Correlation Coeffi cient (MCC Score)

In [1]:
# Import core data manipulation and numerical computation libraries
import pandas as pd  # For loading and manipulating tabular data
import numpy as np   # For numerical operations and array handling
import matplotlib.pyplot as plt  # For creating visualizations and plots
import seaborn as sns  # For enhanced statistical visualizations
import warnings
warnings.filterwarnings('ignore')

# Import machine learning libraries from scikit-learn
from sklearn.model_selection import train_test_split, GridSearchCV  # train_test_split: splits data into training and testing sets, GridSearchCV: performs hyperparameter tuning
from sklearn.linear_model import LinearRegression, Ridge, Lasso  # Linear regression models: LinearRegression (basic), Ridge (L2 regularization), Lasso (L1 regularization)
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MinMaxScaler  # PolynomialFeatures: creates polynomial and interaction features, StandardScaler: normalizes features to mean=0, std=1
from sklearn.pipeline import Pipeline  # Creates a sequence of data transformations and model fitting steps

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Configure visualization settings for consistent plot appearance
sns.set_style('whitegrid')  # Sets seaborn plot style with white background and grid lines
plt.rcParams['figure.figsize'] = (12, 6)  # Sets default figure size to 12 inches wide by 6 inches tall

print("Libraries imported successfully.")

Libraries imported successfully.


###  Exploratory Data Analysis (EDA)

In [2]:
# Importing the dataset:  Import Train data
train_path = "heart.csv"
train = pd.read_csv(train_path)


# Rows and Column count ( no of samples and features with output in train data , without output and some other feature ommision  in test data)
print("Train shape:", train.shape)

display(train.head(3))
display(train.dtypes)

# finding out any missing value to do the data cleansing (inputing the missing value)
print("\nMissing values (train):")
print(train.isna().sum().sort_values(ascending=False))


Train shape: (1025, 14)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0


age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object


Missing values (train):
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


In [3]:
# === Display statistical summary of all features ===
print("\n--- Statistical Summary of Numeric Features ---")
print(train.describe())  # Shows count, mean, std, min, 25%, 50%, 75%, max for each numeric column

# Display comprehensive information about the training dataset structure
print("--- Dataset Info ---")
print(train.info())  # Shows column names, data types, non-null counts, and memory usage

# Check for missing values in each column (important for data quality)
print("\n--- Missing Values ---")
print(train.isnull().sum())  # Returns count of null/NaN values per column


--- Statistical Summary of Numeric Features ---
               age          sex           cp     trestbps        chol  \
count  1025.000000  1025.000000  1025.000000  1025.000000  1025.00000   
mean     54.434146     0.695610     0.942439   131.611707   246.00000   
std       9.072290     0.460373     1.029641    17.516718    51.59251   
min      29.000000     0.000000     0.000000    94.000000   126.00000   
25%      48.000000     0.000000     0.000000   120.000000   211.00000   
50%      56.000000     1.000000     1.000000   130.000000   240.00000   
75%      61.000000     1.000000     2.000000   140.000000   275.00000   
max      77.000000     1.000000     3.000000   200.000000   564.00000   

               fbs      restecg      thalach        exang      oldpeak  \
count  1025.000000  1025.000000  1025.000000  1025.000000  1025.000000   
mean      0.149268     0.529756   149.114146     0.336585     1.071512   
std       0.356527     0.527878    23.005724     0.472772     1.175053 

In [4]:
## Do feature engineering

In [5]:
from sklearn.preprocessing import StandardScaler

def feature_engineering(df, scaler=None, fit=True):
    df = df.copy()

    # Fill missing values
    for col in df.columns:
        if df[col].dtype == "object":
            df[col] = df[col].fillna(df[col].mode()[0])
        else:
            df[col] = df[col].fillna(df[col].median())

    # Create new features
    df["chol_age_ratio"] = df["chol"] / df["age"]
    df["high_risk"] = ((df["trestbps"] > 140) & (df["chol"] > 240)).astype(int)

    if "target" in df.columns:
        X = df.drop("target", axis=1)
        y = df["target"]
    else:
        X = df
        y = None

    num_cols = X.select_dtypes(include=["int64", "float64"]).columns

    if fit:
        scaler = StandardScaler()
        X[num_cols] = scaler.fit_transform(X[num_cols])
    else:
        X[num_cols] = scaler.transform(X[num_cols])

    return X, y, scaler


X, y, scaler = feature_engineering(train, fit=True)


In [6]:
## Model training

In [7]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,      # 20% for testing, 80% for training
    random_state=42,    # same result every time
    stratify=y          # keeps class balance
)



# Logistic Regression
model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train, y_train)
y_pred_lr = model_lr.predict(X_test)

# Decision Tree Classifier
model_dt = DecisionTreeClassifier(random_state=42)
model_dt.fit(X_train, y_train)
y_pred_dt = model_dt.predict(X_test)

# K-Nearest Neighbor (KNN)
model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(X_train, y_train)
y_pred_knn = model_knn.predict(X_test)

# Naive Bayes (Gaussian)
model_nb = GaussianNB()
model_nb.fit(X_train, y_train)
y_pred_nb = model_nb.predict(X_test)

# Random Forest
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)

#XGBoost
model_xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model_xgb.fit(X_train, y_train)
y_pred_xgb = model_xgb.predict(X_test)

In [8]:
## Evaluation Metrics

In [9]:
from sklearn.metrics import (
    accuracy_score, roc_auc_score,
    precision_score, recall_score,
    f1_score, matthews_corrcoef
)


models = {
    "Logistic Regression": model_lr,
    "Decision Tree": model_dt,
    "KNN": model_knn,
    "Naive Bayes": model_nb,
    "Random Forest": model_rf,
    "XGBoost": model_xgb
}

for name, model in models.items():
    print(f"\n===== {name} =====")
    
    y_pred = model.predict(X_test)
    
    # AUC needs probabilities
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_prob) * 100
    else:
        auc = None
    
    acc = accuracy_score(y_test, y_pred) * 100
    prec = precision_score(y_test, y_pred) * 100
    rec = recall_score(y_test, y_pred) * 100
    f1 = f1_score(y_test, y_pred) * 100
    mcc = matthews_corrcoef(y_test, y_pred) * 100

    print(f"Accuracy : {acc:.2f}%")
    print(f"AUC Score: {auc:.2f}%" if auc is not None else "AUC Score: N/A")
    print(f"Precision: {prec:.2f}%")
    print(f"Recall   : {rec:.2f}%")
    print(f"F1 Score : {f1:.2f}%")
    print(f"MCC Score: {mcc:.2f}%")


===== Logistic Regression =====
Accuracy : 80.98%
AUC Score: 92.87%
Precision: 76.19%
Recall   : 91.43%
F1 Score : 83.12%
MCC Score: 63.09%

===== Decision Tree =====
Accuracy : 98.54%
AUC Score: 98.57%
Precision: 100.00%
Recall   : 97.14%
F1 Score : 98.55%
MCC Score: 97.12%

===== KNN =====
Accuracy : 84.88%
AUC Score: 95.65%
Precision: 87.00%
Recall   : 82.86%
F1 Score : 84.88%
MCC Score: 69.86%

===== Naive Bayes =====
Accuracy : 82.44%
AUC Score: 90.15%
Precision: 79.49%
Recall   : 88.57%
F1 Score : 83.78%
MCC Score: 65.21%

===== Random Forest =====
Accuracy : 100.00%
AUC Score: 100.00%
Precision: 100.00%
Recall   : 100.00%
F1 Score : 100.00%
MCC Score: 100.00%

===== XGBoost =====
Accuracy : 100.00%
AUC Score: 100.00%
Precision: 100.00%
Recall   : 100.00%
F1 Score : 100.00%
MCC Score: 100.00%


#Saving the model as pkl file

In [10]:
import joblib

joblib.dump(model_lr,  "logistic.pkl")
joblib.dump(model_dt,  "decision_tree.pkl")
joblib.dump(model_knn, "knn.pkl")
joblib.dump(model_nb,  "naive_bayes.pkl")
joblib.dump(model_rf,  "random_forest.pkl")
joblib.dump(model_xgb, "xgboost.pkl")
joblib.dump(scaler, "scaler.pkl")

print("All models saved as pickle files.")

All models saved as pickle files.
