In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.compose import ColumnTransformer
import seaborn as sns
import matplotlib.pyplot as plt

In [36]:
# Load dataset
df = pd.read_csv("heart_disease_2000_rows.csv")

# Separate features and target
X = df.drop("Heart_Disease_Presence", axis=1)
y = df["Heart_Disease_Presence"]

# Remove duplicates for test set
df_expanded_unique = df.drop_duplicates()
print(df_expanded_unique.shape)

# Split dataset (test set stays original, unique)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)
# SCALING
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#Encode Categorical Features & Scale Numeric Features
categorical_features = ["Sex", "Chest_Pain_Type", "Fasting_Blood_Sugar", 
                        "Resting_ECG", "Exercise_Angina", "ST_Slope", 
                        "Major_Vessels", "Thalassemia"]

numeric_features = ["Age", "Resting_Blood_Pressure", "Serum_Cholesterol", 
                    "Max_Heart_Rate", "ST_Depression"]

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(drop="first"), categorical_features)
])

(302, 14)


In [37]:
# Separate classes
df_1 = df[df["Heart_Disease_Presence"] == 1]
df_0 = df[df["Heart_Disease_Presence"] == 0]

# Desired counts
n_1 = 1500   # 75%
n_0 = 500    # 25%

# Resample with replacement
df_1_resampled = df_1.sample(n=n_1, replace=True, random_state=42)
df_0_resampled = df_0.sample(n=n_0, replace=True, random_state=42)

# Combine and shuffle
df_expanded = pd.concat([df_1_resampled, df_0_resampled]) \
                 .sample(frac=1, random_state=42) \
                 .reset_index(drop=True)

print(df_expanded.shape)
print(df_expanded["Heart_Disease_Presence"].value_counts(normalize=True))

(2000, 14)
Heart_Disease_Presence
1    0.75
0    0.25
Name: proportion, dtype: float64


In [38]:
y_pred = pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7475
              precision    recall  f1-score   support

           0       0.88      0.50      0.64       178
           1       0.70      0.95      0.81       222

    accuracy                           0.75       400
   macro avg       0.79      0.72      0.72       400
weighted avg       0.78      0.75      0.73       400





In [43]:
#Model 1: Logistic Regression 
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(
        C=0.01,       
        max_iter=200
    ))
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
acc_lr = accuracy_score(y_test, y_pred)
print("Accuracy:", acc_lr)


Accuracy: 0.8225


In [45]:
#Model 2: Logistic Regression + GridSearchCV
param_grid = {
    "lr__C": [0.01, 0.1, 1, 10],
    "lr__penalty": ["l2"]
}

grid_lr = GridSearchCV(
    lr_pipeline,
    param_grid,
    cv=5,
    scoring="accuracy"
)
grid_lr.fit(X_train, y_train)
y_pred_grid = grid_lr.predict(X_test)
acc_grid = accuracy_score(y_test, y_pred_grid)
print("Accuracy:",acc_grid)

Accuracy: 0.845


In [46]:
#Model 3 â€“ Gradient Descent (Logistic Regression with SGD)
sgd_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("sgd", SGDClassifier(
        loss="log_loss",
        penalty="l2",
        alpha=0.0001,
        max_iter=2000,
        random_state=42
    ))
])

sgd_pipeline.fit(X_train, y_train)

y_pred_sgd = sgd_pipeline.predict(X_test)
acc_gd = accuracy_score(y_test, y_pred_sgd)
print("Accuracy:",acc_gd)

Accuracy: 0.8475


In [48]:
#Model 4: Random Forest
rf_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("rf", RandomForestClassifier(n_estimators=300, max_depth=6, random_state=42))
])

rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)
print("Accuracy:", acc_rf)

Accuracy: 0.97


In [49]:
results = pd.DataFrame({
    "Model": ["Logistic Regression", "Logistic + GridSearch","Gradient Descent", "Random Forest"],
    "Accuracy": [acc_lr, acc_grid, acc_gd, acc_rf]
})

print(results)

                   Model  Accuracy
0    Logistic Regression    0.8225
1  Logistic + GridSearch    0.8450
2       Gradient Descent    0.8475
3          Random Forest    0.9700


In [51]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    log_loss,
    roc_auc_score
)

param_grid = {
    "C": [0.01, 0.1, 1, 10],
    "penalty": ["l1", "l2"],
    "solver": ["liblinear"]
}

lr = LogisticRegression(max_iter=1000)

grid = GridSearchCV(
    lr,
    param_grid,
    cv=5,
    scoring="f1",
    n_jobs=-1
)

grid.fit(X_train, y_train)

best_model = grid.best_estimator_

print("Best hyperparameters:", grid.best_params_)


Best hyperparameters: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}


In [52]:
y_pred = best_model.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.85      0.79      0.82       178
           1       0.84      0.89      0.86       222

    accuracy                           0.84       400
   macro avg       0.85      0.84      0.84       400
weighted avg       0.85      0.84      0.84       400



In [53]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[141  37]
 [ 25 197]]


In [54]:
y_prob = best_model.predict_proba(X_test)
loss = log_loss(y_test, y_prob)

print("Log Loss:", loss)


Log Loss: 0.36683655275078436
