In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import randint


In [2]:
data = pd.read_csv(r"D:\DATA_ANALYSIS\ML_\1\Heart_Disease_Project\data\top10_features.csv")

X = data.drop("target", axis=1)
y = data["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [3]:

log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_scaled, y_train)
baseline_pred = log_reg.predict(X_test_scaled)
baseline_acc = accuracy_score(y_test, baseline_pred)

print("Baseline Logistic Regression Accuracy:", baseline_acc)

Baseline Logistic Regression Accuracy: 0.8852459016393442


In [4]:
param_grid = {
    "C": [0.01, 0.1, 1, 10, 100],
    "solver": ["liblinear", "lbfgs"],
    "penalty": ["l2"]  
}

grid_search = GridSearchCV(
    LogisticRegression(max_iter=1000, random_state=42),param_grid, cv=5, scoring="accuracy", n_jobs=-1
)

grid_search.fit(X_train_scaled, y_train)

print("\nBest Params (LogReg GridSearch):", grid_search.best_params_)
print("Best CV Score (LogReg):", grid_search.best_score_)

# Evaluate tuned logistic regression
logreg_best = grid_search.best_estimator_
logreg_pred = logreg_best.predict(X_test_scaled)
logreg_acc = accuracy_score(y_test, logreg_pred)
print("Tuned Logistic Regression Accuracy:", logreg_acc)



Best Params (LogReg GridSearch): {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
Best CV Score (LogReg): 0.8305272108843538
Tuned Logistic Regression Accuracy: 0.9016393442622951


In [5]:
rf = RandomForestClassifier(random_state=42)

param_dist = {
    "n_estimators": randint(50, 300),
    "max_depth": randint(2, 10),
    "min_samples_split": randint(2, 10),
    "min_samples_leaf": randint(1, 10)
}

random_search = RandomizedSearchCV(
    rf, param_distributions=param_dist, n_iter=20, cv=5, scoring="accuracy", n_jobs=-1, random_state=42
)

random_search.fit(X_train, y_train)

print("\nBest Params (RandomForest RandomSearch):", random_search.best_params_)
print("Best CV Score (RandomForest):", random_search.best_score_)

# Evaluate tuned Random Forest
rf_best = random_search.best_estimator_
rf_pred = rf_best.predict(X_test)
rf_acc = accuracy_score(y_test, rf_pred)
print("Tuned Random Forest Accuracy:", rf_acc)



Best Params (RandomForest RandomSearch): {'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 108}
Best CV Score (RandomForest): 0.8262755102040817
Tuned Random Forest Accuracy: 0.8852459016393442


In [6]:

print("\n=== Model Comparison ===")
print(f"Baseline Logistic Regression: {baseline_acc:.3f}")
print(f"Tuned Logistic Regression:    {logreg_acc:.3f}")
print(f"Tuned Random Forest:          {rf_acc:.3f}")



=== Model Comparison ===
Baseline Logistic Regression: 0.885
Tuned Logistic Regression:    0.902
Tuned Random Forest:          0.885
