In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score
)
from scipy.stats import randint


data = pd.read_csv(r"D:\DATA_ANALYSIS\ML_\1\Heart_Disease_Project\data\top10_features.csv")

X = data.drop("target", axis=1)
y = data["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression
param_grid = {
    "C": [0.01, 0.1, 1, 10, 100],
    "solver": ["liblinear", "lbfgs"],
    "penalty": ["l2"]
}
grid_search = GridSearchCV(
    LogisticRegression(max_iter=1000, random_state=42),
    param_grid, cv=5, scoring="accuracy", n_jobs=-1
)
grid_search.fit(X_train_scaled, y_train)
logreg_best = grid_search.best_estimator_

y_pred = logreg_best.predict(X_test_scaled)
y_proba = logreg_best.predict_proba(X_test_scaled)[:,1]

logreg_results = {
    "Model": "Logistic Regression",
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred),
    "Recall": recall_score(y_test, y_pred),
    "F1": f1_score(y_test, y_pred),
    "AUC": roc_auc_score(y_test, y_proba)
}


# Random Forest
rf = RandomForestClassifier(random_state=42)
param_dist = {
    "n_estimators": randint(50, 300),
    "max_depth": randint(2, 10),
    "min_samples_split": randint(2, 10),
    "min_samples_leaf": randint(1, 10)
}
random_search = RandomizedSearchCV(
    rf, param_distributions=param_dist, n_iter=20, cv=5,
    scoring="accuracy", n_jobs=-1, random_state=42
)
random_search.fit(X_train, y_train)
rf_best = random_search.best_estimator_

y_pred = rf_best.predict(X_test)
y_proba = rf_best.predict_proba(X_test)[:,1]

rf_results = {
    "Model": "Random Forest",
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred),
    "Recall": recall_score(y_test, y_pred),
    "F1": f1_score(y_test, y_pred),
    "AUC": roc_auc_score(y_test, y_proba)
}


# XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
param_dist_xgb = {
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [2, 3, 4, 5],
    "subsample": [0.8, 1],
    "colsample_bytree": [0.8, 1]
}
random_search_xgb = RandomizedSearchCV(
    xgb, param_distributions=param_dist_xgb, n_iter=15, cv=5,
    scoring="accuracy", n_jobs=-1, random_state=42
)
random_search_xgb.fit(X_train, y_train)
xgb_best = random_search_xgb.best_estimator_

y_pred = xgb_best.predict(X_test)
y_proba = xgb_best.predict_proba(X_test)[:,1]

xgb_results = {
    "Model": "XGBoost",
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred),
    "Recall": recall_score(y_test, y_pred),
    "F1": f1_score(y_test, y_pred),
    "AUC": roc_auc_score(y_test, y_proba)
}


final_results = pd.DataFrame([logreg_results, rf_results, xgb_results])
final_results[["Accuracy","Precision","Recall","F1","AUC"]] = final_results[["Accuracy","Precision","Recall","F1","AUC"]]*100
print("\n=== Final Model Comparison ===")
print(final_results.round(2))



=== Final Model Comparison ===
                 Model  Accuracy  Precision  Recall     F1    AUC
0  Logistic Regression     90.16      89.29   89.29  89.29  96.10
1        Random Forest     88.52      86.21   89.29  87.72  95.02
2              XGBoost     85.25      82.76   85.71  84.21  92.53


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
