In [2]:
import pandas as pd
import numpy as np
import joblib
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Load the Kaggle Dataset
try:
    kaggle_file_path = "heart.csv"
    df_kaggle = pd.read_csv(kaggle_file_path)
    print("Kaggle dataset loaded with shape:", df_kaggle.shape)
except FileNotFoundError:
    print("Warning: Kaggle dataset not found. Proceeding with Cleveland dataset only.")
    df_kaggle = None

# Load and Clean the Cleveland Dataset
cleveland_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
column_names = ["age", "sex", "cp", "trestbps", "chol", "fbs", 
                "restecg", "thalach", "exang", "oldpeak", "slope", 
                "ca", "thal", "target"]
df_cleveland = pd.read_csv(cleveland_url, names=column_names)

df_cleveland.replace('?', np.nan, inplace=True)
df_cleveland.dropna(inplace=True)
df_cleveland = df_cleveland.astype(float)
df_cleveland["target"] = df_cleveland["target"].apply(lambda x: 1 if x > 0 else 0)

# Combine Datasets
if df_kaggle is not None:
    df_combined = pd.concat([df_kaggle, df_cleveland], ignore_index=True)
else:
    df_combined = df_cleveland.copy()

# Prepare Data
X = df_combined.drop(columns=["target"])
y = df_combined["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression
log_reg = LogisticRegression(random_state=42, solver='liblinear')
log_reg.fit(X_train_scaled, y_train)
y_pred_log = log_reg.predict(X_test_scaled)
accuracy_log = accuracy_score(y_test, y_pred_log)
print(f"Logistic Regression Accuracy: {accuracy_log:.2f}")

# Train XGBoost
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
xgb_model.fit(X_train_scaled, y_train)
y_pred_xgb = xgb_model.predict(X_test_scaled)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Model Accuracy: {accuracy_xgb:.2f}")

# Train Naïve Bayes
nb_model = GaussianNB()
nb_model.fit(X_train_scaled, y_train)
y_pred_nb = nb_model.predict(X_test_scaled)
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f"Naïve Bayes Accuracy: {accuracy_nb:.2f}")

# Save Models and Scaler
joblib.dump(log_reg, "log_reg_model.pkl")
joblib.dump(xgb_model, "xgb_model.pkl")
joblib.dump(nb_model, "nb_model.pkl")
joblib.dump(scaler, "scaler.pkl")
print("Models and Scaler saved successfully!")

# Define Risk Level Function
def risk_level(prob):
    if prob < 0.4:
        return "Low Risk"
    elif 0.4 <= prob < 0.7:
        return "Moderate Risk"
    else:
        return "High Risk"

# Manual Testing Function
def manual_test():
    print("\nEnter the following test parameters:")
    try:
        values = [float(input(f"{col}: ")) for col in X.columns]
    except ValueError:
        print("Invalid input. Please enter numeric values where appropriate.")
        return

    input_data = pd.DataFrame([values], columns=X.columns)
    input_scaled = scaler.transform(input_data)
    
    # Predictions
    prob_log = log_reg.predict_proba(input_scaled)[:, 1][0]
    prob_xgb = xgb_model.predict_proba(input_scaled)[:, 1][0]
    prob_nb = nb_model.predict_proba(input_scaled)[:, 1][0]
    
    print("\n--- Logistic Regression ---")
    print(f"Prediction: {'Heart Disease Found' if prob_log >= 0.5 else 'No Heart Disease'}")
    print(f"Risk Level: {risk_level(prob_log)}")
    print(f"Probability: {prob_log:.2f}")

    print("\n--- XGBoost Model ---")
    print(f"Prediction: {'Heart Disease Found' if prob_xgb >= 0.5 else 'No Heart Disease'}")
    print(f"Risk Level: {risk_level(prob_xgb)}")
    print(f"Probability: {prob_xgb:.2f}")
    
    print("\n--- Naïve Bayes ---")
    print(f"Prediction: {'Heart Disease Found' if prob_nb >= 0.5 else 'No Heart Disease'}")
    print(f"Risk Level: {risk_level(prob_nb)}")
    print(f"Probability: {prob_nb:.2f}")

# Run Manual Test
manual_test()




Parameters: { "use_label_encoder" } are not used.



Logistic Regression Accuracy: 0.83
XGBoost Model Accuracy: 0.87
Naïve Bayes Accuracy: 0.88
Models and Scaler saved successfully!

Enter the following test parameters:

--- Logistic Regression ---
Prediction: Heart Disease Found
Risk Level: High Risk
Probability: 1.00

--- XGBoost Model ---
Prediction: No Heart Disease
Risk Level: Moderate Risk
Probability: 0.47

--- Naïve Bayes ---
Prediction: Heart Disease Found
Risk Level: High Risk
Probability: 1.00
