In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    roc_curve, auc, precision_recall_curve, confusion_matrix, classification_report, roc_auc_score
)
from sklearn.model_selection import train_test_split, StratifiedKFold, learning_curve, cross_val_score
from sklearn.preprocessing import StandardScaler

# Uncomment if running in a new environment
# !pip install shap

## Load and preprocess the data

In [None]:
# Load your main dataset
df = pd.read_csv("../datasets/heart_disease_cleaned.csv")

# Example: Assume target column is named 'target'
X = df.drop('target', axis=1)
y = df['target']

# Split for training/validation
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Standardize features (if needed)
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

## Define and fit the best models
(You can update the parameters below with the best found from GridSearchCV if available)

In [None]:
# Update these with your best parameters from GridSearchCV if available!
logreg = LogisticRegression(max_iter=1000, penalty='l2', C=1.0, class_weight='balanced', random_state=42)
rf = RandomForestClassifier(n_estimators=100, max_depth=None, class_weight='balanced', random_state=42)
svm = SVC(probability=True, C=1, kernel='rbf', class_weight='balanced', random_state=42)

logreg.fit(X_train, y_train)
rf.fit(X_train, y_train)
svm.fit(X_train, y_train)

In [None]:
models = {'Logistic Regression': logreg, 'Random Forest': rf, 'SVM': svm}
plt.figure(figsize=(12, 5))

# ROC Curve
plt.subplot(1, 2, 1)
for name, model in models.items():
    y_score = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_score)
    plt.plot(fpr, tpr, label=f"{name} (AUC={auc(fpr, tpr):.2f})")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()

# Precision-Recall Curve
plt.subplot(1, 2, 2)
for name, model in models.items():
    y_score = model.predict_proba(X_test)[:, 1]
    precision, recall, _ = precision_recall_curve(y_test, y_score)
    plt.plot(recall, precision, label=name)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(15, 4))
for i, (name, model) in enumerate(models.items()):
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    plt.subplot(1, 3, i+1)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"{name} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
plt.tight_layout()
plt.show()

In [None]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"=== {name} ===")
    print(classification_report(y_test, y_pred))
    print()

In [None]:
import shap

# Choose your best model (example: Random Forest)
best_model = rf
best_model.fit(X_train, y_train)

explainer = shap.Explainer(best_model, X_train)
shap_values = explainer(X_train)

shap.summary_plot(shap_values, X_train, feature_names=X_train.columns)

In [None]:
from sklearn.inspection import permutation_importance

result = permutation_importance(best_model, X_test, y_test, n_repeats=10, random_state=42)
sorted_idx = result.importances_mean.argsort()

plt.figure(figsize=(8, 6))
plt.barh(X_test.columns[sorted_idx], result.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")
plt.title("Feature Importance (Permutation)")
plt.show()

In [None]:
train_sizes, train_scores, val_scores = learning_curve(
    best_model, X_train, y_train, cv=5, scoring='roc_auc', n_jobs=-1,
    train_sizes=np.linspace(0.1, 1.0, 10), random_state=42
)

train_scores_mean = np.mean(train_scores, axis=1)
val_scores_mean = np.mean(val_scores, axis=1)

plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_scores_mean, 'o-', label="Training score")
plt.plot(train_sizes, val_scores_mean, 'o-', label="Validation score")
plt.xlabel("Training Set Size")
plt.ylabel("ROC-AUC Score")
plt.title("Learning Curve")
plt.legend()
plt.show()

In [None]:
# Load Cleveland dataset
cleveland = pd.read_csv("../datasets/processed.cleveland.data", header=None)
# You may need to preprocess/rename columns to match your main dataset
# Example: cleveland.columns = X.columns

# Handle missing values, encode, and scale as needed
# X_clev = ...
# y_clev = ...

# Example (update as needed):
# X_clev = cleveland.drop('target', axis=1)
# y_clev = cleveland['target']
# X_clev = pd.DataFrame(scaler.transform(X_clev), columns=X.columns)

# Evaluate best model on Cleveland data
# cleveland_score = best_model.score(X_clev, y_clev)
# print(f"Best model accuracy on Cleveland dataset: {cleveland_score:.3f}")

You can update the model parameters in the cells above with the actual best parameters found by GridSearchCV for more accurate reporting.