In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc, precision_recall_curve, confusion_matrix, classification_report, roc_auc_score, average_precision_score
from sklearn.model_selection import train_test_split
from data_preparation import load_data, build_preprocessor

## Load and preprocess the data

In [None]:
X, y = load_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
preprocessor = build_preprocessor(X_train)
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

## Define and fit the best models
(You can update the parameters below with the best found from GridSearchCV if available)

In [None]:
# Logistic Regression (update with best params if available)
logreg = LogisticRegression(max_iter=1000, C=1, penalty='l2', class_weight='balanced', solver='liblinear')
logreg.fit(X_train_processed, y_train)
y_pred_logreg = logreg.predict(X_test_processed)
y_proba_logreg = logreg.predict_proba(X_test_processed)[:, 1]

# Random Forest (update with best params if available)
rf = RandomForestClassifier(n_estimators=100, max_depth=None, class_weight='balanced', random_state=42)
rf.fit(X_train_processed, y_train)
y_pred_rf = rf.predict(X_test_processed)
y_proba_rf = rf.predict_proba(X_test_processed)[:, 1]

# SVM (update with best params if available)
svm = SVC(probability=True, C=1, kernel='rbf', class_weight='balanced', random_state=42)
svm.fit(X_train_processed, y_train)
y_pred_svm = svm.predict(X_test_processed)
y_proba_svm = svm.predict_proba(X_test_processed)[:, 1]

## Plot ROC Curves

In [None]:
plt.figure(figsize=(8, 6))
fpr_logreg, tpr_logreg, _ = roc_curve(y_test, y_proba_logreg)
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_proba_rf)
fpr_svm, tpr_svm, _ = roc_curve(y_test, y_proba_svm)
plt.plot(fpr_logreg, tpr_logreg, label=f'Logistic Regression (AUC = {roc_auc_score(y_test, y_proba_logreg):.2f})')
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {roc_auc_score(y_test, y_proba_rf):.2f})')
plt.plot(fpr_svm, tpr_svm, label=f'SVM (AUC = {roc_auc_score(y_test, y_proba_svm):.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()

## Plot Precision-Recall Curves

In [None]:
plt.figure(figsize=(8, 6))
prec_logreg, rec_logreg, _ = precision_recall_curve(y_test, y_proba_logreg)
prec_rf, rec_rf, _ = precision_recall_curve(y_test, y_proba_rf)
prec_svm, rec_svm, _ = precision_recall_curve(y_test, y_proba_svm)
plt.plot(rec_logreg, prec_logreg, label=f'Logistic Regression (AP = {average_precision_score(y_test, y_proba_logreg):.2f})')
plt.plot(rec_rf, prec_rf, label=f'Random Forest (AP = {average_precision_score(y_test, y_proba_rf):.2f})')
plt.plot(rec_svm, prec_svm, label=f'SVM (AP = {average_precision_score(y_test, y_proba_svm):.2f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve Comparison')
plt.legend(loc='lower left')
plt.tight_layout()
plt.show()

## Show Confusion Matrices

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
for ax, y_pred, name in zip(axes, [y_pred_logreg, y_pred_rf, y_pred_svm], ['Logistic Regression', 'Random Forest', 'SVM']):
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax)
    ax.set_title(f'{name} Confusion Matrix')
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')
plt.tight_layout()
plt.show()

## Summarize Best Parameters and Scores
(Update this table with your actual best parameters from GridSearchCV if available)

In [None]:
summary = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'SVM'],
    'ROC-AUC': [roc_auc_score(y_test, y_proba_logreg), roc_auc_score(y_test, y_proba_rf), roc_auc_score(y_test, y_proba_svm)],
    'Average Precision': [average_precision_score(y_test, y_proba_logreg), average_precision_score(y_test, y_proba_rf), average_precision_score(y_test, y_proba_svm)],
    'Accuracy': [logreg.score(X_test_processed, y_test), rf.score(X_test_processed, y_test), svm.score(X_test_processed, y_test)]
})
display(summary)

You can update the model parameters in the cells above with the actual best parameters found by GridSearchCV for more accurate reporting.