In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
    RocCurveDisplay
)
from sklearn.decomposition import PCA # To recreate our PCA data

print("Libraries imported successfully!")

Libraries imported successfully!


In [2]:
# Load the base processed data
try:
    X_train_full = pd.read_csv('../data/processed/X_train.csv')
    X_test_full = pd.read_csv('../data/processed/X_test.csv')
    y_train = pd.read_csv('../data/processed/y_train.csv').values.ravel()
    y_test = pd.read_csv('../data/processed/y_test.csv').values.ravel()
    print("Base data loaded successfully.")
except FileNotFoundError:
    print("Error: Processed data not found. Please run the preprocessing notebook first.")

# --- 1. Prepare Full Feature Set ---
# Drop the 'id' column if it exists, to avoid data leakage
if 'id' in X_train_full.columns:
    X_train_full = X_train_full.drop('id', axis=1)
    X_test_full = X_test_full.drop('id', axis=1)

# --- 2. Prepare PCA Feature Set ---
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_full)
X_test_pca = pca.transform(X_test_full)

# --- 3. Prepare RFE Feature Set ---
# Use the list of features we selected in the previous notebook
rfe_features = [
    'ca', 'sex_Male', 'dataset_Switzerland', 'dataset_VA Long Beach',
    'cp_atypical angina', 'cp_non-anginal', 'cp_typical angina',
    'exang_True', 'slope_flat', 'thal_normal'
]
X_train_rfe = X_train_full[rfe_features]
X_test_rfe = X_test_full[rfe_features]

print("All three feature sets (Full, PCA, RFE) are ready for modeling.")
print(f"Full features shape: {X_train_full.shape}")
print(f"PCA features shape: {X_train_pca.shape}")
print(f"RFE features shape: {X_train_rfe.shape}")

Base data loaded successfully.
All three feature sets (Full, PCA, RFE) are ready for modeling.
Full features shape: (736, 21)
PCA features shape: (736, 2)
RFE features shape: (736, 10)


In [3]:
# Create a helper function for model evaluation
def evaluate_model(model, X_train, y_train, X_test, y_test, model_name="Model"):
    """
    Trains a model, makes predictions, and prints a classification report.
    """
    print(f"--- {model_name} ---")

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Print evaluation metrics
    print(classification_report(y_test, y_pred))

    # Print ROC AUC Score
    # Note: For SVC with probability=False, this will error. We'll handle it.
    try:
        roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
        print(f"ROC AUC Score: {roc_auc:.4f}")
    except AttributeError:
        # Some models like SVC don't have predict_proba unless enabled
        # For this case, we'll calculate it from the decision function
        roc_auc = roc_auc_score(y_test, model.decision_function(X_test))
        print(f"ROC AUC Score (from decision function): {roc_auc:.4f}")

    print("-" * 28 + "\n")


print("Evaluation function 'evaluate_model' is defined and ready.")

Evaluation function 'evaluate_model' is defined and ready.


In [4]:
# Initialize the Logistic Regression model
# We set max_iter to 1000 to ensure the model converges
log_reg = LogisticRegression(max_iter=1000, random_state=42)

# Evaluate the model on all three datasets
evaluate_model(log_reg, X_train_full, y_train, X_test_full, y_test, model_name="Logistic Regression (Full Features)")
evaluate_model(log_reg, X_train_pca, y_train, X_test_pca, y_test, model_name="Logistic Regression (PCA Features)")
evaluate_model(log_reg, X_train_rfe, y_train, X_test_rfe, y_test, model_name="Logistic Regression (RFE Features)")

--- Logistic Regression (Full Features) ---
              precision    recall  f1-score   support

           0       0.84      0.74      0.79        82
           1       0.81      0.88      0.85       102

    accuracy                           0.82       184
   macro avg       0.82      0.81      0.82       184
weighted avg       0.82      0.82      0.82       184

ROC AUC Score: 0.9211
----------------------------

--- Logistic Regression (PCA Features) ---
              precision    recall  f1-score   support

           0       0.72      0.72      0.72        82
           1       0.77      0.77      0.77       102

    accuracy                           0.75       184
   macro avg       0.75      0.75      0.75       184
weighted avg       0.75      0.75      0.75       184

ROC AUC Score: 0.8091
----------------------------

--- Logistic Regression (RFE Features) ---
              precision    recall  f1-score   support

           0       0.83      0.79      0.81        82
   

In [5]:
# Initialize the Decision Tree classifier
# We set a random_state for reproducibility
tree_clf = DecisionTreeClassifier(random_state=42)

# Evaluate the model on all three datasets
evaluate_model(tree_clf, X_train_full, y_train, X_test_full, y_test, model_name="Decision Tree (Full Features)")
evaluate_model(tree_clf, X_train_pca, y_train, X_test_pca, y_test, model_name="Decision Tree (PCA Features)")
evaluate_model(tree_clf, X_train_rfe, y_train, X_test_rfe, y_test, model_name="Decision Tree (RFE Features)")

--- Decision Tree (Full Features) ---
              precision    recall  f1-score   support

           0       0.77      0.67      0.72        82
           1       0.76      0.84      0.80       102

    accuracy                           0.77       184
   macro avg       0.77      0.76      0.76       184
weighted avg       0.77      0.77      0.76       184

ROC AUC Score: 0.7569
----------------------------

--- Decision Tree (PCA Features) ---
              precision    recall  f1-score   support

           0       0.65      0.65      0.65        82
           1       0.72      0.73      0.72       102

    accuracy                           0.69       184
   macro avg       0.69      0.69      0.69       184
weighted avg       0.69      0.69      0.69       184

ROC AUC Score: 0.6859
----------------------------

--- Decision Tree (RFE Features) ---
              precision    recall  f1-score   support

           0       0.83      0.77      0.80        82
           1       0.

In [6]:
# Initialize the Random Forest classifier
# n_estimators=100 means it will use 100 decision trees
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Evaluate the model on all three datasets
evaluate_model(rf_clf, X_train_full, y_train, X_test_full, y_test, model_name="Random Forest (Full Features)")
evaluate_model(rf_clf, X_train_pca, y_train, X_test_pca, y_test, model_name="Random Forest (PCA Features)")
evaluate_model(rf_clf, X_train_rfe, y_train, X_test_rfe, y_test, model_name="Random Forest (RFE Features)")

--- Random Forest (Full Features) ---
              precision    recall  f1-score   support

           0       0.85      0.78      0.82        82
           1       0.83      0.89      0.86       102

    accuracy                           0.84       184
   macro avg       0.84      0.84      0.84       184
weighted avg       0.84      0.84      0.84       184

ROC AUC Score: 0.9258
----------------------------

--- Random Forest (PCA Features) ---
              precision    recall  f1-score   support

           0       0.69      0.60      0.64        82
           1       0.71      0.78      0.74       102

    accuracy                           0.70       184
   macro avg       0.70      0.69      0.69       184
weighted avg       0.70      0.70      0.70       184

ROC AUC Score: 0.7768
----------------------------

--- Random Forest (RFE Features) ---
              precision    recall  f1-score   support

           0       0.84      0.77      0.80        82
           1       0.

In [7]:
# Initialize the Support Vector Machine (SVC) classifier
# We set probability=True to calculate ROC AUC score
svm_clf = SVC(probability=True, random_state=42)

# Evaluate the model on all three datasets
evaluate_model(svm_clf, X_train_full, y_train, X_test_full, y_test, model_name="Support Vector Machine (Full Features)")
evaluate_model(svm_clf, X_train_pca, y_train, X_test_pca, y_test, model_name="Support Vector Machine (PCA Features)")
evaluate_model(svm_clf, X_train_rfe, y_train, X_test_rfe, y_test, model_name="Support Vector Machine (RFE Features)")

--- Support Vector Machine (Full Features) ---
              precision    recall  f1-score   support

           0       0.87      0.76      0.81        82
           1       0.82      0.91      0.87       102

    accuracy                           0.84       184
   macro avg       0.85      0.83      0.84       184
weighted avg       0.85      0.84      0.84       184

ROC AUC Score: 0.9175
----------------------------

--- Support Vector Machine (PCA Features) ---
              precision    recall  f1-score   support

           0       0.76      0.73      0.75        82
           1       0.79      0.81      0.80       102

    accuracy                           0.78       184
   macro avg       0.77      0.77      0.77       184
weighted avg       0.78      0.78      0.78       184

ROC AUC Score: 0.8130
----------------------------

--- Support Vector Machine (RFE Features) ---
              precision    recall  f1-score   support

           0       0.86      0.77      0.81     