In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_curve, auc,
    roc_auc_score, precision_recall_curve, average_precision_score
)

import warnings
warnings.filterwarnings('ignore')

In [3]:
RANDOM_STATE = 30
np.random.seed(RANDOM_STATE)

In [7]:
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12

In [9]:
df = pd.read_excel('default.xls', header=1)

In [15]:
df = df.rename(columns={'default payment next month': 'DEFAULT'})

In [11]:
print(f"    - Total observations: {df.shape[0]:,}")
print(f"    - Total features: {df.shape[1]}")
print(f"    - Missing values: {df.isnull().sum().sum()}")

    - Total observations: 30,000
    - Total features: 25
    - Missing values: 0


In [17]:
default_counts = df['DEFAULT'].value_counts()
print(f"    - No Default (0): {default_counts[0]:,} ({default_counts[0]/len(df)*100:.2f}%)")
print(f"    - Default (1): {default_counts[1]:,} ({default_counts[1]/len(df)*100:.2f}%)")

    - No Default (0): 23,364 (77.88%)
    - Default (1): 6,636 (22.12%)


In [19]:
feature_desc = {
    'LIMIT_BAL': 'Credit limit (NT dollars)',
    'SEX': 'Gender (1=male, 2=female)',
    'EDUCATION': 'Education level (1=grad school, 2=university, 3=high school, 4=others)',
    'MARRIAGE': 'Marital status (1=married, 2=single, 3=others)',
    'AGE': 'Age in years',
    'PAY_0 to PAY_6': 'Repayment status (-1=paid duly, 1-9=months of delay)',
    'BILL_AMT1 to BILL_AMT6': 'Bill statement amount (NT dollars)',
    'PAY_AMT1 to PAY_AMT6': 'Previous payment amount (NT dollars)'
}
for feat, desc in feature_desc.items():
    print(f"    - {feat}: {desc}")

    - LIMIT_BAL: Credit limit (NT dollars)
    - SEX: Gender (1=male, 2=female)
    - EDUCATION: Education level (1=grad school, 2=university, 3=high school, 4=others)
    - MARRIAGE: Marital status (1=married, 2=single, 3=others)
    - AGE: Age in years
    - PAY_0 to PAY_6: Repayment status (-1=paid duly, 1-9=months of delay)
    - BILL_AMT1 to BILL_AMT6: Bill statement amount (NT dollars)
    - PAY_AMT1 to PAY_AMT6: Previous payment amount (NT dollars)


In [21]:
print("\n1.4 Descriptive Statistics:")
print(df.describe().round(2).to_string())


1.4 Descriptive Statistics:
             ID   LIMIT_BAL       SEX  EDUCATION  MARRIAGE       AGE     PAY_0     PAY_2     PAY_3     PAY_4     PAY_5     PAY_6  BILL_AMT1  BILL_AMT2   BILL_AMT3  BILL_AMT4  BILL_AMT5  BILL_AMT6   PAY_AMT1    PAY_AMT2   PAY_AMT3   PAY_AMT4   PAY_AMT5   PAY_AMT6   DEFAULT
count  30000.00    30000.00  30000.00   30000.00  30000.00  30000.00  30000.00  30000.00  30000.00  30000.00  30000.00  30000.00   30000.00   30000.00    30000.00   30000.00   30000.00   30000.00   30000.00    30000.00   30000.00   30000.00   30000.00   30000.00  30000.00
mean   15000.50   167484.32      1.60       1.85      1.55     35.49     -0.02     -0.13     -0.17     -0.22     -0.27     -0.29   51223.33   49179.08    47013.15   43262.95   40311.40   38871.76    5663.58     5921.16    5225.68    4826.08    4799.39    5215.50      0.22
std     8660.40   129747.66      0.49       0.79      0.52      9.22      1.12      1.20      1.20      1.17      1.13      1.15   73635.86   71173.77  

In [23]:
df = df.drop('ID', axis=1)

In [25]:
X = df.drop('DEFAULT', axis=1)
y = df['DEFAULT']

In [27]:
print(f"\n2.1 Feature Matrix Shape: {X.shape}")
print(f"2.2 Target Vector Shape: {y.shape}")


2.1 Feature Matrix Shape: (30000, 23)
2.2 Target Vector Shape: (30000,)


In [29]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=RANDOM_STATE, stratify=y
)

In [31]:
print(f"    - Training set: {X_train.shape[0]:,} observations")
print(f"    - Test set: {X_test.shape[0]:,} observations")
print(f"    - Training default rate: {y_train.mean()*100:.2f}%")
print(f"    - Test default rate: {y_test.mean()*100:.2f}%")

    - Training set: 21,000 observations
    - Test set: 9,000 observations
    - Training default rate: 22.12%
    - Test default rate: 22.12%


In [33]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [35]:
log_reg = LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)
log_reg.fit(X_train_scaled, y_train)

In [37]:
y_pred_log = log_reg.predict(X_test_scaled)
y_prob_log = log_reg.predict_proba(X_test_scaled)[:, 1]

In [39]:
print("\nModel Performance on Test Set:")
print(f"    - Accuracy: {accuracy_score(y_test, y_pred_log):.4f}")
print(f"    - Precision: {precision_score(y_test, y_pred_log):.4f}")
print(f"    - Recall (Sensitivity): {recall_score(y_test, y_pred_log):.4f}")
print(f"    - F1-Score: {f1_score(y_test, y_pred_log):.4f}")
print(f"    - ROC-AUC: {roc_auc_score(y_test, y_prob_log):.4f}")


Model Performance on Test Set:
    - Accuracy: 0.8134
    - Precision: 0.7566
    - Recall (Sensitivity): 0.2310
    - F1-Score: 0.3540
    - ROC-AUC: 0.7319


In [41]:
print("\nConfusion Matrix:")
cm_log = confusion_matrix(y_test, y_pred_log)
print(f"    TN={cm_log[0,0]:,}  FP={cm_log[0,1]:,}")
print(f"    FN={cm_log[1,0]:,}  TP={cm_log[1,1]:,}")


Confusion Matrix:
    TN=6,861  FP=148
    FN=1,531  TP=460


In [43]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred_log, target_names=['No Default', 'Default']))


Classification Report:
              precision    recall  f1-score   support

  No Default       0.82      0.98      0.89      7009
     Default       0.76      0.23      0.35      1991

    accuracy                           0.81      9000
   macro avg       0.79      0.60      0.62      9000
weighted avg       0.80      0.81      0.77      9000



In [45]:
coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': log_reg.coef_[0],
    'Odds Ratio': np.exp(log_reg.coef_[0])
}).sort_values('Coefficient', key=abs, ascending=False)

In [47]:
print("\nTop 10 Most Important Features:")
print(coef_df.head(10).to_string(index=False))


Top 10 Most Important Features:
  Feature  Coefficient  Odds Ratio
    PAY_0     0.622023    1.862692
BILL_AMT1    -0.380773    0.683333
 PAY_AMT1    -0.205788    0.814005
 PAY_AMT2    -0.202061    0.817045
BILL_AMT2     0.133641    1.142982
    PAY_2     0.112872    1.119489
 PAY_AMT4    -0.106355    0.899106
BILL_AMT5     0.098040    1.103007
 MARRIAGE    -0.096231    0.908254
BILL_AMT3     0.095426    1.100128


In [49]:
print(f"\nIntercept: {log_reg.intercept_[0]:.4f}")


Intercept: -1.4557


In [51]:
C_values = [0.001, 0.01, 0.1, 1, 10, 100]
cv_scores_log = []

In [55]:
print("\nCross-Validation Results for Different C Values:")
print(f"{'C Value':<12} {'Mean AUC':<12} {'Std AUC':<12}")
print("-" * 36)

for C in C_values:
    lr = LogisticRegression(C=C, random_state=RANDOM_STATE, max_iter=1000)
    scores = cross_val_score(lr, X_train_scaled, y_train, cv=5, scoring='roc_auc')
    cv_scores_log.append(scores.mean())
    print(f"{C:<12} {scores.mean():.4f}      {scores.std():.4f}")

best_C = C_values[np.argmax(cv_scores_log)]
print(f"\nBest C value: {best_C}")


Cross-Validation Results for Different C Values:
C Value      Mean AUC     Std AUC     
------------------------------------
0.001        0.7131      0.0066
0.01         0.7176      0.0065
0.1          0.7176      0.0065
1            0.7176      0.0065
10           0.7176      0.0065
100          0.7176      0.0065

Best C value: 0.1


In [57]:
log_reg_tuned = LogisticRegression(C=best_C, random_state=RANDOM_STATE, max_iter=1000)
log_reg_tuned.fit(X_train_scaled, y_train)
y_pred_log_tuned = log_reg_tuned.predict(X_test_scaled)
y_prob_log_tuned = log_reg_tuned.predict_proba(X_test_scaled)[:, 1]

In [59]:
print(f"\nTuned Logistic Regression Performance (C={best_C}):")
print(f"    - Accuracy: {accuracy_score(y_test, y_pred_log_tuned):.4f}")
print(f"    - ROC-AUC: {roc_auc_score(y_test, y_prob_log_tuned):.4f}")


Tuned Logistic Regression Performance (C=0.1):
    - Accuracy: 0.8132
    - ROC-AUC: 0.7317


In [61]:
rf = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1)
rf.fit(X_train, y_train) 

In [63]:
y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:, 1]

In [65]:
print(f"    - Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"    - Precision: {precision_score(y_test, y_pred_rf):.4f}")
print(f"    - Recall (Sensitivity): {recall_score(y_test, y_pred_rf):.4f}")
print(f"    - F1-Score: {f1_score(y_test, y_pred_rf):.4f}")
print(f"    - ROC-AUC: {roc_auc_score(y_test, y_prob_rf):.4f}")

    - Accuracy: 0.8182
    - Precision: 0.6601
    - Recall (Sensitivity): 0.3677
    - F1-Score: 0.4723
    - ROC-AUC: 0.7652


In [67]:
print("\nConfusion Matrix:")
cm_rf = confusion_matrix(y_test, y_pred_rf)
print(f"    TN={cm_rf[0,0]:,}  FP={cm_rf[0,1]:,}")
print(f"    FN={cm_rf[1,0]:,}  TP={cm_rf[1,1]:,}")


Confusion Matrix:
    TN=6,632  FP=377
    FN=1,259  TP=732


In [69]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf, target_names=['No Default', 'Default']))


Classification Report:
              precision    recall  f1-score   support

  No Default       0.84      0.95      0.89      7009
     Default       0.66      0.37      0.47      1991

    accuracy                           0.82      9000
   macro avg       0.75      0.66      0.68      9000
weighted avg       0.80      0.82      0.80      9000



In [71]:
feat_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf.feature_importances_
}).sort_values('Importance', ascending=False)

In [73]:
print("\nTop 10 Most Important Features:")
print(feat_importance.head(10).to_string(index=False))


Top 10 Most Important Features:
  Feature  Importance
    PAY_0    0.098183
      AGE    0.066634
BILL_AMT1    0.059773
LIMIT_BAL    0.059747
BILL_AMT2    0.055341
BILL_AMT3    0.052566
 PAY_AMT1    0.051867
BILL_AMT4    0.051425
BILL_AMT6    0.050775
BILL_AMT5    0.050393


In [75]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

In [79]:
rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1),
    param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=0
)
rf_grid.fit(X_train, y_train)

In [81]:
print(f"\nBest Parameters: {rf_grid.best_params_}")
print(f"Best Cross-Validation AUC: {rf_grid.best_score_:.4f}")


Best Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
Best Cross-Validation AUC: 0.7775


In [83]:
rf_tuned = rf_grid.best_estimator_
y_pred_rf_tuned = rf_tuned.predict(X_test)
y_prob_rf_tuned = rf_tuned.predict_proba(X_test)[:, 1]

In [85]:
print(f"\nTuned Random Forest Performance:")
print(f"    - Accuracy: {accuracy_score(y_test, y_pred_rf_tuned):.4f}")
print(f"    - Precision: {precision_score(y_test, y_pred_rf_tuned):.4f}")
print(f"    - Recall: {recall_score(y_test, y_pred_rf_tuned):.4f}")
print(f"    - F1-Score: {f1_score(y_test, y_pred_rf_tuned):.4f}")
print(f"    - ROC-AUC: {roc_auc_score(y_test, y_prob_rf_tuned):.4f}")


Tuned Random Forest Performance:
    - Accuracy: 0.8237
    - Precision: 0.7016
    - Recall: 0.3531
    - F1-Score: 0.4698
    - ROC-AUC: 0.7826


In [87]:
comparison_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'Logistic Reg. (Tuned)', 
              'Random Forest', 'Random Forest (Tuned)'],
    'Accuracy': [
        accuracy_score(y_test, y_pred_log),
        accuracy_score(y_test, y_pred_log_tuned),
        accuracy_score(y_test, y_pred_rf),
        accuracy_score(y_test, y_pred_rf_tuned)
    ],
    'Precision': [
        precision_score(y_test, y_pred_log),
        precision_score(y_test, y_pred_log_tuned),
        precision_score(y_test, y_pred_rf),
        precision_score(y_test, y_pred_rf_tuned)
    ],
    'Recall': [
        recall_score(y_test, y_pred_log),
        recall_score(y_test, y_pred_log_tuned),
        recall_score(y_test, y_pred_rf),
        recall_score(y_test, y_pred_rf_tuned)
    ],
    'F1-Score': [
        f1_score(y_test, y_pred_log),
        f1_score(y_test, y_pred_log_tuned),
        f1_score(y_test, y_pred_rf),
        f1_score(y_test, y_pred_rf_tuned)
    ],
    'ROC-AUC': [
        roc_auc_score(y_test, y_prob_log),
        roc_auc_score(y_test, y_prob_log_tuned),
        roc_auc_score(y_test, y_prob_rf),
        roc_auc_score(y_test, y_prob_rf_tuned)
    ]
})

In [89]:
print("\n" + comparison_df.round(4).to_string(index=False))


                Model  Accuracy  Precision  Recall  F1-Score  ROC-AUC
  Logistic Regression    0.8134     0.7566  0.2310    0.3540   0.7319
Logistic Reg. (Tuned)    0.8132     0.7566  0.2295    0.3522   0.7317
        Random Forest    0.8182     0.6601  0.3677    0.4723   0.7652
Random Forest (Tuned)    0.8237     0.7016  0.3531    0.4698   0.7826


In [91]:
best_model_idx = comparison_df['ROC-AUC'].idxmax()
print(f"\nBest Model (by ROC-AUC): {comparison_df.loc[best_model_idx, 'Model']}")


Best Model (by ROC-AUC): Random Forest (Tuned)


In [99]:
fig1, ax1 = plt.subplots(figsize=(8, 6))
colors = ['green', 'red']
default_counts.plot(kind='bar', color=colors, ax=ax1, edgecolor='black')
ax1.set_title('Target Variable Distribution', fontsize=14, fontweight='bold')
ax1.set_xlabel('Default Status')
ax1.set_ylabel('Count')
ax1.set_xticklabels(['No Default (0)', 'Default (1)'], rotation=0)
for i, v in enumerate(default_counts):
    ax1.text(i, v + 500, f'{v:,}\n({v/len(df)*100:.1f}%)', ha='center', fontsize=10)
plt.tight_layout()
plt.savefig('01_target_distribution.png', dpi=150, bbox_inches='tight')
plt.close()
print("\nFigure 1: Target distribution saved.")


Figure 1: Target distribution saved.


In [95]:
ax2 = fig.add_subplot(3, 2, 2)

# Calculate ROC curves
fpr_log, tpr_log, _ = roc_curve(y_test, y_prob_log_tuned)
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_prob_rf_tuned)

ax2.plot(fpr_log, tpr_log, 'b-', linewidth=2, 
         label=f'Logistic Regression (AUC = {roc_auc_score(y_test, y_prob_log_tuned):.3f})')
ax2.plot(fpr_rf, tpr_rf, 'r-', linewidth=2,
         label=f'Random Forest (AUC = {roc_auc_score(y_test, y_prob_rf_tuned):.3f})')
ax2.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random Classifier')
ax2.set_xlabel('False Positive Rate (1 - Specificity)')
ax2.set_ylabel('True Positive Rate (Sensitivity)')
ax2.set_title('ROC Curves Comparison', fontsize=14, fontweight='bold')
ax2.legend(loc='lower right')
ax2.grid(True, alpha=0.3)

In [101]:
fig2, ax2 = plt.subplots(figsize=(10, 8))
fpr_log, tpr_log, _ = roc_curve(y_test, y_prob_log_tuned)
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_prob_rf_tuned)
ax2.plot(fpr_log, tpr_log, 'b-', linewidth=2, 
         label=f'Logistic Regression (AUC = {roc_auc_score(y_test, y_prob_log_tuned):.3f})')
ax2.plot(fpr_rf, tpr_rf, 'r-', linewidth=2,
         label=f'Random Forest (AUC = {roc_auc_score(y_test, y_prob_rf_tuned):.3f})')
ax2.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random Classifier')
ax2.set_xlabel('False Positive Rate (1 - Specificity)')
ax2.set_ylabel('True Positive Rate (Sensitivity)')
ax2.set_title('ROC Curves Comparison', fontsize=14, fontweight='bold')
ax2.legend(loc='lower right')
ax2.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('02_roc_curves.png', dpi=150, bbox_inches='tight')
plt.close()
print("Figure 2: ROC curves saved.")

Figure 2: ROC curves saved.


In [103]:
fig3, ax3 = plt.subplots(figsize=(8, 6))
cm_log_tuned = confusion_matrix(y_test, y_pred_log_tuned)
sns.heatmap(cm_log_tuned, annot=True, fmt='d', cmap='Blues', ax=ax3,
            xticklabels=['No Default', 'Default'],
            yticklabels=['No Default', 'Default'])
ax3.set_title('Confusion Matrix: Logistic Regression', fontsize=14, fontweight='bold')
ax3.set_ylabel('Actual')
ax3.set_xlabel('Predicted')
plt.tight_layout()
plt.savefig('03_confusion_matrix_logistic.png', dpi=150, bbox_inches='tight')
plt.close()
print("Figure 3: Logistic Regression confusion matrix saved.")

Figure 3: Logistic Regression confusion matrix saved.


In [105]:
fig4, ax4 = plt.subplots(figsize=(8, 6))
cm_rf_tuned = confusion_matrix(y_test, y_pred_rf_tuned)
sns.heatmap(cm_rf_tuned, annot=True, fmt='d', cmap='Greens', ax=ax4,
            xticklabels=['No Default', 'Default'],
            yticklabels=['No Default', 'Default'])
ax4.set_title('Confusion Matrix: Random Forest', fontsize=14, fontweight='bold')
ax4.set_ylabel('Actual')
ax4.set_xlabel('Predicted')
plt.tight_layout()
plt.savefig('04_confusion_matrix_random_forest.png', dpi=150, bbox_inches='tight')
plt.close()
print("Figure 4: Random Forest confusion matrix saved.")

Figure 4: Random Forest confusion matrix saved.


In [107]:
fig5, ax5 = plt.subplots(figsize=(10, 8))
top_features = feat_importance.head(10)
colors_feat = plt.cm.RdYlGn(np.linspace(0.2, 0.8, 10))[::-1]
bars = ax5.barh(top_features['Feature'], top_features['Importance'], color=colors_feat)
ax5.set_xlabel('Importance')
ax5.set_title('Top 10 Feature Importances (Random Forest)', fontsize=14, fontweight='bold')
ax5.invert_yaxis()
for bar, val in zip(bars, top_features['Importance']):
    ax5.text(val + 0.005, bar.get_y() + bar.get_height()/2, f'{val:.3f}', 
             va='center', fontsize=9)
plt.tight_layout()
plt.savefig('05_feature_importance_rf.png', dpi=150, bbox_inches='tight')
plt.close()
print("Figure 5: Random Forest feature importance saved.")

Figure 5: Random Forest feature importance saved.


In [109]:
fig6, ax6 = plt.subplots(figsize=(10, 8))
top_coefs = coef_df.head(10)
colors_coef = ['#e74c3c' if x > 0 else '#2ecc71' for x in top_coefs['Coefficient']]
bars = ax6.barh(top_coefs['Feature'], top_coefs['Coefficient'], color=colors_coef)
ax6.set_xlabel('Coefficient')
ax6.set_title('Top 10 Logistic Regression Coefficients', fontsize=14, fontweight='bold')
ax6.axvline(x=0, color='black', linewidth=0.5)
ax6.invert_yaxis()
plt.tight_layout()
plt.savefig('06_coefficients_logistic.png', dpi=150, bbox_inches='tight')
plt.close()
print("Figure 6: Logistic Regression coefficients saved.")

Figure 6: Logistic Regression coefficients saved.


In [111]:
fig7, ax = plt.subplots(figsize=(12, 6))
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']
x = np.arange(len(metrics))
width = 0.35

log_scores = comparison_df[comparison_df['Model'] == 'Logistic Reg. (Tuned)'][metrics].values[0]
rf_scores = comparison_df[comparison_df['Model'] == 'Random Forest (Tuned)'][metrics].values[0]

bars1 = ax.bar(x - width/2, log_scores, width, label='Logistic Regression (Tuned)', color='#3498db')
bars2 = ax.bar(x + width/2, rf_scores, width, label='Random Forest (Tuned)', color='#27ae60')

ax.set_ylabel('Score')
ax.set_title('Model Performance Comparison', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()
ax.set_ylim(0, 1)

for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'{height:.3f}',
                   xy=(bar.get_x() + bar.get_width()/2, height),
                   xytext=(0, 3), textcoords="offset points",
                   ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.savefig('07_performance_comparison.png', dpi=150, bbox_inches='tight')
plt.close()
print("Figure 7: Performance comparison bar chart saved.")

Figure 7: Performance comparison bar chart saved.


In [113]:
fig8, ax = plt.subplots(figsize=(10, 8))
precision_log, recall_log, _ = precision_recall_curve(y_test, y_prob_log_tuned)
precision_rf, recall_rf, _ = precision_recall_curve(y_test, y_prob_rf_tuned)

ax.plot(recall_log, precision_log, 'b-', linewidth=2,
        label=f'Logistic Regression (AP = {average_precision_score(y_test, y_prob_log_tuned):.3f})')
ax.plot(recall_rf, precision_rf, 'r-', linewidth=2,
        label=f'Random Forest (AP = {average_precision_score(y_test, y_prob_rf_tuned):.3f})')
ax.axhline(y=y_test.mean(), color='gray', linestyle='--', label=f'Baseline (Prevalence = {y_test.mean():.3f})')
ax.set_xlabel('Recall')
ax.set_ylabel('Precision')
ax.set_title('Precision-Recall Curves', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('08_precision_recall_curve.png', dpi=150, bbox_inches='tight')
plt.close()
print("Figure 8: Precision-Recall curve saved.")

Figure 8: Precision-Recall curve saved.
