In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score,
    confusion_matrix, roc_curve, precision_recall_curve,
    classification_report
)

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (8,5)

# Load provider-level dataset (same file used in Notebook 02)
df = pd.read_csv("data/Provider_Level_Features.csv")
df.head()


In [None]:
# Target selection
if "PotentialFraud_Binary" in df.columns:
    y = df["PotentialFraud_Binary"]
else:
    y = df["PotentialFraud"].map({"No": 0, "Yes": 1})

# Remove leakage columns
leak_cols = ["Provider", "PotentialFraud", "FraudLabel", "PotentialFraud_Binary"]
X = df.drop(columns=[c for c in leak_cols if c in df.columns])


In [None]:
# First: 80% train_temp + 20% test
X_train_temp, X_test, y_train_temp, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

# Second: 60% train + 20% val (i.e., 75%/25% of temp)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_temp, y_train_temp, test_size=0.25, stratify=y_train_temp, random_state=42
)

print("Train size:", len(X_train))
print("Validation size:", len(X_val))
print("Test size:", len(X_test))


### Why Logistic Regression Is Our Final Model

In Notebook 02, we compared multiple algorithms:

- Logistic Regression
- Random Forest
- Gradient Boosting
- Decision Tree
- SVM (optional)

Among all models, **Logistic Regression achieved the best performance on the 20% validation set**, especially in:

- Precision
- Recall
- F1-score
- PR-AUC (most important for imbalanced datasets)

Because it generalizes best, has the best validation performance, and maintains strong interpretability,
**Logistic Regression was selected as our final model for evaluation on the test set**.


In [None]:
# Class weights for balancing fraud vs non-fraud
model = LogisticRegression(
    max_iter=500,
    class_weight="balanced",
    solver="liblinear"
)

model.fit(X_train, y_train)
print("Logistic Regression model trained.")


In [None]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)
pr_auc = average_precision_score(y_test, y_proba)

print("Classification Report:")
print(classification_report(y_test, y_pred, digits=4))

print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)
print("ROC-AUC:", roc_auc)
print("PR-AUC:", pr_auc)


In [None]:
cm = confusion_matrix(y_test, y_pred)

sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix - Logistic Regression")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

cm


### Cost-Based Interpretation

- **False Positives (FP)** = legitimate providers incorrectly flagged
  → leads to unnecessary investigations and administrative cost
- **False Negatives (FN)** = fraudulent providers missed
  → MOST EXPENSIVE: continuing fraud, financial loss, patient risk

Because FN are more harmful, **Recall** and **PR-AUC** matter more than Accuracy.

Logistic Regression gives a strong balance between detecting fraud (Recall) and minimizing false alarms (Precision).
