In [1]:
# 1) Imports
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

# 2) Load a public "fraud-like" dataset (credit card default as proxy for risk)
# Dataset: "credit-g" (German Credit). We'll treat "bad" as higher risk.
data = fetch_openml("credit-g", version=1, as_frame=True)
df = data.frame.copy()

# Target engineering: good/bad credit -> 0/1 risk label
y = (df["class"] == "bad").astype(int)
X = df.drop(columns=["class"])

# 3) One-hot encode categoricals
X = pd.get_dummies(X, drop_first=True)

# 4) Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# 5) Model: audit-friendly baseline
pipe = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),  # sparse-friendly
    ("lr", LogisticRegression(max_iter=2000, class_weight="balanced"))
])

pipe.fit(X_train, y_train)

# 6) Evaluate
proba = pipe.predict_proba(X_test)[:, 1]
pred = (proba >= 0.50).astype(int)

print("ROC-AUC:", round(roc_auc_score(y_test, proba), 4))
print("\nClassification report:\n", classification_report(y_test, pred))
print("\nConfusion matrix:\n", confusion_matrix(y_test, pred))

# 7) Create a risk score output (0–100)
risk_score = (proba * 100).round(1)
out = X_test.copy()
out["risk_score_0_100"] = risk_score
out["actual_label"] = y_test.values
out.sort_values("risk_score_0_100", ascending=False).head(10)


ROC-AUC: 0.8046

Classification report:
               precision    recall  f1-score   support

           0       0.89      0.73      0.80       175
           1       0.56      0.80      0.66        75

    accuracy                           0.75       250
   macro avg       0.72      0.76      0.73       250
weighted avg       0.79      0.75      0.76       250


Confusion matrix:
 [[127  48]
 [ 15  60]]


Unnamed: 0,duration,credit_amount,installment_commitment,residence_since,age,existing_credits,num_dependents,checking_status_<0,checking_status_>=200,checking_status_no checking,...,other_payment_plans_stores,housing_own,housing_rent,job_unemp/unskilled non res,job_unskilled resident,job_skilled,own_telephone_yes,foreign_worker_yes,risk_score_0_100,actual_label
938,60,6288,4,4,42,1,1,False,False,False,...,False,False,False,False,False,True,False,True,96.8,1
832,45,11816,2,4,29,2,1,True,False,False,...,False,False,True,False,False,True,False,True,96.7,1
602,24,1837,4,4,34,1,1,False,False,False,...,False,False,False,False,True,False,False,True,96.4,1
788,48,6224,4,4,50,1,1,False,False,False,...,False,False,False,False,False,True,False,True,96.1,1
522,48,7119,3,4,53,2,2,True,False,False,...,False,False,False,False,False,True,False,True,95.7,1
332,60,7408,4,2,24,1,1,False,False,False,...,False,True,False,False,False,False,False,True,94.8,1
395,39,11760,2,3,32,1,1,False,False,False,...,False,False,True,False,False,True,True,True,94.7,0
818,36,15857,2,3,43,1,1,True,False,False,...,False,True,False,False,False,False,False,True,94.6,0
917,6,14896,1,4,68,1,1,True,False,False,...,False,True,False,False,False,False,True,True,94.2,1
11,48,4308,3,4,24,1,1,True,False,False,...,False,False,True,False,False,True,False,True,93.3,1


In [2]:
# 8) Explainability: Top drivers of risk (logistic regression coefficients)

feature_names = X_train.columns
coefs = pipe.named_steps["lr"].coef_.ravel()

imp = pd.DataFrame({
    "feature": feature_names,
    "coef": coefs,
    "abs_coef": np.abs(coefs)
}).sort_values("abs_coef", ascending=False)

print("Top positive drivers (increase risk):")
display(imp.sort_values("coef", ascending=False).head(10)[["feature","coef"]])

print("Top negative drivers (decrease risk):")
display(imp.sort_values("coef", ascending=True).head(10)[["feature","coef"]])


Top positive drivers (increase risk):


Unnamed: 0,feature,coef
1,credit_amount,0.393649
2,installment_commitment,0.37549
42,housing_rent,0.37315
36,property_magnitude_no known property,0.336088
17,purpose_education,0.286652
0,duration,0.283259
47,foreign_worker_yes,0.251227
41,housing_own,0.243207
24,savings_status_<100,0.224321
14,purpose_new car,0.212063


Top negative drivers (decrease risk):


Unnamed: 0,feature,coef
9,checking_status_no checking,-0.650748
10,credit_history_critical/other existing credit,-0.531019
39,other_payment_plans_none,-0.360751
33,personal_status_male single,-0.304317
15,purpose_used car,-0.287363
26,savings_status_no known savings,-0.255323
25,savings_status_>=1000,-0.234388
27,employment_4<=X<7,-0.222873
34,other_parties_guarantor,-0.175781
12,credit_history_existing paid,-0.16892


In [3]:
# 9) Case Narrative: explain the highest-risk case in plain language

# Identify the highest-risk case
top_idx = out["risk_score_0_100"].idxmax()
case = out.loc[top_idx]

# Retrieve coefficients
feature_names = X_train.columns
coefs = pipe.named_steps["lr"].coef_.ravel()
coef_series = pd.Series(coefs, index=feature_names)

# Directional contributions (approximate, for narrative)
contrib = coef_series * case[feature_names]

top_pos = contrib.sort_values(ascending=False).head(5)
top_neg = contrib.sort_values(ascending=True).head(5)

print("=== Highest-risk case summary ===")
print("Record index:", top_idx)
print("Risk score (0–100):", float(case["risk_score_0_100"]))
print("Actual label (0=lower risk, 1=higher risk):", int(case["actual_label"]))

print("\nTop factors INCREASING risk:")
display(top_pos.reset_index().rename(columns={"index":"feature", 0:"contribution"}))

print("\nTop factors DECREASING risk:")
display(top_neg.reset_index().rename(columns={"index":"feature", 0:"contribution"}))

# Plain-language narrative (interview-ready)
narrative_text = f"""
This case is flagged as higher risk with a score of {float(case['risk_score_0_100'])}/100.
Risk is primarily driven by: {', '.join(top_pos.index.tolist())}.
Risk is partially mitigated by: {', '.join(top_neg.index.tolist())}.
This output is intended for analyst review and decision support, not automated rejection.
"""
print(narrative_text)


=== Highest-risk case summary ===
Record index: 938
Risk score (0–100): 96.8
Actual label (0=lower risk, 1=higher risk): 1

Top factors INCREASING risk:


Unnamed: 0,feature,contribution
0,credit_amount,2475.268023
1,duration,16.995558
2,installment_commitment,1.501958
3,property_magnitude_no known property,0.336088
4,purpose_education,0.286652



Top factors DECREASING risk:


Unnamed: 0,feature,contribution
0,age,-5.255842
1,other_payment_plans_none,-0.360751
2,personal_status_male single,-0.304317
3,credit_history_existing paid,-0.16892
4,other_parties_none,-0.075474



This case is flagged as higher risk with a score of 96.8/100.
Risk is primarily driven by: credit_amount, duration, installment_commitment, property_magnitude_no known property, purpose_education.
Risk is partially mitigated by: age, other_payment_plans_none, personal_status_male single, credit_history_existing paid, other_parties_none.
This output is intended for analyst review and decision support, not automated rejection.

