# Social Security Claim Fraud Detection

This notebook builds a complete **fraud detection pipeline** for synthetic Social Security Administration (SSA)-style claims data.

**Goals:**
- Predict which claims are likely to be fraudulent (`is_fraud`)
- Handle class imbalance (fraud is rare)
- Evaluate models with fraud-focused metrics (recall, precision, PR AUC)
- Use SHAP for explainability
- Export fraud risk scores for inspection / BI dashboards


## 1. Setup & Imports

In [None]:

# If you're in Google Colab, you may need to install:
# !pip install xgboost imbalanced-learn shap

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    average_precision_score
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb
from imblearn.over_sampling import SMOTE

import shap
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams["figure.figsize"] = (8, 5)
sns.set(style="whitegrid")


## 2. Load Data

In [None]:

import os

data_path = os.path.join("..", "data", "ssa_claim_fraud_dataset.csv")
df = pd.read_csv(data_path)
df.shape, df.head()


## 3. Basic EDA & Target Distribution

In [None]:

df['is_fraud'].value_counts(normalize=True)


In [None]:

sns.countplot(x='is_fraud', data=df)
plt.title('Fraud vs Non-Fraud Claims')
plt.show()


## 4. Feature Preparation

In [None]:

target_col = "is_fraud"
y = df[target_col]
X = df.drop(columns=[target_col, "claim_id", "person_id"])

cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

cat_cols, num_cols


In [None]:

# Preprocessor: OneHotEncode categoricals, pass through numerics
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols),
    ]
)


## 5. Train/Test Split & SMOTE for Imbalance

In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape


In [None]:

# First, fit the preprocessor on training data and transform
X_train_proc = preprocessor.fit_transform(X_train)
X_test_proc = preprocessor.transform(X_test)

# Apply SMOTE to handle class imbalance (fraud minority class)
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_proc, y_train)

X_train_proc.shape, X_train_res.shape, y_train.value_counts(), y_train_res.value_counts()


## 6. Train Models

In [None]:

# Logistic Regression (baseline)
log_model = LogisticRegression(max_iter=2000)
log_model.fit(X_train_res, y_train_res)

# Random Forest
rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=12,
    class_weight="balanced_subsample",
    random_state=42
)
rf_model.fit(X_train_res, y_train_res)

# XGBoost
xgb_model = xgb.XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42,
    scale_pos_weight=3  # fraud minority class
)
xgb_model.fit(X_train_res, y_train_res)


## 7. Evaluation (Fraud-Focused Metrics)

In [None]:

def eval_model(model, X_test_proc, y_test, name="Model"):
    y_prob = model.predict_proba(X_test_proc)[:, 1]
    y_pred = (y_prob >= 0.5).astype(int)

    roc = roc_auc_score(y_test, y_prob)
    pr_auc = average_precision_score(y_test, y_prob)
    print(f"{name} - ROC AUC: {roc:.4f}, PR AUC: {pr_auc:.4f}\n")
    print("Classification report:")
    print(classification_report(y_test, y_pred))
    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred))

print("Logistic Regression:")
eval_model(log_model, X_test_proc, y_test, "Logistic Regression")

print("\nRandom Forest:")
eval_model(rf_model, X_test_proc, y_test, "Random Forest")

print("\nXGBoost:")
eval_model(xgb_model, X_test_proc, y_test, "XGBoost")


## 8. SHAP Explainability (XGBoost)

In [None]:

# For SHAP, we need a feature matrix (dense) and feature names
# X_train_proc and X_test_proc may be sparse; convert small sample to dense
import scipy.sparse as sp

X_test_sample = X_test_proc[:1000]
if sp.issparse(X_test_sample):
    X_test_sample = X_test_sample.toarray()

# Get feature names from preprocessor
ohe = preprocessor.named_transformers_['cat']
cat_feature_names = ohe.get_feature_names_out(cat_cols)
feature_names = list(cat_feature_names) + num_cols

explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X_test_sample)

shap.summary_plot(shap_values, X_test_sample, feature_names=feature_names, show=False)
plt.title("SHAP Summary - Fraud Model")
plt.tight_layout()
plt.show()


In [None]:

# SHAP bar plot
shap.summary_plot(shap_values, X_test_sample, feature_names=feature_names, plot_type="bar", show=False)
plt.title("SHAP Feature Importance - Fraud Model")
plt.tight_layout()
plt.show()


## 9. Export Fraud Risk Scores for BI / Case Review

In [None]:

# Recompute processed full feature matrix for entire dataset
X_full_proc = preprocessor.transform(X)

# Get fraud probabilities from XGBoost
full_prob = xgb_model.predict_proba(X_full_proc)[:, 1]
full_pred = (full_prob >= 0.5).astype(int)

def risk_level(p):
    if p >= 0.8:
        return "High"
    elif p >= 0.5:
        return "Medium"
    else:
        return "Low"

risk_levels = [risk_level(p) for p in full_prob]

pred_df = pd.DataFrame({
    "claim_id": df["claim_id"],
    "person_id": df["person_id"],
    "fraud_flag_pred": full_pred,
    "fraud_probability": full_prob,
    "fraud_risk_level": risk_levels
})

pred_df.head()


In [None]:

# Save predictions to CSV
pred_csv_path = os.path.join("..", "data", "fraud_predictions.csv")
pred_df.to_csv(pred_csv_path, index=False)
print(f"Saved fraud predictions to {pred_csv_path}")
