Setup & Imports

In [0]:
%pip install xgboost

import pandas as pd
import numpy as np

from pyspark.sql.functions import *
from pyspark.sql.types import *

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report
import mlflow
import mlflow.xgboost

print("✅ Setup complete")

Load Gold Features

In [0]:
risk_df = spark.table("claims_leakage.gold.gold_claim_risk_summary")
risk_features = spark.table("claims_leakage.gold.gold_claim_risk_features")
leakage_flags = spark.table("claims_leakage.gold.gold_claim_leakage_flags")


In [0]:
training_base = (
    risk_features.alias("r")
    .join(leakage_flags.alias("l"), "claim_id", "left")
)


Create Target Label (POC Fraud Label)

In [0]:
training_base = training_base.withColumn(
    "fraud_label",
    when(
        (col("paid_gt_approved_flag") == 1) |
        (col("paid_gt_coverage_flag") == 1) |
        (col("claim_after_policy_expiry_flag") == 1),
        1
    ).otherwise(0)
)


ML feature columns

In [0]:
feature_columns = [
    "days_to_report",
    "late_reporting_flag",
    "high_fnol_amount_flag",
    "risky_loss_type_flag",
    "risky_geo_flag",
    "digital_channel_flag",
    "paid_gt_approved_flag",
    "paid_gt_coverage_flag",
    "leakage_amount"
]


Convert to Pandas

In [0]:
pdf = training_base.select(feature_columns + ["fraud_label"]).toPandas()

X = pdf[feature_columns].fillna(0)
y = pdf["fraud_label"]


Train/Test Split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


Train XGBoost Model

In [0]:
with mlflow.start_run() as run:

    model = xgb.XGBClassifier(
        objective="binary:logistic",
        eval_metric="auc",
        max_depth=4,
        learning_rate=0.1,
        n_estimators=100,
        random_state=42
    )

    model.fit(X_train, y_train)

    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_proba)

    mlflow.log_metric("auc", auc)
    mlflow.xgboost.log_model(model, "model")

    run_id = run.info.run_id  # ✅ capture run_id safely

    print(f"✅ AUC: {auc:.4f}")
    print(f"Run ID: {run_id}")


Register Model

In [0]:


model_uri = f"runs:/{run_id}/model"

print(f"Model URI: {model_uri}")


Create Scoring Function

In [0]:
loaded_model = mlflow.xgboost.load_model(model_uri)

def score_batch(pdf):
    probs = loaded_model.predict_proba(
        pdf[feature_columns].fillna(0)
    )[:, 1]
    return probs


Score All Claims

In [0]:
full_pdf = training_base.select(
    "claim_id", *feature_columns
).toPandas()

full_pdf["fraud_probability"] = score_batch(full_pdf)

full_pdf["fraud_ml_risk_level"] = pd.cut(
    full_pdf["fraud_probability"],
    bins=[0, 0.3, 0.7, 1],
    labels=["LOW", "MEDIUM", "HIGH"]
)

scored_df = spark.createDataFrame(full_pdf)


Write Final Intelligence Table

In [0]:
(scored_df
    .write
    .format("delta")
    .mode("overwrite")
    .saveAsTable("claims_leakage.gold.gold_claim_intelligence")
)


In [0]:
%sql
select * from claims_leakage.gold.gold_claim_intelligence where fraud_ml_risk_level = 'LOW'