In [0]:
%run "/Workspace/Users/shoyofromconcrete@gmail.com/claims_risk_leakage/config"




In [0]:
import time
import logging
import sys
import traceback
import builtins
import pyspark.sql.functions as F
from pyspark.sql.window import Window


In [0]:
logger_name = CONFIG["logging"]["logger_name"]
log_level = CONFIG["logging"]["log_level"]

logger = logging.getLogger(logger_name)

if not logger.handlers:
    handler = logging.StreamHandler(sys.stdout)
    formatter = logging.Formatter(
        "%(asctime)s | %(levelname)s | %(message)s"
    )
    handler.setFormatter(formatter)
    logger.addHandler(handler)

logger.setLevel(getattr(logging, log_level))

In [0]:
spark.sql(f"USE CATALOG {CONFIG['catalog']}")
spark.sql(f"USE SCHEMA {CONFIG['schemas']['gold']}")

In [0]:
logger.info("Starting GOLD Layer Processing")

start_time = time.time()

try:

    # ==========================================================
    # LOAD SILVER TABLES
    # ==========================================================

    logger.info("Loading silver tables")

    fnol = spark.table(
        f"{CONFIG['catalog']}.silver.{CONFIG['tables']['silver_clean']['fnol']}"
    )

    policy = spark.table(
        f"{CONFIG['catalog']}.silver.{CONFIG['tables']['silver_clean']['policy']}"
    )

    claims = spark.table(
        f"{CONFIG['catalog']}.silver.{CONFIG['tables']['silver_clean']['claims']}"
    )

    # ==========================================================
    # ================= RISK FEATURES ==========================
    # ==========================================================

    logger.info("Building risk feature base")

    risk_base = (
        fnol.alias("f")
        .join(
            policy.alias("p"),
            F.col("f.policy_id_canonical") == F.col("p.policy_id"),
            "left"
        )
    )

    risk_rules = CONFIG["risk"]["rules"]

    gold_claim_risk_features = (
        risk_base
        .withColumn(
            "days_to_report",
            F.datediff(F.col("reported_date_ts"), F.col("loss_date_ts"))
        )
        .withColumn(
            "late_reporting_flag",
            F.when(
                F.col("days_to_report") > risk_rules["late_reporting_days"],
                1
            ).otherwise(0)
        )
        .withColumn(
            "high_fnol_amount_flag",
            F.when(
                F.col("claim_amount_num") >
                risk_rules["high_amount_ratio"] * F.col("coverage_limit_num"),
                1
            ).otherwise(0)
        )
        .withColumn(
            "risky_loss_type_flag",
            F.when(
                F.col("loss_type_std").isin(
                    *risk_rules["risky_loss_types"]
                ),
                1
            ).otherwise(0)
        )
        .withColumn(
            "risky_geo_flag",
            F.when(
                F.col("incident_state_std").isin(
                    *risk_rules["risky_states"]
                ),
                1
            ).otherwise(0)
        )
        .withColumn(
            "digital_channel_flag",
            F.when(
                F.col("reporting_channel_std").isin(
                    *risk_rules["digital_channels"]
                ),
                1
            ).otherwise(0)
        )
        .select(
            "fnol_id",
            "policy_id_canonical",
            "days_to_report",
            "late_reporting_flag",
            "high_fnol_amount_flag",
            "risky_loss_type_flag",
            "risky_geo_flag",
            "digital_channel_flag"
        )
    )

    logger.info("Writing gold risk features")

    gold_claim_risk_features.write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable(CONFIG["tables"]["gold"]["risk_features"])

    # ==========================================================
    # ================= LEAKAGE FLAGS ==========================
    # ==========================================================

    logger.info("Building leakage flags")

    leakage_base = (
        claims.alias("c")
        .join(policy.alias("p"),
              F.col("c.policy_id") == F.col("p.policy_id"),
              "left")
        .join(fnol.alias("f"),
              F.col("c.fnol_id") == F.col("f.fnol_id"),
              "left")
    )

    gold_claim_leakage_flags = (
        leakage_base
        .withColumn(
            "paid_gt_approved_flag",
            F.when(
                F.col("paid_amount_num") >
                F.col("approved_amount_num"),
                1
            ).otherwise(0)
        )
        .withColumn(
            "paid_gt_coverage_flag",
            F.when(
                F.col("paid_amount_num") >
                F.col("coverage_limit_num"),
                1
            ).otherwise(0)
        )
        .withColumn(
            "claim_after_policy_expiry_flag",
            F.when(
                ~F.col("f.loss_date_ts").between(
                    F.col("p.policy_start_date_ts"),
                    F.col("p.policy_end_date_ts")
                ),
                1
            ).otherwise(0)
        )
        .withColumn(
            "leakage_amount",
            F.greatest(
                F.col("paid_amount_num") -
                F.col("approved_amount_num"),
                F.lit(0)
            )
        )
        .select(
            F.col("c.claim_id").alias("claim_id"),
            F.col("c.fnol_id").alias("fnol_id"),
            F.col("p.policy_id").alias("policy_id"),
            "paid_gt_approved_flag",
            "paid_gt_coverage_flag",
            "claim_after_policy_expiry_flag",
            "leakage_amount"
        )
    )

    logger.info("Writing gold leakage flags")

    gold_claim_leakage_flags.write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable(CONFIG["tables"]["gold"]["leakage_flags"])

    # ==========================================================
    # ================= RISK SUMMARY ==========================
    # ==========================================================

    logger.info("Building final risk summary")

    risk = spark.table(CONFIG["tables"]["gold"]["risk_features"])
    leakage = spark.table(CONFIG["tables"]["gold"]["leakage_flags"])

    summary_base = risk.alias("r").join(
        leakage.alias("l"),
        "fnol_id",
        "left"
    )

    weights = CONFIG["risk"]["weights"]
    levels = CONFIG["risk"]["levels"]

    gold_claim_risk_summary = (
        summary_base
        .withColumn(
            "risk_score",
            F.coalesce(F.col("late_reporting_flag"), F.lit(0)) * weights["late_reporting"] +
            F.coalesce(F.col("high_fnol_amount_flag"), F.lit(0)) * weights["high_amount"] +
            F.coalesce(F.col("risky_loss_type_flag"), F.lit(0)) * weights["risky_loss_type"] +
            F.coalesce(F.col("risky_geo_flag"), F.lit(0)) * weights["risky_geo"] +
            F.coalesce(F.col("paid_gt_approved_flag"), F.lit(0)) * weights["paid_gt_approved"] +
            F.coalesce(F.col("paid_gt_coverage_flag"), F.lit(0)) * weights["paid_gt_coverage"]
        )
        .withColumn(
            "risk_level",
            F.when(F.col("risk_score") >= levels["high_threshold"], "HIGH")
            .when(F.col("risk_score") >= levels["medium_threshold"], "MEDIUM")
            .otherwise("LOW")
        )
        .withColumn(
            "leakage_amount",
            F.coalesce(F.col("leakage_amount"), F.lit(0))
        )
        .withColumn(
            "leakage_flag",
            F.when(F.col("leakage_amount") > 0, "YES").otherwise("NO")
        )
        .withColumn(
            "risk_reasons",
            F.concat_ws(
                ", ",
                F.when(F.col("late_reporting_flag") == 1, "Late reporting"),
                F.when(F.col("high_fnol_amount_flag") == 1, "High FNOL amount"),
                F.when(F.col("risky_loss_type_flag") == 1, "Risky loss type"),
                F.when(F.col("risky_geo_flag") == 1, "Risky geography"),
                F.when(F.col("paid_gt_approved_flag") == 1, "Paid > approved"),
                F.when(F.col("paid_gt_coverage_flag") == 1, "Paid > coverage")
            )
        )
        .select(
            "claim_id",
            "fnol_id",
            "policy_id_canonical",
            "risk_score",
            "risk_level",
            "leakage_flag",
            "leakage_amount",
            "risk_reasons"
        )
    )

    logger.info("Writing gold risk summary")

    gold_claim_risk_summary.write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable(CONFIG["tables"]["gold"]["risk_summary"])

    end_time = time.time()
    duration=builtins.round(end_time - start_time, 2)

    logger.info(
        f"GOLD layer completed successfully | "
        f"duration = {duration} sec"
    )

except Exception as e:

    logger.error("Gold Layer Failed")
    logger.error(traceback.format_exc())
    raise e

In [0]:
%sql
select * from claims_leakage.gold.gold_claim_risk_summary where leakage_flag="YES"