In [0]:
%run "/Workspace/Users/shoyofromconcrete@gmail.com/claims_risk_leakage/config"


In [0]:
%run "/Workspace/Users/shoyofromconcrete@gmail.com/claims_risk_leakage/notebooks/silver/utilities"

In [0]:
import time
import traceback
from pyspark.sql.functions import*
from pyspark.sql.types import*
from pyspark.sql.window import*



In [0]:
import logging
import sys

# ----------------------------------------------------------
# Logger Setup
# ----------------------------------------------------------

logger_name = CONFIG["logging"]["logger_name"]
log_level = CONFIG["logging"]["log_level"]

logger = logging.getLogger(logger_name)

# Avoid duplicate handlers in Databricks
if not logger.handlers:
    handler = logging.StreamHandler(sys.stdout)
    formatter = logging.Formatter(
        "%(asctime)s | %(levelname)s | %(message)s"
    )
    handler.setFormatter(formatter)
    logger.addHandler(handler)

logger.setLevel(getattr(logging, log_level))


In [0]:
catalog = CONFIG["catalog"]
bronze_schema = CONFIG["schemas"]["bronze"]
silver_schema = CONFIG["schemas"]["silver"]
bronze_fnol_table = CONFIG["tables"]["bronze"]["fnol"]
silver_fnol_clean_table = CONFIG["tables"]["silver_clean"]["fnol"]
silver_fnol_quarantine_table = CONFIG["tables"]["silver_quarantine"]["fnol"]

In [0]:
pipeline_start = time.time()
logger.info("[START] Silver FNOL processing")

try:

   

    logger.info("Reading Bronze FNOL table")

    fnol_bronze = spark.table(
        f"{catalog}.{bronze_schema}.{bronze_fnol_table}"
    )

 

    logger.info("Applying typing and normalization")

    fnol_silver_typed = (
        fnol_bronze

        # Canonical policy id
        .withColumn(
            "policy_id_canonical",
            coalesce(col("policy_id"), col("policy_number_legacy"))
        )

        # Parsed dates
        .withColumn("loss_date_ts", parse_date("loss_date"))
        .withColumn("reported_date_ts", parse_date("reported_date"))

        # Parsed amount
        .withColumn("claim_amount_num", parse_amount("claim_amount"))

        # Normalized enums
        .withColumn("loss_type_std", normalize("loss_type"))
        .withColumn("reporting_channel_std", normalize("reporting_channel"))
        .withColumn("incident_state_std", normalize("incident_state"))

        # Light cleanup
        .withColumn("incident_city", trim(col("incident_city")))
        .withColumn("incident_zip", trim(col("incident_zip")))
        .withColumn("agent_id", trim(col("agent_id")))
    )

    # ------------------------------------------------------
    # Data Quality Validation
    # ------------------------------------------------------

    logger.info("Applying DQ validation")

    fnol_validated = (
        fnol_silver_typed
        .withColumn(
            "dq_status",
            when(col("fnol_id").isNull(), "FAIL_FNOL_ID")
             .when(col("policy_id_canonical").isNull(), "FAIL_POLICY_ID")
             .when(col("loss_date_ts").isNull(), "FAIL_LOSS_DATE")
             .when(col("claim_amount_num").isNull(), "FAIL_CLAIM_AMOUNT")
             .otherwise("PASS")
        )
        .withColumn(
            "dq_reason",
            when(col("dq_status") == "PASS", lit(None))
             .otherwise(col("dq_status"))
        )
    )

    fnol_clean = fnol_validated.filter(col("dq_status") == "PASS")
    fnol_quarantine = fnol_validated.filter(col("dq_status") != "PASS")

    # ------------------------------------------------------
    # Deduplication
    # ------------------------------------------------------

    logger.info("Applying deduplication (latest snapshot)")

    window_fnol = Window.partitionBy("fnol_id").orderBy(col("start_ts").desc())

    fnol_clean_deduped = (
        fnol_clean
        .withColumn("rn", row_number().over(window_fnol))
        .filter(col("rn") == 1)
        .drop("rn")
    )

    # ------------------------------------------------------
    # Final Projection
    # ------------------------------------------------------

    fnol_silver_final = fnol_clean_deduped.select(
        # Identifiers
        "fnol_id",
        "policy_id_canonical",

        # Event timing
        "loss_date_ts",
        "reported_date_ts",

        # Financial
        "claim_amount_num",

        # Classification
        "loss_type_std",
        "reporting_channel_std",

        # Geography
        "incident_state_std",
        "incident_city",
        "incident_zip",

        # Ops
        "agent_id",

        # SCD columns
        "start_ts",
        "end_ts",
        "is_current",

        # Lineage
        "source_file",
        "ingest_date",

        # DQ
        "dq_status",
        "dq_reason"
    )

    # ------------------------------------------------------
    # Write Clean Table
    # ------------------------------------------------------

    write_batch(
        fnol_silver_final,
        f"{catalog}.{silver_schema}.{silver_fnol_clean_table}",
        mode="overwrite"
    )

    # ------------------------------------------------------
    # Write Quarantine Table
    # ------------------------------------------------------

    write_batch(
        fnol_quarantine,
        f"{catalog}.{silver_schema}.{silver_fnol_quarantine_table}",
        mode="overwrite"
    )

except Exception:
    logger.error("[ERROR] Silver FNOL processing failed")
    logger.error(traceback.format_exc())
    raise

# ----------------------------------------------------------
# End Pipeline
# ----------------------------------------------------------

pipeline_end = time.time()

logger.info(
    f"[END] Silver FNOL processing | duration={(pipeline_end - pipeline_start):.2f} sec"
)