In [0]:
%run "/Workspace/Users/shoyofromconcrete@gmail.com/claims_risk_leakage/config"




In [0]:
%run "/Workspace/Users/shoyofromconcrete@gmail.com/claims_risk_leakage/notebooks/silver/utilities"

In [0]:
import time
import logging
import sys
import traceback
import builtins
import pyspark.sql.functions as F
from pyspark.sql.window import Window


In [0]:
logger_name = CONFIG["logging"]["logger_name"]
log_level = CONFIG["logging"]["log_level"]

logger = logging.getLogger(logger_name)

if not logger.handlers:
    handler = logging.StreamHandler(sys.stdout)
    formatter = logging.Formatter(
        "%(asctime)s | %(levelname)s | %(message)s"
    )
    handler.setFormatter(formatter)
    logger.addHandler(handler)

logger.setLevel(getattr(logging, log_level))

In [0]:
spark.sql(f"USE CATALOG {CONFIG['catalog']}")
spark.sql(f"USE SCHEMA {CONFIG['schemas']['silver']}")

In [0]:
logger.info("Starting CLAIMS Silver Layer")

start_time = time.time()

try:

    # ==========================================================
    # READ BRONZE
    # ==========================================================

    logger.info("Reading claims bronze table")

    claims_bronze = spark.table(
        f"{CONFIG['catalog']}.bronze.{CONFIG['tables']['bronze']['claims']}"
    )

    # ==========================================================
    # TYPING / TRANSFORMATION
    # ==========================================================

    logger.info("Applying typing and transformations")

    claims_typed = (
        claims_bronze
        .withColumn("settlement_date_ts", parse_date("settlement_date"))
        .withColumn("approved_amount_num", parse_amount("approved_amount"))
        .withColumn("paid_amount_num", parse_amount("paid_amount"))
        .withColumn("claim_status_std", normalize("claim_status"))
    )

    # ==========================================================
    # VALIDATION
    # ==========================================================

    logger.info("Running data quality validation")

    claims_validated = claims_typed.withColumn(
        "dq_status",
        F.when(F.col("claim_id").isNull(), "FAIL_CLAIM_ID")
        .when(F.col("policy_id").isNull(), "FAIL_POLICY_ID")
        .when(F.col("approved_amount_num").isNull(), "FAIL_APPROVED_AMOUNT")
        .otherwise("PASS")
    )

    claims_quarantine = claims_validated.filter(F.col("dq_status") != "PASS")
    claims_clean = claims_validated.filter(F.col("dq_status") == "PASS")

    # ==========================================================
    # DEDUP (LATEST BY START_TS)
    # ==========================================================

    logger.info("Running deduplication")

    window_claim = Window.partitionBy("claim_id") \
                         .orderBy(F.col("start_ts").desc())

    claims_snapshot = (
        claims_clean
        .withColumn("rn", F.row_number().over(window_claim))
        .filter(F.col("rn") == 1)
        .drop("rn")
    )

    # ==========================================================
    # WRITE CLEAN TABLE
    # ==========================================================

    logger.info("Writing claims_clean table")

    claims_snapshot.write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable(CONFIG["tables"]["silver_clean"]["claims"])

    # ==========================================================
    # WRITE QUARANTINE
    # ==========================================================

    logger.info("Writing claims_quarantine table")

    claims_quarantine.write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable(CONFIG["tables"]["silver_quarantine"]["claims"])

    # ==========================================================
    # SUCCESS LOG
    # ==========================================================

    end_time = time.time()
    duration = builtins.round(end_time - start_time, 2)

    logger.info(
        f"Claims Silver completed successfully | "
        f"duration ={duration} sec"
    )

except Exception as e:

    logger.error("Claims Silver failed")
    logger.error(traceback.format_exc())
    raise e

In [0]:
%sql
select * from claims_leakage.silver.claims_clean