In [0]:
from pyspark.sql.functions import*
from pyspark.sql.types import*
from pyspark.sql.window import*

In [0]:
%sql
use catalog claims_leakage;
use schema silver;

In [0]:
%run "/Workspace/Users/shoyofromconcrete@gmail.com/claims risk and leakage/silver/utilities"

In [0]:
fnol_bronze=spark.table("claims_leakage.bronze.fnol_bronze")
fnol_bronze.printSchema()

In [0]:
display(fnol_bronze)

In [0]:
policy_bronze=spark.table("claims_leakage.bronze.policy_bronze")
policy_bronze.printSchema()

In [0]:
claims_bronze=spark.table("claims_leakage.bronze.claims_bronze")
claims_bronze.printSchema()

Cleaning Layer

In [0]:
fnol_silver_typed = (
    fnol_bronze
    # canonical policy id
    .withColumn(
        "policy_id_canonical",
        coalesce(col("policy_id"), col("policy_number_legacy"))
    )

    # parsed dates
    .withColumn("loss_date_ts", parse_date("loss_date"))
    .withColumn("reported_date_ts", parse_date("reported_date"))

    # parsed amount
    .withColumn("claim_amount_num", parse_amount("claim_amount"))

    # normalized enums
    .withColumn("loss_type_std", normalize("loss_type"))
    .withColumn("reporting_channel_std", normalize("reporting_channel"))
    .withColumn("incident_state_std", normalize("incident_state"))

    # light cleanup
    .withColumn("incident_city", trim(col("incident_city")))
    .withColumn("incident_zip", trim(col("incident_zip")))
    .withColumn("agent_id", trim(col("agent_id")))
)
display(fnol_silver_typed)

In [0]:
fnol_validated = fnol_silver_typed.withColumn(
    "dq_status",
    when(col("fnol_id").isNull(), "FAIL_FNOL_ID")
    .when(col("policy_id_canonical").isNull(), "FAIL_POLICY_ID")
    .when(col("loss_date_ts").isNull(), "FAIL_LOSS_DATE")
    .when(col("claim_amount_num").isNull(), "FAIL_CLAIM_AMOUNT")
    .otherwise("PASS")
)
display(fnol_validated)

In [0]:
fnol_validated = fnol_validated.withColumn(
    "dq_reason",
    when(col("dq_status") == "PASS", lit(None))
    .otherwise(col("dq_status"))
)


In [0]:
fnol_clean = fnol_validated.filter(col("dq_status") == "PASS")
fnol_quarantine = fnol_validated.filter(col("dq_status") != "PASS")
display(fnol_clean)
display(fnol_quarantine)

In [0]:
window_fnol = Window.partitionBy("fnol_id").orderBy(col("start_ts").desc())

fnol_clean_deduped = (
    fnol_clean
    .withColumn("rn", row_number().over(window_fnol))
    .filter(col("rn") == 1)
    .drop("rn")
)
display(fnol_clean_deduped)

In [0]:
fnol_silver_final = fnol_clean_deduped.select(
    # identifiers
    "fnol_id",
    "policy_id_canonical",

    # event timing
    "loss_date_ts",
    "reported_date_ts",

    # financial
    "claim_amount_num",

    # classification
    "loss_type_std",
    "reporting_channel_std",

    # geography
    "incident_state_std",
    "incident_city",
    "incident_zip",

    # ops
    "agent_id",

    #scd columns
    "start_ts",
    "end_ts",
    "is_current",

    # lineage
    "source_file",
    "ingest_date",

    # dq
    "dq_status",
    "dq_reason"
)
display(fnol_silver_final)

In [0]:
fnol_silver_final.write \
     .format("delta") \
     .mode("overwrite") \
     .option("overwriteSchema", "true") \
     .saveAsTable("fnol_clean")


In [0]:
fnol_quarantine.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema","true")\
    .saveAsTable("fnol_quarantine")
