In [0]:
from pyspark.sql.functions import*
from pyspark.sql.types import*
from pyspark.sql.window import*

In [0]:
%sql
use catalog claims_leakage;
use schema bronze;

In [0]:
debug_cp_fnol="/Volumes/claims_leakage/bronze/bronze_vol/checkpoint/debug_cp/fnol_debug/"
debug_cp_policy="/Volumes/claims_leakage/bronze/bronze_vol/checkpoint/debug_cp/policy_cp/"
debug_cp_claims="/Volumes/claims_leakage/bronze/bronze_vol/checkpoint/debug_cp/claims_cp/"
main_cp_fnol="/Volumes/claims_leakage/bronze/bronze_vol/checkpoint/main_cp/fnol_cp_main/"
main_cp_claims="/Volumes/claims_leakage/bronze/bronze_vol/checkpoint/main_cp/claims_cp_main/"
main_cp_policy="/Volumes/claims_leakage/bronze/bronze_vol/checkpoint/main_cp/policy_cp_main/"
debug_schema_fnol="/Volumes/claims_leakage/bronze/bronze_vol/schema/debug_schema/fnol_debug_schema/"
debug_schema_claims="/Volumes/claims_leakage/bronze/bronze_vol/schema/debug_schema/claims_debug_schema"
debug_schema_policy="/Volumes/claims_leakage/bronze/bronze_vol/schema/debug_schema/policy_debug_schema"
main_schema_fnol="/Volumes/claims_leakage/bronze/bronze_vol/schema/main_schema/fnol_main_schema/"
main_schema_claims="/Volumes/claims_leakage/bronze/bronze_vol/schema/main_schema/claims_main_schema/"
main_schema_policy="/Volumes/claims_leakage/bronze/bronze_vol/schema/main_schema/policy_main_schema/"
fnol_loc="s3://claims-risk-leakage/raw/fnol_events/"
policy_loc="s3://claims-risk-leakage/raw/policy_master/"
claims_his_loc="s3://claims-risk-leakage/raw/claims_history/"

In [0]:
fnol_df_raw=spark.readStream.format("cloudFiles")\
        .option("cloudFiles.format","json")\
        .option("cloudFiles.inferColumnTypes","false")\
        .option("cloudFiles.schemaEvolutionMode","addNewColumns")\
        .option("cloudFiles.schemaLocation",main_schema_fnol)\
        .load(fnol_loc)


In [0]:
policy_df_raw=spark.readStream.format("cloudFiles")\
                  .option("cloudFiles.format","json")\
                  .option("cloudFiles.inferColumnTypes","false")\
                  .option("cloudFiles.schemaEvolutionMode","addNewColumns")\
                  .option("cloudFiles.schemaLocation",main_schema_policy)\
                  .load(policy_loc)


In [0]:
claims_df_raw=spark.readStream.format("cloudFiles")\
                  .option("cloudFiles.format","json")\
                  .option("cloudFiles.inferColumnTypes","false")\
                  .option("cloudFiles.schemaEvolutionMode","addNewColumns")\
                  .option("cloudFiles.schemaLocation",main_schema_claims)\
                  .load(claims_his_loc)


In [0]:
fnol_df_bronze=fnol_df_raw.withColumnRenamed("claim_id","fnol_id")\
                       .withColumn("source_file",col("_metadata.file_path"))\
                       .withColumn("ingest_date", current_date())\
                       .withColumn("start_ts",current_timestamp())\
                       .withColumn("end_ts",lit(None).cast("timestamp"))\
                       .withColumn("is_current",lit(1))


In [0]:
claims_df_bronze=claims_df_raw.withColumn("source_file",col("_metadata.file_path"))\
                       .withColumn("ingest_date",current_date())\
                       .withColumn("start_ts",current_timestamp())\
                       .withColumn("end_ts",lit(None).cast("timestamp"))\
                       .withColumn("is_current",lit(1))

In [0]:
policy_df_bronze=policy_df_raw.withColumn("source_file",col("_metadata.file_path"))\
                       .withColumn("ingest_date",current_date())\
                       .withColumn("start_ts",current_timestamp())\
                       .withColumn("end_ts",lit(None).cast("timestamp"))\
                       .withColumn("is_current",lit(1))

metrics


In [0]:
fnol_metrics_df = (
    fnol_df_bronze
      .groupBy("source_file")
      .agg(
          min("start_ts").alias("file_ingestion_time"),
          count("*").alias("record_count"),
          *[
              sum(
                  when(col(c).isNull() | (trim(col(c)) == ""), 1)
                  .otherwise(0)
              ).alias(f"{c}_null_count")
              for c in fnol_df_bronze.columns
          ],
          sum(
              when(col("_rescued_data").isNotNull(), 1).otherwise(0)
          ).alias("rescued_record_count")
      )
)



In [0]:
claims_metrics_df = (
    claims_df_bronze
      .groupBy("source_file")
      .agg(
          min("start_ts").alias("file_ingestion_time"),
          count("*").alias("record_count"),
          *[
              sum(
                  when(col(c).isNull() | (trim(col(c)) == ""), 1)
                  .otherwise(0)
              ).alias(f"{c}_null_count")
              for c in claims_df_bronze.columns
          ],
          sum(
              when(col("_rescued_data").isNotNull(), 1).otherwise(0)
          ).alias("rescued_record_count")
      )
)



In [0]:
policy_metrics_df = (
    policy_df_bronze
      .groupBy("source_file")
      .agg(
          min("start_ts").alias("file_ingestion_time"),
          count("*").alias("record_count"),
          *[
              sum(
                  when(col(c).isNull() | (trim(col(c)) == ""), 1)
                  .otherwise(0)
              ).alias(f"{c}_null_count")
              for c in policy_df_bronze.columns
          ],
          sum(
              when(col("_rescued_data").isNotNull(), 1).otherwise(0)
          ).alias("rescued_record_count")
      )
)



Queries

In [0]:
fnol_query=fnol_df_bronze.writeStream.format("delta")\
                         .outputMode("append")\
                         .option("checkpointLocation",main_cp_fnol)\
                         .option("mergeSchema","true")\
                         .trigger(availableNow=True)\
                         .toTable("fnol_bronze")

In [0]:
claims_query=claims_df_bronze.writeStream.format("delta")\
                         .outputMode("append")\
                         .option("checkpointLocation",main_cp_claims)\
                         .option("mergeSchema","true")\
                         .trigger(availableNow=True)\
                         .toTable("claims_bronze")
                         

In [0]:
policy_query=policy_df_bronze.writeStream.format("delta")\
                         .outputMode("append")\
                         .option("checkpointLocation",main_cp_policy)\
                         .option("mergeSchema","true")\
                         .trigger(availableNow=True)\
                         .toTable("policy_bronze")
                         

In [0]:
fnol_metrics_query = (
    fnol_metrics_df.writeStream
        .format("delta")
        .outputMode("complete")
        .option("checkpointLocation", debug_cp_fnol + "_metrics/")
        .option("overwriteSchema","true")
        .trigger(availableNow=True)
        .toTable("fnol_file_metrics")
)


In [0]:
claims_metrics_query = (
    claims_metrics_df.writeStream
        .format("delta")
        .outputMode("complete")
        .option("checkpointLocation", debug_cp_claims + "_metrics/")
        .option("overwriteSchema","true")
        .trigger(availableNow=True)
        .toTable("claims_file_metrics")
)


In [0]:
policy_metrics_query = (
    policy_metrics_df.writeStream
        .format("delta")
        .outputMode("complete")
        .option("checkpointLocation", debug_cp_policy + "_metrics/")
        .trigger(availableNow=True)
        .toTable("policy_file_metrics")
)


In [0]:
%sql
select * from claims_leakage.bronze.policy_bronze

In [0]:
%sql
SELECT *
FROM bronze.fnol_bronze
WHERE from_utc_timestamp(start_ts, 'Asia/Kolkata')
      >= timestamp(concat(current_date(), ' 12:13:00'));



In [0]:
%sql
SELECT COUNT(*) 
FROM bronze.claims_bronze
WHERE fnol_id IS NOT NULL;


