In [0]:
from pyspark.sql import functions as F
from config.config import *
from config.schemas import *

In [0]:
schema = schema_claims_stream
target_table = bt_fact_claims_stream
input_path = f"{path_vol_landing_fact_claims_stream}/"
checkpoint_path = f"{path_vol_landing_fact_claims_stream_checkpoint}/"

In [0]:
# Read streaming data from folder
iotstream = (
    spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "json") 
        .option("cloudFiles.schemaLocation", checkpoint_path)
        .option("multiLine", "true") 
        .option("escape", '"')
        .load(input_path)
)
print("Source stream created...")

In [0]:
# Add metadata columns to your stream
iotstream_with_meta = (
    iotstream
        .withColumn("meta_timestamp", F.current_timestamp())
        .withColumn("meta_source_file", F.col("_metadata.file_name"))
)

In [0]:
# Write the data to a Delta table
deltastream = (
    iotstream_with_meta.writeStream
        .format("delta")
        .option("checkpointLocation", checkpoint_path)
        .option("mergeSchema", "true")
        .trigger(availableNow=True) # Currently using the Databricks free version with Serverless so continuous processing is not supported
        .toTable(target_table)
)
print("Streaming to delta sink...")

In [0]:
# Read the data in delta format into a dataframe
df = spark.read.table(target_table)
df.limit(10).display()