In [0]:
# target cloud paths
base_bucket = "s3://nsw-fuel-api-raw"
source_path = f"{base_bucket}/fuel-raw/"
checkpoint_all = f"{base_bucket}/_autoloader/checkpoints/fuel_raw_all"
schema_loc_all = f"{base_bucket}/_autoloader/schema/fuel_raw_all"
# target bronze delta tables
catalog = "fuel"
schema  = "bronze"
station_table = f"{catalog}.{schema}.nsw_fuel_stations_bronze"
price_table   = f"{catalog}.{schema}.nsw_fuel_prices_bronze"

In [0]:
from pyspark.sql.functions import current_timestamp, col
# read stream
stream_df = (
    spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "json")
        .option("cloudFiles.schemaLocation", schema_loc_all)
        .option("cloudFiles.inferColumnTypes", "true")
        .load(source_path)
)

In [0]:
# transformations
tranformed_df = (
    stream_df
        .withColumn("_metadata", col("_metadata"))
        .withColumn("_ingest_ts", current_timestamp())
        .withColumn("_ingest_file", col("_metadata.file_path"))
)

In [0]:
from pyspark.sql.functions import explode, col

def write_to_bronze_tables(batch_df, batch_id: int):
    # ---- stations ----
    stations_df = (
        batch_df
            .select(
                explode(col("stations")).alias("station"),
                "_ingest_ts",
                "_ingest_file"
            )
            .select("station.*", "_ingest_ts", "_ingest_file")
    )

    # ---- prices ----
    prices_df = (
        batch_df
            .select(
                explode(col("prices")).alias("price"),
                "_ingest_ts",
                "_ingest_file"
            )
            .select("price.*", "_ingest_ts", "_ingest_file")
    )

    # Append to prices Bronze table
    (
        prices_df.write
            .format("delta")
            .mode("append")
            .option("mergeSchema", "true")
            .saveAsTable(price_table)
    )

    # Append to stations Bronze table
    (
        stations_df.write
            .format("delta")
            .mode("append")
            .option("mergeSchema", "true")
            .saveAsTable(station_table)
    )

    

In [0]:

query = (
    tranformed_df.writeStream
        .option("checkpointLocation", checkpoint_all)
        .trigger(availableNow=True) 
        .foreachBatch(write_to_bronze_tables)
        .start()
)

query.awaitTermination()