In [0]:
# Use the same catalog and schema as the other notebooks
#spark.sql("USE CATALOG workspace")
#spark.sql("USE SCHEMA default")
spark.sql(f'USE CATALOG cscie103_catalog_final')
spark.sql(f'CREATE VOLUME if not exists cscie103_catalog_final.silver.checkpoints')


DataFrame[]

In [0]:
"""
SILVER LAYER (STREAMING VERSION)
- Reads from Bronze Delta tables as a streaming source
- Joins with static mapping table (bronze_weather_mapping) to add 'county'
- Writes to Silver Delta tables using structured streaming with trigger=once
- This satisfies the 'stream from one Delta table to another incrementally using trigger once' requirement.
"""

from pyspark.sql.functions import col

# Base checkpoint root 
checkpoint_root = "/Volumes/cscie103_catalog_final/silver/checkpoints/"
#checkpoint_root = "/Volumes/workspace/default/cscie103_final_project/checkpoints/silver"

def process_silver_streaming(source_table: str, target_table: str, checkpoint_subdir: str):
    """
    Streaming version of the Silver processing step.

    Parameters
    ----------
    source_table : str
        Name of the Bronze Delta table to read as a streaming source.
    target_table : str
        Name of the Silver Delta table to write.
    checkpoint_subdir : str
        Subdirectory name (under checkpoint_root) for this stream's checkpoints.
    """

    print(f"Streaming processing from {source_table} -> {target_table}")

    # Static mapping table â€“ read once, regular batch DataFrame
    mapping_df = (
        spark.read.table("bronze.weather_mapping")
             .drop("_ingestion_time", "_source_file")
    )

    # Streaming source: read from Bronze Delta table
    weather_stream_df = (
        spark.readStream
             .table(source_table)   # = readStream.format("delta").table(...)
    )

    # Join streaming weather with static mapping to get 'county'
    enriched_stream_df = (
        weather_stream_df.join(
            mapping_df,
            on=["latitude", "longitude"],
            how="left"
        )
    )

    # Build checkpoint path for this specific stream
    checkpoint_path = f"{checkpoint_root}/{checkpoint_subdir}"

    (
        enriched_stream_df.writeStream
            .format("delta")
            .outputMode("append")            # we're appending new rows
            .trigger(once=True)              # process all available data once, then stop
            .option("checkpointLocation", checkpoint_path)
            .option("mergeSchema", "true")   # allow schema evolution if needed
            .toTable(target_table)
    )

    print(f"Streaming write started for {target_table}. "
          "This run will finish when all available data is processed.")

# Run streaming jobs for both historical and forecast weather
process_silver_streaming(
    source_table="bronze.weather_hist",
    target_table="silver.weather_hist_stream",
    checkpoint_subdir="silver.weather_hist"
)

process_silver_streaming(
    source_table="bronze.weather_forecast",
    target_table="silver.weather_forecast_stream",
    checkpoint_subdir="silver.weather_forecast"
)


Streaming processing from bronze.weather_hist -> silver.weather_hist_stream
Streaming write started for silver.weather_hist_stream. This run will finish when all available data is processed.
Streaming processing from bronze.weather_forecast -> silver.weather_forecast_stream
Streaming write started for silver.weather_forecast_stream. This run will finish when all available data is processed.


In [0]:
spark.table("silver.weather_hist_stream").show(5)
spark.table("silver.weather_forecast_stream").show(5)

+--------+---------+-------------------+-----------+--------+----+--------+----------------+----------------+--------------+--------------+---------------+-----------------+-----------------+-------------------+----------------------+-----------------+-------------+--------------------+--------------------+-----------+------+
|latitude|longitude|           datetime|temperature|dewpoint|rain|snowfall|surface_pressure|cloudcover_total|cloudcover_low|cloudcover_mid|cloudcover_high|    windspeed_10m|winddirection_10m|shortwave_radiation|direct_solar_radiation|diffuse_radiation|data_block_id|        _source_file|     _ingestion_time|county_name|county|
+--------+---------+-------------------+-----------+--------+----+--------+----------------+----------------+--------------+--------------+---------------+-----------------+-----------------+-------------------+----------------------+-----------------+-------------+--------------------+--------------------+-----------+------+
|    58.5|     2