In [0]:
# Check if files are visible
#display(dbutils.fs.ls("/Volumes/workspace/default/cscie103_final_project/"))
display(dbutils.fs.ls("/Volumes/cscie103_catalog_final/bronze/landing/predict-energy-behavior-of-prosumers/"))


path,name,size,modificationTime
dbfs:/Volumes/cscie103_catalog_final/bronze/landing/predict-energy-behavior-of-prosumers/client.csv,client.csv,1368122,1765040941000
dbfs:/Volumes/cscie103_catalog_final/bronze/landing/predict-energy-behavior-of-prosumers/county_id_to_name_map.json,county_id_to_name_map.json,301,1765040941000
dbfs:/Volumes/cscie103_catalog_final/bronze/landing/predict-energy-behavior-of-prosumers/electricity_prices.csv,electricity_prices.csv,769348,1765040941000
dbfs:/Volumes/cscie103_catalog_final/bronze/landing/predict-energy-behavior-of-prosumers/enefit/,enefit/,0,1765041222214
dbfs:/Volumes/cscie103_catalog_final/bronze/landing/predict-energy-behavior-of-prosumers/example_test_files/,example_test_files/,0,1765041222214
dbfs:/Volumes/cscie103_catalog_final/bronze/landing/predict-energy-behavior-of-prosumers/forecast_weather.csv,forecast_weather.csv,781119880,1765040966000
dbfs:/Volumes/cscie103_catalog_final/bronze/landing/predict-energy-behavior-of-prosumers/gas_prices.csv,gas_prices.csv,23898,1765040941000
dbfs:/Volumes/cscie103_catalog_final/bronze/landing/predict-energy-behavior-of-prosumers/historical_weather.csv,historical_weather.csv,180530222,1765040967000
dbfs:/Volumes/cscie103_catalog_final/bronze/landing/predict-energy-behavior-of-prosumers/public_timeseries_testing_util.py,public_timeseries_testing_util.py,3677,1765040941000
dbfs:/Volumes/cscie103_catalog_final/bronze/landing/predict-energy-behavior-of-prosumers/train.csv,train.csv,98978994,1765040966000


In [0]:
spark.sql('USE CATALOG cscie103_catalog_final')

DataFrame[]

In [0]:
"""
Bronze Layer Ingestion
- Ingests raw CSV files from Volumes into Bronze Delta Tables.
- Uses "Overwrite" mode to handle full daily refreshes 
  and "overwriteSchema" to handle structural changes in source data.
- Outputting as Delta allows downstream 
- Silver layer to use .readStream
"""

from pyspark.sql.functions import current_timestamp, input_file_name

def ingest_bronze(table_name, file_name):
    """
    Reads csv file from the project volume and overwrites the target Bronze Delta table.
    """
    
    #source_path = f"/Volumes/workspace/default/cscie103_final_project/{file_name}"
    source_path = f"/Volumes/cscie103_catalog_final/bronze/landing/predict-energy-behavior-of-prosumers/{file_name}"
    target_table_name = f"bronze.{table_name}"
    
    print(f"Processing {table_name}")
    
    # Read the CSV (Bronze, schema is inferred)
    df = spark.read.format("csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .load(source_path)
    
    # Adding timestamp to track when this specific batch was processed.
    # Adding input_file to track which file this row came from.
    df_enriched = df.select("*", "_metadata.file_path") \
        .withColumnRenamed("file_path", "_source_file") \
        .withColumn("_ingestion_time", current_timestamp())
    
    # Write to Delta table
    # model("overwrite"), overwriteSchema - ensures idempotency/robustness
    df_enriched.write.format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable(target_table_name)
    
    print(f"Success: refreshed table: {target_table_name}")

spark.sql('USE CATALOG cscie103_catalog_final')

# Ingest all files required for pipeline
ingest_bronze("client", "client.csv")
ingest_bronze("train", "train.csv")
ingest_bronze("gas_prices", "gas_prices.csv")
ingest_bronze("electricity_prices", "electricity_prices.csv")
ingest_bronze("weather_hist", "historical_weather.csv")
ingest_bronze("weather_forecast", "forecast_weather.csv")
ingest_bronze("weather_mapping", "weather_station_to_county_mapping.csv") 

print("\nBronze Layer Ingestion Complete")

Processing client
Success: refreshed table: bronze.client
Processing train
Success: refreshed table: bronze.train
Processing gas_prices
Success: refreshed table: bronze.gas_prices
Processing electricity_prices
Success: refreshed table: bronze.electricity_prices
Processing weather_hist
Success: refreshed table: bronze.weather_hist
Processing weather_forecast
Success: refreshed table: bronze.weather_forecast
Processing weather_mapping
Success: refreshed table: bronze.weather_mapping

Bronze Layer Ingestion Complete
