In [0]:
# Check if files are visible
display(dbutils.fs.ls("/Volumes/workspace/default/cscie103_final_project/"))

In [0]:
"""
Bronze Layer Ingestion
- Ingests raw CSV files from Volumes into Bronze Delta Tables.
- Uses "Overwrite" mode to handle full daily refreshes 
  and "overwriteSchema" to handle structural changes in source data.
- Outputting as Delta allows downstream 
- Silver layer to use .readStream
"""

from pyspark.sql.functions import current_timestamp, input_file_name

def ingest_bronze(table_name, file_name):
    """
    Reads csv file from the project volume and overwrites the target Bronze Delta table.
    """
    
    source_path = f"/Volumes/workspace/default/cscie103_final_project/{file_name}"
    target_table_name = f"bronze_{table_name}"
    
    print(f"Processing {table_name}")
    
    # Read the CSV (Bronze, schema is inferred)
    df = spark.read.format("csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .load(source_path)
    
    # Adding timestamp to track when this specific batch was processed.
    # Adding input_file to track which file this row came from.
    df_enriched = df.select("*", "_metadata.file_path") \
        .withColumnRenamed("file_path", "_source_file") \
        .withColumn("_ingestion_time", current_timestamp())
    
    # Write to Delta table
    # model("overwrite"), overwriteSchema - ensures idempotency/robustness
    df_enriched.write.format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .saveAsTable(target_table_name)
    
    print(f"Success: refreshed table: {target_table_name}")

# Ingest all files required for pipeline
ingest_bronze("client", "client.csv")
ingest_bronze("train", "train.csv")
ingest_bronze("gas_prices", "gas_prices.csv")
ingest_bronze("electricity_prices", "electricity_prices.csv")
ingest_bronze("weather_hist", "historical_weather.csv")
ingest_bronze("weather_forecast", "forecast_weather.csv")
ingest_bronze("weather_mapping", "weather_station_to_county_mapping.csv") 

print("\nBronze Layer Ingestion Complete")