In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType, StringType
from functools import reduce

In [0]:
# ---- CONFIG ----

YEARS = [2016]

SITE_ID = "CA_SOLAR_2014_2016"

CATALOG = "energy_usage_data_platform"
BRONZE_SCHEMA = "bronze"
RAW_SCHEMA = "raw"
SOLAR_VOLUME = "solar_data"
SOLAR_FILE = "folsom_irradiance.csv"

SOLAR_PATH = f"/Volumes/{CATALOG}/{RAW_SCHEMA}/{SOLAR_VOLUME}/{SOLAR_FILE}"

BRONZE_TABLE = f"{CATALOG}.{BRONZE_SCHEMA}.ca_solar_irradiance_raw"

In [0]:
"""
Read a single solar irradiance CSV, normalize to Bronze schema,
and attach Bronze metadata (site_id, source_file, ingest_ts).
Expected columns in the CSV:
    timeStamp (datetime), ghi, dni, dhi (numeric).
"""
raw_df = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv(SOLAR_PATH)
)

In [0]:
bronze_df = (
    raw_df
    # keep original timestamp as raw_timestamp for traceability
    .withColumnRenamed("timeStamp", "raw_timestamp")

    # canonical time columns
    .withColumn("obs_time", F.to_timestamp("raw_timestamp"))
    .withColumn("obs_date", F.to_date("obs_time"))
    .withColumn("year", F.year("obs_time"))

    # identity / site
    .withColumn("site_id", F.lit(SITE_ID).cast(StringType()))

    # solar irradiance in W/m^2
    .withColumn("ghi_w_m2", F.col("ghi").cast(DoubleType()))
    .withColumn("dni_w_m2", F.col("dni").cast(DoubleType()))
    .withColumn("dhi_w_m2", F.col("dhi").cast(DoubleType()))

    # metadata
    .withColumn("source_file", F.lit(SOLAR_PATH))
    .withColumn("ingest_ts", F.current_timestamp())
)

In [0]:
# Filter data to only include data with desired years in YEARS list
bronze_df = bronze_df.filter(bronze_df['year'].isin(YEARS))


In [0]:
# Order columns for consistency with your Bronze design
bronze_df = bronze_df.select(
    "site_id",
    "obs_time", "obs_date", "year",
    "ghi_w_m2", "dni_w_m2", "dhi_w_m2",
    "source_file", "ingest_ts",
    # raw fields retained for lineage/debugging
    "raw_timestamp", "ghi", "dni", "dhi"
)

In [0]:
# ---- Write Bronze Delta & register table ----

(
    bronze_df
    .write
    .format("delta")
    .mode("overwrite")
    .partitionBy("year")
    .saveAsTable(BRONZE_TABLE)
)

display(spark.table(BRONZE_TABLE).limit(5))