In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType, StringType
from functools import reduce

In [0]:
# ---- CONFIG ----

SOLAR_ROOT = "s3://energy-data-platform-project-bucket/unity-catalog/4159464430132590/"

# Single logical site for this dataset; change if you later split by location
SITE_ID = "CA_SOLAR_2014_2016"

CATALOG = "energy_data_platform_project"
SCHEMA = "bronze"
BRONZE_TABLE = f"{CATALOG}.{SCHEMA}.ca_solar_irradiance_raw"

spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")

In [0]:
# ---- 1. Load raw solar irradiance CSV ----
# Dataset columns:
#   timeStamp (datetime)
#   ghi, dni, dhi (numeric)

raw_df = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "true")  # or schema() if you want it fixed
    .csv(SOLAR_PATH)
)

In [0]:


# ---- 1. Per-file loader (analogous to load_single_parquet) ----

def load_single_solar_csv(path: str, site_id: str):
    """
    Read a single solar irradiance CSV, normalize to Bronze schema,
    and attach Bronze metadata (site_id, source_file, ingest_ts).
    Expected columns in the CSV:
      timeStamp (datetime), ghi, dni, dhi (numeric).
    """
    raw_df = (
        spark.read
        .option("header", "true")
        .option("inferSchema", "true")  # or define a fixed schema if you prefer
        .csv(path)
    )

    df = (
        raw_df
        # keep original timestamp as raw_timestamp for traceability
        .withColumnRenamed("timeStamp", "raw_timestamp")

        # canonical time columns
        .withColumn("obs_time", F.to_timestamp("raw_timestamp"))
        .withColumn("obs_date", F.to_date("obs_time"))
        .withColumn("year", F.year("obs_time"))

        # identity / site
        .withColumn("site_id", F.lit(site_id).cast(StringType()))

        # solar irradiance in W/m^2
        .withColumn("ghi_w_m2", F.col("ghi").cast(DoubleType()))
        .withColumn("dni_w_m2", F.col("dni").cast(DoubleType()))
        .withColumn("dhi_w_m2", F.col("dhi").cast(DoubleType()))

        # metadata
        .withColumn("source_file", F.lit(path))
        .withColumn("ingest_ts", F.current_timestamp())
    )

    # Order columns for consistency with your Bronze design
    df = df.select(
        "site_id",
        "obs_time", "obs_date", "year",
        "ghi_w_m2", "dni_w_m2", "dhi_w_m2",
        "source_file", "ingest_ts",
        # raw fields retained for lineage/debugging
        "raw_timestamp", "ghi", "dni", "dhi"
    )

    return df

In [0]:


# ---- 2. Enumerate solar files (SMART-DS style) ----

print(f"Listing solar files under {SOLAR_ROOT}")
files = dbutils.fs.ls(SOLAR_ROOT)

csv_files = [f for f in files if f.name.endswith(".csv")]
print(f"Found {len(csv_files)} CSV files")

all_dfs = []
for f in csv_files:
    df_file = load_single_solar_csv(f.path, SITE_ID)
    all_dfs.append(df_file)

if not all_dfs:
    raise Exception(f"No solar CSV files found under {SOLAR_ROOT}")

In [0]:

# ---- 3. Union per-file DataFrames into one Bronze DataFrame ----

bronze_df = reduce(
    lambda a, b: a.unionByName(b, allowMissingColumns=True),
    all_dfs
)

print("Solar Bronze row count:", bronze_df.count())
display(bronze_df.limit(5))

In [0]:

# ---- 4. Write Bronze Delta & register table ----

(
    bronze_df
    .write
    .format("delta")
    .mode("overwrite")
    .partitionBy("year")      # you can add "site_id" later if you want
    .saveAsTable(BRONZE_TABLE)
)

display(spark.table(BRONZE_TABLE).limit(5))