In [0]:
from pyspark.sql import functions as F

CATALOG = "energy_data_platform_project"
GOLD_SCHEMA = f"{CATALOG}.gold"

In [0]:
daily = spark.table(f"{GOLD_SCHEMA}.daily_region_usage")

In [0]:
bounds = (
    daily
    .agg(
        F.min("obs_date").alias("min_date"),
        F.max("obs_date").alias("max_date")
    )
    .first()
)

In [0]:
bounds = (
    daily
    .agg(
        F.min("obs_date").alias("min_date"),
        F.max("obs_date").alias("max_date")
    )
    .first()
)

min_date = bounds["min_date"]
max_date = bounds["max_date"]

if min_date is None or max_date is None:
    raise ValueError("No dates found in daily_region_usage; cannot build dim_date.")

min_date_str = str(min_date)
max_date_str = str(max_date)

In [0]:
date_seq_df = spark.sql(
    f"SELECT sequence(to_date('{min_date_str}'), to_date('{max_date_str}'), interval 1 day) as date_seq"
)
dim_date = date_seq_df.select(F.explode("date_seq").alias("date_key"))

In [0]:
dim_date = (
    dim_date
    .withColumn("year", F.year("date_key"))
    .withColumn("quarter", F.quarter("date_key"))
    .withColumn("month", F.month("date_key"))
    .withColumn("day_of_month", F.dayofmonth("date_key"))
    .withColumn("day_of_week", F.date_format("date_key", "u").cast("int"))  # 1=Mon..7=Sun
    .withColumn("week_of_year", F.weekofyear("date_key"))
    .withColumn("is_weekend", F.col("day_of_week").isin(6, 7))
    .withColumn("month_name", F.date_format("date_key", "MMM"))
)

In [0]:
(
    dim_date
    .write
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(f"{GOLD_SCHEMA}.dim_date")
)

In [0]:
display(spark.table(f"{GOLD_SCHEMA}.dim_date").limit(5))