In [0]:
from pyspark.sql import functions as F

In [0]:
CATALOG = "energy_usage_data_platform"
GOLD_SCHEMA = f"{CATALOG}.gold"
EXPORT_BASE = "s3://energy-data-platform-project/export"

spark.sql(f"USE CATALOG {CATALOG}")
spark.sql(f"USE {GOLD_SCHEMA}")

In [0]:
tables_df = spark.sql(f"SHOW TABLES IN {GOLD_SCHEMA}")

# Collect table names; filter out temporary/views if needed
table_names = [
    row["tableName"]
    for row in tables_df.collect()
    if row["isTemporary"] == False
]

print("Gold tables to export:", table_names)

In [0]:
def get_export_config(table_name: str):
    """
    Return (export_path, partition_cols) for a given table.
    partition_cols can be [] if no partitioning is desired.
    """
    base_path = f"{EXPORT_BASE}/{table_name}"

    # Time-series facts: partition by obs_date
    if table_name in [
        "daily_region_usage",
        "hourly_region_usage",
        "data_coverage_summary",
        "extreme_event_days",
    ]:
        return base_path, ["obs_date"]

    # Dimensions and small analytic tables: no partitioning
    return base_path, []


In [0]:
for tbl in table_names:
    full_table_name = f"{GOLD_SCHEMA}.{tbl}"
    export_path, partition_cols = get_export_config(tbl)

    print(f"Exporting {full_table_name} -> {export_path}")

    df = spark.table(full_table_name)

    if partition_cols:
        df = df.repartition(len(partition_cols), *partition_cols)
    else:
        df = df.coalesce(1)

    (
        df.write
          .mode("overwrite")
          .option("header", "true")
          .option("compression", "snappy")
          .parquet(export_path)
    )

print("Export completed.")
