In [None]:
# -------------------------------------------------------------------
# BLOCK 1 — GOLD TABLE SCHEMA METADATA
# -------------------------------------------------------------------
# Single source of truth for all Gold-layer table definitions.
# Each key is a table name, and each value is an ordered mapping
# of column names to Spark SQL data types.
# -------------------------------------------------------------------

tables = {
    "dim_product": {
        "product_sk": "bigint",
        "product_id": "string",
        "product_name": "string",
        "product_number": "string",
        "color": "string",
        "standard_cost": "float",
        "list_price": "float",
        "size": "string",
        "weight": "float",
        "category": "string",
        "subcategory": "string"
    },
    "fact_sales": {
        "sales_sk": "bigint",
        "order_id": "string",
        "order_date": "string",
        "customer_sk": "bigint",
        "product_sk": "bigint",
        "quantity": "int",
        "unit_price": "float",
        "discount": "float",
        "line_total": "float"
    },
    "exceptions_fact_sales": {
        "order_id": "string",
        "order_date": "string",
        "customer_id": "string",
        "product_id": "string",
        "quantity": "int",
        "unit_price": "float",
        "discount": "float",
        "line_total": "float",
        "reason": "string"
    }
}

In [None]:
# -------------------------------------------------------------------
# BLOCK 2 — GOLD TABLE GENERATOR (DROP + CREATE)
# -------------------------------------------------------------------
# Kill-and-fill pattern:
# - DROP TABLE IF EXISTS <table>
# - CREATE TABLE <table> (...) from metadata
# Guarantees deterministic schema and wipes drift.
# -------------------------------------------------------------------

for table_name, columns in tables.items():

    # Drop existing table if present
    spark.sql(f"DROP TABLE IF EXISTS {table_name}")

    # Build column definitions from metadata
    col_defs = ",\n    ".join([f"{col} {dtype}" for col, dtype in columns.items()])

    # Create the table using Spark SQL types
    spark.sql(f"""
        CREATE TABLE {table_name} (
            {col_defs}
        )
    """)

In [None]:
# -------------------------------------------------------------------
# BLOCK 3 — CONFIRMATION OUTPUT
# -------------------------------------------------------------------
# Simple console confirmation for notebook and pipeline runs.
# -------------------------------------------------------------------

for table_name in tables.keys():
    print(f"[INIT] Created table: {table_name}")

In [None]:
# -------------------------------------------------------------------
# BLOCK 4 — MAINTENANCE (OPTIONAL OPTIMIZE + ZORDER)
# -------------------------------------------------------------------
# Lightweight performance maintenance for Gold tables.
# - OPTIMIZE compacts small files.
# - ZORDER improves data skipping on common filter columns.
#
# NOTE:
# - Adjust ZORDER columns per table as needed.
# - Safe to no-op if OPTIMIZE/ZORDER not supported in environment.
# -------------------------------------------------------------------

zorder_columns = {
    "dim_product": ["product_id", "category"],
    "fact_sales": ["order_date", "customer_sk", "product_sk"],
    "exceptions_fact_sales": ["order_date", "customer_id", "product_id"]
}

for table_name in tables.keys():
    try:
        # Basic OPTIMIZE
        spark.sql(f"OPTIMIZE {table_name}")

        # Optional ZORDER if configured
        if table_name in zorder_columns and zorder_columns[table_name]:
            cols = ", ".join(zorder_columns[table_name])
            spark.sql(f"OPTIMIZE {table_name} ZORDER BY ({cols})")

        print(f"[MAINT] Optimized table: {table_name}")

    except Exception as e:
        # Non-fatal: log and continue
        print(f"[MAINT][WARN] Maintenance skipped for {table_name}: {e}")

In [None]:
# -------------------------------------------------------------------
# BLOCK 5 — SCHEMA VALIDATION
# -------------------------------------------------------------------
# Validates that the physical table schema matches the metadata:
# - Same columns
# - Same data types (case-insensitive)
# - Same column order
#
# Any mismatch is printed as a warning. You can choose to:
# - raise an Exception to fail the pipeline, or
# - log and continue.
# -------------------------------------------------------------------

from pyspark.sql.types import StructType

def get_table_schema(table_name: str) -> StructType:
    return spark.table(table_name).schema

def normalize_dtype(dtype_str: str) -> str:
    return dtype_str.strip().lower()

validation_errors = []

for table_name, columns in tables.items():
    try:
        schema = get_table_schema(table_name)
        physical_cols = [(f.name, normalize_dtype(f.dataType.simpleString())) for f in schema]
        expected_cols = [(col, normalize_dtype(dtype)) for col, dtype in columns.items()]

        if physical_cols != expected_cols:
            validation_errors.append((table_name, physical_cols, expected_cols))
            print(f"[VALIDATION][ERROR] Schema mismatch for {table_name}")
        else:
            print(f"[VALIDATION] Schema OK for {table_name}")

    except Exception as e:
        validation_errors.append((table_name, str(e), None))
        print(f"[VALIDATION][ERROR] Could not validate {table_name}: {e}")

# Optional: fail hard if any validation errors
if validation_errors:
    print("[VALIDATION][SUMMARY] One or more tables failed schema validation.")
    # Uncomment to fail the notebook/pipeline:
    # raise Exception("Gold table schema validation failed. See logs above.")
else:
    print("[VALIDATION][SUMMARY] All Gold tables passed schema validation.")

In [None]:
# -------------------------------------------------------------------
# BLOCK 6 — LOGGING (GOLD INIT LOG TABLE)
# -------------------------------------------------------------------
# Writes a simple log entry to a Gold log table:
# - run_timestamp
# - status
# - table_count
# - validation_error_count
#
# This can be extended later with:
# - pipeline_run_id
# - user
# - environment
# -------------------------------------------------------------------

from datetime import datetime

log_table = "gold_pipeline_log"

# Ensure log table exists (idempotent)
spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {log_table} (
        run_timestamp string,
        process       string,
        status        string,
        table_count   int,
        validation_error_count int
    )
""")

run_timestamp = datetime.utcnow().isoformat()
process_name = "gold_table_init"
status = "SUCCESS" if not validation_errors else "FAILED"
table_count = len(tables)
validation_error_count = len(validation_errors)

log_df = spark.createDataFrame(
    [
        (run_timestamp, process_name, status, table_count, validation_error_count)
    ],
    ["run_timestamp", "process", "status", "table_count", "validation_error_count"]
)

log_df.write.mode("append").insertInto(log_table)

print(f"[LOG] Wrote Gold init log entry: status={status}, tables={table_count}, validation_errors={validation_error_count}")