In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from delta.tables import DeltaTable
from datetime import datetime
import json

In [0]:
dbutils.widgets.text("pipeline_config_json", "", "Pipeline Config JSON (from 01_config)")
dbutils.widgets.text("run_mode", "full", "Run Mode")  # fallback

pipeline_config_json = dbutils.widgets.get("pipeline_config_json").strip()

if pipeline_config_json:
    pipeline_config = json.loads(pipeline_config_json)

    SOURCE_TABLE = pipeline_config["silver_table"]
    TARGET_TABLE = pipeline_config["gold_dim_divisions"]
    run_mode = pipeline_config.get("run_mode", "full")
    process_year = pipeline_config.get("process_year", None)
else:
    CATALOG = "ironman"
    SOURCE_TABLE = f"{CATALOG}.silver.ironman_results"
    TARGET_TABLE = f"{CATALOG}.gold.dim_divisions"
    run_mode = dbutils.widgets.get("run_mode")
    process_year = None

print(f"Source: {SOURCE_TABLE}")
print(f"Target: {TARGET_TABLE}")
print(f"Run Mode: {run_mode}")
print(f"Process Year: {process_year if process_year else 'ALL'}")

In [0]:
silver_df = spark.table(SOURCE_TABLE)

In [0]:
if process_year:
    silver_df = silver_df.filter(F.col("year") == int(process_year))
    print(f"Filtered to year: {process_year}")

print(f"Silver rows: {silver_df.count():,}")

In [0]:
divisions_df = (
    silver_df
    .select("division")
    .filter(F.col("division").isNotNull())
    .distinct()
)

print(f"Unique divisions: {divisions_df.count()}")
print("\nAll divisions:")
display(divisions_df.orderBy("division"))

divisions_df = divisions_df.withColumn(
    "gender",
    F.when(F.col("division").startswith("M"), "M")
    .when(F.col("division").startswith("F"), "F")
    .otherwise("UNKNOWN")
)

divisions_df = divisions_df.withColumn(
    "is_professional",
    F.when(F.col("division").contains("PRO"), True).otherwise(False)
)

print("After parsing gender and pro status:")
display(divisions_df)

In [0]:
divisions_df = divisions_df.withColumn(
    "age_range",
    F.regexp_extract(F.col("division"), r"(\d+\-\d+)", 1)
)

divisions_df = divisions_df.withColumn(
    "age_group_start",
    F.when(
        F.col("age_range") != "",
        F.split(F.col("age_range"), "-").getItem(0).cast("integer")
    ).otherwise(None)
)

divisions_df = divisions_df.withColumn(
    "age_group_end",
    F.when(
        F.col("age_range") != "",
        F.split(F.col("age_range"), "-").getItem(1).cast("integer")
    ).otherwise(None)
)

divisions_df = divisions_df.drop("age_range")

print("After parsing age range:")
display(divisions_df)

In [0]:
divisions_df = divisions_df.withColumn(
    "division_description",
    F.when(
        F.col("is_professional") == True,
        F.concat(
            F.when(F.col("gender") == "M", F.lit("Male")).otherwise(F.lit("Female")),
            F.lit(" Professional")
        )
    ).when(
        F.col("age_group_start").isNotNull(),
        F.concat(
            F.when(F.col("gender") == "M", F.lit("Male")).otherwise(F.lit("Female")),
            F.lit(" Age "),
            F.col("age_group_start"),
            F.lit(" to "),
            F.col("age_group_end")
        )
    ).otherwise(F.col("division"))
)

print("With descriptions:")
display(divisions_df)

In [0]:
divisions_df = divisions_df.withColumn(
    "division_key",
    F.abs(F.hash(F.col("division")))
)

print("With surrogate key:")
display(divisions_df.select("division_key", "division", "division_description").limit(10))

In [0]:
divisions_df = (
    divisions_df
    .withColumn("created_at", F.current_timestamp())
    .withColumn("updated_at", F.current_timestamp())
)

print("Final schema:")
divisions_df.printSchema()

In [0]:
dim_divisions = divisions_df.select(
    "division_key",
    "division",
    "division_description",
    "gender",
    "is_professional",
    "age_group_start",
    "age_group_end",
    "created_at",
    "updated_at"
)

print(f"Final row count: {dim_divisions.count()}")
display(dim_divisions.orderBy("gender", "age_group_start"))

In [0]:
table_exists = spark.catalog.tableExists(TARGET_TABLE)

if (not table_exists) or (run_mode == "full"):
    print(f"Full load to {TARGET_TABLE}")
    (
        dim_divisions.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .saveAsTable(TARGET_TABLE)
    )
else:
    print(f"Incremental merge to {TARGET_TABLE}")
    delta_table = DeltaTable.forName(spark, TARGET_TABLE)
    (
        delta_table.alias("target")
        .merge(
            dim_divisions.alias("source"),
            "target.division = source.division"
        )
        .whenMatchedUpdate(set={
            "division_description": "source.division_description",
            "gender": "source.gender",
            "is_professional": "source.is_professional",
            "age_group_start": "source.age_group_start",
            "age_group_end": "source.age_group_end",
            "updated_at": "source.updated_at"
        })
        .whenNotMatchedInsertAll()
        .execute()
    )

print("Write complete")

In [0]:
result_df = spark.table(TARGET_TABLE)

print(f"Table: {TARGET_TABLE}")
print(f"Total divisions: {result_df.count()}")

print("\nDivisions by gender:")
display(
    result_df
    .groupBy("gender", "is_professional")
    .count()
    .orderBy("gender", "is_professional")
)

print("\nAll divisions:")
display(result_df.orderBy("gender", "age_group_start"))

In [0]:
print("\n" + "=" * 50)
print("DIMENSION TABLE COMPLETE: dim_divisions")
print("=" * 50)
print(f"Table: {TARGET_TABLE}")
print(f"Rows: {spark.table(TARGET_TABLE).count()}")
print(f"Timestamp: {datetime.now()}")
print("=" * 50)

In [0]:
dbutils.notebook.exit("SUCCESS")