In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from delta.tables import DeltaTable
from datetime import datetime
import json

In [0]:
dbutils.widgets.text("pipeline_config_json", "", "Pipeline Config JSON (from 01_config)")
dbutils.widgets.text("run_mode", "full", "Run Mode")  # fallback

pipeline_config_json = dbutils.widgets.get("pipeline_config_json").strip()

if pipeline_config_json:
    pipeline_config = json.loads(pipeline_config_json)

    SOURCE_TABLE = pipeline_config["silver_table"]
    TARGET_TABLE = pipeline_config["gold_dim_athletes"]
    run_mode = pipeline_config.get("run_mode", "full")
    process_year = pipeline_config.get("process_year", None)
else:
    CATALOG = "ironman"
    SOURCE_TABLE = f"{CATALOG}.silver.ironman_results"
    TARGET_TABLE = f"{CATALOG}.gold.dim_athletes"
    run_mode = dbutils.widgets.get("run_mode")
    process_year = None

print(f"Source: {SOURCE_TABLE}")
print(f"Target: {TARGET_TABLE}")
print(f"Run Mode: {run_mode}")
print(f"Process Year: {process_year if process_year else 'ALL'}")

In [0]:
silver_df = spark.table(SOURCE_TABLE)

In [0]:
if process_year:
    silver_df = silver_df.filter(F.col("year") == int(process_year))
    print(f"Filtered to year: {process_year}")

print(f"Silver rows: {silver_df.count():,}")

In [0]:
silver_with_key = silver_df.withColumn(
    "athlete_natural_key",
    F.lower(F.concat_ws("_", 
        F.regexp_replace(F.col("athlete_name"), "[^a-zA-Z0-9]", ""),
        F.coalesce(F.col("country"), F.lit("UNKNOWN"))
    ))
)

In [0]:
w = Window.partitionBy("athlete_natural_key").orderBy(
    F.col("year").desc(),
    F.col("load_timestamp").desc_nulls_last()
)

athletes_df = (
    silver_with_key
    .withColumn("rn", F.row_number().over(w))
    .filter(F.col("rn") == 1)
    .select("athlete_natural_key", "athlete_name", "first_name", "last_name", "country")
)

print(f"Unique athletes: {athletes_df.count():,}")

In [0]:
athletes_df = athletes_df.withColumn(
    "athlete_key",
    F.abs(F.hash(F.concat_ws("_", F.col("athlete_name"), F.col("country"))))
)

In [0]:
athletes_df = athletes_df.withColumn(
    "athlete_natural_key",
    F.lower(F.concat_ws("_",
        F.regexp_replace(F.col("athlete_name"), "[^a-zA-Z0-9]", ""),
        F.coalesce(F.col("country"), F.lit("UNKNOWN"))
    ))
)

In [0]:
athletes_df = (
    athletes_df
    .withColumn("created_at", F.current_timestamp())
    .withColumn("updated_at", F.current_timestamp())
)

In [0]:
dim_athletes = athletes_df.select(
    "athlete_key",
    "athlete_natural_key",
    "athlete_name",
    "first_name",
    "last_name",
    "country",
    "created_at",
    "updated_at"
)

print(f"Final row count: {dim_athletes.count():,}")

In [0]:
table_exists = spark.catalog.tableExists(TARGET_TABLE)

if (not table_exists) or (run_mode == "full"):
    print(f"Full load to {TARGET_TABLE}")
    (
        dim_athletes.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .saveAsTable(TARGET_TABLE)
    )
else:
    print(f"Incremental merge to {TARGET_TABLE}")
    delta_table = DeltaTable.forName(spark, TARGET_TABLE)
    (
        delta_table.alias("target")
        .merge(
            dim_athletes.alias("source"),
            "target.athlete_natural_key = source.athlete_natural_key"
        )
        .whenMatchedUpdate(set={
            "athlete_name": "source.athlete_name",
            "first_name": "source.first_name",
            "last_name": "source.last_name",
            "country": "source.country",
            "updated_at": "source.updated_at"
        })
        .whenNotMatchedInsertAll()
        .execute()
    )

print("Write complete")

In [0]:
result_df = spark.table(TARGET_TABLE)

print(f"Table: {TARGET_TABLE}")
print(f"Total athletes: {result_df.count():,}")

print("\nAthletes by country (top 10):")
display(
    result_df
    .groupBy("country")
    .count()
    .orderBy(F.col("count").desc())
    .limit(10)
)

print("\n" + "=" * 50)
print("DIMENSION TABLE COMPLETE: dim_athletes")
print("=" * 50)
print(f"Table: {TARGET_TABLE}")
print(f"Rows: {spark.table(TARGET_TABLE).count():,}")
print(f"Timestamp: {datetime.now()}")
print("=" * 50)