In [0]:
from pyspark.sql import functions as F
from delta.tables import DeltaTable
from datetime import datetime
import json


In [0]:
dbutils.widgets.text("pipeline_config_json", "", "Pipeline Config JSON (from 01_config)")
dbutils.widgets.text("run_mode", "full", "Run Mode")  # fallback

pipeline_config_json = dbutils.widgets.get("pipeline_config_json").strip()

if pipeline_config_json:
    pipeline_config = json.loads(pipeline_config_json)

    SOURCE_TABLE = pipeline_config["silver_table"]

    DIM_ATHLETES = pipeline_config["gold_dim_athletes"]
    DIM_DIVISIONS = pipeline_config["gold_dim_divisions"]
    DIM_COUNTRIES = pipeline_config["gold_dim_countries"]

    TARGET_TABLE = pipeline_config["gold_fact_results"]

    run_mode = pipeline_config.get("run_mode", "full")
    process_year = pipeline_config.get("process_year", None)

    incr_cfg = pipeline_config.get("incremental", {})
    merge_key_cols = incr_cfg.get("merge_key_cols", ["row_key"])  
else:
    CATALOG = "ironman"
    SOURCE_TABLE = f"{CATALOG}.silver.ironman_results"

    DIM_ATHLETES = f"{CATALOG}.gold.dim_athletes"
    DIM_DIVISIONS = f"{CATALOG}.gold.dim_divisions"
    DIM_COUNTRIES = f"{CATALOG}.gold.dim_countries"

    TARGET_TABLE = f"{CATALOG}.gold.fact_race_results"

    run_mode = dbutils.widgets.get("run_mode")
    process_year = None
    merge_key_cols = ["row_key"]

print(f"Source: {SOURCE_TABLE}")
print(f"Target: {TARGET_TABLE}")
print(f"Run Mode: {run_mode}")
print(f"Process Year: {process_year if process_year else 'ALL'}")
print(f"\nDimensions:")
print(f"  Athletes: {DIM_ATHLETES}")
print(f"  Divisions: {DIM_DIVISIONS}")
print(f"  Countries: {DIM_COUNTRIES}")

In [0]:
silver_df = spark.table(SOURCE_TABLE)

if process_year:
    silver_df = silver_df.filter(F.col("year") == int(process_year))
    print(f"Filtered to year: {process_year}")

print(f"Silver rows: {silver_df.count():,}")

dim_athletes = spark.table(DIM_ATHLETES)
print(f"Dim athletes: {dim_athletes.count():,}")

dim_divisions = spark.table(DIM_DIVISIONS)
print(f"Dim divisions: {dim_divisions.count():,}")

dim_countries = spark.table(DIM_COUNTRIES)
print(f"Dim countries: {dim_countries.count():,}")

In [0]:
fact_df = silver_df.withColumn(
    "athlete_natural_key",
    F.lower(F.concat_ws("_",
        F.regexp_replace(F.col("athlete_name"), "[^a-zA-Z0-9]", ""),
        F.coalesce(F.col("country"), F.lit("UNKNOWN"))
    ))
)

print("Sample athlete keys:")
display(
    fact_df
    .select("athlete_name", "country", "athlete_natural_key")
    .limit(5)
)

In [0]:
dim_athletes_keys = dim_athletes.select(
    F.col("athlete_key"),
    F.col("athlete_natural_key").alias("dim_athlete_natural_key")
)

fact_df = fact_df.join(
    dim_athletes_keys,
    fact_df["athlete_natural_key"] == dim_athletes_keys["dim_athlete_natural_key"],
    "left"
).drop("dim_athlete_natural_key")

unmatched_athletes = fact_df.filter(F.col("athlete_key").isNull()).count()
print(f"Unmatched athletes: {unmatched_athletes}")

dim_divisions_keys = dim_divisions.select(
    F.col("division_key"),
    F.col("division").alias("dim_division")
)

fact_df = fact_df.join(
    dim_divisions_keys,
    fact_df["division"] == dim_divisions_keys["dim_division"],
    "left"
).drop("dim_division")

unmatched_divisions = fact_df.filter(
    F.col("division").isNotNull() & F.col("division_key").isNull()
).count()
print(f"Unmatched divisions: {unmatched_divisions}")

dim_countries_keys = dim_countries.select(
    F.col("country_key"),
    F.col("country").alias("dim_country")
)

fact_df = fact_df.join(
    dim_countries_keys,
    fact_df["country"] == dim_countries_keys["dim_country"],
    "left"
).drop("dim_country")

unmatched_countries = fact_df.filter(
    F.col("country").isNotNull() & F.col("country_key").isNull()
).count()
print(f"Unmatched countries: {unmatched_countries}")

fact_df = fact_df.withColumn("fact_key", F.abs(F.hash(F.col("row_key"))))

print("Sample fact keys:")
display(
    fact_df
    .select("fact_key", "row_key", "athlete_name", "year")
    .limit(5)
)

In [0]:
fact_race_results = fact_df.select(
    "fact_key",
    "athlete_key",
    "division_key",
    "country_key",
    "year",
    "source_gender",
    "designation",
    "bib",
    "is_finisher",
    "is_dnf",
    "is_dns",
    "is_dq",
    "has_data_issue",
    "rank",
    "div_rank",
    "gender_rank",
    "overall_rank",
    "points",
    "swim_time_seconds",
    "swim_div_rank",
    "swim_gender_rank",
    "swim_overall_rank",
    "transition_1_seconds",
    "bike_time_seconds",
    "bike_div_rank",
    "bike_gender_rank",
    "bike_overall_rank",
    "transition_2_seconds",
    "run_time_seconds",
    "run_div_rank",
    "run_gender_rank",
    "run_overall_rank",
    "finish_time_seconds",
    "row_key"
)

print(f"Final column count: {len(fact_race_results.columns)}")
print(f"Final row count: {fact_race_results.count():,}")


In [0]:
table_exists = spark.catalog.tableExists(TARGET_TABLE)

merge_condition = " AND ".join([f"target.{c} = source.{c}" for c in merge_key_cols])

if (not table_exists) or (run_mode == "full"):
    print(f"Full load to {TARGET_TABLE}")
    (
        fact_race_results.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .saveAsTable(TARGET_TABLE)
    )
else:
    print(f"Incremental merge (insert-only) to {TARGET_TABLE}")
    delta_table = DeltaTable.forName(spark, TARGET_TABLE)
    (
        delta_table.alias("target")
        .merge(
            fact_race_results.alias("source"),
            merge_condition
        )
        .whenNotMatchedInsertAll()
        .execute()
    )

print("Write complete")

In [0]:
result_df = spark.table(TARGET_TABLE)

print(f"Table: {TARGET_TABLE}")
print(f"Total rows: {result_df.count():,}")

print("\nRows by year and gender:")
display(
    result_df
    .groupBy("year", "source_gender")
    .agg(
        F.count("*").alias("total"),
        F.sum(F.when(F.col("is_finisher"), 1).otherwise(0)).alias("finishers")
    )
    .orderBy("year", "source_gender")
)

In [0]:
print("Dimension key coverage:")
total = result_df.count()

athlete_nulls = result_df.filter(F.col("athlete_key").isNull()).count()
division_nulls = result_df.filter(F.col("division_key").isNull()).count()
country_nulls = result_df.filter(F.col("country_key").isNull()).count()

print(f"  athlete_key nulls: {athlete_nulls} ({athlete_nulls/total*100:.1f}%)")
print(f"  division_key nulls: {division_nulls} ({division_nulls/total*100:.1f}%)")
print(f"  country_key nulls: {country_nulls} ({country_nulls/total*100:.1f}%)")

display(
    result_df
    .filter(F.col("division_key").isNull())
    .groupBy("designation")
    .count()
    .orderBy(F.col("count").desc())
)

In [0]:
silver_full_df = spark.table(SOURCE_TABLE)

print("Country analysis from silver:")
null_country_count = silver_full_df.filter(F.col("country").isNull()).count()
print(f"  Athletes with NULL country in silver: {null_country_count}")

total_athletes = silver_full_df.count()
print(f"  Total athletes: {total_athletes}")
print(f"  Percentage with NULL country: {null_country_count/total_athletes*100:.1f}%")

In [0]:
test_query = spark.sql(f"""
    SELECT 
        f.year,
        a.athlete_name,
        c.country_name,
        c.continent,
        d.division_description,
        f.rank,
        f.finish_time_seconds,
        ROUND(f.finish_time_seconds / 3600, 2) as finish_hours
    FROM {TARGET_TABLE} f
    LEFT JOIN {DIM_ATHLETES} a ON f.athlete_key = a.athlete_key
    LEFT JOIN {DIM_COUNTRIES} c ON f.country_key = c.country_key
    LEFT JOIN {DIM_DIVISIONS} d ON f.division_key = d.division_key
    WHERE f.is_finisher = true
    ORDER BY f.year, f.rank
    LIMIT 10
""")

print("Star schema query result (Top 10 finishers):")
display(test_query)

In [0]:
spark.sql(f"OPTIMIZE {TARGET_TABLE}")
print("Table optimized")

print("\n" + "=" * 50)
print("FACT TABLE COMPLETE: fact_race_results")
print("=" * 50)
print(f"Table: {TARGET_TABLE}")
print(f"Rows: {spark.table(TARGET_TABLE).count():,}")
print(f"Timestamp: {datetime.now()}")
print("=" * 50)