In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, DoubleType, StringType
from delta.tables import DeltaTable
from datetime import datetime
import json

In [0]:
dbutils.widgets.text("pipeline_config_json", "", "Pipeline Config JSON (from 01_config)")
dbutils.widgets.text("run_mode", "incremental", "Run Mode")   # fallback
dbutils.widgets.text("process_year", "", "Year to Process")   # fallback

pipeline_config_json = dbutils.widgets.get("pipeline_config_json").strip()

if pipeline_config_json:
    pipeline_config = json.loads(pipeline_config_json)

    CATALOG = pipeline_config.get("catalog", "ironman")
    SOURCE_TABLE = pipeline_config["bronze_table"]
    TARGET_TABLE = pipeline_config["silver_table"]

    run_mode = pipeline_config.get("run_mode", "full")
    process_year = pipeline_config.get("process_year", None)

    incr_cfg = pipeline_config.get("incremental", {})
    merge_key_cols = incr_cfg.get("merge_key_cols", ["row_key"])
else:
    CATALOG = "ironman"
    SOURCE_TABLE = f"{CATALOG}.bronze.ironman_results"
    TARGET_TABLE = f"{CATALOG}.silver.ironman_results"

    run_mode = dbutils.widgets.get("run_mode")
    process_year_raw = dbutils.widgets.get("process_year").strip()
    process_year = int(process_year_raw) if process_year_raw else None

    merge_key_cols = ["row_key"]

print(f"Source: {SOURCE_TABLE}")
print(f"Target: {TARGET_TABLE}")
print(f"Run Mode: {run_mode}")
print(f"Process Year: {process_year if process_year else 'ALL'}")
print(f"Merge Keys: {merge_key_cols}")

In [0]:
bronze_df = spark.table(SOURCE_TABLE)

if process_year:
    bronze_df = bronze_df.filter(F.col("year") == int(process_year))
    print(f"Filtered to year: {process_year}")

print(f"Bronze rows to process: {bronze_df.count():,}")

print("Bronze Schema (all strings from bronze):")
bronze_df.printSchema()

print("\nSample data:")
display(bronze_df.select("rank", "athlete_name", "country", "swim_time", "finish_time").limit(5))

In [0]:
def parse_time_to_seconds(time_col):
    parts = F.split(time_col, ":")
    seconds = (
        F.coalesce(parts.getItem(0).cast(IntegerType()), F.lit(0)) * 3600 +
        F.coalesce(parts.getItem(1).cast(IntegerType()), F.lit(0)) * 60 +
        F.coalesce(parts.getItem(2).cast(IntegerType()), F.lit(0))
    )
    return F.when(time_col.isNull() | (seconds == 0), None).otherwise(seconds)

In [0]:
test_df = bronze_df.select(
    "athlete_name",
    "swim_time",
    "bike_time",
    "run_time",
    "finish_time"
).limit(5)

test_df = test_df.withColumn("swim_seconds_test", parse_time_to_seconds(F.col("swim_time")))
test_df = test_df.withColumn("finish_seconds_test", parse_time_to_seconds(F.col("finish_time")))

print("Time parsing test:")
display(test_df)

In [0]:
silver_df = bronze_df

integer_columns = [
    "rank",
    "div_rank",
    "gender_rank",
    "overall_rank",
    "bib",
    "points",
    "swim_div_rank",
    "swim_gender_rank",
    "swim_overall_rank",
    "bike_div_rank",
    "bike_gender_rank",
    "bike_overall_rank",
    "run_div_rank",
    "run_gender_rank",
    "run_overall_rank"
]

for col_name in integer_columns:
    if col_name in silver_df.columns:
        silver_df = silver_df.withColumn(col_name, F.col(col_name).cast(IntegerType()))
        print(f"Cast to integer: {col_name}")

In [0]:
time_columns = [
    ("swim_time", "swim_time_seconds"),
    ("bike_time", "bike_time_seconds"),
    ("run_time", "run_time_seconds"),
    ("finish_time", "finish_time_seconds"),
    ("transition_1", "transition_1_seconds"),
    ("transition_2", "transition_2_seconds")
]

for source_col, target_col in time_columns:
    if source_col in silver_df.columns:
        silver_df = silver_df.withColumn(target_col, parse_time_to_seconds(F.col(source_col)))
        print(f"Parsed: {source_col} -> {target_col}")

silver_df = silver_df.withColumn("country", F.upper(F.trim(F.col("country"))))
silver_df = silver_df.withColumn("athlete_name", F.trim(F.col("athlete_name")))
silver_df = silver_df.withColumn("designation", F.upper(F.trim(F.col("designation"))))
silver_df = silver_df.withColumn("division", F.upper(F.trim(F.col("division"))))

silver_df = silver_df.withColumn("is_finisher", F.when(F.upper(F.col("designation")) == "FINISHER", True).otherwise(False))
silver_df = silver_df.withColumn("is_dnf", F.when(F.upper(F.col("designation")) == "DNF", True).otherwise(False))
silver_df = silver_df.withColumn("is_dns", F.when(F.upper(F.col("designation")) == "DNS", True).otherwise(False))
silver_df = silver_df.withColumn("is_dq", F.when(F.upper(F.col("designation")) == "DQ", True).otherwise(False))

silver_df = silver_df.withColumn(
    "calculated_total_seconds",
    F.col("swim_time_seconds") +
    F.col("transition_1_seconds") +
    F.col("bike_time_seconds") +
    F.col("transition_2_seconds") +
    F.col("run_time_seconds")
)

In [0]:
silver_df = silver_df.withColumn(
    "time_difference",
    F.when(
        F.col("finish_time_seconds").isNull() | F.col("calculated_total_seconds").isNull(),
        None
    ).otherwise(F.abs(F.col("finish_time_seconds") - F.col("calculated_total_seconds")))
)

silver_df = silver_df.withColumn("first_name", F.split(F.col("athlete_name"), " ").getItem(0))
silver_df = silver_df.withColumn(
    "last_name",
    F.when(
        F.size(F.split(F.col("athlete_name"), " ")) > 1,
        F.element_at(F.split(F.col("athlete_name"), " "), -1)
    ).otherwise(None)
)

display(silver_df.select("athlete_name", "first_name", "last_name").limit(10))

print("Data Validation Report:")
print("=" * 50)

total_rows = silver_df.count()
print(f"Total rows: {total_rows:,}")

In [0]:
finishers = silver_df.filter(F.col("is_finisher") == True).count()
dnf = silver_df.filter(F.col("is_dnf") == True).count()
dns = silver_df.filter(F.col("is_dns") == True).count()
dq = silver_df.filter(F.col("is_dq") == True).count()

print(f"\nBy Status:")
print(f"  Finishers: {finishers:,}" + (f" ({finishers/total_rows*100:.1f}%)" if total_rows > 0 else ""))
print(f"  DNF: {dnf:,}" + (f" ({dnf/total_rows*100:.1f}%)" if total_rows > 0 else ""))
print(f"  DNS: {dns:,}" + (f" ({dns/total_rows*100:.1f}%)" if total_rows > 0 else ""))
print(f"  DQ: {dq:,}" + (f" ({dq/total_rows*100:.1f}%)" if total_rows > 0 else ""))

if total_rows == 0:
    raise ValueError("Silver has 0 rows to process. Check process_year filter and that Bronze has data for that year.")

In [0]:
finisher_df = silver_df.filter(F.col("is_finisher") == True)
print(f"\nNull checks (finishers only):")
for col in ["rank", "finish_time_seconds", "swim_time_seconds", "bike_time_seconds", "run_time_seconds"]:
    null_count = finisher_df.filter(F.col(col).isNull()).count()
    print(f"  {col}: {null_count} nulls")

In [0]:
print("Finishers with null rank:")
display(
    silver_df
    .filter((F.col("is_finisher") == True) & (F.col("rank").isNull()))
    .select("athlete_name", "country", "division", "designation", "rank", "finish_time", "finish_time_seconds")
)

In [0]:
print("\nFinishers with null swim_time_seconds:")
display(
    silver_df
    .filter((F.col("is_finisher") == True) & (F.col("swim_time_seconds").isNull()))
    .select("athlete_name", "swim_time", "swim_time_seconds", "finish_time")
)

In [0]:
print("\nFinishers with null bike_time_seconds:")
display(
    silver_df
    .filter((F.col("is_finisher") == True) & (F.col("bike_time_seconds").isNull()))
    .select("athlete_name", "bike_time", "bike_time_seconds", "finish_time")
)

In [0]:
silver_df = silver_df.withColumn(
    "has_data_issue",
    F.when(
        (F.col("is_finisher") == True) & (
            F.col("rank").isNull() |
            F.col("swim_time_seconds").isNull() |
            F.col("bike_time_seconds").isNull() |
            F.col("run_time_seconds").isNull() |
            F.col("finish_time_seconds").isNull()
        ),
        True
    ).otherwise(False)
)

In [0]:
issue_count = silver_df.filter(F.col("has_data_issue") == True).count()
print(f"Records with data issues: {issue_count}")

In [0]:
discrepancies = silver_df.filter(
    (F.col("is_finisher") == True) &
    (F.col("time_difference") > 60)
)

discrepancy_count = discrepancies.count()
print(f"Records with time discrepancy > 60 seconds: {discrepancy_count}")

In [0]:
if discrepancy_count > 0:
    print("\nSample discrepancies:")
    display(
        discrepancies
        .select("athlete_name", "finish_time_seconds", "calculated_total_seconds", "time_difference")
        .limit(5)
    )

silver_df = silver_df.drop("time_difference")

In [0]:
final_columns = [
    "row_key","year",
    "athlete_name","first_name","last_name","country","bib",
    "division","source_gender",
    "designation","is_finisher","is_dnf","is_dns","is_dq","has_data_issue",
    "rank","div_rank","gender_rank","overall_rank","points",
    "swim_time","swim_time_seconds","swim_div_rank","swim_gender_rank","swim_overall_rank",
    "transition_1","transition_1_seconds",
    "bike_time","bike_time_seconds","bike_div_rank","bike_gender_rank","bike_overall_rank",
    "transition_2","transition_2_seconds",
    "run_time","run_time_seconds","run_div_rank","run_gender_rank","run_overall_rank",
    "finish_time","finish_time_seconds","calculated_total_seconds",
    "source_file","load_timestamp","load_date"
]

In [0]:
existing_columns = [c for c in final_columns if c in silver_df.columns]
silver_df = silver_df.select(existing_columns)

print(f"Final column count: {len(existing_columns)}")
print("Final Schema:")
silver_df.printSchema()

In [0]:
table_exists = spark.catalog.tableExists(TARGET_TABLE)
merge_condition = " AND ".join([f"target.{c} = source.{c}" for c in merge_key_cols])

if (not table_exists) or (run_mode == "full"):
    print(f"Full load to {TARGET_TABLE}")
    (
        silver_df.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .saveAsTable(TARGET_TABLE)
    )
else:
    print(f"Incremental merge (insert-only) to {TARGET_TABLE}")
    delta_table = DeltaTable.forName(spark, TARGET_TABLE)
    (
        delta_table.alias("target")
        .merge(silver_df.alias("source"), merge_condition)
        .whenNotMatchedInsertAll()
        .execute()
    )

print("Write complete")

In [0]:
result_df = spark.table(TARGET_TABLE)

print(f"Table: {TARGET_TABLE}")
print(f"Total rows: {result_df.count():,}")

print("\nRow counts by year and gender:")
display(
    result_df
    .groupBy("year", "source_gender")
    .agg(
        F.count("*").alias("total"),
        F.sum(F.when(F.col("is_finisher"), 1).otherwise(0)).alias("finishers"),
        F.sum(F.when(F.col("is_dnf"), 1).otherwise(0)).alias("dnf"),
        F.sum(F.when(F.col("is_dns"), 1).otherwise(0)).alias("dns")
    )
    .orderBy("year", "source_gender")
)

In [0]:
spark.sql(f"OPTIMIZE {TARGET_TABLE}")
print("Table optimized")

print("\n" + "=" * 50)
print("SILVER LAYER COMPLETE")
print("=" * 50)
print(f"Source: {SOURCE_TABLE}")
print(f"Target: {TARGET_TABLE}")
print(f"Rows: {spark.table(TARGET_TABLE).count():,}")
print(f"Timestamp: {datetime.now()}")
print("=" * 50)