In [0]:
from pyspark.sql import functions as F
from datetime import datetime
import json

In [0]:
dbutils.widgets.text("run_mode", "full", "Run Mode (full/incremental)")
dbutils.widgets.text("process_year", "", "Process Year (empty=all)")
dbutils.widgets.text("triggered_by", "manual", "Triggered By")
dbutils.widgets.text("execution_date", "", "Execution Date")

In [0]:
dbutils.widgets.text("merge_key_cols", "row_key", "Merge key columns (comma-separated)")

In [0]:
run_mode = dbutils.widgets.get("run_mode").lower().strip()
process_year = dbutils.widgets.get("process_year").strip()
triggered_by = dbutils.widgets.get("triggered_by").strip()
execution_date = dbutils.widgets.get("execution_date").strip()

merge_key_cols_raw = dbutils.widgets.get("merge_key_cols").strip()

merge_key_cols = [c.strip() for c in merge_key_cols_raw.split(",") if c.strip()]

print("=" * 60)
print("PIPELINE CONFIGURATION")
print("=" * 60)
print(f"Run Mode: {run_mode}")
print(f"Process Year: {process_year if process_year else 'ALL'}")
print(f"Triggered By: {triggered_by}")
print(f"Execution Date: {execution_date if execution_date else '(not provided)'}")
print(f"Merge Keys: {merge_key_cols}")

In [0]:
valid_run_modes = ["full", "incremental"]
if run_mode not in valid_run_modes:
    raise ValueError(f"Invalid run_mode: {run_mode}. Must be one of {valid_run_modes}")

In [0]:
if run_mode == "incremental" and not process_year:
    raise ValueError("Incremental mode requires process_year parameter")

In [0]:
if not merge_key_cols:
    raise ValueError("merge_key_cols cannot be empty. Example: row_key or row_key,year")

In [0]:
CATALOG = "ironman"
BRONZE_SCHEMA = "bronze"
SILVER_SCHEMA = "silver"
GOLD_SCHEMA = "gold"

In [0]:
BRONZE_TABLE = f"{CATALOG}.{BRONZE_SCHEMA}.ironman_results"
SILVER_TABLE = f"{CATALOG}.{SILVER_SCHEMA}.ironman_results"
GOLD_DIM_ATHLETES = f"{CATALOG}.{GOLD_SCHEMA}.dim_athletes"
GOLD_DIM_DIVISIONS = f"{CATALOG}.{GOLD_SCHEMA}.dim_divisions"
GOLD_DIM_COUNTRIES = f"{CATALOG}.{GOLD_SCHEMA}.dim_countries"
GOLD_FACT_RESULTS = f"{CATALOG}.{GOLD_SCHEMA}.fact_race_results"

In [0]:
VOLUME_PATH = "/Volumes/ironman/default/landing"

In [0]:
ALL_FILES_CONFIG = [
    {"filename": "2023_men.csv", "year": 2023, "gender": "M"},
    {"filename": "2023_women.csv", "year": 2023, "gender": "F"},
    {"filename": "2024_men.csv", "year": 2024, "gender": "M"},
    {"filename": "2024_women.csv", "year": 2024, "gender": "F"},
    {"filename": "2025_men.csv", "year": 2025, "gender": "M"},
    {"filename": "2025_women.csv", "year": 2025, "gender": "F"},
]

In [0]:
if process_year:
    process_year_int = int(process_year)
    FILES_TO_PROCESS = [f for f in ALL_FILES_CONFIG if f["year"] == process_year_int]
else:
    FILES_TO_PROCESS = ALL_FILES_CONFIG

print("\n" + "=" * 60)
print("CONFIGURATION SUMMARY")
print("=" * 60)
print(f"\nTables:")
print(f"  Bronze: {BRONZE_TABLE}")
print(f"  Silver: {SILVER_TABLE}")
print(f"  Gold Fact: {GOLD_FACT_RESULTS}")
print(f"\nVolume Path: {VOLUME_PATH}")
print(f"\nFiles to Process ({len(FILES_TO_PROCESS)}):")
for f in FILES_TO_PROCESS:
    print(f"  - {VOLUME_PATH}/year={f['year']}/{f['filename']}")

In [0]:
pipeline_config = {
    "run_mode": run_mode,
    "process_year": int(process_year) if process_year else None,
    "triggered_by": triggered_by,
    "execution_date": execution_date if execution_date else None,
    "pipeline_start_time": datetime.now().isoformat(),

    "incremental": {
        "strategy": "merge",                
        "merge_key_cols": merge_key_cols, 
    },

    "catalog": CATALOG,
    "bronze_schema": BRONZE_SCHEMA,
    "silver_schema": SILVER_SCHEMA,
    "gold_schema": GOLD_SCHEMA,

    "bronze_table": BRONZE_TABLE,
    "silver_table": SILVER_TABLE,
    "gold_dim_athletes": GOLD_DIM_ATHLETES,
    "gold_dim_divisions": GOLD_DIM_DIVISIONS,
    "gold_dim_countries": GOLD_DIM_COUNTRIES,
    "gold_fact_results": GOLD_FACT_RESULTS,

    "volume_path": VOLUME_PATH,

    "files_to_process": FILES_TO_PROCESS,
}

In [0]:
pipeline_config_json = json.dumps(pipeline_config)
print("\nConfiguration JSON created successfully")

print("\nVerifying source files exist in volume:")
missing_files = []

for file_config in FILES_TO_PROCESS:
    file_path = f"{VOLUME_PATH}/year={file_config['year']}/{file_config['filename']}"
    try:
        dbutils.fs.ls(file_path)
        print(f"  {file_config['filename']}")
    except Exception:
        print(f"  {file_config['filename']} - NOT FOUND")
        missing_files.append(file_config['filename'])

if missing_files:
    raise FileNotFoundError(f"Missing source files: {missing_files}")

print("\nAll source files verified")

[0;31m---------------------------------------------------------------------------[0m
[0;31mFileNotFoundError[0m                         Traceback (most recent call last)
File [0;32m<command-7418642836127763>, line 19[0m
[1;32m     16[0m         missing_files[38;5;241m.[39mappend(file_config[[38;5;124m'[39m[38;5;124mfilename[39m[38;5;124m'[39m])
[1;32m     18[0m [38;5;28;01mif[39;00m missing_files:
[0;32m---> 19[0m     [38;5;28;01mraise[39;00m [38;5;167;01mFileNotFoundError[39;00m([38;5;124mf[39m[38;5;124m"[39m[38;5;124mMissing source files: [39m[38;5;132;01m{[39;00mmissing_files[38;5;132;01m}[39;00m[38;5;124m"[39m)
[1;32m     21[0m [38;5;28mprint[39m([38;5;124m"[39m[38;5;130;01m\n[39;00m[38;5;124m✅ All source files verified[39m[38;5;124m"[39m)

[0;31mFileNotFoundError[0m: Missing source files: ['2024_men.csv', '2024_women.csv']

In [0]:
pipeline_run_id = f"{triggered_by}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

print("\n" + "=" * 60)
print("PIPELINE INITIALIZED")
print("=" * 60)
print(f"Pipeline Run ID: {pipeline_run_id}")
print(f"Start Time: {datetime.now()}")
print(f"Mode: {run_mode.upper()}")
print(f"Merge Keys: {merge_key_cols}")
if process_year:
    print(f"Processing Year: {process_year}")
else:
    print(f"Processing: ALL YEARS")
print("=" * 60)

In [0]:
pipeline_config_json = json.dumps(pipeline_config)

In [0]:
dbutils.jobs.taskValues.set(key="pipeline_config_json", value=pipeline_config_json)


In [0]:
dbutils.notebook.exit(pipeline_config_json)