In [0]:
# Auto-reload
%load_ext autoreload
%autoreload 2
# Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules
# To disable autoreload; run %autoreload 0

In [0]:
# Config Summary
import config
config.print_mode_summary()


In [0]:
# ================================================================
#  Pipeline_Runner.py ‚Äî v3.5 (Databricks / Mount-less / Config-Driven)
#  ---------------------------------------------------------------
#  Purpose : Execute full ETL pipeline (Steps 01‚Äì05)
#  Runtime : Databricks 16.4 LTS (Unity Catalog)
#  Author  : M. Holahan
# ================================================================

# COMMAND ----------
# ‚úÖ Environment bootstrap
!pip install -q adlfs fsspec rapidfuzz

import sys
import inspect
import time
import json
import pandas as pd
import importlib

import scripts.config as config
from scripts.config import spark, print_mode_summary

print_mode_summary()

# ================================================================
#  Import pipeline steps
# ================================================================
from scripts.extract_spark_tmdb import Step01ExtractSparkTMDB
from scripts.extract_spark_discogs import Step02ExtractSparkDiscogs
from scripts.prepare_tmdb_discogs_candidates import Step03PrepareTMDBDiscogsCandidates
from scripts.validate_schema_alignment import Step04ValidateSchemaAlignment
from scripts.match_and_enrich import Step05MatchAndEnrichDBX

# Registry (ordered)
PIPELINE_STEPS = {
    1: Step01ExtractSparkTMDB,
    2: Step02ExtractSparkDiscogs,
    3: Step03PrepareTMDBDiscogsCandidates,
    4: Step04ValidateSchemaAlignment,
    5: Step05MatchAndEnrichDBX,
}

# ================================================================
#  Parameter block
# ================================================================
ACTIVE_STEPS = [2]   # Adjust for partial runs
ROW_LIMIT = None                  # Optional limit for debug mode

print(f"\nüß© Active Steps : {ACTIVE_STEPS}")
print(f"üîó Intermediate  : {config.INTERMEDIATE_DIR}")
print(f"üßæ Metrics Path  : {config.METRICS_DIR}\n")

# ================================================================
#  Execute pipeline with structured logging
# ================================================================
results = []

for step_no in ACTIVE_STEPS:
    StepClass = PIPELINE_STEPS[step_no]
    #importlib.reload(StepClass.__module__ if hasattr(StepClass, "__module__") else StepClass)

    # Safely reload the module of the step (if already imported)
    module_name = StepClass.__module__
    if module_name in sys.modules:
        importlib.reload(sys.modules[module_name])
    
    step_name = StepClass.__name__

    print(f"\nüöÄ Running Step {step_no}: {step_name}")
    t0 = time.time()
    status = "success"

    try:
        # Instantiate step (config-driven)
        step = StepClass()

        # Pass optional limit parameter if present
        sig = inspect.signature(step.run)
        kwargs = {"limit": ROW_LIMIT} if "limit" in sig.parameters else {}
        df_out = step.run(**kwargs)

    except Exception as e:
        status = f"failed: {type(e).__name__}"
        print(f"‚ö†Ô∏è Step {step_no} ({step_name}) failed: {e}")
        df_out = None

    duration = round(time.time() - t0, 2)
    results.append({
        "step": step_no,
        "name": step_name,
        "duration_sec": duration,
        "status": status
    })
    print(f"‚úÖ Step {step_no} completed ‚Üí {status.upper()} in {duration}s")

# ================================================================
#  Summary logging
# ================================================================
summary_df = pd.DataFrame(results)
display(summary_df)

summary_json = summary_df.to_json(orient="records", indent=2)
print(f"\nüìä Pipeline Summary:\n{summary_json}")

# Write to ADLS metrics (mount-less safe)
summary_output = f"{config.METRICS_DIR}/pipeline_summary.json"

try:
    import fsspec
    fs = fsspec.filesystem("abfss", account_name=config.STORAGE_ACCOUNT, anon=False)
    with fs.open(summary_output, "w") as f:
        f.write(summary_json)
    print(f"üì§ Summary uploaded ‚Üí {summary_output}")
except Exception as e:
    print(f"‚ö†Ô∏è Could not upload summary to ADLS: {e}")

print("\nüèÅ Pipeline execution complete.\n")

In [0]:
Individual # Step 1
from scripts.extract_spark_tmdb import Step01ExtractSparkTMDB
Step01ExtractSparkTMDB().run(None)


In [0]:
Individual # Step 2
from scripts.extract_spark_discogs import Step02ExtractSparkDiscogs
Step02ExtractSparkDiscogs().run(None)

In [0]:
Individual # Step 3
import importlib
import scripts.prepare_tmdb_discogs_candidates

# Reload to ensure latest code version
importlib.reload(scripts.prepare_tmdb_discogs_candidates)

from scripts.prepare_tmdb_discogs_candidates import Step03PrepareTMDBDiscogsCandidates
Step03PrepareTMDBDiscogsCandidates().run(None)

In [0]:
Individual # Step 4
import importlib
import scripts.validate_schema_alignment

importlib.reload(scripts.validate_schema_alignment)

from scripts.validate_schema_alignment import Step04ValidateSchemaAlignment
Step04ValidateSchemaAlignment().run()


In [0]:
Individual # Step 5
import importlib
import scripts.match_and_enrich

importlib.reload(scripts.match_and_enrich)

from scripts.match_and_enrich import Step05MatchAndEnrichDBX
Step05MatchAndEnrichDBX().run()
