In [0]:
#%env ENV=prod
%env ENV=test

In [0]:
# ================================================================
# ‚ôªÔ∏è Force Reload of config.py and Supporting Modules
# ================================================================
import sys, importlib, os

# Ensure workspace root is on sys.path
workspace_root = "/Workspace/Users/markholahan@pm.me/unguided-capstone-project"
if workspace_root not in sys.path:
    sys.path.append(workspace_root)

# Clean out cached modules
for module_name in list(sys.modules.keys()):
    if module_name.startswith("scripts"):
        del sys.modules[module_name]

# Optional: enforce desired environment before import
os.environ["ENV"] = os.getenv("ENV", "test").lower()
print(f"üåç Reinitializing config in {os.environ['ENV'].upper()} mode")

# Reimport the package fresh
import scripts.config as config
importlib.reload(config)

print("‚úÖ config.py successfully reloaded.")
print(f"üîß Active environment: {config.ENV.upper()} | RUN_ID={config.RUN_ID}")


In [0]:
# ================================================================
# üß≠ Mission Control ‚Äî Capstone Step 9: "Release the Kraken"
# ---------------------------------------------------------------
# Purpose : Validate pipeline environment + configuration before full run
# Placement: Run this FIRST in Pipeline_Runner_Notebook
# ================================================================

import scripts.config as config
from pprint import pprint

print("\n===============================================================")
print("üöÄ CAPSTONE PIPELINE ‚Äî MISSION CONTROL")
print("===============================================================")

# ---------------------------------------------------------------
# üåç Environment Overview
# ---------------------------------------------------------------
print(f"üåé Environment Mode   : {config.ENV.upper()}")
print(f"üß± Unity Catalog Mode : {'ENABLED' if config.UC_MODE else 'DISABLED'}")
print(f"üè∑Ô∏è  Run ID            : {config.RUN_ID}")
print(f"üí° Storage Account    : {config.STORAGE_ACCOUNT}")
print(f"üì¶ Containers         : bronze={config.CONTAINER_BRONZE}, silver={config.CONTAINER_SILVER}, gold={config.CONTAINER_GOLD}, metrics={config.CONTAINER_METRICS}")
print("---------------------------------------------------------------")

# ---------------------------------------------------------------
# üß± Medallion Layer Paths
# ---------------------------------------------------------------
print("üóÇÔ∏è  Medallion Directories")
print(f"ü•â Bronze Layer  ‚Üí {config.BRONZE_DIR}")
print(f"ü•à Silver Layer  ‚Üí {config.SILVER_DIR}")
print(f"ü•á Gold Layer    ‚Üí {config.GOLD_DIR}")
print(f"üìä Metrics Layer ‚Üí {config.METRICS_DIR}")
print("---------------------------------------------------------------")

# ---------------------------------------------------------------
# üé¨ TMDB Extraction Parameters
# ---------------------------------------------------------------
print("üé¨ TMDB Extraction Parameters")
print(f"   TMDB_PAGE_LIMIT        = {config.TMDB_PAGE_LIMIT}")
print(f"   TMDB_MAX_RESULTS       = {config.TMDB_MAX_RESULTS}")
print(f"   TMDB_REQUEST_DELAY_SEC = {config.TMDB_REQUEST_DELAY_SEC}")
print(f"   TMDB_API_URL           = https://api.themoviedb.org/3/movie/popular")
print("---------------------------------------------------------------")

# ---------------------------------------------------------------
# üéß DISCOGS Extraction Parameters
# ---------------------------------------------------------------
print("üéß DISCOGS Extraction Parameters")
print(f"   DISCOGS_PAGE_CAP    = {config.DISCOGS_PAGE_CAP}")
print(f"   DISCOGS_PER_PAGE    = {config.DISCOGS_PER_PAGE}")
print(f"   DISCOGS_SLEEP_SEC   = {config.DISCOGS_SLEEP_SEC}")
print(f"   DISCOGS_MAX_TITLES  = {config.DISCOGS_MAX_TITLES}")
print(f"   DISCOGS_USER_AGENT  = {config.DISCOGS_USER_AGENT}")
print(f"   DISCOGS_API_URL     = https://api.discogs.com/database/search")
print("---------------------------------------------------------------")

# ---------------------------------------------------------------
# üåê Network Reliability Controls
# ---------------------------------------------------------------
print("üåê Network & API Controls")
print(f"   API_TIMEOUT          = {config.API_TIMEOUT}")
print(f"   API_MAX_RETRIES      = {config.API_MAX_RETRIES}")
print(f"   RETRY_BACKOFF        = {config.RETRY_BACKOFF}")
print(f"   MAX_PAGINATION_WARN  = {config.MAX_PAGINATION_WARN}")
print("---------------------------------------------------------------")

# ---------------------------------------------------------------
# üîé Configuration Validation
# ---------------------------------------------------------------
warnings = []

if config.TMDB_PAGE_LIMIT > config.MAX_PAGINATION_WARN:
    warnings.append(
        f"‚ö†Ô∏è TMDB_PAGE_LIMIT ({config.TMDB_PAGE_LIMIT}) exceeds MAX_PAGINATION_WARN ({config.MAX_PAGINATION_WARN})"
    )
if config.DISCOGS_PAGE_CAP > config.MAX_PAGINATION_WARN:
    warnings.append(
        f"‚ö†Ô∏è DISCOGS_PAGE_CAP ({config.DISCOGS_PAGE_CAP}) exceeds MAX_PAGINATION_WARN ({config.MAX_PAGINATION_WARN})"
    )

if warnings:
    print("üö® CONFIG WARNINGS DETECTED:")
    for w in warnings:
        print(f"   {w}")
else:
    print("‚úÖ Configuration check passed ‚Äî all limits within safety bounds.")
print("---------------------------------------------------------------")

# ---------------------------------------------------------------
# üß© Step Preview
# ---------------------------------------------------------------
ACTIVE_STEPS = [1, 2, 3, 4, 5]
print(f"üß© Steps to Run : {ACTIVE_STEPS}")
print(f"üßæ Output Format: Parquet (Spark ‚Üí ADLS / UC passthrough)\n")

# ---------------------------------------------------------------
# üß† Debug Context Snapshot
# ---------------------------------------------------------------
print("üß† Current Path Configuration:")
pprint(config.get_paths_dict())
print("---------------------------------------------------------------")

print("‚úÖ Mission Control initialized ‚Äî ready for launch.")
print("===============================================================\n")


In [0]:
# ================================================================
#  Pipeline_Runner.py ‚Äî v4.0 (Databricks / Mount-less / Config-Driven)
#  ---------------------------------------------------------------
#  Purpose : Execute full ETL pipeline (Steps 01‚Äì05)
#  Runtime : Databricks 16.4 LTS (Unity Catalog)
#  Author  : M. Holahan
# ================================================================

# COMMAND ----------
# ‚úÖ Environment bootstrap
!pip install -q adlfs fsspec rapidfuzz

import sys
import inspect
import time
import json
import pandas as pd
import importlib
import os

import scripts.config as config
from pyspark.sql import SparkSession

# Ensure Spark context exists (Databricks sometimes resets)
try:
    spark = config.spark
except AttributeError:
    spark = SparkSession.builder.getOrCreate()

# ================================================================
# üåç Environment Diagnostics
# ================================================================
print("\n===============================================================")
print("üöÄ CAPSTONE PIPELINE ‚Äî RUNNER (v4.0)")
print("===============================================================")
print(f"üåé Environment Mode     : {config.ENV.upper()}")
print(f"üß± Unity Catalog Mode   : {'ENABLED' if config.UC_MODE else 'DISABLED'}")
print(f"üè∑Ô∏è  Run ID              : {config.RUN_ID}")
print(f"üíæ Storage Account      : {config.STORAGE_ACCOUNT}")
print("---------------------------------------------------------------")
print(f"ü•â Bronze Layer  ‚Üí {config.BRONZE_DIR}")
print(f"ü•à Silver Layer  ‚Üí {config.SILVER_DIR}")
print(f"ü•á Gold Layer    ‚Üí {config.GOLD_DIR}")
print(f"üìä Metrics Layer ‚Üí {config.METRICS_DIR}")
print("===============================================================\n")

# ================================================================
# üß© Import pipeline steps
# ================================================================
from scripts.extract_spark_tmdb import Step01ExtractSparkTMDB
from scripts.extract_spark_discogs import Step02ExtractSparkDiscogs
from scripts.prepare_tmdb_discogs_candidates import Step03PrepareTMDBDiscogsCandidates
from scripts.validate_schema_alignment import Step04ValidateSchemaAlignment
#from scripts.match_and_enrich import Step05MatchAndEnrichDBX

PIPELINE_STEPS = {
    1: Step01ExtractSparkTMDB,
    2: Step02ExtractSparkDiscogs,
    3: Step03PrepareTMDBDiscogsCandidates,
    4: Step04ValidateSchemaAlignment,
    #5: Step05MatchAndEnrichDBX,
}

# ================================================================
# ‚öôÔ∏è Parameter block
# ================================================================
ACTIVE_STEPS = [2]     # adjust as needed for partial runs
ROW_LIMIT = None                    # optional debugging limit

print(f"üß© Active Steps  : {ACTIVE_STEPS}")
print(f"üìä Metrics Path  : {config.layer_path('metrics', 'pipeline_summary')}\n")

# ================================================================
# üöÄ Execute pipeline with structured logging
# ================================================================
results = []

for step_no in ACTIVE_STEPS:
    StepClass = PIPELINE_STEPS[step_no]
    module_name = StepClass.__module__

    # Safely reload module (helps during development / notebook runs)
    if module_name in sys.modules:
        importlib.reload(sys.modules[module_name])

    step_name = StepClass.__name__
    print(f"\nüöÄ Running Step {step_no}: {step_name}")
    t0 = time.time()
    status = "success"

    try:
        step = StepClass()
        sig = inspect.signature(step.run)
        kwargs = {"limit": ROW_LIMIT} if "limit" in sig.parameters else {}
        df_out = step.run(**kwargs)
    except Exception as e:
        status = f"failed: {type(e).__name__}"
        print(f"‚ö†Ô∏è Step {step_no} ({step_name}) failed: {e}")
        df_out = None

    duration = round(time.time() - t0, 2)
    results.append({
        "step": step_no,
        "name": step_name,
        "duration_sec": duration,
        "status": status
    })
    print(f"‚úÖ Step {step_no} completed ‚Üí {status.upper()} in {duration}s")

# ================================================================
# üìä Summary logging
# ================================================================
summary_df = pd.DataFrame(results)
display(summary_df)

summary_json = summary_df.to_json(orient="records", indent=2)
print(f"\nüìä Pipeline Summary:\n{summary_json}")

# Write to metrics layer via config helpers
summary_output = config.layer_path("metrics", "pipeline_summary")

try:
    import fsspec
    fs = fsspec.filesystem("abfss", account_name=config.STORAGE_ACCOUNT, anon=False)
    with fs.open(f"{summary_output}/summary_{config.RUN_ID}.json", "w") as f:
        f.write(summary_json)
    print(f"üì§ Summary uploaded ‚Üí {summary_output}/summary_{config.RUN_ID}.json")
except Exception as e:
    print(f"‚ö†Ô∏è Could not upload summary to ADLS: {e}")

print(f"\nüèÅ Pipeline execution complete in {config.ENV.upper()} mode.\n")
