In [0]:
# Restart the Python environment
dbutils.library.restartPython()


In [0]:
# ------------------------------------------------------------------
# Databricks-safe path override (keeps local compatibility)
# ------------------------------------------------------------------
import os
from pathlib import Path
from scripts import config

# If Databricks-mounted paths are defined as env vars, use them
if "DBFS_INTERMEDIATE" in os.environ:
    config.INTERMEDIATE_DIR = Path(os.environ["DBFS_INTERMEDIATE"])
if "DBFS_METRICS" in os.environ:
    config.METRICS_DIR = Path(os.environ["DBFS_METRICS"])

# Otherwise, fallback: use a known /dbfs/ mount if it exists
if not config.INTERMEDIATE_DIR.exists() and os.path.exists("/dbfs/mnt/raw/intermediate"):
    config.INTERMEDIATE_DIR = Path("/dbfs/mnt/raw/intermediate")
if not config.METRICS_DIR.exists() and os.path.exists("/dbfs/mnt/raw/metrics"):
    config.METRICS_DIR = Path("/dbfs/mnt/raw/metrics")

# Confirm effective paths (helpful in logs)
print(f"[Step05] INTERMEDIATE_DIR ‚Üí {config.INTERMEDIATE_DIR}")
print(f"[Step05] METRICS_DIR      ‚Üí {config.METRICS_DIR}")
# ------------------------------------------------------------------


In [0]:
# === Verify I/O Access for INTERMEDIATE_DIR and METRICS_DIR ===
from pyspark.sql import SparkSession
import pandas as pd
import os
from pathlib import Path
from scripts import config

spark = SparkSession.builder.getOrCreate()

print("üîç Testing write/read in:", config.INTERMEDIATE_DIR)
test_csv = Path(config.INTERMEDIATE_DIR) / "io_test_spark.csv"
test_pq = Path(config.INTERMEDIATE_DIR) / "io_test_pandas.parquet"

# 1Ô∏è‚É£ Spark write/read test
test_df = spark.createDataFrame([(1, "ok"), (2, "success")], ["id", "status"])
test_df.write.mode("overwrite").option("header", True).csv(str(test_csv))
read_back = spark.read.option("header", True).csv(str(test_csv))
print("Spark read_back count:", read_back.count())

# 2Ô∏è‚É£ Pandas write/read test
pdf = pd.DataFrame({"check": ["ok", "passed"], "rows": [2, 2]})
pdf.to_parquet(test_pq)
pdf_back = pd.read_parquet(test_pq)
print("Pandas read_back shape:", pdf_back.shape)

# 3Ô∏è‚É£ Metrics directory existence check
metrics_path = Path(config.METRICS_DIR)
metrics_path.mkdir(exist_ok=True, parents=True)
print("Metrics dir exists:", metrics_path.exists())

# 4Ô∏è‚É£ Clean up test files (optional)
import shutil
if test_csv.exists():
    shutil.rmtree(test_csv)  # Spark CSV write makes a folder
if test_pq.exists():
    test_pq.unlink()

print("‚úÖ Verification complete ‚Äî paths are fully functional.")
