In [None]:
# reset_dir.ipynb
# Full clean slate for KardiaFlow:
# 1. Drop the databases and Delta tables from the metastore
# 2. Delete all underlying storage for bronze/silver/raw/source/enriched including checkpoints
# 3. Clear any in-memory Spark catalog cache to avoid stale metadata
# This leaves nothing persistent that would affect a fresh bootstrap/run

from kflow.config import bronze_paths, silver_paths
from kflow.auth_adls import ensure_adls_oauth

spark.sql("USE CATALOG hive_metastore")

# Configure ABFS OAuth once for this session
ensure_adls_oauth(validate_path="")

def safe_rm(path: str, description: str) -> None:
    """
    Attempt to remove a path via dbutils.fs.rm recursively.
    On failure, log and continue.
    """
    try:
        dbutils.fs.rm(path, recurse=True)
        print(f"Removed {description}: {path}")
    except Exception as e:
        print(f"  (ignore) failed removing {description} {path}: {e}")


def safe_drop_table(full_name: str) -> None:
    """
    Drop a table, preferring PURGE. If that fails, fall back to a normal drop.
    Clears cache before attempting to minimize stale-metadata errors.
    """
    try:
        spark.catalog.clearCache()
    except Exception:
        pass

    print(f"Dropping table {full_name}")
    try:
        spark.sql(f"DROP TABLE IF EXISTS {full_name} PURGE")
        print(f"Dropped table {full_name} with PURGE")
    except Exception as e:
        print(f"  (ignore) PURGE failed for {full_name}: {e}")
        try:
            spark.sql(f"DROP TABLE IF EXISTS {full_name}")
            print(f"Dropped table {full_name} without PURGE fallback")
        except Exception as e2:
            print(f"  (ignore) failed dropping {full_name}: {e2}")


# 1. Drop the Delta tables from the metastore with PURGE so no leftover catalog entries remain.
to_drop = [
    "kardia_bronze.bronze_encounters",
    "kardia_bronze.bronze_claims",
    "kardia_bronze.bronze_patients",
    "kardia_bronze.bronze_providers",
    "kardia_bronze.bronze_feedback",
    "kardia_silver.silver_encounters",
    "kardia_silver.silver_claims",
    "kardia_silver.silver_patients",
    "kardia_silver.silver_providers",
    "kardia_silver.silver_feedback",
    "kardia_silver.silver_encounters_enriched"
]

for full_name in to_drop:
    safe_drop_table(full_name)


# 2. Remove all underlying storage and checkpoint directories for each dataset.
datasets = ("encounters", "claims", "patients", "providers", "feedback")
for name in datasets:
    # Bronze layer paths
    P = bronze_paths(name)
    print(f"\nCleaning bronze layer for '{name}':")
    safe_rm(P.bronze, f"bronze data for {name}")
    safe_rm(P.checkpoint, f"bronze checkpoint for {name}")

    # Raw / source input (unstructured source zone)
    print(f"Cleaning raw/source layer for '{name}':")
    safe_rm(P.raw, f"raw/source data for {name}")

    # Silver layer paths
    S = silver_paths(name)
    print(f"Cleaning silver layer for '{name}':")
    safe_rm(S.path, f"silver data for {name}")
    safe_rm(S.checkpoint, f"silver checkpoint for {name}")


# 3. Remove enriched target (separate from the core silver datasets)
print("\nCleaning enriched target 'encounters_enriched':")
S_enriched = silver_paths("encounters_enriched")
safe_rm(S_enriched.path, "enriched path")
safe_rm(S_enriched.checkpoint, "enriched checkpoint")


# 4. Drop the databases to eliminate residual namespace/catalog artifacts
print("\nDropping databases (if they exist):")
try:
    spark.sql("DROP DATABASE IF EXISTS kardia_bronze CASCADE")
    print("Dropped database kardia_bronze")
except Exception as e:
    print(f"  (ignore) failed dropping database kardia_bronze: {e}")

try:
    spark.sql("DROP DATABASE IF EXISTS kardia_silver CASCADE")
    print("Dropped database kardia_silver")
except Exception as e:
    print(f"  (ignore) failed dropping database kardia_silver: {e}")

try:
    spark.sql("DROP DATABASE IF EXISTS kardia_gold CASCADE")
    print("Dropped database kardia_gold")
except Exception as e:
    print(f"  (ignore) failed dropping database kardia_gold: {e}")


# 5. Clear in-memory catalog/cache to avoid stale metadata in the current session
print("\nClearing Spark catalog cache.")
try:
    spark.catalog.clearCache()
except Exception as e:
    print(f"  (ignore) failed clearing cache: {e}")

print("\nWipe complete.")