In [None]:
# raw_bootstrap_filesystem.ipynb
# Bootstrap of raw zones (DBFS + ADLS).
# - Patients/Encounters/Claims -> DBFS
# - Providers/Feedback         -> ADLS `raw` container
# Re-running this notebook will NOT duplicate files.

from kflow.config import (
    raw_path, adls_raw_path,
    BRONZE_DB, SILVER_DB, GOLD_DB, VALIDATION_DB
)
from kflow.adls import set_sas

dbu = dbutils

In [None]:
# 1. Spark / Delta defaults
spark.conf.set("spark.databricks.delta.properties.defaults.autoOptimize.optimizeWrite", "true")
spark.conf.set("spark.databricks.delta.properties.defaults.autoOptimize.autoCompact",  "true")
spark.conf.set("spark.databricks.delta.properties.defaults.enableChangeDataFeed",        "true")
spark.conf.set("spark.sql.variable.substitute", "true")
spark.conf.set("kflow.rapid_fire.threshold", 5)
spark.conf.set("kflow.claims.hourly.time_col", "_ingest_ts")

In [0]:
# 2. Databases
DBS = [BRONZE_DB, SILVER_DB, GOLD_DB, VALIDATION_DB]
DB_PROPS = """
  'delta.autoOptimize.optimizeWrite'='true',
  'delta.autoOptimize.autoCompact' ='true'
"""
for db in DBS:
    spark.sql(f"CREATE DATABASE IF NOT EXISTS {db}")
    spark.sql(f"ALTER DATABASE {db} SET DBPROPERTIES ({DB_PROPS})")

In [0]:
# 3. ADLS auth (for providers & feedback)
ADLS_ACCOUNT = "kardiaadlsdemo"
SAS_TOKEN    = dbu.secrets.get("kardia", "adls_raw_sas")
set_sas(ADLS_ACCOUNT, SAS_TOKEN)

In [0]:
# 4. Seed raw dirs with first sample files (run once per env)
UPLOADS_DIR = "dbfs:/FileStore/tables/"  # where seed files are uploaded

# dataset -> (filename, destination_dir)
INITIAL_FILES = {
    # DBFS-backed
    "patients":   ("patients_part_1.csv",    raw_path("patients")),
    "encounters": ("encounters_part_1.avro", raw_path("encounters")),
    "claims":     ("claims_part_1.parquet",  raw_path("claims")),
    # ADLS-backed
    "providers":  ("providers_part_1.tsv",   adls_raw_path("providers")),
    "feedback":   ("feedback_part_1.jsonl",  adls_raw_path("feedback")),
}

In [0]:
# 5. Helpers
def _join(dir_path: str, fname: str) -> str:
    return dir_path.rstrip("/") + "/" + fname

def list_names(path: str):
    """Return the file names in `path`. Empty list if path is missing/forbidden."""
    try:
        return [f.name for f in dbu.fs.ls(path)]
    except Exception:
        return []

def ensure_dir(path: str):
    """Create folder if needed. For ABFSS, fallback to a zero-byte _KEEP file."""
    try:
        dbu.fs.mkdirs(path)
        return
    except Exception:
        pass
    marker = _join(path, "_KEEP")
    try:
        dbu.fs.put(marker, "", overwrite=False)
    except Exception:
        pass

def bootstrap(upload_dir: str, files: dict):
    # 1. Ensure all destination dirs exist
    for _, dest_dir in files.values():
        ensure_dir(dest_dir)

    # 2. Snapshot of what's already uploaded & what's already in each dest
    uploaded = set(list_names(upload_dir))
    dest_cache = {dest: set(list_names(dest)) for _, dest in files.values()}

    # 3. Copy only if missing
    for ds, (fname, dest_dir) in files.items():
        if fname not in uploaded:
            print(f"[{ds}] Skipped – {fname} not in {upload_dir}")
            continue
        if fname in dest_cache[dest_dir]:
            print(f"[{ds}] Skipped (exists): {_join(dest_dir, fname)}")
            continue
        src = _join(upload_dir, fname)
        dst = _join(dest_dir, fname)
        try:
            dbu.fs.cp(src, dst)
            print(f"[{ds}] Copied: {fname} → {dst}")
        except Exception as e:
            print(f"[{ds}] FAILED to copy {fname} → {dest_dir}: {e}")

In [0]:
# 6. Execute
bootstrap(UPLOADS_DIR, INITIAL_FILES)
print("Bootstrap complete")