In [0]:
# Databricks Notebook: 99_utils/setup_kardia_autoloader_env.ipynb
# --------------------------------------------------------------
# Purpose: one-time workspace bootstrap
#   • Creates DBFS folder structure expected by Auto Loader
#   • Optionally seeds tiny CSVs for smoke testing
# Run only when you spin up a brand-new Databricks workspace.

# ───────────────────────────────────────────────────────────────
# USER CONFIG – tweak paths or skip seed files
seed_patients    = True          # set False if you don’t need the seed file
seed_encounters  = True
repo_base        = "/Workspace/Users/matthew.databrickslab2@outlook.com/kardiaflow/data/raw"

# ───────────────────────────────────────────────────────────────
from pyspark.sql import SparkSession, functions as F

# 1️. Core folder tree  (idempotent)
for p in [
    "dbfs:/kardia/raw/patients/",
    "dbfs:/kardia/raw/encounters/",
    "dbfs:/kardia/_schemas/",
    "dbfs:/kardia/_checkpoints/"
]:
    dbutils.fs.mkdirs(p)

# 2️. Seed smoke-test files  (≲ 10 rows each)
def safe_copy(src, dst):
    try:
        dbutils.fs.cp(src, dst, recurse=True)
        print(f"Copied {src} → {dst}")
    except Exception as e:
        print(f"Skipped {src}: {e}")

if seed_patients:
    safe_copy(f"file:{repo_base}/ehr/patients_10.csv",
              "dbfs:/kardia/raw/patients/")
if seed_encounters:
    safe_copy(f"file:{repo_base}/ehr/encounters_10.csv",
              "dbfs:/kardia/raw/encounters/")

# 3️. Quick inventory (non-interactive; prints only counts)
for entity in ["patients", "encounters"]:
    cnt = len(dbutils.fs.ls(f"dbfs:/kardia/raw/{entity}/"))
    print(f"{entity.capitalize()} landing files: {cnt}")

print("\nEnvironment bootstrap complete")
