In [None]:
# bootstrap_dir.ipynb
# Goal: Ensure medallion + raw folders exist in ADLS and seed one sample file per dataset.

# Install kflow wheel
%pip install -q --no-deps --no-index --find-links=/dbfs/Shared/libs kflow
dbutils.library.restartPython()

from pathlib import Path
from pyspark.sql import functions as F

from kflow.auth_adls import ensure_adls_oauth
from kflow.config import LAKE_ROOT  # uses abfss://.../kardia

# 1) Configure OAuth for this session and validate access
abfss_base = ensure_adls_oauth(validate_path="")   # returns "abfss://lake@...dfs.core.windows.net"
lake_root  = LAKE_ROOT
print(f"ABFSS base       : {abfss_base}")
print(f"Lake root        : {lake_root}")

# 2) Create medallion layer folders (idempotent)
for layer in ("bronze", "silver", "gold", "_schemas", "_checkpoints", "_quarantine"):
    p = f"{lake_root}/{layer}"
    dbutils.fs.mkdirs(p)
    print("Ensured", p)

# 3) Create raw/source folders (idempotent)
for ds in ("encounters", "claims", "patients", "providers", "feedback"):
    raw_dir = f"{abfss_base}/{ds}"
    dbutils.fs.mkdirs(raw_dir)
    print("Ensured", raw_dir)

# 4) Copy one local sample into each raw folder
#    Assumes the notebook lives in a Repo with the /data/<ds>/... files committed.
#    Path().cwd() resolves inside the job run to the repo working directory.
repo_root = Path().cwd()
samples = {
    "encounters": ("data/encounters/encounters_part_1.avro", "avro"),
    "claims"    : ("data/claims/claims_part_1.parquet", "parquet"),
    "feedback"  : ("data/feedback/feedback_part_1.jsonl", "json"),
    "patients"  : ("data/patients/patients_part_1.csv", "csv"),
    "providers" : ("data/providers/providers_part_1.tsv", "tsv"),
}

for ds, (rel_path, fmt) in samples.items():
    src_file = repo_root / rel_path
    target_dir = f"{abfss_base}/{ds}"

    if not src_file.exists():
        print(f"SKIP {ds}: {src_file} not found (OK if you haven’t added samples locally).")
        continue

    print(f"→ {ds}: loading {src_file.name} and writing to {target_dir}")

    if fmt == "parquet":
        (spark.read.parquet(str(src_file))
         .write.mode("overwrite").parquet(target_dir))

    elif fmt == "avro":
        (spark.read.format("avro").load(str(src_file))
         .write.mode("overwrite").format("avro").save(target_dir))

    elif fmt == "json":
        (spark.read.option("multiLine", False).json(str(src_file))
         .write.mode("overwrite").json(target_dir))

    elif fmt == "csv":
        (spark.read.option("header", True).csv(str(src_file))
         .write.mode("overwrite").option("header", True).csv(target_dir))

    elif fmt == "tsv":
        (spark.read.option("header", True).option("sep", "\t").csv(str(src_file))
         .write.mode("overwrite").option("header", True).option("sep", "\t").csv(target_dir))

print("\nBootstrap complete – medallion & raw folders ready.")