In [None]:
# bootstrap_dir.ipynb
# Purpose:
# - Create medallion folders (bronze/silver/gold/etc.) in ADLS Gen2
# - Create raw input folders for each dataset
# - Load one sample file per dataset to its corresponding raw folder

# Install kflow and restart Python kernel
%pip install -q --no-deps --no-index --find-links=/dbfs/Shared/libs kflow
dbutils.library.restartPython()

from pathlib import Path
from pyspark.sql import functions as F

from kflow.auth_adls import ensure_adls_oauth
from kflow.config import LAKE_ROOT  # uses abfss://.../kardia

# 1) Configure ADLS OAuth and resolve paths
abfss_base = ensure_adls_oauth(validate_path="")
lake_root = LAKE_ROOT
print(f"ABFSS base: {abfss_base}")
print(f"Lake root: {lake_root}")

# 2) Create medallion layer directories
for layer in ("bronze", "silver", "gold", "_schemas", "_checkpoints", "_quarantine"):
    p = f"{lake_root}/{layer}"
    dbutils.fs.mkdirs(p)
    print("Ensured", p)

# 3) Create raw input directories for each dataset
for ds in ("encounters", "claims", "patients", "providers", "feedback"):
    raw_dir = f"{abfss_base}/{ds}"
    dbutils.fs.mkdirs(raw_dir)
    print("Ensured", raw_dir)

# 4) Upload one local sample file per dataset to its raw folder
repo_root = Path().cwd()
samples = {
    "encounters": ("data/encounters/encounters_part_1.avro", "avro"),
    "claims":     ("data/claims/claims_part_1.parquet", "parquet"),
    "feedback":   ("data/feedback/feedback_part_1.jsonl", "json"),
    "patients":   ("data/patients/patients_part_1.csv", "csv"),
    "providers":  ("data/providers/providers_part_1.tsv", "tsv"),
}

for ds, (rel_path, fmt) in samples.items():
    src_file = repo_root / rel_path
    target_dir = f"{abfss_base}/{ds}"

    if not src_file.exists():
        print(f"SKIP {ds}: {src_file} not found.")
        continue

    print(f"→ {ds}: loading {src_file.name} and writing to {target_dir}")

    if fmt == "parquet":
        (spark.read.parquet(str(src_file))
         .write.mode("overwrite").parquet(target_dir))

    elif fmt == "avro":
        (spark.read.format("avro").load(str(src_file))
         .write.mode("overwrite").format("avro").save(target_dir))

    elif fmt == "json":
        (spark.read.option("multiLine", False).json(str(src_file))
         .write.mode("overwrite").json(target_dir))

    elif fmt == "csv":
        (spark.read.option("header", True).csv(str(src_file))
         .write.mode("overwrite").option("header", True).csv(target_dir))

    elif fmt == "tsv":
        (spark.read.option("header", True).option("sep", "\t").csv(str(src_file))
         .write.mode("overwrite").option("header", True).option("sep", "\t").csv(target_dir))

print("\nBootstrap complete – medallion and raw folders ready.")