In [None]:
%md
# Kardiaflow bootstrap

Authenticates to ADLS, creates medallion and dataset folders,
and copies one local sample file per dataset into the source zone.

In [None]:
from pathlib import Path

from kflow.auth_adls import ensure_adls_oauth
from kflow.config import CONTAINER_URI, LAKE_ROOT

# 1. Configure Spark with ADLS OAuth credentials
ensure_adls_oauth()

# Lake (medallion) root
lake_root = LAKE_ROOT

# Container root for raw/source inputs
container_root = CONTAINER_URI

# Source path
source_root = f"{container_root}/source"
dbutils.fs.mkdirs(source_root)

print("Ensured",        source_root)
print(f"LAKE_ROOT:      {lake_root}")
print(f"CONTAINER_ROOT: {container_root}")

# 2. Create medallion and system folders under lake root
for layer in ("bronze", "silver", "gold", "_schemas", "_checkpoints", "_quarantine"):
    path = f"{lake_root}/{layer}"
    dbutils.fs.mkdirs(path)
    print("Ensured ", path)

# 3. Create dataset folders in source zone
datasets = ("encounters", "claims", "patients", "providers", "feedback")
for ds in datasets:
    path = f"{source_root}/{ds}"
    dbutils.fs.mkdirs(path)
    print("Ensured ", path)


# 4. Find the repo root (by locating top-level 'data/') and copy one sample per dataset
def find_repo_root_with_data(start: Path = Path.cwd()) -> Path:
    """Walk upward from `start` until a 'data/' directory is found."""
    for candidate in (start, *start.parents):
        if (candidate / "data").is_dir():
            return candidate
    raise FileNotFoundError(f"Could not find 'data/' under {start} or its parents.")


repo_root = find_repo_root_with_data()
print(f"Detected project root: {repo_root}")

# One local file to seed each dataset (keeps original filename)
samples = {
    "encounters": "data/encounters/encounters_part_1.avro",
    "claims":     "data/claims/claims_part_1.parquet",
    "feedback":   "data/feedback/feedback_part_1.jsonl",
    "patients":   "data/patients/patients_part_1.csv",
    "providers":  "data/providers/providers_part_1.tsv",
}

# 5. Copy single file per dataset, skip if already present
for ds, rel in samples.items():
    src_local = (repo_root / rel).resolve()
    if not src_local.exists():
        print(f"SKIP {ds}: {src_local} not found.")
        continue

    dst_dir = f"{source_root}/{ds}"
    dst_path = f"{dst_dir}/{src_local.name}"

    # Ensure destination exists
    dbutils.fs.mkdirs(dst_dir)

    # Skip copy if file is already there
    try:
        existing = [p.path.split("/")[-1] for p in dbutils.fs.ls(dst_dir)]
    except Exception as e:
        print(f"WARNING listing {dst_dir}: {e}")
        existing = []

    if src_local.name in existing:
        print(f"SKIP {ds}: {dst_path} already exists.")
        continue

    # Copy from local to ABFS
    src_uri = f"file:{src_local.as_posix()}"
    try:
        dbutils.fs.cp(src_uri, dst_path)
        print(f"{ds}: copied {src_local.name} -> {dst_path}")
    except Exception as e:
        print(f"ERROR copying {ds} from {src_uri} to {dst_path}: {e}")

print("\nBootstrap complete – source folders populated with sample files.")