In [None]:
# bootstrap_dir.ipynb
# - Create medallion folders (optional) under LAKE_ROOT
# - Create dataset folders at the container root (…/encounters, …/claims, …)
# - Copy one local sample file per dataset (preserve original filename; skip if exists)

# Install kflow only if running on a jobs cluster;
# for interactive testing, install via cluster libraries instead.

# Optional library bootstrap for ephemeral jobs clusters
# %run ./bootstrap_kflow

from pathlib import Path

from kflow.auth_adls import ensure_adls_oauth
from kflow.config import LAKE_ROOT

# 1. Authenticate and resolve roots
ensure_adls_oauth(validate_path="")

lake_root = LAKE_ROOT.rstrip("/")
container_root = "/".join(lake_root.split("/")[:3])  # drop trailing segment to get container root
source_root = f"{container_root}/source"
dbutils.fs.mkdirs(source_root)

print("Ensured",        source_root)
print(f"LAKE_ROOT:      {lake_root}")
print(f"CONTAINER_ROOT: {container_root}")

# 2. Idempotently create directory structure
# Medallion layer folders under lake root
for layer in ("bronze", "silver", "gold", "_schemas", "_checkpoints", "_quarantine"):
    path = f"{lake_root}/{layer}"
    dbutils.fs.mkdirs(path)
    print("Ensured", path)

# Dataset folders at container root
datasets = ("encounters", "claims", "patients", "providers", "feedback")
for ds in datasets:
    path = f"{source_root}/{ds}"
    dbutils.fs.mkdirs(path)
    print("Ensured", path)


# 3. Resolve project root by locating the top-level `data/` directory
def find_repo_root_with_data(start: Path = Path.cwd()) -> Path:
    for candidate in (start, *start.parents):
        if (candidate / "data").is_dir():
            return candidate
    raise FileNotFoundError(f"Could not find 'data/' under {start} or its parents.")

repo_root = find_repo_root_with_data()
print(f"Detected project root: {repo_root}")

# 4. Local sample definitions
samples = {
    "encounters": "data/encounters/encounters_part_1.avro",
    "claims":     "data/claims/claims_part_1.parquet",
    "feedback":   "data/feedback/feedback_part_1.jsonl",
    "patients":   "data/patients/patients_part_1.csv",
    "providers":  "data/providers/providers_part_1.tsv",
}

# 5. Copy single file per dataset, skip if already present
for ds, rel in samples.items():
    src_local = (repo_root / rel).resolve()
    if not src_local.exists():
        print(f"SKIP {ds}: {src_local} not found.")
        continue

    dst_dir = f"{source_root}/{ds}"
    dst_path = f"{dst_dir}/{src_local.name}"

    # Ensure destination folder exists (safe/redundant)
    dbutils.fs.mkdirs(dst_dir)

    # Check for existing file to avoid overwrite
    try:
        existing = [p.path.split("/")[-1] for p in dbutils.fs.ls(dst_dir)]
    except Exception as e:
        print(f"WARNING listing {dst_dir}: {e}")
        existing = []

    if src_local.name in existing:
        print(f"SKIP {ds}: {dst_path} already exists.")
        continue

    # Copy from local filesystem to ABFS using file: URI
    src_uri = f"file:{src_local.as_posix()}"
    try:
        dbutils.fs.cp(src_uri, dst_path)
        print(f"{ds}: copied {src_local.name} -> {dst_path}")
    except Exception as e:
        print(f"ERROR copying {ds} from {src_uri} to {dst_path}: {e}")

print("\nBootstrap complete – container-root dataset folders populated with sample files.")