In [None]:
# raw_file_ops/bootstrap_raw.ipynb
# 1. Install helpers once per cluster/session (optional if on repo cluster)
%pip install -q git+https://github.com/okv627/KardiaFlow@main#subdirectory=src

from kflow.config import raw_path, adls_raw_path
from kflow.display_utils import banner

In [None]:
# 2. Global Delta defaults (apply to *new* Delta tables)
spark.conf.set("spark.databricks.delta.properties.defaults.autoOptimize.optimizeWrite", "true")
spark.conf.set("spark.databricks.delta.properties.defaults.autoOptimize.autoCompact",  "true")
spark.conf.set("spark.databricks.delta.properties.defaults.enableChangeDataFeed", "true")

# Enable ${var} substitution in pure SQL cells
spark.conf.set("spark.sql.variable.substitute", "true")

# Business knobs (“magic numbers”)
spark.conf.set("kflow.rapid_fire.threshold", 5)
spark.conf.set("kflow.claims.hourly.time_col", "_ingest_ts")

In [None]:
# 3. Databases – create once & tag with DBPROPERTIES
db_props = """
  'delta.autoOptimize.optimizeWrite'='true',
  'delta.autoOptimize.autoCompact' ='true'
"""

for db in ["kardia_bronze","kardia_silver","kardia_gold","kardia_validation"]:
    spark.sql(f"CREATE DATABASE IF NOT EXISTS {db}")
    spark.sql(f"ALTER DATABASE {db} SET DBPROPERTIES ({db_props})")

In [None]:
# 4. ADLS auth (providers & feedback). Run once per cluster/session.
ADLS_ACCOUNT = "kardiaadlsdemo"
SUFFIX       = "core.windows.net"
sas_token    = dbutils.secrets.get("kardia","adls_raw_sas").lstrip('?')

spark.conf.set(f"fs.azure.account.auth.type.{ADLS_ACCOUNT}.dfs.{SUFFIX}", "SAS")
spark.conf.set(f"fs.azure.sas.token.provider.type.{ADLS_ACCOUNT}.dfs.{SUFFIX}",
               "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider")
spark.conf.set(f"fs.azure.sas.fixed.token.{ADLS_ACCOUNT}.dfs.{SUFFIX}", sas_token)

In [None]:
# 5. Seed raw dirs with first sample files (run once per env)
UPLOADS_DIR = "dbfs:/FileStore/tables/"   # where you manually uploaded seed files

INITIAL_FILES = {
    # DBFS-backed
    "patients":   ("patients_part_1.csv",    raw_path("patients")),
    "encounters": ("encounters_part_1.avro", raw_path("encounters")),
    "claims":     ("claims_part_1.parquet",  raw_path("claims")),
    # ADLS-backed
    "providers":  ("providers_part_1.tsv",   adls_raw_path("providers")),
    "feedback":   ("feedback_part_1.jsonl",  adls_raw_path("feedback")),
}
# Helpers
def list_names(path: str):
    try:
        return [f.name for f in dbutils.fs.ls(path)]
    except Exception:
        return []

def ensure_dir(path: str):
    # dbfs mkdirs work; abfss doesn't need it
    if path.startswith("dbfs:/"):
        try:
            dbutils.fs.mkdirs(path)
        except Exception:
            pass

def copy_if_missing(src_dir: str, dest_dir: str, fname: str):
    src, dst = src_dir + fname, dest_dir + fname
    if fname in list_names(dest_dir):
        print(f"Skipped (already exists): {dst}")
        return
    try:
        dbutils.fs.cp(src, dst)
        print(f"Bootstrapped: {fname} → {dst}")
    except Exception as e:
        print(f"Failed to copy {fname}: {e}")

def bootstrap_initial_files(upload_dir: str, files: dict[str, tuple[str, str]]):
    # Ensure DBFS dirs
    for _, dest in files.values():
        ensure_dir(dest)
    # Copy once
    uploaded = set(list_names(upload_dir))
    for ds, (fname, dest_dir) in files.items():
        if fname not in uploaded:
            print(f"[{ds}] Skipped – {fname} not found in {upload_dir}")
            continue
        copy_if_missing(upload_dir, dest_dir, fname)

bootstrap_initial_files(UPLOADS_DIR, INITIAL_FILES)
banner("Bootstrap complete", ok=True)