In [0]:
# raw_file_ops/move_new_files.ipynb
# Moves new `*_part_*` seed files into raw zones (DBFS + ADLS).
#
# - Supports ad hoc uploads via DBFS or ADLS container root.
# - Respects file prefixes and formats:
#    - Patients / Encounters / Claims: DBFS
#    - Providers / Feedback:           ADLS
#
# Skips any file already present at the destination.

from kflow.config import raw_path, adls_raw_path
from kflow.adls import set_sas

dbu = dbutils

In [0]:
# 1. Auth for ADLS (providers/feedback live there)
ACCOUNT   = "kardiaadlsdemo"
SAS_TOKEN = dbu.secrets.get("kardia", "adls_raw_sas")
set_sas(ACCOUNT, SAS_TOKEN)

In [0]:
# 2. Config
# Where seed files may appear
DBFS_UPLOADS_DIR = "dbfs:/FileStore/tables/"
ADLS_UPLOADS_DIR = adls_raw_path("")  # root of raw container (e.g., abfss://raw@acct/)

In [0]:
# Map filename prefix: (raw zone path, allowed extensions)
PREFIX_MAP = {
    "patients_part_":   (raw_path("patients"),    (".csv",)),
    "encounters_part_": (raw_path("encounters"),  (".avro",)),
    "claims_part_":     (raw_path("claims"),      (".parquet",)),
    "providers_part_":  (adls_raw_path("providers"), (".tsv", ".avro")),
    "feedback_part_":   (adls_raw_path("feedback"),  (".jsonl",)),
}

# Directories to scan for new files
SCAN_DIRS = [DBFS_UPLOADS_DIR, ADLS_UPLOADS_DIR]

In [0]:
# 3. Helpers
_join = lambda d, f: d.rstrip("/") + "/" + f

def ensure_dir(path: str):
    """Create directory if needed. For ABFSS, fallback to zero-byte _KEEP marker."""
    try:
        dbu.fs.mkdirs(path)
    except Exception:
        try:
            dbu.fs.put(_join(path, "_KEEP"), "", overwrite=False)
        except Exception:
            pass

def existing_names(path: str):
    try:
        return {f.name for f in dbu.fs.ls(path)}
    except Exception:
        return set()

def iter_files(path: str):
    """Recursively yield file Info objects under `path`."""
    try:
        entries = dbu.fs.ls(path)
    except Exception:
        return
    for e in entries:
        if e.name.endswith("/"):
            yield from iter_files(e.path)
        else:
            yield e

In [0]:
# 4. Execution
moved = skipped = 0
moved_files, skipped_files = [], []

# Ensure destination folders exist & cache their contents
for dest, _exts in PREFIX_MAP.values():
    ensure_dir(dest)
dest_cache = {dest: existing_names(dest) for dest, _ in PREFIX_MAP.values()}

for scan_dir in SCAN_DIRS:
    for fi in iter_files(scan_dir):
        fname = fi.name
        low   = fname.lower()

        # find target dir
        target = None
        for prefix, (dest, exts) in PREFIX_MAP.items():
            if low.startswith(prefix) and low.endswith(exts):
                target = dest
                break
        if not target:
            continue  # unrecognized file

        if fname in dest_cache[target]:
            skipped += 1
            skipped_files.append(fname)
            print(f"Skipped (exists): {fname}")
            continue

        try:
            dbu.fs.cp(fi.path, _join(target, fname))
            dest_cache[target].add(fname)
            moved += 1
            moved_files.append(fname)
            print(f"Copied: {fname} → {target}")
        except Exception as e:
            print(f"FAILED to copy {fname}: {e}")

In [0]:
# 5. Summary
print(f"Move complete. Moved: {moved:,}, Skipped: {skipped:,}")

if moved_files:
    print("Moved files:\n  " + "\n  ".join(moved_files))

if skipped_files:
    print("Skipped files:\n  " + "\n  ".join(skipped_files))