In [1]:
# -*- coding: utf-8 -*-
"""
Compute Engagnition movement_intensity for ACC with a progress bar.
- RAW: median SVM (read if present; else compute from X/Y/Z)
- Z:   robust z with cascade: (pid, condition) -> condition -> global
- BIN: 1 if z >= 0 else 0
"""

import os
import argparse
import numpy as np
import pandas as pd
from tqdm import tqdm

# ---------- small helpers ----------

def norm(s: str) -> str:
    return str(s).strip().lower().replace(" ", "").replace("_", "")

def find_col(cols, predicate):
    for c in cols:
        if predicate(norm(c)):
            return c
    return None

def read_acc_csv(csv_path: str) -> pd.DataFrame:
    """Read ACC CSV robustly (engine='python' helps with odd separators)."""
    return pd.read_csv(csv_path, engine="python")

def get_svm_from_frame(df: pd.DataFrame) -> pd.Series:
    """Return a Series with SVM per-row. If SVM present -> use it; else compute from XYZ."""
    cols = list(df.columns)

    # Try ready SVM-like column
    svm_col = find_col(cols, lambda n: "svm" in n or "vectormagnitude" in n or n.endswith("magnitude"))
    if svm_col:
        return pd.to_numeric(df[svm_col], errors="coerce")

    # Else look for XYZ
    x_col = find_col(cols, lambda n: n in ("x","accx") or n.endswith("accx"))
    y_col = find_col(cols, lambda n: n in ("y","accy") or n.endswith("accy"))
    z_col = find_col(cols, lambda n: n in ("z","accz") or n.endswith("accz"))

    # Fallback: try any columns containing 'x','y','z'
    if not x_col: x_col = find_col(cols, lambda n: n.endswith("x"))
    if not y_col: y_col = find_col(cols, lambda n: n.endswith("y"))
    if not z_col: z_col = find_col(cols, lambda n: n.endswith("z"))

    if not (x_col and y_col and z_col):
        raise ValueError("ACC CSV has no SVM and cannot locate X/Y/Z columns.")

    x = pd.to_numeric(df[x_col], errors="coerce")
    y = pd.to_numeric(df[y_col], errors="coerce")
    z = pd.to_numeric(df[z_col], errors="coerce")
    return np.sqrt(x**2 + y**2 + z**2)

def robust_center_scale(series: pd.Series):
    """Return (median, iqr)."""
    v = pd.to_numeric(series, errors="coerce").dropna().values
    if len(v) == 0:
        return np.nan, np.nan
    med = float(np.median(v))
    q25, q75 = np.percentile(v, 25), np.percentile(v, 75)
    iqr = q75 - q25
    return med, float(iqr)

# ---------- main computation ----------

def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--data-root", required=True, help="Root folder containing Engagnition and metadata.")
    ap.add_argument("--meta", required=True, help="Path to metadata_master.csv (input).")
    ap.add_argument("--out", required=True, help="Where to save updated metadata_master.csv.")
    ap.add_argument("--propagate-blocks", action="store_true",
                    help="Copy session z/bin to block rows for same participant+condition (ACC only).")
    args = ap.parse_args()

    # Load meta
    df = pd.read_csv(args.meta, dtype="object")

    # Ensure columns exist
    for col in ["movement_intensity_raw", "movement_intensity_z", "movement_intensity_bin",
                "unit_level", "modality"]:
        if col not in df.columns:
            df[col] = pd.NA

    # Filter: Engagnition, ACC, session-level, with path
    mask_sess = (
        (df.get("dataset") == "Engagnition")
        & ((df.get("modality") == "ACC") | df.get("rel_path_acc").notna())
        & (df.get("unit_level", "session") == "session")
        & df.get("rel_path_acc").notna()
    )
    idx_sess = df[mask_sess].index.tolist()
    if not idx_sess:
        print("[WARN] No Engagnition session×ACC rows found.")
        return

    # --- compute RAW per session row with progress bar ---
    raws = pd.Series(index=df.index, dtype="float64")
    ok, fail = 0, 0

    print("[INFO] Computing ACC RAW (median SVM) …")
    for i in tqdm(idx_sess, desc="ACC sessions", unit="row"):
        rel = df.at[i, "rel_path_acc"]
        if not isinstance(rel, str) or not rel:
            fail += 1
            continue
        fcsv = os.path.join(args.data_root, rel)
        if not os.path.isfile(fcsv):
            fail += 1
            continue
        try:
            acc = read_acc_csv(fcsv)
            svm = get_svm_from_frame(acc)
            rv = pd.to_numeric(svm, errors="coerce").dropna().median()
            if pd.notna(rv):
                raws.at[i] = float(rv)
                ok += 1
            else:
                fail += 1
        except Exception:
            fail += 1

    df.loc[raws.index, "movement_intensity_raw"] = raws

    # --- build groups for z-scaling cascade ---
    work = df.loc[idx_sess, ["participant_id", "condition", "movement_intensity_raw"]].copy()
    work["raw"] = pd.to_numeric(work["movement_intensity_raw"], errors="coerce")
    valid = work["raw"].notna()
    if valid.sum() == 0:
        print("[ERROR] No valid RAW values computed.")
        df.to_csv(args.out, index=False, encoding="utf-8-sig")
        return

    # Global stats
    g_med, g_iqr = robust_center_scale(work.loc[valid, "raw"])

    # Precompute per (pid, condition) and per condition stats
    pc_stats = {}
    for (pid, cond), sub in work.groupby(["participant_id", "condition"]):
        med, iqr = robust_center_scale(sub["raw"])
        pc_stats[(pid, cond)] = (med, iqr)

    c_stats = {}
    for cond, sub in work.groupby(["condition"]):
        med, iqr = robust_center_scale(sub["raw"])
        c_stats[cond] = (med, iqr)

    # --- compute Z/BIN with cascade ---
    z_out = pd.Series(index=df.index, dtype="float64")
    bin_out = pd.Series(index=df.index, dtype="Int64")

    for i in idx_sess:
        r = raws.at[i]
        if pd.isna(r):
            continue

        pid = df.at[i, "participant_id"]
        cond = df.at[i, "condition"]

        med, iqr = pc_stats.get((pid, cond), (np.nan, np.nan))
        if not (np.isfinite(iqr) and iqr > 0):
            med, iqr = c_stats.get(cond, (np.nan, np.nan))
        if not (np.isfinite(iqr) and iqr > 0):
            med, iqr = g_med, g_iqr
        if not (np.isfinite(iqr) and iqr > 0):
            # last resort: unscaled deviation from median
            z = r - (med if np.isfinite(med) else 0.0)
        else:
            z = (r - med) / iqr

        z_out.at[i] = float(z)
        bin_out.at[i] = int(z >= 0.0)

    df.loc[z_out.index, "movement_intensity_z"] = z_out
    df.loc[bin_out.index, "movement_intensity_bin"] = bin_out

    # --- optional: propagate session z/bin to block rows (same pid+cond, ACC) ---
    if args.propagate_blocks:
        mask_block = (
            (df.get("dataset") == "Engagnition")
            & ((df.get("modality") == "ACC") | df.get("rel_path_acc").notna())
            & (df.get("unit_level") == "block")
        )
        blocks = df[mask_block].index.tolist()
        if blocks:
            # Build lookup: (pid, cond) -> median z/bin of sessions (if many)
            sess_tab = df.loc[idx_sess, ["participant_id", "condition",
                                         "movement_intensity_z", "movement_intensity_bin"]].copy()
            sess_tab["z"] = pd.to_numeric(sess_tab["movement_intensity_z"], errors="coerce")
            # choose median z per pid+cond; bin via majority (or z>=0 of median)
            z_map = sess_tab.groupby(["participant_id", "condition"])["z"].median()
            for j in blocks:
                pid = df.at[j, "participant_id"]
                cond = df.at[j, "condition"]
                if (pid, cond) in z_map.index:
                    z_val = z_map.loc[(pid, cond)]
                    df.at[j, "movement_intensity_z"] = z_val
                    df.at[j, "movement_intensity_bin"] = int(float(z_val) >= 0.0)

    # --- save and summary ---
    df.to_csv(args.out, index=False, encoding="utf-8-sig")

    print(f"[OK] RAW computed: {ok}, failed: {fail}")
    eng_rows = df.loc[idx_sess]
    vc = pd.to_numeric(eng_rows["movement_intensity_bin"], errors="coerce").value_counts(dropna=False)
    print("[OK] BIN value counts (session×ACC):")
    print(vc.to_string())
    print("[OK] Z describe (session×ACC):")
    print(pd.to_numeric(eng_rows["movement_intensity_z"], errors="coerce").describe())

if __name__ == "__main__":
    main()


usage: ipykernel_launcher.py [-h] --data-root DATA_ROOT --meta META --out OUT [--propagate-blocks]
ipykernel_launcher.py: error: the following arguments are required: --data-root, --meta, --out


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
