In [None]:
# =============================================================================
# -*- coding: utf-8 -*-
"""
BioRob Column Guard + Active-Only Export (no overwrite; T0-safe)
"""

# =============================================================================
from __future__ import annotations
import csv, glob, re, shutil
from pathlib import Path
from datetime import datetime
import pandas as pd

# --------- CONFIG ---------
ROOT_DIR = Path(r"/home/tsultan1/BioRob(Final)/Data")
SRC_LABEL_SUBPATH  = Path(r"cleaned/synchronized_proper_lite_union_v3/label")
DEST_LABEL_SUBNAME = "labelonly"  # sibling to 'label'
CSV_GLOB = "*_icml_consensus_labels.csv"

COPY_SIDECARS = True       # copy matching *_onsets.json and *.unitconv.json if present
DELETE_IF_EMPTY = False    # even in dest; normally keep empty outputs for traceability
ALLOW_SOURCE_DELETION = False  # NEVER delete source files (we only log)

# Canonical header order (matches your shown columns)
CANONICAL_BASE = [
    "Timestamp_seconds",
    # (optional 'Timestamp_ms' will be inserted here if present)
    "EMG_Ch1","EMG_Ch2","EMG_Ch3","EMG_Ch4",
    "EEG_Ch1","EEG_Ch2","EEG_Ch3","EEG_Ch4","EEG_Ch5","EEG_Ch6","EEG_Ch7","EEG_Ch8",
    "ET_GazeLeftx","ET_GazeLefty","ET_GazeRightx","ET_GazeRighty",
    "ET_PupilLeft","ET_PupilRight",
    "ET_ValidityLeftEye","ET_ValidityRightEye",
    "ET_Blink","ET_Fixation","ET_Worn",
    # (optional ET_DistanceLeft/ET_DistanceRight will be inserted here if present)
    "ET_GyroX","ET_GyroY","ET_GyroZ",
    "ET_AccX","ET_AccY","ET_AccZ",
    "ET_HeadRotationPitch","ET_HeadRotationYaw","ET_HeadRotationRoll",
    "active_raw","active","active_prob","label_action",
    "subject_id","task","trial","label_11","task_target",
]

OPTIONAL_COLS = {
    "Timestamp_ms",
    "ET_DistanceLeft","ET_DistanceRight",
}

# Logs (written under ROOT_DIR)
LOG_INVALID_PATH = ROOT_DIR / "column_invalid_or_skipped.csv"
LOG_FILTER_PATH  = ROOT_DIR / "active_zero_rows_removed.csv"

# --------- HELPERS ---------
def _is_subject_dir(name: str) -> bool:
    return re.match(r"(?i)^sub-?\d+$", name) is not None

def _iter_label_csvs():
    for sub in sorted(d for d in ROOT_DIR.iterdir() if d.is_dir() and _is_subject_dir(d.name)):
        src_label_dir = sub / SRC_LABEL_SUBPATH
        if not src_label_dir.exists():
            print(f"[skip] label folder missing: {src_label_dir}")
            continue
        for f in sorted(Path(p) for p in glob.glob(str(src_label_dir / CSV_GLOB))):
            yield f

def _clean_header(header):
    out = []
    for i, h in enumerate(header):
        h = (h or "")
        if i == 0:
            h = h.replace("\ufeff", "")
        out.append(h.strip())
    return out

def _target_order_for(header: list[str]) -> list[str]:
    has_ts_ms = "Timestamp_ms" in header
    has_distL = "ET_DistanceLeft" in header
    has_distR = "ET_DistanceRight" in header

    target = []
    for name in CANONICAL_BASE:
        if name == "Timestamp_seconds":
            target.append(name)
            if has_ts_ms:
                target.append("Timestamp_ms")
        elif name == "ET_Worn":
            target.append(name)
            if has_distL: target.append("ET_DistanceLeft")
            if has_distR: target.append("ET_DistanceRight")
        else:
            target.append(name)
    # Only keep those that actually exist in the file
    return [c for c in target if c in header]

def _is_task0_file(path: Path) -> bool:
    """
    Detect Task 0 from filename using the same convention as the labeler:
        - (T|M)(\d{2,})
        - FIRST digit after T/M = task
        - remaining digits = trial

    Examples:
        ..._T02_...   → task=0, trial=2   (REST)
        ..._T016_...  → task=0, trial=16  (REST)
        ..._T114_...  → task=1, trial=14  (NOT REST)
        ..._M305_...  → imagery; ignored here
    """
    stem = path.stem
    m = re.search(r'(?:^|_)(T|M)(\d{2,})', stem, flags=re.I)
    if not m:
        return False

    kind = m.group(1).upper()
    if kind != 'T':
        # Only physical T-trials get the REST/T0 treatment
        return False

    digits = m.group(2)
    task = int(digits[0])  # FIRST digit only (matches parse_ids_from_stem)
    return task == 0


def _dest_path_for(src_csv: Path) -> Path:
    # src: .../synchronized_proper_lite_union_v3/label/file.csv
    # dst: .../synchronized_proper_lite_union_v3/label_active_only/file.csv
    src_label_dir = src_csv.parent
    dest_label_dir = src_label_dir.parent / DEST_LABEL_SUBNAME
    dest_label_dir.mkdir(parents=True, exist_ok=True)
    return dest_label_dir / src_csv.name

def _sidecars_for(src_csv: Path):
    return [
        src_csv.with_name(src_csv.stem.replace("_icml_consensus_labels","_onsets") + ".json"),
        src_csv.with_suffix(".unitconv.json"),
    ]

# --------- MAIN ---------
invalid_log = []
filter_rows_log = []

n_files = 0
n_exported = 0
n_reordered = 0
n_filtered = 0
n_t0_skipped_filter = 0
n_invalid = 0

ALLOWED_SET = set(CANONICAL_BASE) | OPTIONAL_COLS
REQUIRED_SET = set(CANONICAL_BASE)  # optionals not required

for src_csv in _iter_label_csvs():
    n_files += 1
    try:
        with open(src_csv, "r", newline="", encoding="utf-8") as fh:
            reader = csv.reader(fh)
            header_raw = next(reader)
        header = _clean_header(header_raw)

        set_cur = set(header)
        extras = sorted(list(set_cur - ALLOWED_SET))
        missing_required = sorted(list(REQUIRED_SET - set_cur))

        if extras or missing_required:
            msg = f"invalid schema: missing={missing_required} extras={extras}"
            print(f"[skip-invalid] {src_csv} | {msg}")
            invalid_log.append({
                "file": str(src_csv),
                "when": datetime.now().isoformat(timespec="seconds"),
                "missing": ";".join(missing_required),
                "extra": ";".join(extras),
            })
            n_invalid += 1
            # Do NOT delete source; just skip
            continue

        # Load and reorder
        df = pd.read_csv(src_csv, low_memory=False)
        header_now = [c.strip() for c in df.columns]
        target_order = _target_order_for(header_now)
        if header_now != target_order:
            print(f"[reorder] {src_csv}")
            df = df[target_order]
            n_reordered += 1

        dest_csv = _dest_path_for(src_csv)

        # T0 safeguard → no filtering, just export reordered
        if _is_task0_file(src_csv):
            print(f"[export T0-no-filter] {dest_csv.name}")
            if COPY_SIDECARS:
                for sc in _sidecars_for(src_csv):
                    if sc.exists():
                        dest_sc = dest_csv.parent / sc.name
                        try: shutil.copy2(sc, dest_sc)
                        except Exception as e: print(f"  [warn] sidecar copy failed {sc} → {dest_sc}: {e}")
            df.to_csv(dest_csv, index=False)
            n_t0_skipped_filter += 1
            n_exported += 1
            continue

        # Non-T0: filter active==1
        if "active" not in df.columns:
            msg = "missing 'active' after load"
            print(f"[skip-invalid] {src_csv} | {msg}")
            invalid_log.append({
                "file": str(src_csv),
                "when": datetime.now().isoformat(timespec="seconds"),
                "missing": "'active'",
                "extra": "",
            })
            n_invalid += 1
            continue

        before = len(df)
        df = df[df["active"].astype("int64") == 1].copy()
        removed = before - len(df)

        if removed > 0:
            print(f"[filter] {src_csv.name}  kept={len(df)}  removed={removed}")
            n_filtered += 1
            filter_rows_log.append({
                "file": str(src_csv),
                "when": datetime.now().isoformat(timespec="seconds"),
                "rows_before": before,
                "rows_removed": removed,
                "rows_after": len(df),
                "dest": str(dest_csv),
            })

        if len(df) == 0 and DELETE_IF_EMPTY:
            # export nothing; still create an empty CSV if you prefer traceability:
            # df.to_csv(dest_csv, index=False)
            print(f"[note] empty after filter → not exporting: {src_csv.name}")
            continue

        if COPY_SIDECARS:
            for sc in _sidecars_for(src_csv):
                if sc.exists():
                    dest_sc = dest_csv.parent / sc.name
                    try: shutil.copy2(sc, dest_sc)
                    except Exception as e: print(f"  [warn] sidecar copy failed {sc} → {dest_sc}: {e}")

        df.to_csv(dest_csv, index=False)
        n_exported += 1

    except StopIteration:
        print(f"[skip-invalid] {src_csv} | empty or no header")
        invalid_log.append({
            "file": str(src_csv),
            "when": datetime.now().isoformat(timespec="seconds"),
            "missing": "ALL",
            "extra": "",
        })
        n_invalid += 1
    except Exception as e:
        print(f"[ERROR] {src_csv} | {e}")
        invalid_log.append({
            "file": str(src_csv),
            "when": datetime.now().isoformat(timespec="seconds"),
            "missing": "EXCEPTION",
            "extra": str(e),
        })
        n_invalid += 1

# Write logs
if invalid_log:
    pd.DataFrame(invalid_log).to_csv(LOG_INVALID_PATH, index=False)
    print(f"[log] Invalid/Skipped list → {LOG_INVALID_PATH}")

if filter_rows_log:
    pd.DataFrame(filter_rows_log).to_csv(LOG_FILTER_PATH, index=False)
    print(f"[log] Active-row filter stats → {LOG_FILTER_PATH}")

print(
    f"\n[summary] scanned={n_files}  exported={n_exported}  "
    f"reordered={n_reordered}  filtered_files={n_filtered}  "
    f"T0_passthrough={n_t0_skipped_filter}  invalid_or_skipped={n_invalid}"
)
