In [None]:
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-


import re
from pathlib import Path
import pandas as pd
import numpy as np
from typing import Optional

# ========= SETTINGS =========
ROOT_DIR = Path(r"/home/tsultan1/BioRob(Final)/Data")  # ← change if needed
SUBJECT_GLOB = "Sub-*"

# Accept endings like: _T14.csv, _T106.csv (T = real trials)
FNAME_ANY_RE = re.compile(r"[_\-]T(?P<num>\d{2,3})\.csv$", re.IGNORECASE)

# Neon world-camera FOV (px) per metadata
ET_FOV_W = 1600.0
ET_FOV_H = 1200.0

# ---- EMG policy for REAL trials ----
KEEP_EMG_RAW = True        # keep raw for your own pipeline
DROP_EMG_VENDOR = True     # drop vendor mV to avoid opaque preprocessing

# Add a master seconds clock alongside originals
ADD_TIMESTAMP_SECONDS = True

# Keep distances/pupils in mm (True) or convert both to meters (False)
KEEP_DIST_IN_MM = True

# Drop eyelid features unless you plan to use them
DROP_EYELID_FEATURES = True

# Columns to drop (meta/ops/noisy). We keep SampleNumber & time columns for sync/use.
DROP_COLS_EXACT_BASE = {
    "EventSource", "SlideEvent", "StimType", "Duration",
    "CollectionPhase", "SourceStimuliName",
    "Auto 1 active", "Auto 1 instance",
    "Active active", "active active", "active instance",
    # Camera/capture eye positions (often -1 on Neon)
    "ET_CameraLeftX", "ET_CameraLeftY", "ET_CameraRightX", "ET_CameraRightY",
}

# Raw EMG column names
EMG_RAW_COLS = {"Ch1 EMG raw", "Ch2 EMG raw", "Ch3 EMG raw", "Ch4 EMG raw"}

# Vendor EMG (mV) columns
EMG_VENDOR_COLS = {"Ch1 EMG", "Ch2 EMG", "Ch3 EMG", "Ch4 EMG"}

# Possible extras to prune
DROP_COLS_MAYBE = {"EventSource.1", "EventSource.2"}
DROP_PREFIXES = ("Unnamed:",)

# ========= HELPERS =========
def find_header_row_index(path: Path) -> int:
    """Find the 'Row,' header line (guarding multi-line metadata)."""
    with path.open("r", encoding="utf-8-sig", errors="ignore") as f:
        for i, line in enumerate(f):
            if line.lstrip().startswith("Row,"):
                return i
    # Fallback if not found
    return 30

def parse_task_trial(name: str):
    """
    118_T14.csv  -> task=1, trial=4
    062_T106.csv -> task=10, trial=6
    """
    m = FNAME_ANY_RE.search(name)
    if not m:
        return None, None
    digits = m.group("num")
    if len(digits) < 2:
        return None, None
    trial = int(digits[-1])
    task  = int(digits[:-1])
    return task, trial

def parse_subject_id(folder_name: str):
    m = re.search(r"Sub[\s\-_]*(\d+)", folder_name, re.IGNORECASE)
    return int(m.group(1)) if m else None

def _coerce_numeric(series: pd.Series) -> pd.Series:
    return pd.to_numeric(series, errors="coerce")

def _emg_raw_center_inplace(df: pd.DataFrame) -> list:
    """
    Robust-center EMG raw (ADC-like counts) by subtracting its median
    only if it looks like counts (median in ~[10k, 60k]).
    Leaves column names unchanged; casts to float32.
    """
    log = []
    for col in EMG_RAW_COLS:
        if col not in df.columns:
            continue
        s = _coerce_numeric(df[col])
        if not s.notna().any():
            continue
        med = float(s.median())
        if 1e4 <= med <= 6e4:
            df[col] = (s - med).astype("float32")
            log.append((col, f"centered by median {med:.2f} (ADC counts → zero-mean)"))
        else:
            df[col] = s.astype("float32")  # already small / non-ADC-like
            log.append((col, f"left as-is (median {med:.2f} not in ADC range)"))
    return log

def apply_unit_conversions_inplace(df: pd.DataFrame, keep_dist_in_mm: bool = True) -> list:
    """
    Convert IN-PLACE without renaming columns.
    - Gaze 2D px → [0,1] (clipped), float32
    - Distances & Pupils: keep mm (float32) or convert both to m (float32)
    - Validity/Blink/Fixation/Worn → {0,1} uint8
    - IMU + head rotations → float32
    - 3D eyeball / optical axis / eyelids → float32
    - EMG raw robust centering → float32
    - EEG Ch1..Ch8 μV → float32 (no scaling)
    """
    log = []

    # --- Gaze 2D px -> [0,1], clip, float32 ---
    for col, denom in [
        ("ET_GazeLeftx", ET_FOV_W), ("ET_GazeRightx", ET_FOV_W),
        ("ET_GazeLefty", ET_FOV_H), ("ET_GazeRighty", ET_FOV_H),
    ]:
        if col in df.columns:
            s = _coerce_numeric(df[col])
            s = (s / denom).clip(lower=0.0, upper=1.0)
            df[col] = s.astype("float32")
            log.append((col, f"px→[0,1] by {denom} + clip + float32"))

    # --- Distances / Pupils units consistency ---
    if not keep_dist_in_mm:
        for col in ["ET_DistanceLeft", "ET_DistanceRight", "ET_PupilLeft", "ET_PupilRight"]:
            if col in df.columns:
                s = _coerce_numeric(df[col])
                mask = s >= 0
                s_conv = s.astype("float64")
                s_conv.loc[mask] = s.loc[mask] / 1000.0  # mm → m
                df[col] = s_conv.astype("float32")
                log.append((col, "mm→m for values >= 0; kept negatives unchanged; float32"))
    else:
        for col in ["ET_DistanceLeft", "ET_DistanceRight", "ET_PupilLeft", "ET_PupilRight"]:
            if col in df.columns:
                df[col] = _coerce_numeric(df[col]).astype("float32")

    # --- Validity/Blink/Fixation/Worn -> uint8 ---
    for col in ["ET_ValidityLeftEye", "ET_ValidityRightEye", "ET_Blink", "ET_Fixation", "ET_Worn"]:
        if col in df.columns:
            s = _coerce_numeric(df[col]).fillna(0.0)
            df[col] = (s > 0.5).astype("uint8")
            log.append((col, "thresholded >0.5 → {0,1} uint8"))

    # --- IMU / Head rotations to float32 ---
    for col in ["ET_GyroX","ET_GyroY","ET_GyroZ","ET_AccX","ET_AccY","ET_AccZ",
                "ET_HeadRotationPitch","ET_HeadRotationYaw","ET_HeadRotationRoll"]:
        if col in df.columns:
            df[col] = _coerce_numeric(df[col]).astype("float32")

    # --- 3D eyeball centers / optical axis / eyelids to float32 ---
    three_d_cols = [
        "ET_Gaze3dEyeballXLeft","ET_Gaze3dEyeballYLeft","ET_Gaze3dEyeballZLeft",
        "ET_Gaze3dEyeballXRight","ET_Gaze3dEyeballYRight","ET_Gaze3dEyeballZRight",
        "ET_Gaze3dOpticalAxisXLeft","ET_Gaze3dOpticalAxisYLeft","ET_Gaze3dOpticalAxisZLeft",
        "ET_Gaze3dOpticalAxisXRight","ET_Gaze3dOpticalAxisYRight","ET_Gaze3dOpticalAxisZRight",
        "ET_Gaze3dEyelidAngleTopLeft","ET_Gaze3dEyelidAngleBottomLeft",
        "ET_Gaze3dEyelidAngleTopRight","ET_Gaze3dEyelidAngleBottomRight",
        "ET_Gaze3dEyelidApertureLeft","ET_Gaze3dEyelidApertureRight",
    ]
    for col in three_d_cols:
        if col in df.columns:
            df[col] = _coerce_numeric(df[col]).astype("float32")

    # --- EMG raw robust centering + float32 ---
    log += _emg_raw_center_inplace(df)

    # --- EEG μV to float32 (no scaling) ---
    for ch in ["Ch1","Ch2","Ch3","Ch4","Ch5","Ch6","Ch7","Ch8"]:
        if ch in df.columns:
            df[ch] = _coerce_numeric(df[ch]).astype("float32")

    return log

def _make_timestamp_seconds(df: pd.DataFrame) -> Optional[pd.Series]:

    """
    Build a master seconds clock (keeps originals).
    Priority: iMotions 'Timestamp' (ms) → ET_TimeSignal (ms) → LSL Timestamp (s).
    Zero-base and gently fix tiny gaps (ffill).
    """
    candidates = []
    if 'Timestamp' in df.columns:         # iMotions global clock (ms)
        candidates.append(('Timestamp', 1000.0))
    if 'ET_TimeSignal' in df.columns:     # ET elapsed (ms)
        candidates.append(('ET_TimeSignal', 1000.0))
    if 'LSL Timestamp' in df.columns:     # EEG device clock (s)
        candidates.append(('LSL Timestamp', 1.0))

    for col, denom in candidates:
        s = pd.to_numeric(df[col], errors='coerce')
        if s.notna().any():
            t = s / denom
            t0 = t.dropna().iloc[0]
            t = (t - t0).ffill()
            # diagnostics (print once per file)
            dt = t.diff().dropna()
            if not dt.empty:
                med_dt = float(dt.median())
                est_fs = 1.0 / med_dt if med_dt > 0 else np.nan
                print(f"[INFO] {col}→Timestamp_seconds: median Δt≈{med_dt:.6f}s (fs≈{est_fs:.2f} Hz)")
            return t.astype('float64')
    return None

def clean_one_df(
    df: pd.DataFrame,
    keep_emg_raw: bool,
    drop_emg_vendor: bool,
    add_ts_seconds: bool,
    insert_at: int,
    keep_dist_in_mm: bool = True,
    drop_eyelid_features: bool = False,
) -> pd.DataFrame:
    # Trim accidental whitespace in headers
    df.columns = [c.strip() for c in df.columns]

    # Remove SlideEvents rows if present
    if "EventSource" in df.columns:
        df = df[df["EventSource"] != "SlideEvents"]

    # Optional master Timestamp_seconds (keeps originals)
    if add_ts_seconds and 'Timestamp_seconds' not in df.columns:
        ts = _make_timestamp_seconds(df)
        if ts is not None:
            df.insert(insert_at, 'Timestamp_seconds', ts.astype("float64"))
            insert_at += 1

    # In-place unit conversions + dtype tightening
    _ = apply_unit_conversions_inplace(df, keep_dist_in_mm=keep_dist_in_mm)

    # Build final drop list
    to_drop = set(DROP_COLS_EXACT_BASE)
    if not keep_emg_raw:
        to_drop.update(EMG_RAW_COLS)
    if drop_emg_vendor:
        to_drop.update(EMG_VENDOR_COLS)
    if drop_eyelid_features:
        to_drop.update({
            "ET_Gaze3dEyelidAngleTopLeft","ET_Gaze3dEyelidAngleBottomLeft",
            "ET_Gaze3dEyelidAngleTopRight","ET_Gaze3dEyelidAngleBottomRight",
            "ET_Gaze3dEyelidApertureLeft","ET_Gaze3dEyelidApertureRight",
        })

    to_drop.update(c for c in df.columns if c in DROP_COLS_MAYBE)
    to_drop.update(c for c in df.columns if any(c.startswith(p) for p in DROP_PREFIXES))

    drop_now = [c for c in to_drop if c in df.columns]
    if drop_now:
        df = df.drop(columns=drop_now, errors="ignore")

    # Compact dtypes for ids/indices
    for col in ["subject_id", "task", "trial", "SampleNumber", "Row"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0).astype("int32")

    return df

# ========= MAIN =========
def run():
    subj_dirs = sorted([p for p in ROOT_DIR.glob(SUBJECT_GLOB) if p.is_dir()])
    if not subj_dirs:
        print(f"[WARN] No subject folders found under: {ROOT_DIR}")
        return

    total_files = 0
    total_saved = 0
    for subj_dir in subj_dirs:
        sid = parse_subject_id(subj_dir.name)
        if sid is None:
            print(f"[SKIP] {subj_dir.name}: cannot parse subject id")
            continue

        out_dir = subj_dir / "cleaned"
        out_dir.mkdir(parents=True, exist_ok=True)

        csvs = sorted(subj_dir.glob("*.csv"))
        if not csvs:
            print(f"[INFO] {subj_dir.name}: no CSVs found")
            continue

        print(f"\n=== Subject {sid} ({subj_dir.name}) → {len(csvs)} files ===")

        for csv_path in csvs:
            total_files += 1
            try:
                header_idx = find_header_row_index(csv_path)
                df = pd.read_csv(
                    csv_path,
                    skiprows=header_idx,
                    header=0,
                    encoding="utf-8-sig",
                    engine="python",
                    on_bad_lines="skip",
                )
            except Exception as e:
                print(f"[WARN] Could not read {csv_path.name}: {e}")
                continue

            # Parse task/trial from filename (T-files expected)
            task, trial = parse_task_trial(csv_path.name)

            # Ensure subject/task/trial columns (insert at front)
            insert_cols = [("subject_id", sid)]
            if task is not None:
                insert_cols.append(("task", task))
            if trial is not None:
                insert_cols.append(("trial", trial))

            for col, val in reversed(insert_cols):
                if col not in df.columns:
                    df.insert(0, col, val)
                else:
                    df[col] = val

            # Where to insert Timestamp_seconds (just after subject/task/trial)
            insert_at = sum(col in df.columns for col, _ in insert_cols)

            # Clean + convert (no renames) — policy for REAL trials
            df = clean_one_df(
                df,
                keep_emg_raw=KEEP_EMG_RAW,          # True
                drop_emg_vendor=DROP_EMG_VENDOR,    # True
                add_ts_seconds=ADD_TIMESTAMP_SECONDS,
                insert_at=insert_at,
                keep_dist_in_mm=KEEP_DIST_IN_MM,
                drop_eyelid_features=DROP_EYELID_FEATURES,
            )

            # Save
            out_path = out_dir / csv_path.name
            try:
                df.to_csv(out_path, index=False, encoding="utf-8-sig")
                total_saved += 1
                print(f"[SAVED] {subj_dir.name}/{csv_path.name} "
                      f"({len(df)} rows, {len(df.columns)} cols)")
            except Exception as e:
                print(f"[WARN] Could not save {out_path.name}: {e}")

    print(f"\nDone. Processed {total_files} files, saved {total_saved} cleaned CSVs.")

if __name__ == "__main__":
    run()

