### Creating 2 combined and standardized feature matrix consisting of Kinematic and EMG Data for each Phase

In [1]:
"""
Pre-process the multi-modal dataset into phase-specific matrices
with standardization (Z-score) and a positive offset.

Rationale
---------
1. *Standardization (Z-score)*: Every channel is scaled to have a mean
   of 0 and a standard deviation of 1. This ensures that every channel,
   regardless of its original magnitude or variance, has equal weight
   in the MMF/NMF cost function.
2. *Positive Offset*: A constant is added to the muscle data channels
   (OTB and Myo) to shift the minimum value to 0, satisfying the
   non-negativity constraint required for NMF.
"""
from __future__ import annotations

import os
from pathlib import Path
import joblib
import numpy as np

# =====================================================================
# Global parameters
# =====================================================================
BASE_DIR: Path | str = (
    r"C:\Users\schmi\Documents\Studium\TUM\Masterthesis\Experimental Data"
)
PARTICIPANTS: list[int] = list(range(1, 9))
PHASES: list[str] = ["phase1", "phase2"]
N_POSE_COLUMNS: int = 6      # Pose columns at the tail of kinematics array
DTYPE_WORK: np.dtype = np.float32

# =====================================================================
# Scaling switches
# =====================================================================
# ✅ Set to "standardize_offset" to use the new scaling approach.
FEATURE_SCALE_MODE: str = "standardize_offset" # "standardize_offset" | "column_then_block" | ...
# ‼ This flag is only used by the "column_then_block" mode.
BLOCK_EQUALISE: bool = True

# =====================================================================
# Scaling helpers
# =====================================================================
def scale_by_standardize_and_offset(
    X: np.ndarray,
    idx: dict,
    dtype: np.dtype = np.float32,
) -> np.ndarray:
    """✅ Scales all features by Z-score and applies a positive offset
    to muscle channels.

    1) Standardize (Z-score) all columns to mean=0, std=1.
    2) For muscle data blocks (OTB, Myo), shift the data so that the
       minimum value is 0 to satisfy non-negativity constraints.
    """
    Xs = X.astype(dtype, copy=True)

    # ---------- Step 1: Standardize all columns -----------------------
    mean = np.mean(Xs, axis=0)
    std = np.std(Xs, axis=0)
    std[std == 0] = 1.0  # Avoid division by zero
    Xs = (Xs - mean) / std

    # ---------- Step 2: Apply positive offset to muscle data ----------
    for key in ("otb_indices", "myo_indices"):
        if key not in idx or idx[key] is None:
            continue
        s, e = idx[key]
        if e <= s:
            continue
        
        muscle_block = Xs[:, s:e]
        min_val = np.min(muscle_block)

        # Shift the entire block so the minimum value becomes 0
        if min_val < 0:
            Xs[:, s:e] = muscle_block - min_val

    return Xs


def scale_by_column_rms(
    X: np.ndarray,
    idx: dict,
    block_equalise: bool = True,
    dtype: np.dtype = np.float32,
) -> np.ndarray:
    """
    Alternative two-stage scaling: Column RMS then optional Block Equalisation.
    """
    Xs = X.astype(dtype, copy=True)

    # ---------- Step 1: per-feature RMS ---------------------------------
    rms = np.linalg.norm(Xs, axis=0) / np.sqrt(len(Xs))  # column RMS
    rms[rms == 0] = 1.0                                     # avoid div/0
    Xs /= rms                                               # broadcasted divide

    # ---------- Step 2: equalise block energies --------------------------
    if block_equalise:
        energies = []
        for key in ("otb_indices", "myo_indices", "kin_hand_indices"):
            if key not in idx or idx[key] is None:
                continue
            s, e = idx[key]
            energies.append(np.linalg.norm(Xs[:, s:e], ord="fro") ** 2)
        if energies:                                        # at least one block
            E_ref = float(np.mean(energies))
            for key in ("otb_indices", "myo_indices", "kin_hand_indices"):
                if key not in idx or idx[key] is None:
                    continue
                s, e = idx[key]
                E_block = np.linalg.norm(Xs[:, s:e], ord="fro") ** 2
                if E_block > 0:
                    Xs[:, s:e] *= np.sqrt(E_ref / E_block)
    return Xs

# =====================================================================
# I/O helpers
# =====================================================================
def load_full_phase(
    part_dir: Path | str,
    trial: int,
    phase: str,
    n_pose_cols: int,
) -> dict | None:
    """Read uncropped OTB, Myo and hand-only kinematics; return combined
    matrix & column indices, or *None* if files are missing/invalid."""
    sync_dir = Path(part_dir, "Synchronized Data split in Phases")
    kin_f = sync_dir / f"match_{trial:02d}_{phase}_kin_norm.npy"
    myo_f = sync_dir / f"match_{trial:02d}_{phase}_myo.npy"
    otb_f = sync_dir / f"match_{trial:02d}_{phase}_otb.npy"
    if not (kin_f.exists() and myo_f.exists() and otb_f.exists()):
        return None

    kin, myo, otb = map(np.load, (kin_f, myo_f, otb_f))
    T = min(len(kin), len(myo), len(otb))
    if T < 5 or kin.shape[1] <= n_pose_cols:
        return None

    kin_h = kin[:T, : kin.shape[1] - n_pose_cols]  # drop pose cols
    X = np.hstack([otb[:T], myo[:T], kin_h])

    otb_end = otb.shape[1]
    myo_end = otb_end + myo.shape[1]
    kin_end = myo_end + kin_h.shape[1]

    return {
        "combined": X,
        "otb_indices": (0, otb_end),
        "myo_indices": (otb_end, myo_end),
        "kin_hand_indices": (myo_end, kin_end),
    }


# =====================================================================
# Main processing loop
# =====================================================================
def preprocess_and_save_all_data() -> None:
    bar = "=" * 72
    print(f"\n{bar}\n PRE-PROCESSING ({FEATURE_SCALE_MODE})\n{bar}")

    for pid in PARTICIPANTS:
        p_dir = Path(BASE_DIR, f"P({pid})")
        if not p_dir.is_dir():
            print(f"[WARN] folder missing for P{pid}, skipping.")
            continue

        data_phase: dict[str, list[np.ndarray]] = {ph: [] for ph in PHASES}
        len_phase: dict[str, list[int]] = {ph: [] for ph in PHASES}
        last_idx: dict | None = None

        # --------- load all trials -----------------------------------
        for trial in range(1, 25):
            for ph in PHASES:
                out = load_full_phase(p_dir, trial, ph, N_POSE_COLUMNS)
                if out is None:
                    continue
                data_phase[ph].append(out["combined"])
                len_phase[ph].append(out["combined"].shape[0])
                last_idx = out                              # store mapping

        if any(len(data_phase[ph]) == 0 for ph in PHASES):
            print(f"[WARN] P{pid}: no valid data for one phase, skipping.")
            continue
        assert last_idx is not None, "Index mapping not captured."

        # --------- stack trials per phase ---------------------------
        X1 = np.vstack(data_phase["phase1"], dtype=DTYPE_WORK)
        X2 = np.vstack(data_phase["phase2"], dtype=DTYPE_WORK)

        # --------- scaling ------------------------------------------
        if FEATURE_SCALE_MODE == "standardize_offset":
            X1s = scale_by_standardize_and_offset(X1, last_idx)
            X2s = scale_by_standardize_and_offset(X2, last_idx)

        elif FEATURE_SCALE_MODE == "column_then_block":
            X1s = scale_by_column_rms(X1, last_idx, BLOCK_EQUALISE)
            X2s = scale_by_column_rms(X2, last_idx, BLOCK_EQUALISE)

        else:
            # Added other legacy modes back for completeness
            legacy_modes = {
                "feature": scale_each_feature_rms,
                "block": scale_blocks_by_rms,
            }
            if FEATURE_SCALE_MODE in legacy_modes:
                 # Note: Legacy modes don't use the idx mapping in this simplified call
                 X1s = legacy_modes[FEATURE_SCALE_MODE](X1) if FEATURE_SCALE_MODE == "feature" else legacy_modes[FEATURE_SCALE_MODE](X1, last_idx)
                 X2s = legacy_modes[FEATURE_SCALE_MODE](X2) if FEATURE_SCALE_MODE == "feature" else legacy_modes[FEATURE_SCALE_MODE](X2, last_idx)
            else:
                raise ValueError(
                    f"Invalid FEATURE_SCALE_MODE: '{FEATURE_SCALE_MODE}'. "
                    "Must be 'standardize_offset', 'column_then_block', 'feature', or 'block'"
                )


        # --------- persist ------------------------------------------
        out_dir = Path(p_dir, "Preprocessed_Data_Matrix")
        out_dir.mkdir(exist_ok=True)
        np.save(out_dir / f"P{pid}_combined_matrix_phase1.npy", X1s)
        np.save(out_dir / f"P{pid}_combined_matrix_phase2.npy", X2s)

        if 'combined' in last_idx:
            del last_idx['combined']

        meta = dict(last_idx)
        meta["scaling"] = {
            "feature_scale_mode": FEATURE_SCALE_MODE,
            "block_equalise": (
                BLOCK_EQUALISE if "column" in FEATURE_SCALE_MODE else "N/A"
            ),
        }
        meta["phase1_trial_lengths"] = len_phase["phase1"]
        meta["phase2_trial_lengths"] = len_phase["phase2"]
        joblib.dump(meta, out_dir / f"P{pid}_feature_indices.joblib")

        print(f"[DONE] P{pid}: phase1 {X1s.shape}, phase2 {X2s.shape}")

    print("=== PRE-PROCESSING COMPLETE ===")


# =====================================================================
# Entry point
# =====================================================================
if __name__ == "__main__":
    preprocess_and_save_all_data()


 PRE-PROCESSING (standardize_offset)
[DONE] P1: phase1 (168814, 180), phase2 (387674, 180)
[DONE] P2: phase1 (183597, 180), phase2 (356604, 180)
[DONE] P3: phase1 (134040, 180), phase2 (316406, 180)
[DONE] P4: phase1 (138778, 180), phase2 (352780, 180)
[DONE] P5: phase1 (219915, 180), phase2 (362094, 180)
[DONE] P6: phase1 (167439, 180), phase2 (320959, 180)
[DONE] P7: phase1 (134506, 180), phase2 (294965, 180)
[DONE] P8: phase1 (145986, 180), phase2 (392938, 180)
=== PRE-PROCESSING COMPLETE ===
