### Creating 2 combined and standardized feature matrix consisting of Kinematic and EMG Data for each Phase

In [None]:
"""
Pre-process the multi-modal dataset into phase-specific matrices
with block-wise Frobenius norm scaling.

Rationale
---------
* The primary method, "block_frobenius", scales each modality block
    (OTB, Myo, Kinematics) to have a unit Frobenius norm. This ensures
    that each modality contributes equally to the MMF/NMF cost function,
    preventing bias from modalities with more channels or higher variance.

* The alternative method, "column_then_block", is a two-stage process
    for fine-grained control, but block-wise scaling is recommended for
    most synergy extraction tasks.
"""
from __future__ import annotations

import os
from pathlib import Path
import joblib
import numpy as np

# =====================================================================
# Global parameters
# =====================================================================
BASE_DIR: Path | str = (
    r"C:\Users\schmi\Documents\Studium\TUM\Masterthesis\Experimental Data"
)
PARTICIPANTS: list[int] = list(range(1, 9))
PHASES: list[str] = ["phase1", "phase2"]
N_POSE_COLUMNS: int = 6      # Pose columns at the tail of kinematics array
DTYPE_WORK: np.dtype = np.float32

# =====================================================================
# Scaling switches
# =====================================================================
#  Set to "block_frobenius" for the recommended modality balancing.
FEATURE_SCALE_MODE: str = "column_then_block"  # "block_frobenius" | "column_then_block" | "feature" | "block"
# ‼ This flag is only used by the "column_then_block" mode.
BLOCK_EQUALISE: bool = True

# =====================================================================
# Scaling helpers
# =====================================================================
def scale_blocks_by_frobenius(
    X: np.ndarray,
    idx: dict,
    dtype: np.dtype = np.float32,
) -> np.ndarray:
    """ Recommended: Scales each data block to have a unit Frobenius norm.

    This ensures each modality is weighted equally in the factorization.

    Parameters
    ----------
    X   : array (samples × features)
    idx : dict with keys 'otb_indices', 'myo_indices', 'kin_hand_indices';
          each value is a (start, end) tuple of column indices.
    dtype : working dtype.
    """
    Xs = X.astype(dtype, copy=True)

    # Iterate through each data block (modality)
    for key in ("otb_indices", "myo_indices", "kin_hand_indices"):
        if key not in idx or idx[key] is None:
            continue
        s, e = idx[key]
        if e <= s:
            continue

        block = Xs[:, s:e]
        frob_norm = np.linalg.norm(block, ord="fro")

        if frob_norm > 0:
            Xs[:, s:e] = block / frob_norm

    return Xs


def scale_by_column_rms(
    X: np.ndarray,
    idx: dict,
    block_equalise: bool = True,
    dtype: np.dtype = np.float32,
) -> np.ndarray:
    """
    Alternative two-stage scaling:

    1) Divide *every* column by its own RMS.
    2) If *block_equalise* is True, rescale each modality block so that
       its Frobenius energy equals the mean energy of all present blocks.
    """
    Xs = X.astype(dtype, copy=True)

    # ---------- Step 1: per-feature RMS ---------------------------------
    rms = np.linalg.norm(Xs, axis=0) / np.sqrt(len(Xs))  # column RMS
    rms[rms == 0] = 1.0                                     # avoid div/0
    Xs /= rms                                               # broadcasted divide

    # ---------- Step 2: equalise block energies --------------------------
    if block_equalise:
        energies = []
        for key in ("otb_indices", "myo_indices", "kin_hand_indices"):
            if key not in idx or idx[key] is None:
                continue
            s, e = idx[key]
            energies.append(np.linalg.norm(Xs[:, s:e], ord="fro") ** 2)
        if energies:                                        # at least one block
            E_ref = float(np.mean(energies))
            for key in ("otb_indices", "myo_indices", "kin_hand_indices"):
                if key not in idx or idx[key] is None:
                    continue
                s, e = idx[key]
                E_block = np.linalg.norm(Xs[:, s:e], ord="fro") ** 2
                if E_block > 0:
                    Xs[:, s:e] *= np.sqrt(E_ref / E_block)

    return Xs

# ---------------------------------------------------------------------
# Legacy scalers (kept for reference / comparison)
# ---------------------------------------------------------------------
def scale_each_feature_rms(X: np.ndarray) -> np.ndarray:
    """Old: column-wise RMS only."""
    Xs = X.astype(DTYPE_WORK, copy=True)
    rms = np.sqrt(np.mean(Xs ** 2, axis=0, keepdims=True))
    rms[rms == 0] = 1.0
    return Xs / rms


def equalise_block_energy_sqrt_d(X: np.ndarray, idx: dict) -> np.ndarray:
    """Old: divide each block by √d after column scaling."""
    Xs = X.copy()
    for key in ("otb_indices", "myo_indices", "kin_hand_indices"):
        if key not in idx or idx[key] is None:
            continue
        start, end = idx[key]
        d = end - start
        if d > 0:
            Xs[:, start:end] /= np.sqrt(d, dtype=DTYPE_WORK)
    return Xs


def scale_blocks_by_rms(X: np.ndarray, idx: dict) -> np.ndarray:
    """Old: one scalar per block  (Frobenius/√d)."""
    Xs = X.astype(DTYPE_WORK, copy=True)
    for key in ("otb_indices", "myo_indices", "kin_hand_indices"):
        if key not in idx or idx[key] is None:
            continue
        s, e = idx[key]
        if e <= s:
            continue
        block = Xs[:, s:e]
        d = e - s
        frob = np.linalg.norm(block, ord="fro")
        if frob > 0:
            Xs[:, s:e] = block / (frob / np.sqrt(d, dtype=DTYPE_WORK))
    return Xs


# =====================================================================
# I/O helpers
# =====================================================================
def load_full_phase(
    part_dir: Path | str,
    trial: int,
    phase: str,
    n_pose_cols: int,
) -> dict | None:
    """Read uncropped OTB, Myo and hand-only kinematics; return combined
    matrix & column indices, or *None* if files are missing/invalid."""
    sync_dir = Path(part_dir, "Synchronized Data split in Phases")
    kin_f = sync_dir / f"match_{trial:02d}_{phase}_kin_norm.npy"
    myo_f = sync_dir / f"match_{trial:02d}_{phase}_myo.npy"
    otb_f = sync_dir / f"match_{trial:02d}_{phase}_otb.npy"
    if not (kin_f.exists() and myo_f.exists() and otb_f.exists()):
        return None

    kin, myo, otb = map(np.load, (kin_f, myo_f, otb_f))
    T = min(len(kin), len(myo), len(otb))
    if T < 5 or kin.shape[1] <= n_pose_cols:
        return None

    kin_h = kin[:T, : kin.shape[1] - n_pose_cols]  # drop pose cols
    X = np.hstack([otb[:T], myo[:T], kin_h])

    otb_end = otb.shape[1]
    myo_end = otb_end + myo.shape[1]
    kin_end = myo_end + kin_h.shape[1]

    return {
        "combined": X,
        "otb_indices": (0, otb_end),
        "myo_indices": (otb_end, myo_end),
        "kin_hand_indices": (myo_end, kin_end),
    }


# =====================================================================
# Main processing loop
# =====================================================================
def preprocess_and_save_all_data() -> None:
    bar = "=" * 72
    print(f"\n{bar}\n PRE-PROCESSING ({FEATURE_SCALE_MODE})\n{bar}")

    for pid in PARTICIPANTS:
        p_dir = Path(BASE_DIR, f"P({pid})")
        if not p_dir.is_dir():
            print(f"[WARN] folder missing for P{pid}, skipping.")
            continue

        data_phase: dict[str, list[np.ndarray]] = {ph: [] for ph in PHASES}
        len_phase: dict[str, list[int]] = {ph: [] for ph in PHASES}
        last_idx: dict | None = None

        # --------- load all trials -----------------------------------
        for trial in range(1, 25):
            for ph in PHASES:
                out = load_full_phase(p_dir, trial, ph, N_POSE_COLUMNS)
                if out is None:
                    continue
                data_phase[ph].append(out["combined"])
                len_phase[ph].append(out["combined"].shape[0])
                last_idx = out                              # store mapping

        if any(len(data_phase[ph]) == 0 for ph in PHASES):
            print(f"[WARN] P{pid}: no valid data for one phase, skipping.")
            continue
        assert last_idx is not None, "Index mapping not captured."

        # --------- stack trials per phase ---------------------------
        X1 = np.vstack(data_phase["phase1"], dtype=DTYPE_WORK)
        X2 = np.vstack(data_phase["phase2"], dtype=DTYPE_WORK)

        # --------- scaling ------------------------------------------
        if FEATURE_SCALE_MODE == "block_frobenius":
            X1s = scale_blocks_by_frobenius(X1, last_idx)
            X2s = scale_blocks_by_frobenius(X2, last_idx)
            
        elif FEATURE_SCALE_MODE == "column_then_block":
            X1s = scale_by_column_rms(X1, last_idx, BLOCK_EQUALISE)
            X2s = scale_by_column_rms(X2, last_idx, BLOCK_EQUALISE)

        elif FEATURE_SCALE_MODE == "feature":
            X1s = scale_each_feature_rms(X1)
            X2s = scale_each_feature_rms(X2)
            if BLOCK_EQUALISE:                          # √d heuristic
                X1s = equalise_block_energy_sqrt_d(X1s, last_idx)
                X2s = equalise_block_energy_sqrt_d(X2s, last_idx)

        elif FEATURE_SCALE_MODE == "block":
            X1s = scale_blocks_by_rms(X1, last_idx)
            X2s = scale_blocks_by_rms(X2, last_idx)

        else:
            raise ValueError(
                "FEATURE_SCALE_MODE must be 'block_frobenius', 'column_then_block', 'feature', or 'block'"
            )

        # --------- persist ------------------------------------------
        out_dir = Path(p_dir, "Preprocessed_Data_Matrix")
        out_dir.mkdir(exist_ok=True)
        np.save(out_dir / f"P{pid}_combined_matrix_phase1.npy", X1s)
        np.save(out_dir / f"P{pid}_combined_matrix_phase2.npy", X2s)

        # Remove 'combined' key before saving metadata to avoid redundancy
        if 'combined' in last_idx:
            del last_idx['combined']

        meta = dict(last_idx)
        meta["scaling"] = {
            "feature_scale_mode": FEATURE_SCALE_MODE,
            "block_equalise": (
                BLOCK_EQUALISE if "column" in FEATURE_SCALE_MODE else "N/A"
            ),
        }
        meta["phase1_trial_lengths"] = len_phase["phase1"]
        meta["phase2_trial_lengths"] = len_phase["phase2"]
        joblib.dump(meta, out_dir / f"P{pid}_feature_indices.joblib")

        print(f"[DONE] P{pid}: phase1 {X1s.shape}, phase2 {X2s.shape}")

    print("=== PRE-PROCESSING COMPLETE ===")


# =====================================================================
# Entry point
# =====================================================================
if __name__ == "__main__":
    preprocess_and_save_all_data()


 PRE-PROCESSING (column_then_block)
[DONE] P1: phase1 (168814, 180), phase2 (387674, 180)
[DONE] P2: phase1 (183597, 180), phase2 (356604, 180)


OSError: [Errno 22] Invalid argument: 'C:\\Users\\schmi\\Documents\\Studium\\TUM\\Masterthesis\\Experimental Data\\P(3)\\Preprocessed_Data_Matrix\\P3_combined_matrix_phase1.npy'