### Creating 2 combined and standardized feature matrix consisting of Kinematic and EMG Data for each Phase

In [None]:
# =====================================================================
# Preprocessing into Separate Phase Matrices
#
# PURPOSE:
#   - Aggregates all phase1 data into one matrix and all phase2 data
#     into another matrix for each participant.
#   - Fits a SINGLE global scaler on the data from BOTH phases combined.
#   - Transforms each phase matrix separately using the global scaler.
#   - Saves two separate standardized matrices per participant.
# =====================================================================

import os
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler

# ---------------------------------------------------------------------
# 0) Global Parameters
# ---------------------------------------------------------------------
BASE_DIR = r"C:\Users\schmi\Documents\Studium\TUM\Masterthesis\Experimental Data"
PARTICIPANTS = [1, 2, 3, 4, 5, 6, 7, 8]
PHASES = ["phase1", "phase2"]
N_POSE_COLUMNS = 6

# ---------------------------------------------------------------------
# 1) Data Loading for FULL PHASES
# ---------------------------------------------------------------------
def load_full_phase_data(participant_dir, trial_idx, phase_name):
    """Loads the FULL, uncropped data for a given phase."""
    sync_phase_dir = os.path.join(participant_dir, "Synchronized Data split in Phases")
    kin_file = os.path.join(sync_phase_dir, f"match_{trial_idx:02d}_{phase_name}_kin_norm.npy")
    myo_file = os.path.join(sync_phase_dir, f"match_{trial_idx:02d}_{phase_name}_myo.npy")
    otb_file = os.path.join(sync_phase_dir, f"match_{trial_idx:02d}_{phase_name}_otb.npy")

    if not all(os.path.exists(f) for f in [kin_file, myo_file, otb_file]):
        return None

    kin, myo, otb = np.load(kin_file), np.load(myo_file), np.load(otb_file)
    n_min = min(k.shape[0] for k in [kin, myo, otb])
    if n_min < 5: return None
    kin_c, myo_c, otb_c = kin[:n_min], myo[:n_min], otb[:n_min]

    if kin_c.shape[1] <= N_POSE_COLUMNS: return None
    hand_only = kin_c[:, : (kin_c.shape[1] - N_POSE_COLUMNS)]
    combined = np.hstack([otb_c, myo_c, hand_only])
    
    otb_cols, myo_cols = otb_c.shape[1], myo_c.shape[1]
    otb_end, myo_end = otb_cols, otb_cols + myo_cols
    kin_hand_end = myo_end + hand_only.shape[1]

    return {
        'combined': combined,
        'kin_hand_indices': (myo_end, kin_hand_end),
        'myo_indices': (otb_end, myo_end),
        'otb_indices': (0, otb_end),
    }

# ---------------------------------------------------------------------
# 2) Main Preprocessing Function [MODIFIED]
# ---------------------------------------------------------------------
def preprocess_and_save_all_data():
    """
    MODIFIED to save a separate aggregated matrix for each phase, while still
    using a single global scaler fitted on all data for consistency.
    """
    print(f"\n{'='*70}\n         RUNNING: Data Aggregation into Separate Phase Matrices\n{'='*70}")
    for pid in PARTICIPANTS:
        participant_str = f"P({pid})"
        participant_dir = os.path.join(BASE_DIR, participant_str)
        if not os.path.isdir(participant_dir):
            print(f"[WARN] Participant folder not found: {participant_str}. Skipping.")
            continue

        print(f"\n--- Processing Participant: {participant_str} ---")

        # Create separate lists for each phase's data
        all_phase1_data = []
        all_phase2_data = []
        last_valid_indices = None

        print(f"[INFO] Loading and aggregating trial data into separate phases...")
        for trial_idx in range(1, 25):
            phase1_dict = load_full_phase_data(participant_dir, trial_idx, "phase1")
            if phase1_dict:
                all_phase1_data.append(phase1_dict['combined'])
                last_valid_indices = phase1_dict
            
            phase2_dict = load_full_phase_data(participant_dir, trial_idx, "phase2")
            if phase2_dict:
                all_phase2_data.append(phase2_dict['combined'])
                last_valid_indices = phase2_dict if last_valid_indices is None else last_valid_indices

        if not all_phase1_data or not all_phase2_data:
            print(f"[WARN] No valid data for one or both phases for participant {pid}. Skipping.")
            continue
        
        # Create an aggregated matrix for each phase
        aggregated_p1_matrix = np.vstack(all_phase1_data)
        aggregated_p2_matrix = np.vstack(all_phase2_data)
        print(f"[INFO] Shape of Phase 1 aggregated matrix: {aggregated_p1_matrix.shape}")
        print(f"[INFO] Shape of Phase 2 aggregated matrix: {aggregated_p2_matrix.shape}")
        
        # --- Fit the scaler on BOTH phases combined for global consistency ---
        full_aggregated_matrix = np.vstack([aggregated_p1_matrix, aggregated_p2_matrix])
        global_scaler = StandardScaler()
        global_scaler.fit(full_aggregated_matrix)
        print("[INFO] Global StandardScaler fitted on all data.")

        # --- Transform each phase matrix SEPARATELY using the global scaler ---
        data_p1_standardized = global_scaler.transform(aggregated_p1_matrix)
        data_p2_standardized = global_scaler.transform(aggregated_p2_matrix)
        print("[INFO] Phase 1 and Phase 2 matrices transformed.")

        # --- Save the results ---
        output_dir = os.path.join(participant_dir, "Preprocessed_Data_Matrix")
        os.makedirs(output_dir, exist_ok=True)

        # Define paths for the two matrices and common files
        matrix_p1_path = os.path.join(output_dir, f"P{pid}_combined_matrix_phase1.npy")
        matrix_p2_path = os.path.join(output_dir, f"P{pid}_combined_matrix_phase2.npy")
        scaler_path = os.path.join(output_dir, f"P{pid}_global_scaler.joblib")
        indices_path = os.path.join(output_dir, f"P{pid}_feature_indices.joblib")
        
        # Save all files
        np.save(matrix_p1_path, data_p1_standardized)
        np.save(matrix_p2_path, data_p2_standardized)
        joblib.dump(global_scaler, scaler_path)
        joblib.dump(last_valid_indices, indices_path)
        
        print(f"[SUCCESS] Saved Phase 1 matrix to: {matrix_p1_path}")
        print(f"[SUCCESS] Saved Phase 2 matrix to: {matrix_p2_path}")
        print(f"[SUCCESS] Saved global scaler and indices to: {output_dir}")

# ---------------------------------------------------------------------
# 3) Script Execution
# ---------------------------------------------------------------------
if __name__ == "__main__":
    preprocess_and_save_all_data()
    print(f"\n{'='*70}\n         PREPROCESSING COMPLETE\n{'='*70}")



         RUNNING: Data Aggregation and Standardization (Full Phases)

--- Processing Participant: P(1) ---
[INFO] Loading and aggregating all trial data...
[INFO] Shape of aggregated raw data matrix: (556488, 180)
[INFO] Global StandardScaler fitted and data transformed.
[SUCCESS] Saved standardized data to: C:\Users\schmi\Documents\Studium\TUM\Masterthesis\Experimental Data\P(1)\Preprocessed_Data_Matrix\P1_combined_matrix.npy
[SUCCESS] Saved fitted scaler to: C:\Users\schmi\Documents\Studium\TUM\Masterthesis\Experimental Data\P(1)\Preprocessed_Data_Matrix\P1_global_scaler.joblib
[SUCCESS] Saved feature indices to: C:\Users\schmi\Documents\Studium\TUM\Masterthesis\Experimental Data\P(1)\Preprocessed_Data_Matrix\P1_feature_indices.joblib

--- Processing Participant: P(2) ---
[INFO] Loading and aggregating all trial data...
[INFO] Shape of aggregated raw data matrix: (540201, 180)
[INFO] Global StandardScaler fitted and data transformed.
[SUCCESS] Saved standardized data to: C:\Users\sc