In [None]:
# ============================================================
# DATASET-1 (BMIS) — SUB-1 ONLY → FINAL TRAINING CSVs (NO PLOTS)
#
# ============================================================

import os
import re
import warnings
import numpy as np
import pandas as pd
from scipy.signal import butter, sosfilt, filtfilt, iirnotch, resample
from sklearn.preprocessing import MinMaxScaler

# ----------------------------
# CONFIG (same preprocessing)
# ----------------------------
SUBJECT_ID = 9
NO_GESTURE = 7

EMG_FS = 200
EEG_FS = 250

NOTCH_FREQ = 60
QUALITY_FACTOR = 30

EMG_FC, EMG_FH = 5, 50
EEG_FC, EEG_FH = 5, 50

ORDER = 4
WINDOW_TIME_MS = 1000
OVERLAP_PERCENT = 80
TARGET_FS = 200

# Dataset-1 roots
DATASET1_ROOT = "/home/tsultan1/paper-2/dataset-1"
EEG_BASE = os.path.join(DATASET1_ROOT, "BMIS_EEG_DATA")
EMG_BASE = os.path.join(DATASET1_ROOT, "BMIS_EMG_DATA")

# Output (sub-1 folder)
OUT_DIR = os.path.join(DATASET1_ROOT, "final_exports-sub9")
os.makedirs(OUT_DIR, exist_ok=True)

# Expected channels (match dataset design)
EXPECTED_EEG_CH = 8
EXPECTED_EMG_CH = 8
FORCE_FIXED_CHANNELS = True  # prevents dimension errors

# ============================================================
# Filtering Functions (same logic as old)
# ============================================================

def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    sos = butter(order, [low, high], analog=False, btype='bandpass', output='sos')
    return sos

def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    sos = butter_bandpass(lowcut, highcut, fs, order=order)
    data_t = data.T
    filtered_t = np.zeros_like(data_t)
    for i in range(data_t.shape[0]):
        filtered_t[i, :] = sosfilt(sos, data_t[i, :])
    return filtered_t.T

def mains_removal(data, fs, notch_freq, quality_factor):
    b, a = iirnotch(notch_freq, quality_factor, fs)
    data_t = data.T
    # old behavior used method='gust' (can crash on short/odd signals) → safe fallback
    try:
        filtered_t = filtfilt(b, a, data_t, axis=1, method='gust')
    except Exception:
        try:
            filtered_t = filtfilt(b, a, data_t, axis=1)
        except Exception:
            # last resort: return un-notched
            filtered_t = data_t
    return filtered_t.T

def preprocess_data(data, fs, notch_freq, quality_factor, lowcut, highcut, order, target_fs=None):
    notched_data = mains_removal(data, fs=fs, notch_freq=notch_freq, quality_factor=quality_factor)
    filtered_data = butter_bandpass_filter(notched_data, lowcut=lowcut, highcut=highcut, fs=fs, order=order)

    # Downsample if target_fs is provided
    if target_fs and fs != target_fs:
        num_samples = int(data.shape[1] * target_fs / fs)
        filtered_data = resample(filtered_data, num=num_samples, axis=1)
    return filtered_data

# ============================================================
# Windowing Function (same behavior; plus safe guards)
# ============================================================

def window_with_overlap(data, sampling_frequency, window_time, overlap, no_channel):
    samples_per_window = int(sampling_frequency * (window_time / 1000))
    step_size = int(samples_per_window * (1 - overlap / 100))
    step_size = max(step_size, 1)

    if data.shape[1] < samples_per_window:
        return np.zeros((0, no_channel, samples_per_window), dtype=data.dtype)

    num_windows = (data.shape[1] - samples_per_window) // step_size + 1
    if num_windows <= 0:
        return np.zeros((0, no_channel, samples_per_window), dtype=data.dtype)

    windows = np.zeros((num_windows, no_channel, samples_per_window), dtype=data.dtype)
    for i in range(num_windows):
        start = i * step_size
        end = start + samples_per_window
        windows[i] = data[:, start:end]
    return windows

def truncate_to_min_length(data1, data2):
    min_length = min(data1.shape[1], data2.shape[1])
    return data1[:, :min_length], data2[:, :min_length]

# ============================================================
# Robust subject/gesture file discovery
# ============================================================

_SUBJ_RE = re.compile(r"subject_(\d+)$", re.IGNORECASE)

def find_subject_root(base_dir: str) -> str:
    """
    Finds directory that directly contains subject_1, subject_2, ...
    Searches recursively in case files are nested.
    """
    if not os.path.isdir(base_dir):
        raise FileNotFoundError(f"Not found: {base_dir}")

    # direct hit
    for name in os.listdir(base_dir):
        if _SUBJ_RE.match(name) and os.path.isdir(os.path.join(base_dir, name)):
            return base_dir

    # recursive search
    for root, dirs, _ in os.walk(base_dir):
        for d in dirs:
            if _SUBJ_RE.match(d):
                return root

    raise RuntimeError(f"Could not find any subject_* folders under: {base_dir}")

EEG_ROOT = find_subject_root(EEG_BASE)
EMG_ROOT = find_subject_root(EMG_BASE)

def files_for_subject_gesture(root_dir: str, subject_id: int, gesture_idx_1based: int):
    subj_dir = os.path.join(root_dir, f"subject_{subject_id}")
    if not os.path.isdir(subj_dir):
        return []
    pat = re.compile(rf"G{gesture_idx_1based}", re.IGNORECASE)
    out = []
    for f in os.listdir(subj_dir):
        if f.lower().endswith(".csv") and pat.search(f):
            out.append(os.path.join(subj_dir, f))
    return sorted(out)

# ============================================================
# Robust CSV loading + channel fixing (prevents dimension errors)
# ============================================================

def _read_csv_numeric(filepath: str) -> np.ndarray:
    """
    Read CSV, keep numeric content only.
    Removes non-numeric columns and replaces NaNs/infs with 0.
    """
    df = pd.read_csv(filepath)
    df = df.apply(pd.to_numeric, errors="coerce")
    df = df.dropna(axis=1, how="all")
    arr = df.to_numpy(dtype=np.float64, copy=False)
    arr = np.nan_to_num(arr, nan=0.0, posinf=0.0, neginf=0.0)
    if arr.ndim == 1:
        arr = arr.reshape(1, -1)
    return arr

def _pick_best_channels(x: np.ndarray, expected_ch: int) -> np.ndarray:
    """
    If too many channels, drop the most 'time/index-like' ones first.
    Fallback: keep first expected_ch.
    """
    ch, t = x.shape
    if ch <= expected_ch:
        return x

    ramp = np.linspace(-1.0, 1.0, t, dtype=np.float64)
    scores = []
    for i in range(ch):
        v = x[i].astype(np.float64, copy=False)
        if np.std(v) < 1e-12:
            corr = 1.0
        else:
            corr = float(np.corrcoef(v, ramp)[0, 1])
            corr = abs(corr) if np.isfinite(corr) else 1.0
        var_pen = 1.0 / (np.std(v) + 1e-12)
        scores.append(corr + 0.01 * var_pen)

    keep_idx = np.argsort(np.asarray(scores))[:expected_ch]
    keep_idx = np.sort(keep_idx)
    return x[keep_idx, :]

def ensure_channels_by_time(arr: np.ndarray, expected_ch: int, force_fixed: bool) -> np.ndarray:
    """
    Return array shaped (expected_ch, time). Handles orientation mismatch.
    """
    if arr.shape[0] == expected_ch:
        x = arr
    elif arr.shape[1] == expected_ch:
        x = arr.T
    else:
        # choose orientation where channels dimension is closer to expected_ch
        if abs(arr.shape[0] - expected_ch) > abs(arr.shape[1] - expected_ch):
            x = arr.T
        else:
            x = arr

    if x.ndim != 2:
        x = x.reshape(x.shape[0], -1)

    ch, t = x.shape
    if ch != expected_ch:
        if not force_fixed:
            raise ValueError(f"Channel mismatch: got {ch}, expected {expected_ch}")

        if ch > expected_ch:
            x = _pick_best_channels(x, expected_ch)
        else:
            pad = np.zeros((expected_ch - ch, t), dtype=x.dtype)
            x = np.vstack([x, pad])
    return x

def stack_trials_timewise(file_list, expected_ch: int, force_fixed: bool, transpose: bool):
    """
    Load each trial, enforce (ch,time), then concatenate timewise (axis=1),
    matching your old np.column_stack behavior.
    """
    chunks = []
    for fp in file_list:
        try:
            arr = _read_csv_numeric(fp)
            if transpose:
                arr = arr.T
            arr = ensure_channels_by_time(arr, expected_ch, force_fixed)
            if arr.shape[1] > 0:
                chunks.append(arr)
        except Exception as e:
            warnings.warn(f"Skipping file: {fp} | reason: {e}")
            continue
    if not chunks:
        return None
    return np.column_stack(chunks)

# ============================================================
# Subject pipeline (SUB-1)
# ============================================================

def preprocess_subject(subject_id: int):
    X_emg_list, X_eeg_list, y_list, subj_list = [], [], [], []
    total_windows = 0

    for g0 in range(NO_GESTURE):      # labels 0..6
        g1 = g0 + 1                   # file pattern G1..G7

        # IMPORTANT: keep old orientation logic
        #   EMG: transpose=False
        #   EEG: transpose=True
        emg_files = files_for_subject_gesture(EMG_ROOT, subject_id, g1)
        eeg_files = files_for_subject_gesture(EEG_ROOT, subject_id, g1)
        if not emg_files or not eeg_files:
            continue

        emg_raw = stack_trials_timewise(emg_files, EXPECTED_EMG_CH, FORCE_FIXED_CHANNELS, transpose=False)
        eeg_raw = stack_trials_timewise(eeg_files, EXPECTED_EEG_CH, FORCE_FIXED_CHANNELS, transpose=True)
        if emg_raw is None or eeg_raw is None:
            continue

        # preprocessing (same)
        emg_pp = preprocess_data(emg_raw, EMG_FS, NOTCH_FREQ, QUALITY_FACTOR, EMG_FC, EMG_FH, ORDER, TARGET_FS)
        eeg_pp = preprocess_data(eeg_raw, EEG_FS, NOTCH_FREQ, QUALITY_FACTOR, EEG_FC, EEG_FH, ORDER, TARGET_FS)

        # truncate to common length (same)
        emg_pp, eeg_pp = truncate_to_min_length(emg_pp, eeg_pp)

        # window (same)
        emg_w = window_with_overlap(emg_pp, TARGET_FS, WINDOW_TIME_MS, OVERLAP_PERCENT, emg_pp.shape[0])
        eeg_w = window_with_overlap(eeg_pp, TARGET_FS, WINDOW_TIME_MS, OVERLAP_PERCENT, eeg_pp.shape[0])
        if emg_w.shape[0] == 0 or eeg_w.shape[0] == 0:
            continue

        # align window counts (same logic)
        n = min(emg_w.shape[0], eeg_w.shape[0])
        emg_w = emg_w[:n].astype(np.float32)
        eeg_w = eeg_w[:n].astype(np.float32)

        X_emg_list.append(emg_w)
        X_eeg_list.append(eeg_w)
        y_list.extend([g0] * n)
        subj_list.extend([subject_id] * n)
        total_windows += n

    if not X_emg_list or not X_eeg_list:
        raise RuntimeError(f"No windows produced for subject_{subject_id}. Check subject folder / filenames G1..G7.")

    X_emg = np.vstack(X_emg_list)   # (N, ch, T)
    X_eeg = np.vstack(X_eeg_list)   # (N, ch, T)
    y = np.asarray(y_list, dtype=np.int64)
    subj = np.asarray(subj_list, dtype=np.int64)

    # safety clip
    n = min(len(y), len(subj), X_emg.shape[0], X_eeg.shape[0])
    return X_emg[:n], X_eeg[:n], y[:n], subj[:n], total_windows

def export_subject_csvs(X_emg, X_eeg, y, subj, out_dir: str, subject_id: int):
    # flatten like old code
    Xeeg_2d = X_eeg.reshape(X_eeg.shape[0], -1)
    Xemg_2d = X_emg.reshape(X_emg.shape[0], -1)

    # MinMaxScaler (same idea as old)
    eeg_scaler = MinMaxScaler()
    emg_scaler = MinMaxScaler()
    Xeeg_norm = eeg_scaler.fit_transform(Xeeg_2d).astype(np.float32)
    Xemg_norm = emg_scaler.fit_transform(Xemg_2d).astype(np.float32)

    # export file paths (same naming style as dataset-2 exports)
    eeg_path = os.path.join(out_dir, f"eeg_sub{subject_id}.csv")
    emg_path = os.path.join(out_dir, f"emg_sub{subject_id}.csv")
    lab_path = os.path.join(out_dir, f"labels_sub{subject_id}.csv")
    comb_path = os.path.join(out_dir, f"combined_sub{subject_id}.csv")

    # save EEG/EMG as numeric-only matrices (no extra cols)
    pd.DataFrame(Xeeg_norm).to_csv(eeg_path, index=False)
    pd.DataFrame(Xemg_norm).to_csv(emg_path, index=False)

    # labels (subject_id + Label)
    pd.DataFrame({"subject_id": subj, "Label": y}).to_csv(lab_path, index=False)

    # combined (subject_id, Label, eeg_*, emg_*)
    eeg_cols = [f"eeg_{i}" for i in range(Xeeg_norm.shape[1])]
    emg_cols = [f"emg_{i}" for i in range(Xemg_norm.shape[1])]

    df_eeg = pd.DataFrame(Xeeg_norm, columns=eeg_cols)
    df_emg = pd.DataFrame(Xemg_norm, columns=emg_cols)

    df_eeg.insert(0, "Label", y)
    df_eeg.insert(0, "subject_id", subj)

    df_comb = pd.concat([df_eeg, df_emg], axis=1)
    df_comb.to_csv(comb_path, index=False)

    print("\n[SAVED DATASET-1 SUB-1]")
    print(" ", eeg_path)
    print(" ", emg_path)
    print(" ", lab_path)
    print(" ", comb_path)
    print(f"Rows: {len(y)} | labels: {np.unique(y)}")
    return eeg_path, emg_path, lab_path, comb_path

# ============================================================
# MAIN
# ============================================================
if __name__ == "__main__":
    print(f"[INFO] EEG_ROOT = {EEG_ROOT}")
    print(f"[INFO] EMG_ROOT = {EMG_ROOT}")
    print(f"[INFO] Exporting subject_{SUBJECT_ID} to: {OUT_DIR}")

    X_emg, X_eeg, y, subj, total = preprocess_subject(SUBJECT_ID)
    print(f"[OK] subject_{SUBJECT_ID}: windows={total}")

    export_subject_csvs(X_emg, X_eeg, y, subj, OUT_DIR, SUBJECT_ID)


[INFO] EEG_ROOT = /home/tsultan1/paper-2/dataset-1/BMIS_EEG_DATA/BMIS_EEG_DATA/data/csv_data
[INFO] EMG_ROOT = /home/tsultan1/paper-2/dataset-1/BMIS_EMG_DATA/BMIS_EMG_DATA/data/csv_data
[INFO] Exporting subject_9 to: /home/tsultan1/paper-2/dataset-1/final_exports-sub9
[OK] subject_9: windows=1043

[SAVED DATASET-1 SUB-1]
  /home/tsultan1/paper-2/dataset-1/final_exports-sub9/eeg_sub9.csv
  /home/tsultan1/paper-2/dataset-1/final_exports-sub9/emg_sub9.csv
  /home/tsultan1/paper-2/dataset-1/final_exports-sub9/labels_sub9.csv
  /home/tsultan1/paper-2/dataset-1/final_exports-sub9/combined_sub9.csv
Rows: 1043 | labels: [0 1 2 3 4 5 6]
