In [1]:
# ===============================================
# EEG × Sound 全流程分析管线（Colab 一键运行版）
# 覆盖：加载→QC→声学特征修复→噪声上限→时延→通道ROI→被试级TRF
# 依赖：numpy, tqdm, scikit-learn（Colab自带；若报错可取消下一行注释）
# !pip install -q numpy tqdm scikit-learn
# ===============================================
import os, glob, json, math, csv
import numpy as np
from tqdm import tqdm
from collections import defaultdict, Counter
from sklearn.model_selection import GroupKFold




In [2]:
!pip -q install librosa soundfile

In [3]:
# -----------------------------
# 0) 数据加载（与你之前一致）
# -----------------------------
def load_segments_with_subject_ids(root_dir, eeg_suffix='EEG_aligned.npy', sound_suffix='Sound_aligned.npy', root_suffix='feature_normalized'):
    eeg_segments_list = []
    sound_segments_list = []
    subject_ids_list = []

    print(f"开始从根目录 '{root_dir}' 加载数据段及被试者ID...")
    subject_folders = [f for f in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, f))]

    for subject in tqdm(subject_folders, desc="处理被试者"):
        subject_path = os.path.join(root_dir, subject, root_suffix)
        eeg_files = glob.glob(os.path.join(subject_path, f'*{eeg_suffix}'))
        for eeg_file_path in eeg_files:
            base_name = os.path.basename(eeg_file_path).replace(eeg_suffix, '')
            sound_file_path = os.path.join(subject_path, base_name + sound_suffix)
            if os.path.exists(sound_file_path):
                try:
                    eeg_segment = np.load(eeg_file_path)
                    sound_segment = np.load(sound_file_path)
                    if eeg_segment.shape[0] == sound_segment.shape[0]:
                        eeg_segments_list.append(eeg_segment)
                        sound_segments_list.append(sound_segment)
                        subject_ids_list.append(subject)
                except Exception as e:
                    print(f"加载文件 {os.path.basename(eeg_file_path)} 时出错: {e}")

    print(f"\n加载完成！总共加载了 {len(eeg_segments_list)} 个数据段。")
    return eeg_segments_list, sound_segments_list, subject_ids_list

In [4]:
# -----------------------------
# 1) 基础 QC / 预处理工具
# -----------------------------
def filter_and_summarize(eegs, snds, sids, min_frames=20):
    kept_e, kept_s, kept_id = [], [], []
    n_emptyT, n_emptyD, n_short = 0, 0, 0
    for E, S, sid in zip(eegs, snds, sids):
        if E.ndim != 2 or S.ndim != 2:
            n_emptyD += 1; continue
        T1, D1 = E.shape; T2, D2 = S.shape
        if T1 == 0 or T2 == 0:
            n_emptyT += 1; continue
        if D1 == 0 or D2 == 0:
            n_emptyD += 1; continue
        if T1 < min_frames or T2 < min_frames:
            n_short += 1; continue
        kept_e.append(E); kept_s.append(S); kept_id.append(sid)
    print(f"过滤结果：总 {len(eegs)} 段 -> 保留 {len(kept_e)} 段 | 空T: {n_emptyT}, 空D: {n_emptyD}, 过短(<{min_frames}帧): {n_short}")
    return kept_e, kept_s, kept_id

def nan_inf_report(eegs, snds):
    def stats(arrs, name):
        n = len(arrs)
        any_nan = sum([np.isnan(a).any() for a in arrs])
        any_inf = sum([np.isinf(a).any() for a in arrs])
        print(f"{name}: 段数={n}, 含NaN段={any_nan} ({any_nan/n:.1%}), 含Inf段={any_inf} ({any_inf/n:.1%})")
    stats(eegs, "EEG"); stats(snds, "Sound")

def nan_breakdown_per_segment(S):
    S = np.asarray(S)
    nan_mask = ~np.isfinite(S)
    overall = nan_mask.mean() if S.size else 1.0
    per_col = nan_mask.mean(axis=0) if S.ndim==2 and S.shape[1]>0 else np.array([])
    return overall, per_col

def summarize_sound_nan(sound_segments):
    overall_rates = []
    percol_accum_sum = None
    percol_accum_cnt = None
    D_max = max(s.shape[1] for s in sound_segments if s.ndim==2) if sound_segments else 0
    for S in sound_segments:
        overall, per_col = nan_breakdown_per_segment(S)
        overall_rates.append(overall)
        if per_col.size:
            if percol_accum_sum is None:
                percol_accum_sum = np.zeros(D_max, dtype=float)
                percol_accum_cnt = np.zeros(D_max, dtype=float)
            d = per_col.size
            percol_accum_sum[:d] += per_col
            percol_accum_cnt[:d] += 1.0
    print("\n=== 声音特征 NaN 概览 ===")
    print(f"- 段级：含NaN的段 = {sum(r>0 for r in overall_rates)}/{len(overall_rates)} ({sum(r>0 for r in overall_rates)/len(overall_rates):.1%}); 中位数NaN率 = {np.median(overall_rates):.2%}")
    if percol_accum_cnt is not None:
        percol_mean = percol_accum_sum / np.maximum(percol_accum_cnt, 1e-9)
        top_bad = np.argsort(-percol_mean)[:10]
        print(f"- 列级（跨段平均）前10个最差列及其NaN率：")
        for j in top_bad:
            print(f"  列{j}: 平均NaN率={percol_mean[j]:.2%}")
        return percol_mean
    else:
        return None

In [5]:
# -----------------------------
# 2) 声音特征修复（丢列+插值+z-score）
# -----------------------------
def interp_nan_1d(x):
    x = np.asarray(x, dtype=np.float64)
    n = len(x)
    if n == 0: return x
    idx = np.arange(n)
    m = np.isfinite(x)
    if m.sum() == 0:
        return np.zeros_like(x)
    y = x.copy()
    y[~m] = np.interp(idx[~m], idx[m], x[m])
    return y

def repair_sound_matrix(S, drop_thr=0.6):
    S = np.asarray(S, dtype=np.float64)
    T, D = S.shape
    col_nan_rate = np.mean(~np.isfinite(S), axis=0)
    keep_mask = col_nan_rate <= drop_thr
    if keep_mask.sum() == 0:
        return np.zeros((T,0), dtype=np.float64), keep_mask
    S2 = S[:, keep_mask].copy()
    for d in range(S2.shape[1]):
        S2[:, d] = interp_nan_1d(S2[:, d])
    mu = S2.mean(axis=0, keepdims=True)
    sd = S2.std(axis=0, keepdims=True); sd[sd==0] = 1.0
    S2 = (S2 - mu) / sd
    return S2, keep_mask

def batch_repair_sound(sound_segments, drop_thr=0.6):
    repaired = []
    dropped_cols_stat = []
    for S in tqdm(sound_segments, desc="修复声音特征"):
        S2, keep_mask = repair_sound_matrix(S, drop_thr=drop_thr)
        repaired.append(S2)
        dropped_cols_stat.append({"D_in": S.shape[1], "D_kept": int(keep_mask.sum())})
    print("修复完成：平均保留列数 =",
          np.mean([r["D_kept"] for r in dropped_cols_stat]),
          " / 平均原始列数 =",
          np.mean([r["D_in"] for r in dropped_cols_stat]))
    return repaired, dropped_cols_stat

In [6]:
# -----------------------------
# 3) 信号处理 & 互相关工具
# -----------------------------
def zscore1d(x):
    x = np.asarray(x, dtype=np.float64)
    m, s = x.mean(), x.std()
    return (x - m)/(s if s>0 and np.isfinite(s) else 1.0), m, (s if s>0 and np.isfinite(s) else 1.0)

def highpass_moving_average(ts, win=15):
    ts = np.asarray(ts, dtype=np.float64)
    if ts.size == 0: return ts
    if ts.size < 2*win:
        return (ts - ts.mean())/(ts.std() if ts.std()>0 else 1.0)
    kernel = np.ones(win, dtype=np.float64)/win
    trend = np.convolve(ts, kernel, mode='same')
    hp = ts - trend
    return (hp - hp.mean())/(hp.std() if hp.std()>0 else 1.0)

def first_pc_time_series(X):
    X = np.asarray(X, dtype=np.float64)
    if X.size == 0 or min(X.shape) < 2:
        return np.array([])
    U,S,VT = np.linalg.svd((X - X.mean(0)) / (X.std(0) + 1e-12), full_matrices=False)
    return (U[:,0]*S[0]).astype(np.float64)

def envelope_from_matrix(S):
    if S.size == 0: return np.array([])
    e = np.sqrt((S**2).sum(axis=1))
    return (e - e.mean())/(e.std() if e.std()>0 else 1.0)

def causal_ma(ts, win=9):
    ts = np.asarray(ts, dtype=np.float64)
    if ts.size==0: return ts
    out = np.empty_like(ts); csum = 0.0
    for i in range(len(ts)):
        csum += ts[i]
        if i >= win: csum -= ts[i-win]
        out[i] = csum / min(i+1, win)
    return (out - out.mean())/(out.std() if out.std()>0 else 1.0)

def normxcorr_1d(x, y, max_lag, min_eff=12):
    x = np.asarray(x, dtype=np.float64); y = np.asarray(y, dtype=np.float64)
    T = min(len(x), len(y)); x, y = x[:T], y[:T]
    if T < max(min_eff + 1, max_lag + 2): return np.array([]), np.array([])
    x = (x - x.mean())/(x.std() if x.std()>0 else 1.0)
    y = (y - y.mean())/(y.std() if y.std()>0 else 1.0)
    lags = np.arange(-max_lag, max_lag+1, dtype=int)
    cs = []
    for lag in lags:
        if lag>=0: xx, yy = x[lag:], y[:T-lag]
        else:      xx, yy = x[:T+lag], y[-lag:]
        if len(xx) < min_eff or len(yy) < min_eff: cs.append(np.nan); continue
        sx, sy = xx.std(), yy.std()
        if sx==0 or sy==0 or not np.isfinite(sx) or not np.isfinite(sy): cs.append(np.nan); continue
        cs.append(np.corrcoef(xx, yy)[0,1])
    return lags, np.array(cs, dtype=np.float64)

def _summarize_lag_rows(rows):
    def med(key):
        vals = [r[key] for r in rows if np.isfinite(r.get(key, np.nan))]
        return float(np.median(vals)) if vals else np.nan
    frac = np.mean([
        (r.get("peak_r", np.nan) > r.get("peak_r_null", np.nan))
        for r in rows
        if np.isfinite(r.get("peak_r", np.nan)) and np.isfinite(r.get("peak_r_null", np.nan))
    ]) if any(np.isfinite(r.get("peak_r", np.nan)) and np.isfinite(r.get("peak_r_null", np.nan)) for r in rows) else np.nan
    return {
        "median_peak_r": med("peak_r"),
        "median_peak_lag_ms": med("peak_lag_ms"),
        "median_peak_r_null": med("peak_r_null"),
        "frac_r_gt_null": float(frac) if np.isfinite(frac) else np.nan
    }

In [7]:

# -----------------------------
# 4) 噪声上限（基于PC1的分块split-half）
# -----------------------------
def split_half_1d(ts, block_len=20):
    ts = np.asarray(ts, dtype=np.float64)
    if ts.size < block_len*2: return np.nan
    # 分块均值
    n_blocks = len(ts)//block_len
    B = ts[:n_blocks*block_len].reshape(n_blocks, block_len).mean(axis=1)
    x, y = B[0::2], B[1::2]
    if len(x)<3 or len(y)<3: return np.nan
    r_half = np.corrcoef((x-x.mean())/(x.std() or 1.0), (y-y.mean())/(y.std() or 1.0))[0,1]
    if not np.isfinite(r_half): return np.nan
    return (2*r_half)/(1+r_half)

def fisher_mean(rs):
    rs = [r for r in rs if np.isfinite(r)]
    if not rs: return np.nan
    z = np.arctanh(np.clip(rs, -0.999999, 0.999999))
    return float(np.tanh(np.mean(z)))

def noise_ceiling_from_pc1(eeg_segments, sound_repaired):
    eeg_r_blk, snd_r_blk = [], []
    for E, S2 in zip(eeg_segments, sound_repaired):
        eeg_pc1 = first_pc_time_series(E)
        snd_pc1 = first_pc_time_series(S2) if S2.size else np.array([])
        eeg_r_blk.append(split_half_1d(eeg_pc1, block_len=20))
        snd_r_blk.append(split_half_1d(snd_pc1, block_len=20))
    r_eeg = fisher_mean(eeg_r_blk)
    r_snd = fisher_mean(snd_r_blk)
    r_max = np.sqrt(r_eeg*r_snd) if (np.isfinite(r_eeg) and np.isfinite(r_snd) and r_eeg>0 and r_snd>0) else np.nan
    print("\n=== 基于 PC1 的分块噪声上限 ===")
    print(json.dumps({"r_eeg_blk_overall_PC1": r_eeg, "r_sound_blk_overall_PC1": r_snd, "r_max_blk_overall_PC1": r_max},
                     ensure_ascii=False, indent=2))
    return r_eeg, r_snd, r_max


In [8]:
# -----------------------------
# 5) 时延：PC1↔PC1 与 PC1↔Envelope
# -----------------------------
def compute_lag_stats(eeg_segments, sound_repaired, frame_ms=11.0, max_lag_ms=300):
    max_lag = int(round(max_lag_ms/frame_ms))
    rows_pc1, rows_env = [], []
    for E, S2 in tqdm(list(zip(eeg_segments, sound_repaired)), desc="Lag PC1 / Envelope"):
        eeg_pc1 = first_pc_time_series(E)
        snd_pc1 = first_pc_time_series(S2) if S2.size else np.array([])
        env_ts  = envelope_from_matrix(S2) if S2.size else np.array([])
        # PC1↔PC1
        lags, cs = normxcorr_1d(eeg_pc1, snd_pc1, max_lag)
        if lags.size and np.any(np.isfinite(cs)):
            i = int(np.nanargmax(cs)); peak_r, lag_ms = float(cs[i]), float(lags[i]*frame_ms)
            # null
            sh = max(1, int(round(len(snd_pc1)*0.33))) if len(snd_pc1) else 1
            snd_sh = np.roll(snd_pc1, sh) if len(snd_pc1) else snd_pc1
            _, cs0 = normxcorr_1d(eeg_pc1, snd_sh, max_lag)
            peak_r_null = float(np.nanmax(cs0)) if np.any(np.isfinite(cs0)) else np.nan
        else:
            peak_r, lag_ms, peak_r_null = np.nan, np.nan, np.nan
        rows_pc1.append({"peak_r": peak_r, "peak_lag_ms": lag_ms, "peak_r_null": peak_r_null})
        # PC1↔Envelope
        lags, cs = normxcorr_1d(eeg_pc1, env_ts, max_lag)
        if lags.size and np.any(np.isfinite(cs)):
            i = int(np.nanargmax(cs)); peak_r, lag_ms = float(cs[i]), float(lags[i]*frame_ms)
            sh = max(1, int(round(len(env_ts)*0.33))) if len(env_ts) else 1
            env_sh = np.roll(env_ts, sh) if len(env_ts) else env_ts
            _, cs0 = normxcorr_1d(eeg_pc1, env_sh, max_lag)
            peak_r_null = float(np.nanmax(cs0)) if np.any(np.isfinite(cs0)) else np.nan
        else:
            peak_r, lag_ms, peak_r_null = np.nan, np.nan, np.nan
        rows_env.append({"peak_r": peak_r, "peak_lag_ms": lag_ms, "peak_r_null": peak_r_null})
    print("\n=== 时延（PC1↔PC1）汇总 ==="); print(json.dumps(_summarize_lag_rows(rows_pc1), ensure_ascii=False, indent=2))
    print("\n=== 时延（PC1↔Envelope）汇总 ==="); print(json.dumps(_summarize_lag_rows(rows_env), ensure_ascii=False, indent=2))
    return rows_pc1, rows_env

In [9]:
# -----------------------------
# 6) 通道级 ROI（逐段最佳通道 + 汇总）
# -----------------------------
def voiced_mask_from_original(S_orig, voicing_cols, fallback_percentile=40):
    T, D = S_orig.shape
    if len(voicing_cols):
        vc = [c for c in voicing_cols if c < D]
        if len(vc):
            m = np.any(np.isfinite(S_orig[:, vc]), axis=1)
            if m.any():
                return m.astype(bool)
    e = np.sqrt((np.nan_to_num(S_orig)**2).sum(axis=1))
    thr = np.percentile(e[~np.isnan(e)], fallback_percentile) if np.any(~np.isnan(e)) else 0.0
    return (e > thr)

def compute_lag_stats_channelwise(eeg_segments, sound_segments, sound_repaired, *,
                                  frame_ms=11.0, max_lag_ms=300, null_shift_ratio=0.33, voicing_cols=None):
    max_lag = int(round(max_lag_ms / frame_ms))
    per_seg = []; ch_counter = Counter()
    for E, S_orig, S_rep in tqdm(list(zip(eeg_segments, sound_segments, sound_repaired)), desc="Channelwise ROI"):
        vm = voiced_mask_from_original(S_orig, voicing_cols or [], 40)
        env = envelope_from_matrix(S_rep)
        env = (env - env.mean())/(env.std() if env.std()>0 else 1.0)
        best = {"peak_r": np.nan, "peak_lag_ms": np.nan, "peak_r_null": np.nan, "best_ch": -1}
        for ch in range(E.shape[1]):
            y = (E[:,ch]-E[:,ch].mean())/(E[:,ch].std() if E[:,ch].std()>0 else 1.0)
            y = highpass_moving_average(y, 15)
            mlen = min(len(y), len(env), len(vm))
            if mlen < 30: continue
            yy, ee, mm = y[:mlen], env[:mlen], vm[:mlen]
            lags, cs = normxcorr_1d(yy[mm], ee[mm], max_lag)
            if cs.size==0 or np.all(~np.isfinite(cs)): continue
            i = int(np.nanargmax(cs)); r, lag_ms = float(cs[i]), float(lags[i]*frame_ms)
            # null
            shift = max(1, int(round(len(ee)*null_shift_ratio)))
            eesh = np.roll(ee, shift)
            _, csn = normxcorr_1d(yy[mm], eesh[mm], max_lag)
            rnull = float(np.nanmax(csn)) if np.any(np.isfinite(csn)) else np.nan
            if not np.isfinite(best["peak_r"]) or r > best["peak_r"]:
                best = {"peak_r": r, "peak_lag_ms": lag_ms, "peak_r_null": rnull, "best_ch": ch}
        per_seg.append(best)
        if best["best_ch"] >= 0:
            ch_counter[best["best_ch"]] += 1
    overall = _summarize_lag_rows(per_seg); overall["top_channels"] = ch_counter.most_common(10)
    print("\n=== 通道级 ROI 汇总 ==="); print(json.dumps(overall, ensure_ascii=False, indent=2))
    return {"per_segment": per_seg, "overall_summary": overall}




In [10]:
# -----------------------------
# 7) TRF（0..200ms，ROI-in-train + 列标准化 + 二阶差分平滑 + 嵌套CV）
# -----------------------------
def build_lagged_design(env, lags_fr, voicing=None):
    T = len(env); X = np.zeros((T, len(lags_fr)), dtype=np.float64)
    for i,L in enumerate(lags_fr):
        if L>=0: X[L:, i] = env[:T-L]
        else:    X[:T+L, i] = env[-L:]
    if voicing is None:
        return X
    v = np.asarray(voicing, dtype=np.float64).reshape(-1,1)
    v = v[:T]
    return np.hstack([X[:T], v])

def diff2_mat(L):
    if L < 3: return np.zeros((L, L), dtype=np.float64)
    D = np.zeros((L-2, L), dtype=np.float64)
    for i in range(L-2):
        D[i, i]   = 1.0
        D[i, i+1] = -2.0
        D[i, i+2] = 1.0
    return D.T @ D

def trf_fit_ridge_smooth(X_tr, y_tr, alpha=10.0, beta=10.0, n_lag_cols=None, alpha_voicing=None):
    N, P = X_tr.shape
    L = int(n_lag_cols if n_lag_cols is not None else P-1)
    XtX = X_tr.T @ X_tr
    XtY = X_tr.T @ y_tr
    A = XtX + alpha * np.eye(P)
    if beta > 0 and L >= 3:
        D2 = diff2_mat(L)
        A[:L, :L] += beta * D2
    if alpha_voicing is not None and P > L:
        A[L, L] += (alpha_voicing - alpha)
    w = np.linalg.solve(A, XtY)
    return w

def fit_standardizer(X):
    mu = X.mean(axis=0); sd = X.std(axis=0)
    sd[(~np.isfinite(sd)) | (sd==0)] = 1.0
    return mu, sd

def apply_standardizer(X, mu, sd):
    return (X - mu) / sd

def select_roi_channels_for_subject(eeg_segments, sound_segments, sound_repaired, train_idx, top_k=3, frame_ms=11.0, max_lag_ms=200, voicing_cols=None):
    max_lag = int(round(max_lag_ms/frame_ms))
    D = eeg_segments[train_idx[0]].shape[1]
    per_ch_rs = [[] for _ in range(D)]
    for i in train_idx:
        E = eeg_segments[i]; S0 = sound_segments[i]; Sr = sound_repaired[i]
        vm = voiced_mask_from_original(S0, voicing_cols or [], 40)
        env = causal_ma(envelope_from_matrix(Sr), win=9)
        yenv = (env - env.mean())/(env.std() if env.std()>0 else 1.0)
        for ch in range(D):
            y = (E[:,ch]-E[:,ch].mean())/(E[:,ch].std() if E[:,ch].std()>0 else 1.0)
            y = highpass_moving_average(y, 15)
            mlen = min(len(y), len(yenv), len(vm))
            lags, cs = normxcorr_1d(y[:mlen][vm[:mlen]], yenv[:mlen][vm[:mlen]], max_lag)
            if cs.size and np.any(np.isfinite(cs)):
                per_ch_rs[ch].append(float(np.nanmax(cs)))
    ch_scores = np.array([np.median(rs) if len(rs) else -np.inf for rs in per_ch_rs])
    roi = np.argsort(-ch_scores)[:top_k]
    return roi.tolist()

def eval_subject_trf(subject_id, eeg_segments, sound_segments, sound_repaired, subject_ids, voicing_cols, frame_ms=11.0):
    idx = [i for i, sid in enumerate(subject_ids) if sid == subject_id]
    n_segs = len(idx)
    if n_segs < 8:
        return {"subject": subject_id, "note": "too few segments"}
    groups = np.arange(n_segs)
    outer = GroupKFold(n_splits=min(5, n_segs))
    lags_ms = np.arange(0, 201, 11)
    lags_fr = (lags_ms / frame_ms).astype(int)
    L = len(lags_fr)
    alpha_grid = [1.0, 10.0, 100.0]
    beta_grid  = [0.0, 10.0, 100.0]
    seg_corrs, seg_corrs_null, peak_lags = [], [], []

    for tr_idx_rel, va_idx_rel in outer.split(np.zeros(n_segs), groups=groups):
        tr_idx = [idx[i] for i in tr_idx_rel]
        va_idx = [idx[i] for i in va_idx_rel]
        roi = select_roi_channels_for_subject(eeg_segments, sound_segments, sound_repaired, tr_idx,
                                              top_k=3, frame_ms=frame_ms, max_lag_ms=200, voicing_cols=voicing_cols)
        def build_dataset(seg_indices):
            X_list, y_list = [], []
            for k in seg_indices:
                E = eeg_segments[k]; S0 = sound_segments[k]; Sr = sound_repaired[k]
                env = causal_ma(envelope_from_matrix(Sr), win=9)
                vm  = voiced_mask_from_original(S0, voicing_cols or [], 40)
                Xseg = build_lagged_design(env, lags_fr, voicing=vm)
                ychs = []
                for ch in roi:
                    y = (E[:,ch]-E[:,ch].mean())/(E[:,ch].std() if E[:,ch].std()>0 else 1.0)
                    y = highpass_moving_average(y, 15)
                    ychs.append(y[:Xseg.shape[0]])
                yseg = np.mean(np.vstack(ychs), axis=0) if len(ychs) else np.zeros(Xseg.shape[0])
                T = min(Xseg.shape[0], len(yseg))
                X_list.append(Xseg[:T]); y_list.append(yseg[:T])
            X = np.vstack(X_list); y = np.concatenate(y_list)
            return X, y

        Xtr_raw, ytr_raw = build_dataset(tr_idx)
        Xva_raw_list, yva_raw_list = [], []
        for k in va_idx:
            Xk, yk = build_dataset([k])
            Xva_raw_list.append(Xk); yva_raw_list.append(yk)

        Xmu, Xsd = fit_standardizer(Xtr_raw); Xtr = apply_standardizer(Xtr_raw, Xmu, Xsd)
        ytr, ymu, ysd = zscore1d(ytr_raw)
        n = len(ytr); ntr = int(0.8*n)
        Xtr_i, ytr_i = Xtr[:ntr], ytr[:ntr]
        Xvl_i, yvl_i = Xtr[ntr:], ytr[ntr:]
        best_score, best_hp = -np.inf, (10.0, 10.0)
        for alpha in alpha_grid:
            for beta in beta_grid:
                w = trf_fit_ridge_smooth(Xtr_i, ytr_i, alpha=alpha, beta=beta, n_lag_cols=L, alpha_voicing=alpha)
                yhat = Xvl_i @ w
                r = np.corrcoef(yvl_i, yhat)[0,1] if yvl_i.std()>0 and yhat.std()>0 else -np.inf
                if np.isfinite(r) and r > best_score:
                    best_score, best_hp = r, (alpha, beta)
        alpha, beta = best_hp
        w = trf_fit_ridge_smooth(Xtr, ytr, alpha=alpha, beta=beta, n_lag_cols=L, alpha_voicing=alpha)
        lag_block = w[:L]
        pk = int(np.argmax(np.abs(lag_block))) if L else 0
        peak_lags.append(float(pk * 11.0))
        for Xk_raw, yk_raw in zip(Xva_raw_list, yva_raw_list):
            Xk = apply_standardizer(Xk_raw, Xmu, Xsd)
            yk = (yk_raw - ymu) / (ysd if ysd>0 else 1.0)
            yhat = Xk @ w
            rk = np.corrcoef(yk, yhat)[0,1] if yk.std()>0 and yhat.std()>0 else np.nan
            shift = max(1, int(0.33*len(yk)))
            yperm = np.roll(yk, shift)
            yhat0 = Xk @ w
            rk0 = np.corrcoef(yperm, yhat0)[0,1] if yperm.std()>0 and yhat0.std()>0 else np.nan
            seg_corrs.append(rk); seg_corrs_null.append(rk0)

    seg_corrs = np.array(seg_corrs, dtype=np.float64)
    seg_corrs_null = np.array(seg_corrs_null, dtype=np.float64)
    valid = np.isfinite(seg_corrs) & np.isfinite(seg_corrs_null)
    res = {
        "subject": subject_id,
        "n_segments_eval": int(valid.sum()),
        "median_pred_r": float(np.median(seg_corrs[valid])) if valid.any() else np.nan,
        "median_pred_r_null": float(np.median(seg_corrs_null[valid])) if valid.any() else np.nan,
        "frac_r_better_than_null": float(np.mean(seg_corrs[valid] > seg_corrs_null[valid])) if valid.any() else np.nan,
        "median_trf_peak_ms_(>=0)": float(np.median(peak_lags)) if len(peak_lags) else np.nan,
        "note": "ROI top-3 | lags 0..200ms | ridge + 2nd-diff smoothing"
    }
    return res

def run_trf_analysis_per_subject(eeg_segments, sound_segments, sound_repaired, subject_ids, *,
                                 voicing_cols, frame_ms=11.0):
    subjects = sorted(set(subject_ids))
    per_subject = [eval_subject_trf(sid, eeg_segments, sound_segments, sound_repaired, subject_ids, voicing_cols, frame_ms=frame_ms) for sid in tqdm(subjects, desc="TRF per subject")]
    vals = [r for r in per_subject if np.isfinite(r.get("median_pred_r", np.nan)) and np.isfinite(r.get("median_pred_r_null", np.nan))]
    if vals:
        med_r  = float(np.median([r["median_pred_r"] for r in vals]))
        med_r0 = float(np.median([r["median_pred_r_null"] for r in vals]))
        frac   = float(np.mean([r["median_pred_r"] > r["median_pred_r_null"] for r in vals]))
        med_pk = float(np.median([r["median_trf_peak_ms_(>=0)"] for r in vals if np.isfinite(r["median_trf_peak_ms_(>=0)"])])) if any(np.isfinite(r["median_trf_peak_ms_(>=0)"]) for r in vals) else np.nan
        overall = {"median_of_subject_medians": med_r, "median_of_subject_nulls": med_r0,
                   "frac_subjects_better_than_null": frac, "median_TRF_peak_ms_(>=0)": med_pk}
    else:
        overall = {"median_of_subject_medians": np.nan, "median_of_subject_nulls": np.nan,
                   "frac_subjects_better_than_null": np.nan, "median_TRF_peak_ms_(>=0)": np.nan}
    print("\n=== TRF（总体）==="); print(json.dumps(overall, ensure_ascii=False, indent=2))
    return {"per_subject": per_subject, "overall_summary": overall}


In [11]:
# -----------------------------
# 8) 总管线（可选落盘）
# -----------------------------
def analyze_eeg_sound_data(root_dir, frame_ms=11.0, max_lag_ms=300, null_shift_ratio=0.33, min_frames=20, drop_thr=0.6, save_dir=None):
    """
    返回字典：
      - noise_ceiling_pc1
      - lag_alignment {"pc1":..., "envelope":...}
      - channel_roi_summary
      - trf_results_per_subject
      - sound_nan_summary
      - sound_repair_stats
    """
    print("\n--- 开始数据分析管线 ---")
    eegs, snds, sids = load_segments_with_subject_ids(root_dir)
    eegs, snds, sids = filter_and_summarize(eegs, snds, sids, min_frames=min_frames)
    nan_inf_report(eegs, snds)
    percol_nan_mean = summarize_sound_nan(snds)
    if percol_nan_mean is None:
        Dmax = max(S.shape[1] for S in snds)
        percol_nan_mean = np.zeros(Dmax)
    snds_rep, drop_stats = batch_repair_sound(snds, drop_thr=drop_thr)
    r_eeg_blk, r_snd_blk, r_max_blk = noise_ceiling_from_pc1(eegs, snds_rep)
    rows_pc1, rows_env = compute_lag_stats(eegs, snds_rep, frame_ms=frame_ms, max_lag_ms=max_lag_ms)
    lag_alignment = {"pc1": _summarize_lag_rows(rows_pc1), "envelope": _summarize_lag_rows(rows_env)}
    voicing_cols = np.where(percol_nan_mean > 0.95)[0].tolist()
    ch_roi = compute_lag_stats_channelwise(eegs, snds, snds_rep, frame_ms=frame_ms, max_lag_ms=max_lag_ms,
                                           null_shift_ratio=null_shift_ratio, voicing_cols=voicing_cols)
    trf_res = run_trf_analysis_per_subject(eegs, snds, snds_rep, sids, voicing_cols=voicing_cols, frame_ms=frame_ms)
    per_segment_nan_rates = [float(nan_breakdown_per_segment(S)[0]) for S in snds]

    results = {
        "noise_ceiling_pc1": {"r_eeg_blk_overall_PC1": r_eeg_blk, "r_sound_blk_overall_PC1": r_snd_blk, "r_max_blk_overall_PC1": r_max_blk},
        "lag_alignment": lag_alignment,
        "channel_roi_summary": ch_roi,
        "trf_results_per_subject": trf_res,
        "sound_nan_summary": {"per_segment_rates": per_segment_nan_rates, "per_col_mean": percol_nan_mean.tolist()},
        "sound_repair_stats": drop_stats
    }

    if save_dir:
        os.makedirs(save_dir, exist_ok=True)
        # JSON 汇总
        with open(os.path.join(save_dir, "summary.json"), "w") as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        # 逐段 ROI
        with open(os.path.join(save_dir, "channel_roi_per_segment.csv"), "w", newline="") as f:
            w = csv.DictWriter(f, fieldnames=["peak_r","peak_lag_ms","peak_r_null","best_ch"])
            w.writeheader(); w.writerows(ch_roi["per_segment"])
        # 被试级 TRF
        with open(os.path.join(save_dir, "trf_per_subject.json"), "w") as f:
            json.dump(trf_res["per_subject"], f, ensure_ascii=False, indent=2)
        # 声学列修复统计
        with open(os.path.join(save_dir, "sound_repair_stats.csv"), "w", newline="") as f:
            w = csv.DictWriter(f, fieldnames=["D_in","D_kept"])
            w.writeheader(); w.writerows(drop_stats)
        # 段级NaN率
        with open(os.path.join(save_dir, "sound_nan_rate_per_segment.csv"), "w", newline="") as f:
            w = csv.writer(f); w.writerow(["nan_rate"]); w.writerows([[x] for x in per_segment_nan_rates])
        print(f"结果已保存到: {save_dir}")

    print("--- 数据分析管线完成 ---\n")
    return results

In [12]:

root_dir = "/content/drive/MyDrive/data"
res = analyze_eeg_sound_data(root_dir, save_dir="/content/eeg_sound_results")
print(json.dumps({
    "NoiseCeiling": res["noise_ceiling_pc1"],
    "Lag-PC1": res["lag_alignment"]["pc1"],
    "Lag-Env": res["lag_alignment"]["envelope"],
    "ROI-Overall": res["channel_roi_summary"]["overall_summary"],
    "TRF-Overall": res["trf_results_per_subject"]["overall_summary"]
}, ensure_ascii=False, indent=2))


--- 开始数据分析管线 ---
开始从根目录 '/content/drive/MyDrive/data' 加载数据段及被试者ID...


处理被试者: 100%|██████████| 20/20 [01:26<00:00,  4.32s/it]



加载完成！总共加载了 1794 个数据段。
过滤结果：总 1794 段 -> 保留 1794 段 | 空T: 0, 空D: 0, 过短(<20帧): 0
EEG: 段数=1794, 含NaN段=0 (0.0%), 含Inf段=0 (0.0%)
Sound: 段数=1794, 含NaN段=1794 (100.0%), 含Inf段=0 (0.0%)

=== 声音特征 NaN 概览 ===
- 段级：含NaN的段 = 1794/1794 (100.0%); 中位数NaN率 = 16.67%
- 列级（跨段平均）前10个最差列及其NaN率：
  列52: 平均NaN率=99.69%
  列51: 平均NaN率=99.22%
  列50: 平均NaN率=99.22%
  列48: 平均NaN率=99.22%
  列45: 平均NaN率=99.22%
  列46: 平均NaN率=99.22%
  列47: 平均NaN率=99.22%
  列49: 平均NaN率=99.22%
  列53: 平均NaN率=99.22%
  列0: 平均NaN率=0.00%


修复声音特征: 100%|██████████| 1794/1794 [00:03<00:00, 500.98it/s]


修复完成：平均保留列数 = 45.062430323299886  / 平均原始列数 = 54.0

=== 基于 PC1 的分块噪声上限 ===
{
  "r_eeg_blk_overall_PC1": 0.6935338185221477,
  "r_sound_blk_overall_PC1": 0.27392708407007144,
  "r_max_blk_overall_PC1": 0.4358643098623171
}


Lag PC1 / Envelope: 100%|██████████| 1794/1794 [01:50<00:00, 16.27it/s]



=== 时延（PC1↔PC1）汇总 ===
{
  "median_peak_r": 0.1365618066091414,
  "median_peak_lag_ms": 55.0,
  "median_peak_r_null": 0.14892065585890285,
  "frac_r_gt_null": 0.4793756967670011
}

=== 时延（PC1↔Envelope）汇总 ===
{
  "median_peak_r": 0.12626358925003917,
  "median_peak_lag_ms": 44.0,
  "median_peak_r_null": 0.15050690789462307,
  "frac_r_gt_null": 0.463768115942029
}


Channelwise ROI: 100%|██████████| 1794/1794 [16:11<00:00,  1.85it/s]



=== 通道级 ROI 汇总 ===
{
  "median_peak_r": 0.21005677445726784,
  "median_peak_lag_ms": 11.0,
  "median_peak_r_null": 0.13534070860063452,
  "frac_r_gt_null": 0.8876146788990825,
  "top_channels": [
    [
      10,
      147
    ],
    [
      1,
      126
    ],
    [
      28,
      77
    ],
    [
      23,
      75
    ],
    [
      4,
      74
    ],
    [
      8,
      74
    ],
    [
      5,
      73
    ],
    [
      11,
      70
    ],
    [
      17,
      67
    ],
    [
      14,
      65
    ]
  ]
}


TRF per subject: 100%|██████████| 15/15 [21:33<00:00, 86.25s/it]



=== TRF（总体）===
{
  "median_of_subject_medians": 0.012809939369599645,
  "median_of_subject_nulls": -0.003862342276028379,
  "frac_subjects_better_than_null": 0.8,
  "median_TRF_peak_ms_(>=0)": 121.0
}
结果已保存到: /content/eeg_sound_results
--- 数据分析管线完成 ---

{
  "NoiseCeiling": {
    "r_eeg_blk_overall_PC1": 0.6935338185221477,
    "r_sound_blk_overall_PC1": 0.27392708407007144,
    "r_max_blk_overall_PC1": 0.4358643098623171
  },
  "Lag-PC1": {
    "median_peak_r": 0.1365618066091414,
    "median_peak_lag_ms": 55.0,
    "median_peak_r_null": 0.14892065585890285,
    "frac_r_gt_null": 0.4793756967670011
  },
  "Lag-Env": {
    "median_peak_r": 0.12626358925003917,
    "median_peak_lag_ms": 44.0,
    "median_peak_r_null": 0.15050690789462307,
    "frac_r_gt_null": 0.463768115942029
  },
  "ROI-Overall": {
    "median_peak_r": 0.21005677445726784,
    "median_peak_lag_ms": 11.0,
    "median_peak_r_null": 0.13534070860063452,
    "frac_r_gt_null": 0.8876146788990825,
    "top_channels": [
   

In [13]:
# =======================
# 上游数据与对齐（Colab一键版）
# 目标：在 root/Subject/feature_causal/ 下生成新的 Sound_aligned.npy + Sound_voicing.npy
# 关键：全程“因果/左对齐”，避免零相位；确保与 EEG 的帧数 T 一致
# 依赖：librosa, soundfile, scipy（Colab自带scipy，librosa需要安装）
# =======================
# !pip -q install librosa soundfile

import os, glob, json, math, shutil
import numpy as np
import soundfile as sf
import librosa
from scipy.signal import lfilter
from tqdm import tqdm

# ---------- 基础参数（可按需修改） ----------
FRAME_MS = 11.0         # 与现有管线一致
SR_TARGET = 16000       # 若音频采样率不一致，将重采样至此
N_MELS = 40
N_FFT_MS = 25.0         # STFT窗长（毫秒），与RMS窗一致
PREEMPH = 0.97          # 因果预加重系数
ADD_DELTAS = False      # 如需加一阶delta，可改 True
VOICING_PCT = 60        # voicing门限的RMS分位点（段内百分位）
GLOBAL_OFFSET_MS = 0.0  # 可选：若已知音频需整体延后/提前，对音频作整体移位（+为延后）

# ---------- 工具 ----------
def next_pow2(n):
    p = 1
    while p < n:
        p <<= 1
    return p

def ensure_dir(p):
    os.makedirs(p, exist_ok=True)

def causal_shift_signal(y, sr, shift_ms):
    """对音频做整体因果移位：shift_ms>0 => 向后推迟（前面补零）"""
    if abs(shift_ms) < 1e-6:
        return y
    shift_samp = int(round(sr * shift_ms / 1000.0))
    if shift_samp > 0:
        return np.concatenate([np.zeros(shift_samp, dtype=y.dtype), y])
    else:
        return y[-shift_samp:]  # 负移位：裁掉前面

def find_audio_for_segment(subject_root, base_name):
    """
    在被试目录内尝试找到与 base_name 匹配的音频文件。
    优先顺序：与特征同目录 / audio/ / wav/ / 整个subject内递归查找。
    """
    cand_names = [
        base_name.strip(".") + ".wav",
        base_name.strip(".") + ".flac",
        base_name.strip(".") + ".mp3",
    ]
    # 常见位置
    for sub in ["feature_normalized", ".", "audio", "wav", "Audio", "WAV"]:
        d = os.path.join(subject_root, sub)
        for cn in cand_names:
            p = os.path.join(d, cn)
            if os.path.exists(p):
                return p
    # 递归兜底
    for ext in ("*.wav","*.flac","*.mp3"):
        hits = glob.glob(os.path.join(subject_root, "**", ext), recursive=True)
        hits = [h for h in hits if os.path.basename(h).startswith(os.path.basename(base_name).strip("."))]
        if hits:
            return hits[0]
    return None

def causal_features_from_audio(wav_path, T_target, frame_ms=FRAME_MS, sr_target=SR_TARGET,
                               n_mels=N_MELS, n_fft_ms=N_FFT_MS, preemph=PREEMPH, add_deltas=ADD_DELTAS,
                               voicing_pct=VOICING_PCT, global_offset_ms=GLOBAL_OFFSET_MS):
    """
    从音频构造因果特征矩阵（T×D）与voicing掩码（T,）：
    - 左对齐帧: hop=frame_ms, center=False
    - 预加重采用因果lfilter
    - log-mel + RMS（附在最后一列），无NaN；可选 delta
    """
    y, sr = sf.read(wav_path, always_2d=False)
    if y.ndim > 1:
        y = np.mean(y, axis=1)  # to mono
    if sr != sr_target:
        y = librosa.resample(y.astype(np.float32), orig_sr=sr, target_sr=sr_target, res_type="kaiser_fast")
        sr = sr_target

    # 全局时间移位（若需要整体对齐补偿）
    y = causal_shift_signal(y, sr, global_offset_ms).astype(np.float32)

    # 因果预加重（y[n] - a*y[n-1]）
    y = lfilter([1.0, -preemph], [1.0], y)

    hop = int(round(sr * frame_ms / 1000.0))
    win = int(round(sr * n_fft_ms / 1000.0))
    n_fft = next_pow2(win)

    # STFT 特征（左对齐、因果窗；不使用center=True）
    S = librosa.feature.melspectrogram(
        y=y, sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop, win_length=win,
        center=False, power=2.0
    )  # (n_mels, T0)
    S_db = librosa.power_to_db(S + 1e-12, ref=np.max)  # 对数能量（不会有NaN）
    # RMS 包络（同样 left-aligned、center=False）
    rms = librosa.feature.rms(y=y, frame_length=win, hop_length=hop, center=False)[0]  # (T0,)

    # 可选：Δ
    feats = [S_db.T]  # (T0, n_mels)
    if add_deltas:
        d1 = librosa.feature.delta(S_db, order=1).T
        feats.append(d1)
    feats.append(rms[:, None])  # 附加 RMS 一列
    X = np.concatenate(feats, axis=1).astype(np.float32)  # (T0, D)

    # 因果/左对齐到 EEG 帧数：严格截断或零填充以匹配 T_target
    T0 = X.shape[0]
    if T0 < T_target:
        pad = np.zeros((T_target - T0, X.shape[1]), dtype=np.float32)
        X = np.vstack([X, pad])
        rms2 = np.concatenate([rms, np.zeros(T_target - T0, dtype=np.float32)])
    else:
        X = X[:T_target]
        rms2 = rms[:T_target]

    # 段内分位点阈值做 voicing 掩码（因果阈值本身不引入相位）
    finite_rms = rms2[np.isfinite(rms2)]
    thr = np.percentile(finite_rms, voicing_pct) if finite_rms.size else 0.0
    voicing = (rms2 > thr)

    # 时间向 z-score（逐列），避免尺度漂移；RMS 列也 z 但不影响voicing
    mu = np.nanmean(X, axis=0, keepdims=True); sd = np.nanstd(X, axis=0, keepdims=True); sd[sd==0] = 1.0
    Xz = (X - mu) / sd
    Xz[~np.isfinite(Xz)] = 0.0

    return Xz.astype(np.float32), voicing.astype(np.bool_)

def causal_features_from_existing_matrix(S_old, T_target, frame_ms=FRAME_MS, n_mels=0,
                                         voicing_cols_guess=None, voicing_pct=VOICING_PCT):
    """
    兜底：若没有原始音频，用现有 Sound_aligned.npy 自救
    - 丢弃高NaN列、插值、z-score（与清洗一致）
    - 生成L2包络+RMS代理（从矩阵行范数），拼在最后
    - voicing：优先用高NaN列构造掩码，否则用能量分位点
    """
    S = np.array(S_old, dtype=np.float32, copy=True)
    T, D = S.shape
    # 丢弃 >60% NaN 的列
    nan_rate = np.mean(~np.isfinite(S), axis=0)
    keep = nan_rate <= 0.6
    if keep.sum() == 0:
        X = np.zeros((T_target, 1), dtype=np.float32)
    else:
        S2 = S[:, keep]
        # 时间向插值 + 端点外推
        for d in range(S2.shape[1]):
            col = S2[:, d]
            m = np.isfinite(col)
            if m.sum() == 0:
                S2[:, d] = 0.0
            else:
                idx = np.arange(T)
                S2[~m, d] = np.interp(idx[~m], idx[m], col[m])
        # z
        mu = S2.mean(0, keepdims=True); sd = S2.std(0, keepdims=True); sd[sd==0]=1.0
        S2 = (S2 - mu)/sd
        # L2 包络
        env = np.sqrt((S2**2).sum(axis=1, keepdims=True))
        X = np.hstack([S2, env]).astype(np.float32)

    # 匹配 T
    if T < T_target:
        pad = np.zeros((T_target-T, X.shape[1]), dtype=np.float32)
        X = np.vstack([X, pad])
        env1 = np.concatenate([X[:T, -1], np.zeros(T_target-T, dtype=np.float32)])
    else:
        X = X[:T_target]
        env1 = X[:T_target, -1]

    # voicing
    if voicing_cols_guess is not None and len(voicing_cols_guess) > 0:
        m = np.any(np.isfinite(S[:, [c for c in voicing_cols_guess if c < D]]), axis=1)
        if T < T_target:
            m = np.concatenate([m, np.zeros(T_target-T, dtype=bool)])
        else:
            m = m[:T_target]
        voicing = m
    else:
        thr = np.percentile(env1[np.isfinite(env1)], voicing_pct) if np.any(np.isfinite(env1)) else 0.0
        voicing = env1 > thr

    return X.astype(np.float32), voicing.astype(np.bool_)

def build_upstream_for_all(root_dir,
                           old_root_suffix="feature_normalized",
                           new_root_suffix="feature_causal",
                           frame_ms=FRAME_MS, sr_target=SR_TARGET,
                           n_mels=N_MELS, n_fft_ms=N_FFT_MS,
                           preemph=PREEMPH, add_deltas=ADD_DELTAS,
                           voicing_pct=VOICING_PCT, global_offset_ms=GLOBAL_OFFSET_MS):
    """
    对所有被试、所有段：
    - 读取 EEG_aligned.npy（获得 T）
    - 尝试匹配音频并生成因果特征；若找不到音频，使用旧矩阵自救
    - 保存到 Subject/new_root_suffix/ 同名文件：
        * Sound_aligned.npy  （新的因果特征）
        * Sound_voicing.npy  （布尔掩码）
        * EEG_aligned.npy    （从旧目录复制，确保载入兼容）
        * meta.json          （记录参数）
    """
    subjects = [f for f in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, f))]
    total_done = 0; total_audio = 0; total_fallback = 0

    for subj in tqdm(subjects, desc="Subjects"):
        subj_root = os.path.join(root_dir, subj)
        old_dir = os.path.join(subj_root, old_root_suffix)
        new_dir = os.path.join(subj_root, new_root_suffix)
        if not os.path.isdir(old_dir):
            continue
        ensure_dir(new_dir)

        eeg_files = glob.glob(os.path.join(old_dir, "*EEG_aligned.npy"))
        for eeg_path in eeg_files:
            base = os.path.basename(eeg_path).replace("EEG_aligned.npy", "")
            sound_old_path = os.path.join(old_dir, base + "Sound_aligned.npy")
            # 载入 EEG 以获知 T
            try:
                E = np.load(eeg_path)
                T_target = E.shape[0]
            except Exception as e:
                print("EEG载入失败，跳过：", eeg_path, e); continue

            # 搜索音频
            wav_path = find_audio_for_segment(subj_root, base)
            # 生成新特征
            try:
                if wav_path and os.path.exists(wav_path):
                    X, vm = causal_features_from_audio(
                        wav_path, T_target, frame_ms, sr_target,
                        n_mels, n_fft_ms, preemph, add_deltas,
                        voicing_pct, global_offset_ms
                    )
                    total_audio += 1
                else:
                    # 若无音频，则用旧矩阵兜底
                    if os.path.exists(sound_old_path):
                        S_old = np.load(sound_old_path)
                    else:
                        # 若连旧矩阵也不存在，就造一个全零占位
                        S_old = np.zeros((T_target, N_MELS), dtype=np.float32)
                    # 猜 voicing 列（极高NaN率）
                    if os.path.exists(sound_old_path):
                        nan_rate = np.mean(~np.isfinite(S_old), axis=0)
                        voicing_cols_guess = np.where(nan_rate > 0.95)[0]
                    else:
                        voicing_cols_guess = None
                    X, vm = causal_features_from_existing_matrix(S_old, T_target, frame_ms, voicing_cols_guess=voicing_cols_guess)
                    total_fallback += 1
            except Exception as e:
                print("特征构造失败，跳过：", base, e); continue

            # 保存：Sound, voicing, EEG（复制）
            np.save(os.path.join(new_dir, base + "Sound_aligned.npy"), X.astype(np.float32))
            np.save(os.path.join(new_dir, base + "Sound_voicing.npy"), vm.astype(np.bool_))
            # 复制 EEG 对齐文件，保持加载兼容性
            eeg_new_path = os.path.join(new_dir, base + "EEG_aligned.npy")
            if not os.path.exists(eeg_new_path):
                try:
                    shutil.copy2(eeg_path, eeg_new_path)
                except Exception as e:
                    print("复制EEG失败：", eeg_path, "->", eeg_new_path, e)

            # 元信息
            meta = dict(
                sr_target=sr_target, frame_ms=frame_ms, n_fft_ms=n_fft_ms, n_mels=n_mels,
                preemph=preemph, add_deltas=add_deltas, voicing_pct=voicing_pct,
                global_offset_ms=global_offset_ms, used_audio=bool(wav_path and os.path.exists(wav_path)),
                wav_path=wav_path if wav_path else None
            )
            with open(os.path.join(new_dir, base + "meta.json"), "w") as f:
                json.dump(meta, f, ensure_ascii=False, indent=2)

            total_done += 1

    print(f"\n完成上游构建：总段数={total_done} | 使用音频构建={total_audio} | 兜底自救={total_fallback}")
    print("已输出至各被试的", new_root_suffix, "目录。")

# ---------- 与现有分析管线对接 ----------
def analyze_with_causal_features(root_dir):
    """
    用新的 feature_causal 目录跑你已有的分析流程（无需改动核心代码）：
    - 直接用 load_segments_with_subject_ids(root_suffix='feature_causal') 即可
    """
    # 复用你现有的函数（确保之前的分析函数已在环境里）
    eeg_segments, sound_segments, subject_ids = load_segments_with_subject_ids(root_dir, root_suffix="feature_causal")
    # 后续就按你现有管线跑：
    # 例如：eeg_segments, sound_segments, subject_ids = filter_and_summarize(...)
    #       percol_nan_mean = summarize_sound_nan(sound_segments)
    #       sound_repaired, drop_stats = batch_repair_sound(sound_segments, drop_thr=0.6)
    #       ... 或直接调用你已装配的 analyze_eeg_sound_data 替身版本
    return eeg_segments, sound_segments, subject_ids




In [14]:
# ========== 使用示例 ==========
# 1) 指定数据根目录
root_dir = "/content/drive/MyDrive/data"

# 2) 生成新的“因果对齐”的声学特征
build_upstream_for_all(root_dir,
                       old_root_suffix="feature_normalized",
                       new_root_suffix="feature_causal",
                       frame_ms=FRAME_MS, sr_target=SR_TARGET,
                       n_mels=N_MELS, n_fft_ms=N_FFT_MS,
                       preemph=PREEMPH, add_deltas=ADD_DELTAS,
                       voicing_pct=VOICING_PCT, global_offset_ms=GLOBAL_OFFSET_MS)

# 3) 用新的特征目录跑现有分析（把 root_suffix 改为 'feature_causal'）
eeg_segments, sound_segments, subject_ids = analyze_with_causal_features(root_dir)

# 或者如果你使用我给的整合管线 analyze_eeg_sound_data，可写一个轻量封装版本：
results = analyze_eeg_sound_data(root_dir, save_dir="/content/eeg_sound_results_causal")

Subjects: 100%|██████████| 20/20 [28:47<00:00, 86.38s/it] 



完成上游构建：总段数=1794 | 使用音频构建=0 | 兜底自救=1794
已输出至各被试的 feature_causal 目录。
开始从根目录 '/content/drive/MyDrive/data' 加载数据段及被试者ID...


处理被试者: 100%|██████████| 20/20 [01:38<00:00,  4.93s/it]



加载完成！总共加载了 1794 个数据段。

--- 开始数据分析管线 ---
开始从根目录 '/content/drive/MyDrive/data' 加载数据段及被试者ID...


处理被试者: 100%|██████████| 20/20 [00:27<00:00,  1.39s/it]



加载完成！总共加载了 1794 个数据段。
过滤结果：总 1794 段 -> 保留 1794 段 | 空T: 0, 空D: 0, 过短(<20帧): 0
EEG: 段数=1794, 含NaN段=0 (0.0%), 含Inf段=0 (0.0%)
Sound: 段数=1794, 含NaN段=1794 (100.0%), 含Inf段=0 (0.0%)

=== 声音特征 NaN 概览 ===
- 段级：含NaN的段 = 1794/1794 (100.0%); 中位数NaN率 = 16.67%
- 列级（跨段平均）前10个最差列及其NaN率：
  列52: 平均NaN率=99.69%
  列51: 平均NaN率=99.22%
  列50: 平均NaN率=99.22%
  列48: 平均NaN率=99.22%
  列45: 平均NaN率=99.22%
  列46: 平均NaN率=99.22%
  列47: 平均NaN率=99.22%
  列49: 平均NaN率=99.22%
  列53: 平均NaN率=99.22%
  列0: 平均NaN率=0.00%


修复声音特征: 100%|██████████| 1794/1794 [00:03<00:00, 486.27it/s]


修复完成：平均保留列数 = 45.062430323299886  / 平均原始列数 = 54.0

=== 基于 PC1 的分块噪声上限 ===
{
  "r_eeg_blk_overall_PC1": 0.6935338185221477,
  "r_sound_blk_overall_PC1": 0.27392708407007144,
  "r_max_blk_overall_PC1": 0.4358643098623171
}


Lag PC1 / Envelope: 100%|██████████| 1794/1794 [01:45<00:00, 17.06it/s]



=== 时延（PC1↔PC1）汇总 ===
{
  "median_peak_r": 0.1365618066091414,
  "median_peak_lag_ms": 55.0,
  "median_peak_r_null": 0.14892065585890285,
  "frac_r_gt_null": 0.4793756967670011
}

=== 时延（PC1↔Envelope）汇总 ===
{
  "median_peak_r": 0.12626358925003917,
  "median_peak_lag_ms": 44.0,
  "median_peak_r_null": 0.15050690789462307,
  "frac_r_gt_null": 0.463768115942029
}


Channelwise ROI: 100%|██████████| 1794/1794 [15:58<00:00,  1.87it/s]



=== 通道级 ROI 汇总 ===
{
  "median_peak_r": 0.21005677445726784,
  "median_peak_lag_ms": 11.0,
  "median_peak_r_null": 0.13534070860063452,
  "frac_r_gt_null": 0.8876146788990825,
  "top_channels": [
    [
      10,
      147
    ],
    [
      1,
      126
    ],
    [
      28,
      77
    ],
    [
      23,
      75
    ],
    [
      4,
      74
    ],
    [
      8,
      74
    ],
    [
      5,
      73
    ],
    [
      11,
      70
    ],
    [
      17,
      67
    ],
    [
      14,
      65
    ]
  ]
}


TRF per subject: 100%|██████████| 15/15 [21:47<00:00, 87.19s/it]



=== TRF（总体）===
{
  "median_of_subject_medians": 0.012809939369599645,
  "median_of_subject_nulls": -0.003862342276028379,
  "frac_subjects_better_than_null": 0.8,
  "median_TRF_peak_ms_(>=0)": 121.0
}
结果已保存到: /content/eeg_sound_results_causal
--- 数据分析管线完成 ---



In [15]:
!pip -q install resampy==0.4.3


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/3.1 MB[0m [31m6.8 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━[0m [32m2.7/3.1 MB[0m [31m38.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [16]:
# ============================================
# 根据 CSV(wav_filename_base) + 原音频(Stimuli) 生成新声学特征
# 左对齐/因果预处理 + 与 EEG 段对齐（通过与旧特征包络互相关来定位）
# 输出到: <subject>/feature_from_wav_causal/
# 依赖: librosa, soundfile
# ============================================
!pip -q install librosa soundfile

import os, glob, json, math, shutil
import numpy as np
import pandas as pd
import librosa, soundfile as sf
from scipy.signal import lfilter
from tqdm import tqdm
import math
import numpy as np
import librosa, soundfile as sf
from scipy.signal import lfilter, resample_poly

# ---------- 基本参数 ----------
root_dir = "/content/drive/MyDrive/data"
stimuli_dir = "/content/drive/MyDrive/data/Stimuli/Stimuli"

FRAME_MS   = 11.0   # 每帧时长，与现有管线一致
SR_TARGET  = 16000  # 统一采样率
N_MELS     = 40
WIN_MS     = 25.0   # STFT窗长
PREEMPH    = 0.97   # 因果预加重系数
ADD_DELTAS = False  # 是否添加delta
VOICING_PCT = 60    # 段内RMS分位点做有声阈值
NEW_SUFFIX = "feature_from_wav_causal"  # 新输出目录名
OLD_SUFFIX = "feature_normalized"       # 旧特征目录，用于对齐参照

# ---------- 工具 ----------
def ensure_dir(p): os.makedirs(p, exist_ok=True)

def next_pow2(n):
    p = 1
    while p < n: p <<= 1
    return p

def load_subject_csv(subject_dir):
    """寻找并读取 artifact/*.csv；返回DataFrame（如果没找到返回None）。"""
    cands = glob.glob(os.path.join(subject_dir, "artifact", "*.csv"))
    if not cands:
        return None
    # 如果有多个，取名字最“像”的（也可以直接取第一个）
    cands.sort()
    return pd.read_csv(cands[0])

def build_stimuli_index(stim_dir):
    """建立 {basename(不含扩展): 完整路径} 的索引。"""
    idx = {}
    for p in glob.glob(os.path.join(stim_dir, "*.wav")):
        base = os.path.splitext(os.path.basename(p))[0]
        idx[base] = p
    return idx


def _resample_with_scipy(y, orig_sr, target_sr):
    if orig_sr == target_sr:
        return y.astype(np.float32, copy=False)
    g = math.gcd(int(orig_sr), int(target_sr))
    up, down = int(target_sr//g), int(orig_sr//g)
    return resample_poly(y.astype(np.float32, copy=False), up, down).astype(np.float32)

def causal_audio_features(wav_path, sr_target=16000, frame_ms=11.0, win_ms=25.0,
                          n_mels=40, preemph=0.97, add_deltas=False):
    # 读 & 重采样（不用 resampy）
    y, sr = sf.read(wav_path, always_2d=False)
    if y.ndim > 1:
        y = np.mean(y, axis=1)
    y = _resample_with_scipy(y, sr, sr_target)
    sr = sr_target

    # 因果预加重
    y = lfilter([1.0, -preemph], [1.0], y).astype(np.float32)

    hop = int(round(sr * frame_ms / 1000.0))
    win = int(round(sr * win_ms  / 1000.0))

    # 关键：n_fft 与 win 完全一致，center=False
    n_fft = win

    S = librosa.feature.melspectrogram(
        y=y, sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop, win_length=win,
        center=False, power=2.0
    )  # (n_mels, Tm)
    S_db = librosa.power_to_db(S + 1e-12, ref=np.max)

    rms = librosa.feature.rms(y=y, frame_length=win, hop_length=hop, center=False)[0]  # (Tr,)

    # 安全对齐：取共同最小帧数
    T0 = min(S_db.shape[1], rms.shape[0])
    if T0 <= 0:
        # 兜底：返回空特征
        return np.zeros((0, n_mels + (n_mels if add_deltas else 0) + 1), dtype=np.float32), \
               np.zeros((0,), dtype=np.float32), \
               np.zeros((0,), dtype=np.float32)
    S_db = S_db[:, :T0]
    rms  = rms[:T0]

    feats = [S_db.T]  # (T0, n_mels)
    if add_deltas:
        d1 = librosa.feature.delta(S_db, order=1).T  # 与 S_db 同 T0
        feats.append(d1)
    feats.append(rms[:, None])  # (T0, 1)
    X = np.concatenate(feats, axis=1).astype(np.float32)

    # 列 z-score
    mu = X.mean(axis=0, keepdims=True)
    sd = X.std(axis=0, keepdims=True); sd[sd==0] = 1.0
    Xz = (X - mu) / sd

    # mel-L2 包络（标准化，用于对齐/质检）
    mel = Xz[:, :n_mels]
    env = np.sqrt((mel**2).sum(axis=1)).astype(np.float32)
    env = (env - env.mean()) / (env.std() if env.std()>0 else 1.0)

    return Xz.astype(np.float32), env.astype(np.float32), rms.astype(np.float32)

print("✅ Patched causal_audio_features: n_fft == win, frames aligned.")

# 2) 保险：若其他代码调用了 librosa.resample，则把它指向 SciPy 版本
def _librosa_resample_wrapper(y, orig_sr, target_sr, *args, **kwargs):
    return _resample_with_scipy(np.asarray(y), orig_sr, target_sr)

try:
    librosa.resample = _librosa_resample_wrapper
except Exception:
    pass
def envelope_from_old_matrix(S_old):
    """从旧的 Sound_aligned.npy 构造一个稳健包络（丢高NaN列、时间插值）。"""
    S = np.array(S_old, dtype=np.float32, copy=True)
    T, D = S.shape
    nan_rate = np.mean(~np.isfinite(S), axis=0)
    keep = nan_rate <= 0.6
    if keep.sum() == 0:
        return np.zeros(T, dtype=np.float32)
    S2 = S[:, keep]
    # 时间向插值
    idx = np.arange(T)
    for d in range(S2.shape[1]):
        col = S2[:, d]
        m = np.isfinite(col)
        if m.sum() == 0: S2[:, d] = 0.0
        else: S2[~m, d] = np.interp(idx[~m], idx[m], col[m])
    # 列z
    mu = S2.mean(axis=0, keepdims=True)
    sd = S2.std(axis=0, keepdims=True); sd[sd==0]=1.0
    Z = (S2 - mu)/sd
    env = np.sqrt((Z**2).sum(axis=1))
    return (env - env.mean())/(env.std() if env.std()>0 else 1.0)

def crosscorr_align_offset(ref_env, full_env, max_search=None):
    """
    在 full_env 里寻找与 ref_env（长度T_ref）最匹配的起点。
    返回起点索引 start（使得 full_env[start:start+T_ref] 与 ref_env 的相关最大）。
    如果 max_search 设定，将搜索窗口限制在 [0, max_search]。
    """
    T_ref = len(ref_env); T_full = len(full_env)
    if T_full < T_ref:
        return 0
    # 为避免O(T^2)极慢，做简单而稳健的“窗口滑动 z-score 相关”
    # 这里为了简洁，直接粗略子采样搜索步长 = 1（帧），一般音频帧数也不至于太大
    # 可选：如果非常长，可设 stride>1，再在邻域细化（这里先用 stride=1）
    if max_search is None or max_search > T_full - T_ref:
        max_search = T_full - T_ref
    best_r, best_s = -np.inf, 0
    ref = (ref_env - np.mean(ref_env)) / (np.std(ref_env) if np.std(ref_env)>0 else 1.0)
    for s in range(0, max_search+1):
        seg = full_env[s:s+T_ref]
        if seg.std() == 0: continue
        r = np.corrcoef(ref, (seg - seg.mean())/seg.std())[0,1]
        if np.isfinite(r) and r > best_r:
            best_r, best_s = r, s
    return best_s

def segment_from_full(X_full, start, T_target):
    """从整段特征 X_full 中截取 [start, start+T_target)，不足补零。"""
    T_full, D = X_full.shape
    if start < 0: start = 0
    end = start + T_target
    if start >= T_full:
        return np.zeros((T_target, D), dtype=X_full.dtype)
    if end <= T_full:
        return X_full[start:end].copy()
    # 右侧不足补零
    pad = np.zeros((end - T_full, D), dtype=X_full.dtype)
    return np.vstack([X_full[start:T_full], pad])

def voicing_from_rms(rms_seg, pct=VOICING_PCT):
    """用段内 RMS 的分位点阈值得到有声掩码。"""
    x = rms_seg[np.isfinite(rms_seg)]
    thr = np.percentile(x, pct) if x.size else 0.0
    return (rms_seg > thr)

# ---------- 主流程 ----------
def build_features_from_table_and_audio(root_dir, stimuli_dir,
                                        old_suffix=OLD_SUFFIX, new_suffix=NEW_SUFFIX):
    stim_index = build_stimuli_index(stimuli_dir)
    subjects = [s for s in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, s))]
    cache = {}  # 缓存每个wav的整段特征，避免重复计算：{wav_base: (X_full, env_full, rms_full)}
    total = 0

    for subj in tqdm(subjects, desc="Subjects"):
        subj_dir = os.path.join(root_dir, subj)
        old_dir  = os.path.join(subj_dir, old_suffix)
        if not os.path.isdir(old_dir):
            continue
        new_dir  = os.path.join(subj_dir, new_suffix)
        ensure_dir(new_dir)

        # 读CSV，收集可能的 wav basenames
        df = load_subject_csv(subj_dir)
        wav_bases = []
        if df is not None and ("WAV_Filename_Base" in df.columns):
            wav_bases = sorted(set(os.path.splitext(os.path.basename(str(x)))[0]
                       for x in df["WAV_Filename_Base"].dropna().tolist()))

        else:
            print(f"[警告] {subj} 未找到 CSV 或缺列 'wav_filename_base'，将尝试使用所有 Stimuli 下的 wav（不推荐）。")
            wav_bases = list(stim_index.keys())

        # 将可用的 wav 预加载/缓存
        for wb in wav_bases:
            if wb in cache:
                continue
            wav_path = stim_index.get(wb)
            if not wav_path or not os.path.exists(wav_path):
                print(f"[警告] Stimuli 中找不到 {wb}.wav，跳过缓存预建。")
                continue
            cache[wb] = causal_audio_features(wav_path)

        # 遍历该被试的所有段
        eeg_files = glob.glob(os.path.join(old_dir, "*EEG_aligned.npy"))
        eeg_files.sort()
        for eeg_path in eeg_files:
            base = os.path.basename(eeg_path).replace("EEG_aligned.npy","")
            E = np.load(eeg_path)  # (T, C)
            T_target = E.shape[0]

            # 旧声音（用于对齐参照）
            old_sound_path = os.path.join(old_dir, base + "Sound_aligned.npy")
            if not os.path.exists(old_sound_path):
                # 没有旧矩阵就无法通过互相关定位，只能从第一帧起截取
                S_old = None
                ref_env = None
            else:
                S_old = np.load(old_sound_path)
                ref_env = envelope_from_old_matrix(S_old)

            # 选择音频：优先CSV里的唯一项；若多项，则挑与旧包络相关最高的那一个
            pick_wb, pick_start = None, 0
            best_r = -np.inf
            for wb in wav_bases:
                if wb not in cache:
                    wav_path = stim_index.get(wb)
                    if wav_path and os.path.exists(wav_path):
                        cache[wb] = causal_audio_features(wav_path)
                    else:
                        continue
                X_full, env_full, rms_full = cache[wb]
                # 定位起点
                if ref_env is not None and len(ref_env) > 10:
                    # 搜索上限：整段音频范围内都可以；可按需设定上限以加速
                    start = crosscorr_align_offset(ref_env, env_full, max_search=None)
                    # 计算对应窗口的相关，作为选择标准
                    seg = env_full[start:start+len(ref_env)]
                    if len(seg) == len(ref_env) and seg.std()>0 and np.std(ref_env)>0:
                        r = np.corrcoef((seg - seg.mean())/seg.std(),
                                        (ref_env - ref_env.mean())/ref_env.std())[0,1]
                    else:
                        r = -np.inf
                else:
                    # 没有参照包络，就从0开始
                    start, r = 0, 0.0
                if np.isfinite(r) and r > best_r:
                    best_r, pick_wb, pick_start = r, wb, start

            if pick_wb is None:
                # 仍然找不到可用音频；用零矩阵占位
                print(f"[警告] {subj}/{base}: 未找到匹配音频，输出零特征。")
                X_seg = np.zeros((T_target, N_MELS + (N_MELS if ADD_DELTAS else 0) + 1), dtype=np.float32)
                rms_seg = np.zeros(T_target, dtype=np.float32)
            else:
                X_full, env_full, rms_full = cache[pick_wb]
                X_seg = segment_from_full(X_full, pick_start, T_target)
                rms_seg = segment_from_full(rms_full.reshape(-1,1), pick_start, T_target).ravel()

            # 有声掩码
            vm = voicing_from_rms(rms_seg, pct=VOICING_PCT)

            # 保存
            np.save(os.path.join(new_dir, base + "Sound_aligned.npy"), X_seg.astype(np.float32))
            np.save(os.path.join(new_dir, base + "Sound_voicing.npy"), vm.astype(np.bool_))

            # 复制 EEG 对齐文件，保持加载兼容性
            eeg_new_path = os.path.join(new_dir, base + "EEG_aligned.npy")
            if not os.path.exists(eeg_new_path):
                shutil.copy2(eeg_path, eeg_new_path)

            # 写 meta
            meta = dict(
                picked_wav_base=pick_wb,
                pick_start_frame=int(pick_start),
                frame_ms=FRAME_MS, sr_target=SR_TARGET,
                n_mels=N_MELS, win_ms=WIN_MS, preemph=PREEMPH,
                add_deltas=ADD_DELTAS, voicing_pct=VOICING_PCT,
                align_with="crosscorr(old_env, full_env)", align_r=float(best_r) if np.isfinite(best_r) else None
            )
            with open(os.path.join(new_dir, base + "meta.json"), "w") as f:
                json.dump(meta, f, ensure_ascii=False, indent=2)

            total += 1

    print(f"\n完成！共输出 {total} 段到各被试的 '{NEW_SUFFIX}' 目录。")

# 运行构建
build_features_from_table_and_audio(root_dir, stimuli_dir)

# （可选）随后把你的分析管线的 root_suffix 改为 'feature_from_wav_causal' 再跑一遍：


✅ Patched causal_audio_features: n_fft == win, frames aligned.


Subjects: 100%|██████████| 20/20 [48:32<00:00, 145.63s/it]


完成！共输出 1794 段到各被试的 'feature_from_wav_causal' 目录。





In [17]:
# === Patch: 让 analyze_eeg_sound_data 支持 root_suffix 并实际使用它 ===
def analyze_eeg_sound_data(root_dir, frame_ms=11.0, max_lag_ms=300, null_shift_ratio=0.33,
                           min_frames=20, drop_thr=0.6, save_dir=None, root_suffix='feature_from_wav_causal'):
    """
    分析 EEG 和 Sound 数据，计算噪声上限、时延对齐、通道级 ROI 和 TRF。
    新增参数：
        root_suffix (str): 指定从哪个子目录加载段（如 'feature_from_wav_causal' 或 'feature_causal' 或 'feature_normalized'）
    """
    import os, json, numpy as np
    print("\n--- 开始数据分析管线 ---")
    print(f"[INFO] 使用 root_suffix='{root_suffix}' 加载数据段")

    # 1) 加载（关键：使用 root_suffix）
    eegs, snds, sids = load_segments_with_subject_ids(root_dir, root_suffix=root_suffix)

    # 2) 过滤
    eegs, snds, sids = filter_and_summarize(eegs, snds, sids, min_frames=min_frames)

    # 报告 NaN/Inf（打印）
    nan_inf_report(eegs, snds)

    # 3) 声音特征 NaN 概览 & 修复
    percol_nan_mean = summarize_sound_nan(snds)
    if percol_nan_mean is None:
        Dmax = max(S.shape[1] for S in snds) if snds else 0
        percol_nan_mean = np.zeros(Dmax)
    snds_rep, drop_stats = batch_repair_sound(snds, drop_thr=drop_thr)

    # 4) 基于 PC1 的分块噪声上限
    r_eeg_blk, r_snd_blk, r_max_blk = noise_ceiling_from_pc1(eegs, snds_rep)
    noise_ceiling_pc1 = {
        "r_eeg_blk_overall_PC1": r_eeg_blk,
        "r_sound_blk_overall_PC1": r_snd_blk,
        "r_max_blk_overall_PC1": r_max_blk
    }

    # 5) 时延：PC1↔PC1 与 PC1↔Envelope
    rows_pc1, rows_env = compute_lag_stats(eegs, snds_rep, frame_ms=frame_ms, max_lag_ms=max_lag_ms)
    def _summarize_lag_rows(rows):
        def med(key):
            vals = [r.get(key, np.nan) for r in rows if np.isfinite(r.get(key, np.nan))]
            return float(np.median(vals)) if len(vals) else np.nan
        frac = np.mean([
            (r.get("peak_r", np.nan) > r.get("peak_r_null", np.nan))
            for r in rows
            if np.isfinite(r.get("peak_r", np.nan)) and np.isfinite(r.get("peak_r_null", np.nan))
        ]) if any(np.isfinite(r.get("peak_r", np.nan)) and np.isfinite(r.get("peak_r_null", np.nan)) for r in rows) else np.nan
        return {
            "median_peak_r": med("peak_r"),
            "median_peak_lag_ms": med("peak_lag_ms"),
            "median_peak_r_null": med("peak_r_null"),
            "frac_r_gt_null": float(frac) if np.isfinite(frac) else np.nan
        }
    lag_alignment = {"pc1": _summarize_lag_rows(rows_pc1), "envelope": _summarize_lag_rows(rows_env)}

    # 6) 通道级 ROI（基于 Envelope）
    voicing_cols = np.where(percol_nan_mean > 0.95)[0].tolist()
    channel_roi_summary = compute_lag_stats_channelwise(
        eegs, snds, snds_rep, frame_ms=frame_ms, max_lag_ms=max_lag_ms,
        null_shift_ratio=null_shift_ratio, voicing_cols=voicing_cols
    )

    # 7) 被试级 TRF
    trf_results_per_subject = run_trf_analysis_per_subject(
        eegs, snds, snds_rep, sids,
        voicing_cols=voicing_cols, frame_ms=frame_ms
    )

    # 8) 段级 NaN 率
    per_segment_nan_rates = []
    for S in snds:
        overall, _ = nan_breakdown_per_segment(S)
        per_segment_nan_rates.append(float(overall))

    results = {
        "noise_ceiling_pc1": noise_ceiling_pc1,
        "lag_alignment": lag_alignment,
        "channel_roi_summary": channel_roi_summary,
        "trf_results_per_subject": trf_results_per_subject,
        "sound_nan_summary": {"per_segment_rates": per_segment_nan_rates,
                              "per_col_mean": percol_nan_mean.tolist() if hasattr(percol_nan_mean, "tolist") else None},
        "sound_repair_stats": drop_stats,
        "root_suffix_used": root_suffix
    }

    # 可选保存
    if save_dir:
        os.makedirs(save_dir, exist_ok=True)
        with open(os.path.join(save_dir, "summary.json"), "w") as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        # 逐段 ROI
        import csv
        with open(os.path.join(save_dir, "channel_roi_per_segment.csv"), "w", newline="") as f:
            w = csv.DictWriter(f, fieldnames=["peak_r","peak_lag_ms","peak_r_null","best_ch"])
            w.writeheader(); w.writerows(channel_roi_summary["per_segment"])
        # 被试级 TRF
        with open(os.path.join(save_dir, "trf_per_subject.json"), "w") as f:
            json.dump(trf_results_per_subject["per_subject"], f, ensure_ascii=False, indent=2)
        # 声学列修复统计
        with open(os.path.join(save_dir, "sound_repair_stats.csv"), "w", newline="") as f:
            w = csv.DictWriter(f, fieldnames=["D_in","D_kept"])
            w.writeheader(); w.writerows(drop_stats)
        # 段级NaN率
        with open(os.path.join(save_dir, "sound_nan_rate_per_segment.csv"), "w", newline="") as f:
            w = csv.writer(f); w.writerow(["nan_rate"]); w.writerows([[x] for x in per_segment_nan_rates])
        print(f"结果已保存到: {save_dir}")

    print("--- 数据分析管线完成 ---")
    return results


In [18]:
results = analyze_eeg_sound_data(root_dir, save_dir="/content/eeg_sound_results_from_wav", root_suffix="feature_from_wav_causal")



--- 开始数据分析管线 ---
[INFO] 使用 root_suffix='feature_from_wav_causal' 加载数据段
开始从根目录 '/content/drive/MyDrive/data' 加载数据段及被试者ID...


处理被试者:  25%|██▌       | 5/20 [00:28<01:01,  4.07s/it]

加载文件 VIE_RechLn4_Cer&LIY25m_EP002_129_WomenFury2_EEG_aligned.npy 时出错: No data left in file


处理被试者: 100%|██████████| 20/20 [01:20<00:00,  4.02s/it]



加载完成！总共加载了 1793 个数据段。
过滤结果：总 1793 段 -> 保留 1793 段 | 空T: 0, 空D: 0, 过短(<20帧): 0
EEG: 段数=1793, 含NaN段=0 (0.0%), 含Inf段=0 (0.0%)
Sound: 段数=1793, 含NaN段=0 (0.0%), 含Inf段=0 (0.0%)

=== 声音特征 NaN 概览 ===
- 段级：含NaN的段 = 0/1793 (0.0%); 中位数NaN率 = 0.00%
- 列级（跨段平均）前10个最差列及其NaN率：
  列0: 平均NaN率=0.00%
  列1: 平均NaN率=0.00%
  列2: 平均NaN率=0.00%
  列3: 平均NaN率=0.00%
  列4: 平均NaN率=0.00%
  列5: 平均NaN率=0.00%
  列6: 平均NaN率=0.00%
  列7: 平均NaN率=0.00%
  列8: 平均NaN率=0.00%
  列9: 平均NaN率=0.00%


修复声音特征: 100%|██████████| 1793/1793 [00:02<00:00, 675.80it/s]


修复完成：平均保留列数 = 41.0  / 平均原始列数 = 41.0

=== 基于 PC1 的分块噪声上限 ===
{
  "r_eeg_blk_overall_PC1": 0.693258824616112,
  "r_sound_blk_overall_PC1": 0.6428187346463753,
  "r_max_blk_overall_PC1": 0.6675625516924706
}


Lag PC1 / Envelope: 100%|██████████| 1793/1793 [01:50<00:00, 16.28it/s]



=== 时延（PC1↔PC1）汇总 ===
{
  "median_peak_r": 0.1427395589178009,
  "median_peak_lag_ms": 0.0,
  "median_peak_r_null": 0.14893642109793823,
  "frac_r_gt_null": 0.4796430563301729
}

=== 时延（PC1↔Envelope）汇总 ===
{
  "median_peak_r": 0.1457437393733052,
  "median_peak_lag_ms": 0.0,
  "median_peak_r_null": 0.14751033550014306,
  "frac_r_gt_null": 0.4902398215281651
}


Channelwise ROI: 100%|██████████| 1793/1793 [17:35<00:00,  1.70it/s]



=== 通道级 ROI 汇总 ===
{
  "median_peak_r": 0.21002804880813247,
  "median_peak_lag_ms": 11.0,
  "median_peak_r_null": 0.13906479400341226,
  "frac_r_gt_null": 0.8901282766313441,
  "top_channels": [
    [
      10,
      174
    ],
    [
      1,
      145
    ],
    [
      8,
      99
    ],
    [
      28,
      82
    ],
    [
      30,
      76
    ],
    [
      17,
      72
    ],
    [
      14,
      70
    ],
    [
      2,
      70
    ],
    [
      13,
      63
    ],
    [
      11,
      61
    ]
  ]
}


TRF per subject: 100%|██████████| 15/15 [22:51<00:00, 91.40s/it]



=== TRF（总体）===
{
  "median_of_subject_medians": 0.0025876498736293607,
  "median_of_subject_nulls": 0.0015650909126277055,
  "frac_subjects_better_than_null": 0.5333333333333333,
  "median_TRF_peak_ms_(>=0)": 66.0
}
结果已保存到: /content/eeg_sound_results_from_wav
--- 数据分析管线完成 ---


In [19]:
# ===== A. 多子带 TRF 补丁（贴进一个新单元运行）=====

import numpy as np
from sklearn.model_selection import GroupKFold

# --- 小工具：二阶差分惩罚矩阵（D^T D） ---
def _diff2_mat(n):
    if n < 3:
        return np.zeros((n, n), dtype=np.float64)
    D = np.zeros((n-2, n), dtype=np.float64)
    for i in range(n-2):
        D[i, i]   = 1.0
        D[i, i+1] = -2.0
        D[i, i+2] = 1.0
    return D.T @ D  # (n,n)

# --- 设计矩阵：把 mel(B×T) 展成 [lag0全频 | lag1全频 | ...] -> (T, B*L) ---
def build_multiband_design(mel_TxB, lags_fr):
    mel = np.asarray(mel_TxB, dtype=np.float64)
    T, B = mel.shape
    L = len(lags_fr)
    X = np.zeros((T, B*L), dtype=np.float64)
    for j, Lg in enumerate(lags_fr):
        if Lg >= 0:
            X[Lg:, j*B:(j+1)*B] = mel[:T-Lg, :]
        else:
            X[:T+Lg, j*B:(j+1)*B] = mel[-Lg:, :]
    return X  # (T, B*L)

# --- 频带 & 滞后联合平滑的岭回归（alpha*I + beta_lag*kron(I_B, D2_L) + beta_band*kron(D2_B, I_L)）---
def trf_fit_multiband(Xtr, ytr, B, L, alpha=10.0, beta_lag=10.0, beta_band=1.0, add_voicing=False, alpha_voicing=None):
    # Xtr: (N, P)  其中 P = B*L [+1 if voicing]
    P = Xtr.shape[1]
    # XtX / XtY
    XtX = Xtr.T @ Xtr
    XtY = Xtr.T @ ytr

    # 正则块定位
    P0 = B*L  # 仅滞后块
    # alphaI
    A = XtX + alpha * np.eye(P)
    # 滞后方向二阶平滑（块对角）
    if beta_lag > 0 and L >= 3:
        D2_L = _diff2_mat(L)
        A[:P0, :P0] += beta_lag * np.kron(np.eye(B), D2_L)
    # 频带方向二阶平滑（跨频带）
    if beta_band > 0 and B >= 3:
        D2_B = _diff2_mat(B)
        A[:P0, :P0] += beta_band * np.kron(D2_B, np.eye(L))
    # voicing 列（如果有）：可给更强/更弱的岭
    if add_voicing and (P == P0 + 1) and (alpha_voicing is not None):
        A[P0, P0] += (alpha_voicing - alpha)

    # 求解
    w = np.linalg.solve(A, XtY)
    return w  # (P,)

# --- 方便从一段特征中取出 mel 子带与 RMS（约定：前 n_mels 列是 mel，最后一列是 RMS）---
def split_mel_rms_from_segment(Sseg, n_mels=40):
    S = np.asarray(Sseg, dtype=np.float64)
    T, D = S.shape
    if D < n_mels + 1:
        n_mels = max(1, D-1)  # 尽量向下兼容
    mel = S[:, :n_mels]
    rms = S[:, -1]
    return mel, rms

# --- 简单 RMS→voicing 掩码（段内分位数阈值）---
def voicing_from_rms(rms, pct=60):
    rms = np.asarray(rms, dtype=np.float64)
    x = rms[np.isfinite(rms)]
    thr = np.percentile(x, pct) if x.size else 0.0
    return (rms > thr)

# --- 计算多子带 TRF 的“峰滞后”（把每个滞后的 across-band L2 能量作为峰选择依据）---
def trf_peak_ms_from_weights(w, B, L, frame_ms=11.0):
    w0 = w[:B*L].reshape(L, B)  # (L, B)
    e = np.linalg.norm(w0, axis=1)  # 每个滞后的 across-band L2
    k = int(np.argmax(np.abs(e)))
    return float(k * frame_ms)

# --- 每被试的多子带 TRF 评估（嵌套CV，ROI-in-train，含可选 voicing 与超参网格） ---
def eval_subject_trf_multiband(subject_id,
                               eeg_segments, sound_segments, subject_ids,
                               frame_ms=11.0, n_mels=40,
                               lags_ms=np.arange(0, 201, 11),   # 0..200ms
                               roi_topk=3,
                               add_voicing=True, voicing_pct=60,
                               alpha_grid=(1.0, 10.0, 100.0),
                               beta_lag_grid=(0.0, 10.0, 100.0),
                               beta_band_grid=(0.0, 1.0, 10.0),
                               global_offset_ms=0,
                               standardize_XY=True):
    # 依赖：fit_standardizer, apply_standardizer, zscore1d, highpass_moving_average, select_roi_channels_for_subject
    idx = [i for i, sid in enumerate(subject_ids) if sid == subject_id]
    if len(idx) < 8:
        return {"subject": subject_id, "note": "too few segments"}
    groups = np.arange(len(idx))
    outer = GroupKFold(n_splits=min(5, len(idx)))

    # 滞后（帧）
    lags_fr = (np.array(lags_ms) / frame_ms).astype(int)
    L = len(lags_fr)

    seg_corrs, seg_corrs_null, peak_lags = [], [], []

    for tr_i, va_i in outer.split(np.zeros(len(idx)), groups=groups):
        tr_idx = [idx[i] for i in tr_i]
        va_idx = [idx[i] for i in va_i]

        # --- 训练折内选 ROI（沿用你现有选择器：基于 envelope 的 top-k 通道）---
        roi = select_roi_channels_for_subject(
            eeg_segments, sound_segments, sound_segments,  # 这里 sound_repaired==sound_segments（已无NaN）
            tr_idx, top_k=roi_topk, frame_ms=frame_ms, max_lag_ms=200, voicing_cols=None
        )

        # --- 构建数据集 ---
        def build_dataset(seg_indices):
            Xs, ys = [], []
            for k in seg_indices:
                E = eeg_segments[k]
                S = sound_segments[k]
                # mel & rms
                mel, rms = split_mel_rms_from_segment(S, n_mels=n_mels)
                # 全局正偏移（声音前移 = EEG滞后）
                if global_offset_ms and abs(global_offset_ms) > 1e-6:
                    shift = int(round(global_offset_ms / frame_ms))
                    if shift > 0:
                        # 前移：丢前面 shift 帧，后面补零
                        mel = np.vstack([mel[shift:], np.zeros((shift, mel.shape[1]))])
                        rms = np.concatenate([rms[shift:], np.zeros(shift)])
                    elif shift < 0:
                        # 负移（一般不用）
                        mel = np.vstack([np.zeros((-shift, mel.shape[1])), mel[:len(mel)+shift]])
                        rms = np.concatenate([np.zeros(-shift), rms[:len(rms)+shift]])
                # 设计矩阵（多子带）
                Xseg = build_multiband_design(mel, lags_fr)  # (T, B*L)
                # 可选：追加 voicing 1 列
                if add_voicing:
                    vm = voicing_from_rms(rms, pct=voicing_pct).astype(np.float64)
                    Xseg = np.hstack([Xseg[:len(vm)], vm.reshape(-1,1)])
                # 目标：ROI 通道平均，且高通
                ychs = []
                for ch in roi:
                    y = (E[:, ch] - E[:, ch].mean()) / (E[:, ch].std() if E[:, ch].std()>0 else 1.0)
                    y = highpass_moving_average(y, 15)
                    ychs.append(y[:Xseg.shape[0]])
                yseg = np.mean(np.vstack(ychs), axis=0) if len(ychs) else np.zeros(Xseg.shape[0])
                # 截齐
                T = min(Xseg.shape[0], len(yseg))
                Xs.append(Xseg[:T]); ys.append(yseg[:T])
            X = np.vstack(Xs); y = np.concatenate(ys)
            return X, y

        Xtr_raw, ytr_raw = build_dataset(tr_idx)
        Xva_raw_list, yva_raw_list = [], []
        for k in va_idx:
            Xk, yk = build_dataset([k])
            Xva_raw_list.append(Xk); yva_raw_list.append(yk)

        # 标准化
        if standardize_XY:
            Xmu, Xsd = fit_standardizer(Xtr_raw); Xtr = apply_standardizer(Xtr_raw, Xmu, Xsd)
            ytr, ymu, ysd = zscore1d(ytr_raw)
        else:
            Xmu, Xsd = np.zeros(Xtr_raw.shape[1]), np.ones(Xtr_raw.shape[1]); Xtr = Xtr_raw
            ytr = ytr_raw; ymu, ysd = 0.0, 1.0

        # 内层验证（80/20）
        n = len(ytr); ntr = max(1, int(0.8*n))
        Xtr_i, ytr_i = Xtr[:ntr], ytr[:ntr]
        Xvl_i, yvl_i = Xtr[ntr:], ytr[ntr:]

        # 超参网格
        B = min(n_mels, split_mel_rms_from_segment(sound_segments[tr_idx[0]], n_mels=n_mels)[0].shape[1])
        P0 = B*L
        best, best_hp = -np.inf, (10.0, 10.0, 1.0)
        for a in alpha_grid:
            for bl in beta_lag_grid:
                for bb in beta_band_grid:
                    w = trf_fit_multiband(Xtr_i, ytr_i, B, L, alpha=a, beta_lag=bl, beta_band=bb,
                                          add_voicing=add_voicing, alpha_voicing=a)
                    yhat = Xvl_i @ w
                    r = np.corrcoef(yvl_i, yhat)[0,1] if np.std(yvl_i)>0 and np.std(yhat)>0 else -np.inf
                    if np.isfinite(r) and r > best:
                        best, best_hp = r, (a, bl, bb)

        a, bl, bb = best_hp
        w = trf_fit_multiband(Xtr, ytr, B, L, alpha=a, beta_lag=bl, beta_band=bb,
                              add_voicing=add_voicing, alpha_voicing=a)

        # 记录“峰滞后”
        peak_lags.append(trf_peak_ms_from_weights(w, B, L, frame_ms=frame_ms))

        # 外层验证
        for Xk_raw, yk_raw in zip(Xva_raw_list, yva_raw_list):
            Xk = apply_standardizer(Xk_raw, Xmu, Xsd) if standardize_XY else Xk_raw
            yk = (yk_raw - ymu) / (ysd if ysd>0 else 1.0) if standardize_XY else yk_raw
            yhat = Xk @ w
            rk = np.corrcoef(yk, yhat)[0,1] if np.std(yk)>0 and np.std(yhat)>0 else np.nan
            # 置换对照：循环移位
            sh = max(1, int(0.33*len(yk)))
            yperm = np.roll(yk, sh)
            rk0 = np.corrcoef(yperm, yhat)[0,1] if np.std(yperm)>0 and np.std(yhat)>0 else np.nan
            seg_corrs.append(rk); seg_corrs_null.append(rk0)

    seg_corrs = np.array(seg_corrs, dtype=np.float64)
    seg_corrs_null = np.array(seg_corrs_null, dtype=np.float64)
    valid = np.isfinite(seg_corrs) & np.isfinite(seg_corrs_null)

    res = {
        "subject": subject_id,
        "n_segments_eval": int(valid.sum()),
        "median_pred_r": float(np.median(seg_corrs[valid])) if valid.any() else np.nan,
        "median_pred_r_null": float(np.median(seg_corrs_null[valid])) if valid.any() else np.nan,
        "frac_r_better_than_null": float(np.mean(seg_corrs[valid] > seg_corrs_null[valid])) if valid.any() else np.nan,
        "median_trf_peak_ms_(>=0)": float(np.median(peak_lags)) if len(peak_lags) else np.nan,
        "note": f"Multi-band TRF | B={n_mels}, lags={lags_ms[0]}..{lags_ms[-1]}ms | ridge + lag/band smooth"
    }
    return res

# --- 批量跑所有被试（多子带 TRF 版本） ---
def run_trf_analysis_per_subject_multiband(eeg_segments, sound_segments, subject_ids,
                                           frame_ms=11.0, n_mels=40,
                                           lags_ms=np.arange(0, 201, 11),
                                           global_offset_ms=0):
    subs = sorted(set(subject_ids))
    out = []
    for sid in subs:
        out.append(eval_subject_trf_multiband(
            sid, eeg_segments, sound_segments, subject_ids,
            frame_ms=frame_ms, n_mels=n_mels, lags_ms=lags_ms,
            global_offset_ms=global_offset_ms
        ))
    vals = [r for r in out if np.isfinite(r.get("median_pred_r", np.nan))]
    if vals:
        med_r  = float(np.median([r["median_pred_r"] for r in vals]))
        med_r0 = float(np.median([r["median_pred_r_null"] for r in vals]))
        frac   = float(np.mean([r["median_pred_r"] > r["median_pred_r_null"] for r in vals]))
        med_pk = float(np.median([r["median_trf_peak_ms_(>=0)"] for r in vals if np.isfinite(r["median_trf_peak_ms_(>=0)"])])) if any(np.isfinite(r["median_trf_peak_ms_(>=0)"]) for r in vals) else np.nan
        overall = {"median_of_subject_medians": med_r, "median_of_subject_nulls": med_r0,
                   "frac_subjects_better_than_null": frac, "median_TRF_peak_ms_(>=0)": med_pk}
    else:
        overall = {"median_of_subject_medians": np.nan, "median_of_subject_nulls": np.nan,
                   "frac_subjects_better_than_null": np.nan, "median_TRF_peak_ms_(>=0)": np.nan}
    return {"per_subject": out, "overall_summary": overall}


In [20]:
# 用你已经加载好的段（确保 root_suffix='feature_from_wav_causal'）
eegs, snds, sids = load_segments_with_subject_ids(root_dir, root_suffix='feature_from_wav_causal')

# 直接跑多子带 TRF（不做全局偏移；后面的补丁 B 会自动扫偏移）
trf_mb = run_trf_analysis_per_subject_multiband(
    eegs, snds, sids, frame_ms=11.0, n_mels=40, lags_ms=np.arange(0,201,11), global_offset_ms=0
)
print(trf_mb["overall_summary"])


开始从根目录 '/content/drive/MyDrive/data' 加载数据段及被试者ID...


处理被试者:  20%|██        | 4/20 [00:25<01:38,  6.17s/it]

加载文件 VIE_RechLn4_Cer&LIY25m_EP002_129_WomenFury2_EEG_aligned.npy 时出错: No data left in file


处理被试者: 100%|██████████| 20/20 [01:00<00:00,  3.04s/it]



加载完成！总共加载了 1793 个数据段。
{'median_of_subject_medians': -0.0014900903102061573, 'median_of_subject_nulls': -0.00995797873044558, 'frac_subjects_better_than_null': 0.8, 'median_TRF_peak_ms_(>=0)': 66.0}


In [21]:
# ===== B. 全局偏移扫描补丁（贴进新单元运行）=====

import numpy as np
from tqdm import tqdm

def _env_from_mel(S, n_mels=40):
    """从段的声学特征中抽 mel 子带并构造 L2 包络（已 z 的 mel 更佳，这里直接按输入做）。"""
    S = np.asarray(S, dtype=np.float64)
    mel = S[:, :min(n_mels, S.shape[1]-1)]
    e = np.sqrt((mel**2).sum(axis=1))
    return (e - e.mean())/(e.std() if e.std()>0 else 1.0)

def _shift_forward_matrix(S, shift_ms, frame_ms):
    """声音前移（EEG滞后）：+shift_ms 会把 S 向前滚动，尾部补零。"""
    k = int(round(shift_ms / frame_ms))
    if k <= 0:
        return S.copy()
    T, D = S.shape
    out = np.zeros_like(S)
    if k < T:
        out[k:] = S[:T-k]
    return out

def roi_score_with_offset(eeg_segments, sound_segments, frame_ms=11.0, max_lag_ms=300, offset_ms=0):
    """对所有段，计算“通道级 ROI”在给定全局偏移下的峰相关中位数（简化版）。"""
    max_lag = int(round(max_lag_ms/frame_ms))
    rs = []
    for E, S in zip(eeg_segments, sound_segments):
        Ssh = _shift_forward_matrix(S, offset_ms, frame_ms)
        ee = _env_from_mel(Ssh)
        best = -np.inf
        for ch in range(E.shape[1]):
            y = (E[:, ch] - E[:, ch].mean()) / (E[:, ch].std() if E[:, ch].std()>0 else 1.0)
            y = highpass_moving_average(y, 15)
            T = min(len(y), len(ee))
            y0 = (y[:T] - y[:T].mean())/(y[:T].std() if y[:T].std()>0 else 1.0)
            e0 = (ee[:T] - ee[:T].mean())/(ee[:T].std() if ee[:T].std()>0 else 1.0)
            vals = []
            for lag in range(-max_lag, max_lag+1):
                if lag >= 0:
                    a, b = y0[lag:], e0[:T-lag]
                else:
                    a, b = y0[:T+lag], e0[-lag:]
                if len(a) < 20 or a.std()==0 or b.std()==0:
                    vals.append(np.nan); continue
                vals.append(np.corrcoef(a, b)[0,1])
            r = np.nanmax(vals)
            if np.isfinite(r) and r > best:
                best = r
        if np.isfinite(best):
            rs.append(best)
    return float(np.median(rs)) if rs else np.nan

def pick_global_offset_ms(eeg_segments, sound_segments, candidates_ms=(0,33,55,88,121), frame_ms=11.0, max_lag_ms=300):
    scores = {}
    for m in candidates_ms:
        scores[m] = roi_score_with_offset(eeg_segments, sound_segments, frame_ms=frame_ms, max_lag_ms=max_lag_ms, offset_ms=m)
    # 选最高的；若并列，取偏移较小者
    best = sorted(scores.items(), key=lambda kv: (-np.nan_to_num(kv[1], nan=-np.inf), kv[0]))[0][0]
    return best, scores

# —— 一键：先扫偏移，再跑多子带 TRF（补丁 A 里的函数）——
def run_multiband_trf_with_auto_offset(root_dir, root_suffix='feature_from_wav_causal',
                                       frame_ms=11.0, n_mels=40, lags_ms=np.arange(0,201,11),
                                       offset_candidates=(0,33,55,88,121)):
    # 载入段
    eegs, snds, sids = load_segments_with_subject_ids(root_dir, root_suffix=root_suffix)
    # 简单过滤（若你已有 filter_and_summarize，可在此调用）
    # 这里默认你的数据已经是有效帧数
    # 扫描偏移
    best_off, all_scores = pick_global_offset_ms(eegs, snds, candidates_ms=offset_candidates, frame_ms=frame_ms)
    print("全局偏移扫描：", all_scores, " → 选择", best_off, "ms")

    # 跑多子带 TRF
    trf_mb = run_trf_analysis_per_subject_multiband(
        eegs, snds, sids, frame_ms=frame_ms, n_mels=n_mels, lags_ms=lags_ms,
        global_offset_ms=best_off
    )
    return {"best_offset_ms": best_off, "offset_scores": all_scores, "trf_multiband": trf_mb}


In [22]:
res_auto = run_multiband_trf_with_auto_offset(
    root_dir, root_suffix='feature_from_wav_causal',
    frame_ms=11.0, n_mels=40, lags_ms=np.arange(0,201,11),
    offset_candidates=(0,33,55,88,121)
)
print("最佳全局偏移(ms):", res_auto["best_offset_ms"])
print("多子带 TRF 总体：", res_auto["trf_multiband"]["overall_summary"])


开始从根目录 '/content/drive/MyDrive/data' 加载数据段及被试者ID...


处理被试者:  20%|██        | 4/20 [00:12<00:48,  3.05s/it]

加载文件 VIE_RechLn4_Cer&LIY25m_EP002_129_WomenFury2_EEG_aligned.npy 时出错: No data left in file


处理被试者: 100%|██████████| 20/20 [00:47<00:00,  2.36s/it]



加载完成！总共加载了 1793 个数据段。
全局偏移扫描： {0: 0.13130889287633654, 33: 0.13601997784196465, 55: 0.13779113666156684, 88: 0.13576752771856193, 121: 0.13391909700439705}  → 选择 55 ms
最佳全局偏移(ms): 55
多子带 TRF 总体： {'median_of_subject_medians': 0.0018318710500284817, 'median_of_subject_nulls': -0.004173565441407767, 'frac_subjects_better_than_null': 0.6, 'median_TRF_peak_ms_(>=0)': 143.0}
