In [1]:
#!/usr/bin/env python3
"""
Run statistical tests for all brain areas:
H0: corr(ViT_PC_i, BrainArea_PC_j) = 0

Performs Benjamini‚ÄìHochberg correction (q < 0.05) within each area.
Saves one summary CSV of all significant results.
"""

import os
import pickle
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from scipy.special import softmax
from skbio.stats.composition import clr
from scipy.stats import pearsonr
from statsmodels.stats.multitest import multipletests

# ---------------------------------------------------------------
# CONFIG
# ---------------------------------------------------------------
VIT_PATH = '/home/maria/Documents/HuggingMouseData/MouseViTEmbeddings/google_vit-base-patch16-224_embeddings_logits.pkl'
NEURAL_PATH = '/home/maria/LuckyMouse/pixel_transformer_neuro/data/processed/hybrid_neural_responses.npy'
AREAS_PATH = '/home/maria/MITNeuralComputation/visualization/brain_area.npy'
N_IMAGES, N_TRIALS = 118, 50
ALPHA_FDR = 0.05
OUT_CSV = "vit_brainarea_corr_summary.csv"

# ---------------------------------------------------------------
# LOAD VIT EMBEDDINGS + PCA
# ---------------------------------------------------------------
print("üîπ Loading ViT embeddings...")
with open(VIT_PATH, 'rb') as f:
    vit_logits = pickle.load(f)['natural_scenes']
embeddings = np.asarray(vit_logits)
X = softmax(embeddings, axis=1)
X_clr = clr(X + 1e-12)

pca_vit_full = PCA(n_components=min(X_clr.shape), random_state=0)
pca_vit_full.fit(X_clr)
vit_cumvar = np.cumsum(pca_vit_full.explained_variance_ratio_)
vit_ncomp = np.searchsorted(vit_cumvar, 0.90) + 1
print(f"ViT PCs covering 90% variance: {vit_ncomp}")

pca_vit = PCA(n_components=vit_ncomp, random_state=0)
vit_U = pca_vit.fit_transform(X_clr)
vit_var = pca_vit.explained_variance_ratio_

# ---------------------------------------------------------------
# LOAD NEURAL DATA
# ---------------------------------------------------------------
print("üîπ Loading neural data and area labels...")
dat = np.load(NEURAL_PATH, mmap_mode='r')
areas = np.load(AREAS_PATH, allow_pickle=True)
unique_areas = np.unique(areas)
print(f"Found {len(unique_areas)} brain areas:", list(unique_areas))

# ---------------------------------------------------------------
# LOOP OVER AREAS
# ---------------------------------------------------------------
records = []

for area in unique_areas:
    print(f"\n===== Processing {area} =====")
    mask = areas == area
    dat_area = dat[mask]
    if dat_area.size == 0:
        continue

    n_neurons, n_total = dat_area.shape
    n_time = n_total // (N_IMAGES * N_TRIALS)
    dat_area = dat_area.reshape(n_neurons, N_IMAGES, N_TRIALS, n_time)
    X_mean = dat_area.mean(axis=(2, 3))  # (neurons √ó images)

    # Skip if <5 neurons (unstable PCA)
    if X_mean.shape[0] < 5:
        print(f"Skipping {area}: too few neurons ({X_mean.shape[0]}).")
        continue

    # PCA on area
    pca_brain_full = PCA(n_components=min(X_mean.shape), random_state=0)
    pca_brain_full.fit(X_mean - X_mean.mean(axis=0))
    brain_cumvar = np.cumsum(pca_brain_full.explained_variance_ratio_)
    brain_ncomp = np.searchsorted(brain_cumvar, 0.90) + 1

    pca_brain = PCA(n_components=brain_ncomp, random_state=0)
    brain_V = pca_brain.fit_transform(X_mean - X_mean.mean(axis=0))
    brain_loadings = pca_brain.components_.T  # (images √ó brain_ncomp)

    print(f"{area}: {brain_ncomp} PCs (90% variance)")

    # Correlation matrix
    r_vals, p_vals, pairs = [], [], []
    for i in range(vit_ncomp):
        for j in range(brain_ncomp):
            r, p = pearsonr(vit_U[:, i], brain_loadings[:, j])
            r_vals.append(r)
            p_vals.append(p)
            pairs.append((i + 1, j + 1))

    r_vals = np.array(r_vals)
    p_vals = np.array(p_vals)
    n_tests = len(p_vals)

    # FDR correction within this area
    reject, pvals_corr, _, _ = multipletests(p_vals, alpha=ALPHA_FDR, method='fdr_bh')
    sig_idx = np.where(reject)[0]

    if len(sig_idx) > 0:
        print(f"‚úÖ {len(sig_idx)}/{n_tests} significant correlations (q < {ALPHA_FDR})")
    else:
        print(f"‚ùå No significant correlations (n = {n_tests})")

    # Record results
    for idx in range(n_tests):
        vit_pc, brain_pc = pairs[idx]
        records.append({
            "area": area,
            "vit_pc": vit_pc,
            "brain_pc": brain_pc,
            "r": r_vals[idx],
            "p_raw": p_vals[idx],
            "p_fdr": pvals_corr[idx],
            "significant": bool(reject[idx]),
            "n_tests_in_area": n_tests
        })

# ---------------------------------------------------------------
# SAVE SUMMARY
# ---------------------------------------------------------------
df = pd.DataFrame(records)
df.to_csv(OUT_CSV, index=False)
print(f"\nüíæ Results saved to {OUT_CSV}")

# Show top significant pairs
sig_df = df[df["significant"]].sort_values("p_fdr")
if not sig_df.empty:
    print("\nTop significant correlations after FDR correction:")
    print(sig_df[["area", "vit_pc", "brain_pc", "r", "p_fdr"]].head(10))
else:
    print("No significant pairs found across areas.")


üîπ Loading ViT embeddings...
ViT PCs covering 90% variance: 44
üîπ Loading neural data and area labels...
Found 6 brain areas: ['VISal', 'VISam', 'VISl', 'VISp', 'VISpm', 'VISrl']

===== Processing VISal =====
VISal: 71 PCs (90% variance)
‚ùå No significant correlations (n = 3124)

===== Processing VISam =====
VISam: 61 PCs (90% variance)
‚úÖ 1/2684 significant correlations (q < 0.05)

===== Processing VISl =====
VISl: 77 PCs (90% variance)
‚ùå No significant correlations (n = 3388)

===== Processing VISp =====
VISp: 80 PCs (90% variance)
‚ùå No significant correlations (n = 3520)

===== Processing VISpm =====
VISpm: 69 PCs (90% variance)
‚úÖ 2/3036 significant correlations (q < 0.05)

===== Processing VISrl =====
VISrl: 81 PCs (90% variance)
‚ùå No significant correlations (n = 3564)

üíæ Results saved to vit_brainarea_corr_summary.csv

Top significant correlations after FDR correction:
        area  vit_pc  brain_pc         r     p_fdr
12719  VISpm       1         4  0.436290  0.

In [2]:
#!/usr/bin/env python3
"""
Global (inter-area) correlation test between ViT and full neural population PCs.

Tests H0: corr(ViT_PC_i, Brain_PC_j) = 0
Applies Benjamini‚ÄìHochberg FDR correction (q < 0.05) across all ViT‚ÄìBrain PC pairs.
"""

import os
import pickle
import numpy as np
from sklearn.decomposition import PCA
from scipy.special import softmax
from skbio.stats.composition import clr
from scipy.stats import pearsonr
from statsmodels.stats.multitest import multipletests
import pandas as pd

# ---------------------------------------------------------------
# CONFIG
# ---------------------------------------------------------------
VIT_PATH = '/home/maria/Documents/HuggingMouseData/MouseViTEmbeddings/google_vit-base-patch16-224_embeddings_logits.pkl'
NEURAL_PATH = '/home/maria/LuckyMouse/pixel_transformer_neuro/data/processed/hybrid_neural_responses.npy'
N_IMAGES, N_TRIALS = 118, 50
ALPHA_FDR = 0.05
OUT_PATH = "vit_brain_global_corr_summary.csv"

# ---------------------------------------------------------------
# LOAD ViT EMBEDDINGS + PCA
# ---------------------------------------------------------------
print("üîπ Loading ViT embeddings ...")
with open(VIT_PATH, 'rb') as f:
    vit_logits = pickle.load(f)['natural_scenes']

embeddings = np.asarray(vit_logits)
X = softmax(embeddings, axis=1)
X_clr = clr(X + 1e-12)

pca_vit_full = PCA(n_components=min(X_clr.shape), random_state=0)
pca_vit_full.fit(X_clr)
vit_cumvar = np.cumsum(pca_vit_full.explained_variance_ratio_)
vit_ncomp = np.searchsorted(vit_cumvar, 0.90) + 1
print(f"ViT PCs covering 90% variance: {vit_ncomp}")

pca_vit = PCA(n_components=vit_ncomp, random_state=0)
vit_U = pca_vit.fit_transform(X_clr)  # (images √ó vit_ncomp)
vit_var = pca_vit.explained_variance_ratio_

# ---------------------------------------------------------------
# LOAD NEURAL DATA (ALL AREAS COMBINED)
# ---------------------------------------------------------------
print("üîπ Loading all neural responses ...")
dat = np.load(NEURAL_PATH, mmap_mode='r')  # (neurons √ó (images√ótrials√ótime))
n_neurons, n_total = dat.shape
n_time = n_total // (N_IMAGES * N_TRIALS)

print(f"Data shape: {n_neurons} neurons, {N_IMAGES} images √ó {N_TRIALS} trials √ó {n_time} timepoints")

# Reshape and average across trials and time
dat = dat.reshape(n_neurons, N_IMAGES, N_TRIALS, n_time)
X_mean = dat.mean(axis=(2, 3))  # (neurons √ó images)
print("Averaged responses:", X_mean.shape)

# ---------------------------------------------------------------
# PCA ON ALL NEURONS (GLOBAL POPULATION)
# ---------------------------------------------------------------
print("üîπ Running PCA on full neural population ...")
X_centered = X_mean - X_mean.mean(axis=0, keepdims=True)
pca_brain_full = PCA(n_components=min(X_centered.shape), random_state=0)
pca_brain_full.fit(X_centered)
brain_cumvar = np.cumsum(pca_brain_full.explained_variance_ratio_)
brain_ncomp = np.searchsorted(brain_cumvar, 0.90) + 1
print(f"Neural PCs covering 90% variance: {brain_ncomp}")

pca_brain = PCA(n_components=brain_ncomp, random_state=0)
brain_V = pca_brain.fit_transform(X_centered)   # (neurons √ó brain_ncomp)
brain_loadings = pca_brain.components_.T        # (images √ó brain_ncomp)
brain_var = pca_brain.explained_variance_ratio_

# ---------------------------------------------------------------
# CORRELATION TESTS
# ---------------------------------------------------------------
print("üîπ Computing ViT ‚Üî Neural PC correlations ...")
r_vals, p_vals, pairs = [], [], []

for i in range(vit_ncomp):
    for j in range(brain_ncomp):
        r, p = pearsonr(vit_U[:, i], brain_loadings[:, j])
        r_vals.append(r)
        p_vals.append(p)
        pairs.append((i+1, j+1))

r_vals = np.array(r_vals)
p_vals = np.array(p_vals)
n_tests = len(p_vals)
print(f"Total tests performed: {n_tests}")

# ---------------------------------------------------------------
# MULTIPLE TESTING CORRECTION
# ---------------------------------------------------------------
reject, pvals_corrected, _, _ = multipletests(p_vals, alpha=ALPHA_FDR, method='fdr_bh')

sig_idx = np.where(reject)[0]
if len(sig_idx) == 0:
    print("‚ùå No correlations survived FDR correction.")
else:
    print(f"‚úÖ {len(sig_idx)}/{n_tests} correlations survived FDR correction (q < {ALPHA_FDR}).")
    print("\nSignificant ViT‚ÄìBrain PC pairs:")
    for idx in sig_idx:
        i, j = pairs[idx]
        print(f"  ViT PC{i:2d} ‚Üî Brain PC{j:2d} | r = {r_vals[idx]:+.3f}, p = {p_vals[idx]:.2e}, q = {pvals_corrected[idx]:.2e}")

# ---------------------------------------------------------------
# SAVE RESULTS
# ---------------------------------------------------------------
df = pd.DataFrame({
    "vit_pc": [p[0] for p in pairs],
    "brain_pc": [p[1] for p in pairs],
    "r": r_vals,
    "p_raw": p_vals,
    "p_fdr": pvals_corrected,
    "significant": reject
})

df.to_csv(OUT_PATH, index=False)
print(f"\nüíæ Results saved to {OUT_PATH}")

sig_df = df[df["significant"]].sort_values("p_fdr")
if not sig_df.empty:
    print("\nTop significant pairs after FDR correction:")
    print(sig_df[["vit_pc", "brain_pc", "r", "p_fdr"]].head(10))
else:
    print("No significant pairs found.")


üîπ Loading ViT embeddings ...
ViT PCs covering 90% variance: 44
üîπ Loading all neural responses ...
Data shape: 39209 neurons, 118 images √ó 50 trials √ó 1 timepoints
Averaged responses: (39209, 118)
üîπ Running PCA on full neural population ...
Neural PCs covering 90% variance: 80
üîπ Computing ViT ‚Üî Neural PC correlations ...
Total tests performed: 3520
‚ùå No correlations survived FDR correction.

üíæ Results saved to vit_brain_global_corr_summary.csv
No significant pairs found.


‚úÖ Saved bayes_pc1.csv
   neuron_idx  pearson_r  beta_mean  beta_lo95  beta_hi95  post_P_beta_gt0  \
0           0   0.013737   0.013621  -0.167887   0.195128         0.558940   
1           1   0.160575   0.159214  -0.019976   0.338403         0.959459   
2           2   0.175615   0.174127  -0.004601   0.352855         0.971952   
3           3  -0.000108  -0.000107  -0.181632   0.181417         0.499534   
4           4  -0.012887  -0.012778  -0.194287   0.168732         0.444684   

   post_P_abs_beta_gt_0.1  
0                0.282743  
1                0.745403  
2                0.794908  
3                0.277535  
4                0.282120  
Neurons with P(beta>0) >= 0.95: 4196
Neurons with P(|beta| > 0.1) >= 0.95: 678
