In [2]:
#!/usr/bin/env python3
"""
Cross-domain cvPCA between ViT embeddings and mouse brain population responses.

Computes the singular values of the cross-covariance matrix between ViT embeddings
and neural responses (averaged across trials). These singular values represent
shared, stimulus-driven variance across the two representational domains.

Author: Maria + Pl√§ku üêæ
"""

import numpy as np
import pickle
from sklearn.decomposition import PCA
from scipy.special import softmax
from skbio.stats.composition import clr

# ---------------------------------------------------------------
# CONFIG
# ---------------------------------------------------------------
VIT_PATH    = '/home/maria/Documents/HuggingMouseData/MouseViTEmbeddings/google_vit-base-patch16-224_embeddings_logits.pkl'
NEURAL_PATH = '/home/maria/LuckyMouse/pixel_transformer_neuro/data/processed/hybrid_neural_responses.npy'
AREAS_PATH  = '/home/maria/MITNeuralComputation/visualization/brain_area.npy'
AREA_NAME   = 'VISp'     # e.g., VISp, VISam, VISpm...
N_IMAGES, N_TRIALS = 118, 50
VAR_CUTOFF  = 0.90
RANDOM_SEED = 42

# ---------------------------------------------------------------
# LOAD ViT EMBEDDINGS
# ---------------------------------------------------------------
print("üîπ Loading ViT embeddings...")
with open(VIT_PATH, 'rb') as f:
    vit_logits = pickle.load(f)['natural_scenes']  # shape: (images, D_vit)

Xv = softmax(np.asarray(vit_logits), axis=1)
Xv = clr(Xv + 1e-12)

# PCA to reduce ViT to 90% variance
vit_pca_full = PCA(random_state=RANDOM_SEED).fit(Xv)
vit_cumvar = np.cumsum(vit_pca_full.explained_variance_ratio_)
vit_ncomp = np.searchsorted(vit_cumvar, VAR_CUTOFF) + 1
vit_pca = PCA(n_components=vit_ncomp, random_state=RANDOM_SEED)
Zv = vit_pca.fit_transform(Xv)  # (images √ó vit_ncomp)
print(f"ViT PCs covering 90% variance: {vit_ncomp}")

# ---------------------------------------------------------------
# LOAD NEURAL DATA (AVERAGED RESPONSES)
# ---------------------------------------------------------------
print(f"üîπ Loading neural responses for {AREA_NAME}...")
dat = np.load(NEURAL_PATH, mmap_mode='r')
areas = np.load(AREAS_PATH, allow_pickle=True)
mask = (areas == AREA_NAME)
dat = dat[mask]

n_neurons, n_total = dat.shape
n_time = n_total // (N_IMAGES * N_TRIALS)
dat = dat.reshape(n_neurons, N_IMAGES, N_TRIALS, n_time)
X_mean = dat.mean(axis=(2,3))  # (neurons √ó images)

# PCA to reduce neural data to 90% variance
brain_pca_full = PCA(random_state=RANDOM_SEED).fit(X_mean.T)
brain_cumvar = np.cumsum(brain_pca_full.explained_variance_ratio_)
brain_ncomp = np.searchsorted(brain_cumvar, VAR_CUTOFF) + 1
brain_pca = PCA(n_components=brain_ncomp, random_state=RANDOM_SEED)
Zb = brain_pca.fit_transform(X_mean.T)  # (images √ó brain_ncomp)
print(f"{AREA_NAME} PCs covering 90% variance: {brain_ncomp}")

# ---------------------------------------------------------------
# CROSS-DOMAIN cvPCA (CROSS-COVARIANCE SVD)
# ---------------------------------------------------------------
print("üîπ Computing cross-domain covariance and SVD ...")

# Center both representations
Zv -= Zv.mean(axis=0, keepdims=True)
Zb -= Zb.mean(axis=0, keepdims=True)

# Compute cross-covariance (ViT ‚Üî Brain)
C = (Zv.T @ Zb) / Zv.shape[0]  # shape: (vit_ncomp √ó brain_ncomp)

# Perform SVD
U, S, Vt = np.linalg.svd(C, full_matrices=False)

shared_var = S**2 / np.sum(S**2)  # normalized shared variance per component

# ---------------------------------------------------------------
# REPORT RESULTS
# ---------------------------------------------------------------
print("\n===== Cross-domain cvPCA results =====")
for i, (s, frac) in enumerate(zip(S, shared_var), 1):
    print(f"Component {i:2d}:  shared œÉ = {s:.4f}  |  fraction = {frac*100:.2f}%")

print(f"\n‚úÖ Total shared variance across {len(S)} components = 100%")

# Save results for visualization
np.savez(f"vit_{AREA_NAME}_cvpca_results.npz",
         singular_values=S,
         shared_fraction=shared_var,
         vit_basis=U,
         brain_basis=Vt.T,
         vit_scores=Zv,
         brain_scores=Zb)

print(f"üíæ Saved to vit_{AREA_NAME}_cvpca_results.npz")


üîπ Loading ViT embeddings...
ViT PCs covering 90% variance: 44
üîπ Loading neural responses for VISp...
VISp PCs covering 90% variance: 87
üîπ Computing cross-domain covariance and SVD ...

===== Cross-domain cvPCA results =====
Component  1:  shared œÉ = 14.5762  |  fraction = 33.68%
Component  2:  shared œÉ = 11.5751  |  fraction = 21.24%
Component  3:  shared œÉ = 5.8192  |  fraction = 5.37%
Component  4:  shared œÉ = 5.5102  |  fraction = 4.81%
Component  5:  shared œÉ = 5.2341  |  fraction = 4.34%
Component  6:  shared œÉ = 4.5741  |  fraction = 3.32%
Component  7:  shared œÉ = 4.2715  |  fraction = 2.89%
Component  8:  shared œÉ = 3.8619  |  fraction = 2.36%
Component  9:  shared œÉ = 3.6181  |  fraction = 2.08%
Component 10:  shared œÉ = 3.2361  |  fraction = 1.66%
Component 11:  shared œÉ = 3.0995  |  fraction = 1.52%
Component 12:  shared œÉ = 2.8749  |  fraction = 1.31%
Component 13:  shared œÉ = 2.7464  |  fraction = 1.20%
Component 14:  shared œÉ = 2.6803  |  fraction =