# Cross-validation to validate the accuracy of the model trained on specter

## Text to brian

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold

import torch
from torch import nn
import warnings

from tqdm.notebook import tqdm

import os

from neurovlm.data import fetch_data
from neurovlm.models import NeuroAutoEncoder, TextAligner
from neurovlm.train import Trainer, which_device

device = torch.device('cpu')
warnings.filterwarnings("ignore")

In [2]:


# Load data once, onto the correct device
latent_text = torch.load(
    "specter/latent_text.pt", weights_only=False
).to("cpu")

with torch.no_grad():
    latent_text_detach = torch.load("specter/latent_text.pt").to("cpu").detach()

neuro_vectors = torch.load(
    "specter/neuro_vectors.pt", weights_only=False
).to("cpu")

neuro_vectors = neuro_vectors[:1000]

In [4]:
# Set up k-fold Cross-Validation
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Loss function for reconstruction
loss_fn = nn.BCELoss()

fold_val_losses = []

# Outer loop with tqdm
for fold_num, (train_idx, val_idx) in enumerate(
        tqdm(kf.split(neuro_vectors), total=k, desc="Fold"), start=1
    ):
    print(f"\n--- Fold {fold_num}/{k} ---")

    # Split `train_idx` further into sub-train and sub-test
    train_sub_idx, test_sub_idx = train_test_split(
        train_idx, train_size=0.8, random_state=0
    )

    # Build tensors for this fold
    X_train = neuro_vectors[train_sub_idx]
    X_test  = neuro_vectors[test_sub_idx]
    X_val   = neuro_vectors[val_idx]

    # 1) Train AutoEncoder
    trainer = Trainer(
        NeuroAutoEncoder(seed=fold_num),
        n_epochs=100,
        batch_size=256,
        lr=1e-4,
        loss_fn=loss_fn,
        optimizer=torch.optim.AdamW,
        X_val=X_test,
        device="cpu"
    )

    trainer.fit(X_train)
    autoencoder = trainer.model

    # 2) Train TextAligner
    # Encode MNI vectors and fully detach to break any graph connectivity
    with torch.no_grad():
        latent_neuro = autoencoder.encoder(
            neuro_vectors.to("cpu")
        )
    latent_neuro = latent_neuro.detach().clone()

    align_trainer = Trainer(
        TextAligner(
            latent_text_dim=latent_text.shape[-1],
            hidden_dim=384,
            latent_neuro_dim=latent_neuro.shape[-1],
            seed=0
        ),
        n_epochs=500,
        batch_size=1028,
        lr=2e-4,
        loss_fn=nn.MSELoss(),
        optimizer=torch.optim.AdamW,
        X_val=latent_text_detach[test_sub_idx],
        y_val=latent_neuro[test_sub_idx],
        device="cpu"
    )

    align_trainer.fit(
        latent_text_detach[train_sub_idx],
        latent_neuro[train_sub_idx]
    )

    aligner = align_trainer.model

    # 3) Compute final validation loss on the held-out fold
    autoencoder.eval()
    with torch.no_grad():
        latent_batch = latent_text[val_idx]
        aligned_batch = aligner(latent_batch)
        model_prediction = autoencoder.to('cpu').decoder(aligned_batch)
        val_loss = loss_fn(model_prediction, X_val).item()

    print(f"Validation BCE Loss: {val_loss:.4f}")
    fold_val_losses.append(val_loss)

# 4) Summarize across all folds
mean_loss = sum(fold_val_losses) / k
print(f"\nFold-wise losses: {fold_val_losses}")
print(f"Mean validation loss over {k} folds: {mean_loss:.4f}")

Fold:   0%|          | 0/5 [00:00<?, ?it/s]


--- Fold 1/5 ---
Validation BCE Loss: 0.6869108 (initial) -> 8.048 (current)))

--- Fold 2/5 ---
Validation BCE Loss: 0.6908949 (initial) -> 8.2855 (current)

--- Fold 3/5 ---
Validation BCE Loss: 0.6997182 (initial) -> 8.5162 (current))

--- Fold 4/5 ---
Validation BCE Loss: 0.6444784 (initial) -> 8.9977 (current))

--- Fold 5/5 ---
Validation BCE Loss: 0.6736145 (initial) -> 9.65 (current)t))

Fold-wise losses: [0.6868982911109924, 0.6908074021339417, 0.6997487545013428, 0.6443724036216736, 0.6736149191856384]
Mean validation loss over 5 folds: 0.6791


## Brain to text

In [5]:
import numpy as np
import torch
from torch import nn
from sklearn.model_selection import train_test_split, KFold
from tqdm.notebook import tqdm
from neurovlm.models import NeuroAutoEncoder, TextAligner
from neurovlm.train import Trainer, which_device

In [None]:
# Determine compute device
# You can set this to "cpu" or use which_device() to auto-select
device = "cpu"

# Load precomputed latents and brain vectors
latent_text = torch.load(
    "specter/latent_text.pt",
    map_location=device
).to(device)
# Detached copy for aligner training
latent_text_detach = latent_text.detach().clone()

neuro_vectors = torch.load(
    "specter/neuro_vectors.pt",
    map_location=device
).to(device)

In [6]:
# 5-fold CV
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

fold_scores = []
top_k = 5

for fold_num, (train_idx, val_idx) in enumerate(
        tqdm(kf.split(neuro_vectors), total=k, desc="Fold"), start=1
    ):
    print(f"\n--- Fold {fold_num}/{k} ---")

    # Split train into sub-train, sub-test
    train_sub_idx, test_sub_idx = train_test_split(
        train_idx, train_size=0.8, random_state=0
    )

    # Subsets
    X_train = neuro_vectors[train_sub_idx]
    X_test  = neuro_vectors[test_sub_idx]

    # 1) Train AutoEncoder on sub-train
    ae_trainer = Trainer(
        NeuroAutoEncoder(seed=fold_num),
        n_epochs=100,
        batch_size=256,
        lr=1e-4,
        loss_fn=nn.BCELoss(),
        optimizer=torch.optim.AdamW,
        X_val=X_test,
        device=device
    )
    ae_trainer.fit(X_train)
    autoencoder = ae_trainer.model.eval()

    # 2) Train TextAligner
    # Encode all neuro into detached tensor
    with torch.no_grad():
        latent_neuro = autoencoder.encoder(neuro_vectors)
    latent_neuro = latent_neuro.detach().clone()

    align_trainer = Trainer(
        TextAligner(
            latent_text_dim=latent_text.shape[-1],
            hidden_dim=384,
            latent_neuro_dim=latent_neuro.shape[-1],
            seed=0
        ),
        n_epochs=500,
        batch_size=1024,
        lr=2e-4,
        loss_fn=nn.MSELoss(),
        optimizer=torch.optim.AdamW,
        X_val=latent_text_detach[test_sub_idx],
        y_val=latent_neuro[test_sub_idx],
        device=device
    )
    align_trainer.fit(
        latent_text_detach[train_sub_idx],
        latent_neuro[train_sub_idx]
    )
    aligner = align_trainer.model.eval()

    # 3) Evaluate retrieval on held-out fold
    # Candidate documents: those in the outer training set
    docs = latent_text[train_idx]
    with torch.no_grad():
        aligned_docs = aligner(docs)
    docs_np = aligned_docs.detach().cpu().numpy().astype(np.float32)
    docs_norm = docs_np / np.linalg.norm(docs_np, axis=1, keepdims=True)

    # Query brain vectors for validation set
    with torch.no_grad():
        q_neuro = autoencoder.encoder(neuro_vectors[val_idx])
    queries_np = q_neuro.detach().cpu().numpy().astype(np.float32)

    # Compute average top-k cosine similarity per query
    fold_similarities = []
    for vec in queries_np:
        vec_norm = vec / np.linalg.norm(vec)
        sims = docs_norm @ vec_norm
        topk = np.sort(sims)[-top_k:]
        fold_similarities.append(topk.mean())

    fold_score = np.mean(fold_similarities)
    print(f"Fold {fold_num} avg top-{top_k} cos sim: {fold_score:.4f}")
    fold_scores.append(fold_score)

# Summary
mean_score = np.mean(fold_scores)
print(f"\nFold-wise scores: {fold_scores}")
print(f"Mean avg top-{top_k} cos sim over {k} folds: {mean_score:.4f}")

Fold:   0%|          | 0/5 [00:00<?, ?it/s]


--- Fold 1/5 ---
Fold 1 avg top-5 cos sim: 0.9456initial) -> 8.048 (current)))

--- Fold 2/5 ---
Fold 2 avg top-5 cos sim: 0.9403initial) -> 8.2855 (current)

--- Fold 3/5 ---
Fold 3 avg top-5 cos sim: 0.9456initial) -> 8.5162 (current))

--- Fold 4/5 ---
Fold 4 avg top-5 cos sim: 0.9456initial) -> 8.9977 (current))

--- Fold 5/5 ---
Fold 5 avg top-5 cos sim: 0.9432initial) -> 9.65 (current)t))

Fold-wise scores: [np.float32(0.94562376), np.float32(0.9403087), np.float32(0.94555545), np.float32(0.9455957), np.float32(0.9432132)]
Mean avg top-5 cos sim over 5 folds: 0.9441


In [None]:
# Ask ryan is the loss is
# 1. average cosine similarity between the between the unseen brain and the top k most similar title&abstract pairs
# 2. or the average cosine similarity between the unseen brain's title & abstract vector and the top k most similar title&abstract pairs