For combined data:

In [None]:
import sys
sys.path.append('/shared/home/mis6559/neurobio240/BLIP')

sys.argv = [
    'decode_captions.py',
    '--subject', 'subj01',
    '--gpu', '0',
    '--roi_str', 'early_ventral_midventral_midlateral_lateral_parietal'
]

import torch
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
from BLIP.models.blip import blip_decoder
from transformers import logging
import argparse

logging.set_verbosity_error()  # suppress transformer warnings

def generate_from_imageembeds(
    model, device, image_embeds, sample=False, num_beams=3, max_length=30,
    min_length=10, top_p=0.9, repetition_penalty=1.0
):
    # No repetition!
    image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(device)

    model_kwargs = {
        "encoder_hidden_states": image_embeds,
        "encoder_attention_mask": image_atts,
    }

    prompt = [model.prompt]
    input_ids = model.tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    input_ids[:, 0] = model.tokenizer.bos_token_id
    input_ids = input_ids[:, :-1]  # Remove final token to avoid duplication

    generate_args = {
        "input_ids": input_ids,
        "max_length": max_length,
        "min_length": min_length,
        "eos_token_id": model.tokenizer.sep_token_id,
        "pad_token_id": model.tokenizer.pad_token_id,
        "repetition_penalty": repetition_penalty,
        **model_kwargs
    }

    if sample:
        outputs = model.text_decoder.generate(
            do_sample=True,
            top_p=top_p,
            num_return_sequences=1,
            **generate_args
        )
    else:
        outputs = model.text_decoder.generate(
            num_beams=num_beams,
            **generate_args
        )

    return [
        model.tokenizer.decode(output, skip_special_tokens=True)[len(model.prompt):]
        for output in outputs
    ]

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--subject", type=str, required=True)
    parser.add_argument("--roi_str", type=str, default="early_ventral_midlateral_midventral_lateral_parietal")
    parser.add_argument("--gpu", type=int, default=0)
    args = parser.parse_args()

    subject = args.subject
    roi_str = args.roi_str
    gpu = args.gpu

    torch.cuda.set_device(gpu)
    device = torch.device(f"cuda:{gpu}" if torch.cuda.is_available() else "cpu")

    # Setup paths
    image_size = 240
    decoded_path = f"/shared/home/mis6559/neurobio240/decoded/{subject}/{subject}_{roi_str}_scores_blip_combined.npy"
    savedir = f"/shared/home/mis6559/neurobio240/decoded/{subject}/captions"
    os.makedirs(savedir, exist_ok=True)

    # Load model
    print("Loading BLIP decoder...")
    model_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
    model_decoder = blip_decoder(
        pretrained=model_url,
        image_size=image_size,
        vit="base",
        med_config="/shared/home/mis6559/neurobio240/BLIP/configs/med_config.json"
    )
    model_decoder.eval()
    model_decoder = model_decoder.to(device)

    # Load predicted BLIP features
    print(f"Loading predicted features from: {decoded_path}")
    scores = np.load(decoded_path)
    print("Shape:", scores.shape)

    # Use the fixed number of tokens (225) expected by BLIP
    expected_tokens = 225
    expected_length = expected_tokens * 768
    print(f"Using expected BLIP input shape: {expected_tokens} tokens ({expected_length} features)")

    # Generate captions
    captions_brain = []
    for imidx in tqdm(range(scores.shape[0])):
        flat = scores[imidx, :]

        if flat.shape[0] < expected_length:
            raise ValueError(f"[{imidx}] Too few features: {flat.shape[0]} < {expected_length}")
        elif flat.shape[0] > expected_length:
            print(f"[{imidx}] Trimming from {flat.shape[0] // 768} → {expected_tokens} tokens")
            flat = flat[:expected_length]

        scores_test = torch.Tensor(flat.reshape(expected_tokens, 768)).unsqueeze(0).to(device)

        captions = generate_from_imageembeds(model_decoder, device, scores_test)
        captions_brain.append(captions[0])

    # Save output
    out_csv = os.path.join(savedir, "captions_brain.csv")
    pd.DataFrame(captions_brain).to_csv(out_csv, sep='\t', header=False, index=False)
    print(f"Saved {len(captions_brain)} captions to: {out_csv}")

if __name__ == "__main__":
    main()

For solo data:

In [None]:
import sys
sys.path.append('/shared/home/mis6559/neurobio240/BLIP')

sys.argv = [
    'decode_captions.py',
    '--subject', 'subj01',
    '--gpu', '0',
    '--roi_str', 'early'
]

import torch
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
from BLIP.models.blip import blip_decoder
from transformers import logging
import argparse

logging.set_verbosity_error()  # suppress transformer warnings

def generate_from_imageembeds(model, device, image_embeds, sample=False, num_beams=3, max_length=30, min_length=10, top_p=0.9, repetition_penalty=1.0):
    print("image_embeds shape BEFORE:", image_embeds.shape)

    prompt = [model.prompt]
    input_ids = model.tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    input_ids[:, 0] = model.tokenizer.bos_token_id
    input_ids = input_ids[:, :-1]

    if sample:
        image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(device)
        model_kwargs = {
            "encoder_hidden_states": image_embeds,
            "encoder_attention_mask": image_atts,
        }
        outputs = model.text_decoder.generate(
            input_ids=input_ids,
            max_length=max_length,
            min_length=min_length,
            do_sample=True,
            top_p=top_p,
            num_return_sequences=1,
            eos_token_id=model.tokenizer.sep_token_id,
            pad_token_id=model.tokenizer.pad_token_id,
            repetition_penalty=repetition_penalty,
            **model_kwargs)
    else:
        expanded_input_ids = input_ids.repeat_interleave(num_beams, dim=0)
        expanded_image_embeds = image_embeds.repeat_interleave(num_beams, dim=0)
        expanded_image_atts = torch.ones(expanded_image_embeds.size()[:-1], dtype=torch.long).to(device)

        model_kwargs = {
            "encoder_hidden_states": expanded_image_embeds,
            "encoder_attention_mask": expanded_image_atts,
        }

        outputs = model.text_decoder.generate(
            input_ids=expanded_input_ids,
            max_length=max_length,
            min_length=min_length,
            num_beams=num_beams,
            eos_token_id=model.tokenizer.sep_token_id,
            pad_token_id=model.tokenizer.pad_token_id,
            repetition_penalty=repetition_penalty,
            **model_kwargs)

    captions = []
    for output in outputs:
        caption = model.tokenizer.decode(output, skip_special_tokens=True)
        captions.append(caption[len(model.prompt):])
    return captions


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--subject", type=str, required=True)
    parser.add_argument("--roi_str", type=str, default="early_ventral_midlateral_midventral_lateral_parietal")
    parser.add_argument("--gpu", type=int, default=0)
    args = parser.parse_args()

    subject = args.subject
    roi_str = args.roi_str
    gpu = args.gpu

    torch.cuda.set_device(gpu)
    device = torch.device(f"cuda:{gpu}" if torch.cuda.is_available() else "cpu")

    # Setup paths
    image_size = 240
    decoded_path = f"/shared/home/mis6559/neurobio240/decoded/{subject}/{subject}_early_scores_blip.npy"
    savedir = f"/shared/home/mis6559/neurobio240/decoded/{subject}/captions"
    os.makedirs(savedir, exist_ok=True)

    # Load model
    print("Loading BLIP decoder...")
    model_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
    model_decoder = blip_decoder(
        pretrained=model_url,
        image_size=image_size,
        vit="base",
        med_config="/shared/home/mis6559/neurobio240/BLIP/configs/med_config.json"
    )
    model_decoder.eval()
    model_decoder = model_decoder.to(device)

    # Load BLIP predictions
    print(f"Loading predicted features from: {decoded_path}")
    scores = np.load(decoded_path)
    print(f"score shape: {scores.shape}")  # should be (228, N), where N = num_tokens * 768
    print(f"Element count per image: {scores.shape[1]}")
    print(f"Num tokens: {scores.shape[1] / 768}")

    
    # Generate captions
    captions_brain = []
    for imidx in tqdm(range(scores.shape[0])):
    
        flat = scores[imidx, :]
        assert flat.shape[0] == 226 * 768, f"[{imidx}] Wrong vector length: {flat.shape[0]}"
        
        reshaped = flat.reshape(226, 768)  # full 226 tokens
        scores_test = torch.Tensor(reshaped).unsqueeze(0).to(device)
        
        print(f"[{imidx}] Passing tensor with shape: {scores_test.shape}")  # should be (1, 226, 768)
    
        caption = generate_from_imageembeds(model_decoder, device, scores_test, num_beams=3, max_length=20, min_length=5, repetition_penalty=1.5)
        captions_brain.append(caption)
    
    # Save output
    out_csv = os.path.join(savedir, "captions_brain.csv")
    pd.DataFrame(captions_brain).to_csv(out_csv, sep='\t', header=False, index=False)
    print(f"Saved decoded captions to: {out_csv}")

if __name__ == "__main__":
    main()