In [None]:
from pathlib import Path
import re

import pandas as pd
import numpy as np
import nibabel as nib
from tqdm.notebook import tqdm

from convergence.bold5000 import get_resource, load_dataset, load_mask, get_betas_roi

In [None]:
ds = load_dataset(query="stim_source=='imagenet'")

In [None]:
bold_path = Path("/mnt/tecla/Datasets/bold5000")
presentation_list = bold_path / "bold5000_stimuli/Stimuli_Presentation_Lists"

preprocessed_folder = bold_path / "preprocessed"
preprocessed_folder.mkdir(exist_ok=True)

In [None]:
data = []
for subject_folder in sorted(presentation_list.glob("CSI*"), key=lambda x: x.name):
    if subject_folder.is_file(): continue
    subject = int(subject_folder.name.replace("CSI", ""))
    for session_folder in sorted(subject_folder.glob("CSI*_sess*"), key=lambda x: x.name):
        if session_folder.is_file(): continue
        session = int(session_folder.name.replace("CSI{}_sess".format(subject), ""))
        for run_file in sorted(session_folder.glob("*.txt"), key=lambda x: x.name):
            run = int(run_file.stem[-2:])
            run_index = 0
            with open(run_file, "r") as f:
                lines = f.readlines()
            
            for stim in lines:
                stim = stim.strip()
                if not stim: continue
                

                # If stim startswith rep_, remove it
                repeated = False
                if stim.startswith('rep_'):
                    stim = stim[4:]
                    repeated = True

                if stim.startswith('COCO'):
                    stim_source = 'coco'
                # Elif is n\d+_\d+.* 
                elif re.match(r"n\d+_\d+.*", stim):
                    stim_source = 'imagenet'
                else:
                    stim_source = 'scenes'
                    
                data.append({
                    "subject": subject,
                    "session": session,
                    "run": run,
                    "run_index": run_index,
                    "image_name": stim,
                    "stim_source": stim_source,
                    "repeated": repeated
                })
                run_index += 1

df_stim = pd.DataFrame(data)
# Sort by subject, session and run
df_stim = df_stim.sort_values(by=["subject", "session", "run", "run_index"])
df_stim = df_stim.reset_index(drop=True)
# Create a session_index and a subject_index with the position of each stim inside the session or inside the full subject
df_stim["session_index"] = df_stim.groupby(["subject", "session"]).cumcount()
df_stim["subject_index"] = df_stim.groupby(["subject"]).cumcount()

# Add repetition index by cumsum per subject and image_name
df_stim["repetition"] = df_stim.groupby(["subject", "image_name"]).cumcount() + 1


In [None]:
df_stim_images = df_stim.copy()[['image_name', 'stim_source']]
df_stim_images = df_stim_images.drop_duplicates(subset=['image_name'])
df_stim_images = df_stim_images.reset_index()
df_stim_images = df_stim_images.rename(columns={'index': 'bold_id'})


presented_stimuli_folder = bold_path / "bold5000_stimuli/Scene_Stimuli/Presented_Stimuli"
dataset_folders = {'coco': presented_stimuli_folder / 'COCO', 'imagenet': presented_stimuli_folder / 'ImageNet', 'scenes': presented_stimuli_folder / 'Scene'}

df_stim_images['image_folder'] = df_stim_images['stim_source'].map(dataset_folders)
df_stim_images['image_path'] = df_stim_images.apply(lambda x: x['image_folder'] / x['image_name'], axis=1)
df_stim_images['exists'] = df_stim_images.image_path.apply(lambda x: x.exists())
assert df_stim_images.exists.all(), "Some images are missing"
# Drop
df_stim_images.image_path = df_stim_images.image_path.apply(lambda x: str(x.relative_to(bold_path)))
df_stim_images = df_stim_images.drop(columns=['image_folder', 'exists'])
df_stim_images['bold_id'] = df_stim_images['bold_id'].astype("uint16")
df_stim_images['image_name'] = df_stim_images['image_name'].astype("string")
df_stim_images['stim_source'] = df_stim_images['stim_source'].astype("string").astype("category")
df_stim_images['image_path'] = df_stim_images['image_path'].astype("string")
df_stim_images.to_csv(preprocessed_folder / "images.csv", index=False)
df_stim_images.to_parquet(preprocessed_folder / "images.parquet", index=False)
df_stim_images.head()


In [None]:

images_ids = df_stim_images[['bold_id', 'image_name']]
df_stim_ids = df_stim.merge(images_ids, on='image_name', validate='many_to_one')
df_stim_ids['name'] = df_stim_ids['image_name'].str.split(".").str[0].str.lower()
df_stim_ids = df_stim_ids[['subject', 'session', 'run', 'bold_id', 'subject_index', 'session_index', 'run_index', 'repetition', 'name', 'stim_source', 'image_name']]
df_stim_ids['subject'] = df_stim_ids['subject'].astype('uint8')
df_stim_ids['session'] = df_stim_ids['session'].astype('uint8')
df_stim_ids['run'] = df_stim_ids['run'].astype('uint8')
df_stim_ids['bold_id'] = df_stim_ids['bold_id'].astype('uint32')
df_stim_ids['subject_index'] = df_stim_ids['subject_index'].astype('uint16')
df_stim_ids['session_index'] = df_stim_ids['session_index'].astype('uint16')
df_stim_ids['run_index'] = df_stim_ids['run_index'].astype('uint8')
df_stim_ids['repetition'] = df_stim_ids['repetition'].astype('uint8')
df_stim_ids['name'] = df_stim_ids['name'].astype('string')
df_stim_ids['image_name'] = df_stim_ids['image_name'].astype('string')
df_stim_ids['stim_source'] = df_stim_ids['stim_source'].astype('string').astype('category')
df_stim_ids.to_csv(preprocessed_folder / 'stimulus_index.csv', index=False)
df_stim_ids.to_parquet(preprocessed_folder / 'stimulus_parquet.csv', index=False)
df_stim_ids.head()

### Check masks processed by fsl

In [None]:
for subject in [1,2,3,4]:
    mask = load_mask(subject=subject)
    print(mask.shape)
    assert len(np.unique(mask)) == 361

In [None]:
def whole_session_betas(subject, session, betas_type="D"):
    betas_types_map = {
        "A": "CSI{subject}_GLMbetas-TYPEA-ASSUMEHRF_ses-{session}.nii.gz",
        "B": "CSI{subject}_GLMbetas-TYPEB-FITHRF_ses-{session}.nii.gz",
        "C": "CSI{subject}_GLMbetas-TYPEC-FITHRF-GLMDENOISE_ses-{session}.nii.gz",
        "D": "CSI{subject}_GLMbetas-TYPED-FITHRF-GLMDENOISE-RR_ses-{session}.nii.gz",
    }
    # Convert session to 2 digits with one leading 0
    session = f"{session:02d}"
    betas_type = betas_types_map[betas_type].format(subject=subject, session=session)
    betas_path = bold_path / "bold5000" / betas_type
    image = nib.load(str(betas_path))
    data = image.get_fdata()
    return data

In [None]:
betas_folder = preprocessed_folder / "betas"
betas_folder.mkdir(exist_ok=True)

df_stim = get_resource("stimulus")

In [None]:
for beta_type in ["D", "C", "B", "A"]:
    for subject in [1,2,3,4]:
        subject_betas_folder = betas_folder / f"sub-{subject}"
        subject_betas_folder.mkdir(exist_ok=True)
        mask = load_mask(subject=subject)
        roi_ids, voxel_counts = np.unique(mask.ravel(), return_counts=True)
        df_stim_subject = df_stim[df_stim.subject == subject]
        n_stim = len(df_stim_subject)

        betas_data = {}
        for roi, count in zip(roi_ids, voxel_counts):
            if roi == 0: continue
            betas_data[roi] = np.zeros((n_stim, count))
        sessions = list(df_stim_subject.session.unique())
        sessions.sort()
        stimulus_index = 0 
        # Fill with roi betas
        for session in tqdm(sessions):
            session_betas = whole_session_betas(subject=subject, session=session, betas_type=beta_type) # (x,y,z,trials)
            n_trials_session = session_betas.shape[-1]
            # Fill nans with zeros
            session_betas = np.nan_to_num(session_betas, nan=0, posinf=0, neginf=0)
            for roi in range(1, 361):
                betas_session_roi = session_betas[mask == roi, :]
                betas_data[roi][stimulus_index:stimulus_index+n_trials_session, :] = betas_session_roi.T
            stimulus_index += n_trials_session

        for roi, betas in betas_data.items():
            betas_path = subject_betas_folder / f"sub-{subject}_hcpmmp_roi{roi}_{beta_type}.npy"
            np.save(betas_path, betas)
