# Extracting Shirer components from fMRI data

We'll be working with the afni-smooth derivatives from the Nastase et. al. dataset. These niftis have been through the following preprocessing steps: 

#### fMRIPrep 20.0.5
* Motion correction: FSL MCFLIRT
* Slice timing correction: AFNI 3dTshift
* Susceptibility distortion correction: fieldmap-less method using BOLD-T1w registration with intensity inversion; ANTs antsRegistration
* Co-registration to T1w: boundary-based registration (bbregister, FreeSurfer)
* Spatial normalization: to MNI152NLin2009cAsym using the transforms from T1w → MNI
* Resampling: BOLD resampled to MNI space (volumetric) with 2 mm isotropic resolution (and fsaverage for surface)
* Confounds extraction: motion parameters (6 DOF), FD, DVARS, aCompCor (from WM and CSF masks), global signal, cosine drift terms
##### Note: No spatial smoothing, no global signal regression, no temporal filtering

#### Confound regression in AFNI
* Nuisance regression of the confounds listed above
* No smoothing

#### Smoothing in AFNI
* 6 mm FWHM

##### Now, we will have to z-score voxelwise across time...

In [8]:
import os
import re
import pandas as pd
import glob
import json
import numpy as np
import nibabel as nib
from collections import defaultdict
from nilearn.image import resample_img
import gc
import h5py

Printing all available pieman niftis in the afni-smooth directory

In [2]:
# Set the base directory
base_dir = '/home/jovyan/shared/data/narratives/derivatives/afni-smooth'

# Filter subject directories: sub-001 to sub-082 only
subject_dirs = []
for d in os.listdir(base_dir):
    if d.startswith("sub-"):
        match = re.match(r"sub-(\d+)", d)
        if match:
            sub_num = int(match.group(1))
            if 1 <= sub_num <= 82:
                full_path = os.path.join(base_dir, d)
                if os.path.isdir(full_path):
                    subject_dirs.append(full_path)

# Patterns to match: with and without run-*
file_patterns = [
    "*_task-pieman_run-*_space-MNI152NLin2009cAsym_res-native_desc-sm6_bold.nii.gz",
    "*_task-pieman_space-MNI152NLin2009cAsym_res-native_desc-sm6_bold.nii.gz"
]

# Loop through filtered subjects
for sub_dir in sorted(subject_dirs):
    subject_id = os.path.basename(sub_dir)
    func_dir = os.path.join(sub_dir, 'func')
    matched_files = []

    if os.path.exists(func_dir):
        for pattern in file_patterns:
            matched_files.extend(glob.glob(os.path.join(func_dir, pattern)))

    if matched_files:
        print(f"{subject_id} — {len(matched_files)} run(s) found:")
        for f in matched_files:
            print(f"    {f}")
    else:
        print(f"{subject_id} — no matching runs found.")

sub-001 — 2 run(s) found:
    /home/jovyan/shared/data/narratives/derivatives/afni-smooth/sub-001/func/sub-001_task-pieman_run-1_space-MNI152NLin2009cAsym_res-native_desc-sm6_bold.nii.gz
    /home/jovyan/shared/data/narratives/derivatives/afni-smooth/sub-001/func/sub-001_task-pieman_run-2_space-MNI152NLin2009cAsym_res-native_desc-sm6_bold.nii.gz
sub-002 — 2 run(s) found:
    /home/jovyan/shared/data/narratives/derivatives/afni-smooth/sub-002/func/sub-002_task-pieman_run-1_space-MNI152NLin2009cAsym_res-native_desc-sm6_bold.nii.gz
    /home/jovyan/shared/data/narratives/derivatives/afni-smooth/sub-002/func/sub-002_task-pieman_run-2_space-MNI152NLin2009cAsym_res-native_desc-sm6_bold.nii.gz
sub-003 — 2 run(s) found:
    /home/jovyan/shared/data/narratives/derivatives/afni-smooth/sub-003/func/sub-003_task-pieman_run-1_space-MNI152NLin2009cAsym_res-native_desc-sm6_bold.nii.gz
    /home/jovyan/shared/data/narratives/derivatives/afni-smooth/sub-003/func/sub-003_task-pieman_run-2_space-MNI152NL

Excluding niftis using the exclude_scans.json

In [3]:
# === Load exclusion JSON ===
exclude_path = '/home/jovyan/scan_exclude.json'
with open(exclude_path, 'r') as f:
    exclusion_dict = json.load(f)

# Task-specific exclusions
pieman_exclusions = exclusion_dict.get("pieman", {})

# === Base data directory ===
base_dir = '/home/jovyan/shared/data/narratives/derivatives/afni-smooth'

# Filter sub-001 to sub-082
subject_dirs = []
for d in os.listdir(base_dir):
    if d.startswith("sub-"):
        match = re.match(r"sub-(\d+)", d)
        if match:
            sub_num = int(match.group(1))
            if 1 <= sub_num <= 82:
                full_path = os.path.join(base_dir, d)
                if os.path.isdir(full_path):
                    subject_dirs.append(full_path)

# File patterns (with and without run)
file_patterns = [
    "*_task-pieman_run-*_space-MNI152NLin2009cAsym_res-native_desc-sm6_bold.nii.gz",  # multi-run
    "*_task-pieman_space-MNI152NLin2009cAsym_res-native_desc-sm6_bold.nii.gz"         # single-run
]

# === Collect included files ===
subject_to_files = defaultdict(list)

for sub_dir in sorted(subject_dirs):
    subject_id = os.path.basename(sub_dir)  # e.g., sub-014
    func_dir = os.path.join(sub_dir, 'func')
    matched_files = []

    if os.path.exists(func_dir):
        for pattern in file_patterns:
            matched_files.extend(glob.glob(os.path.join(func_dir, pattern)))

    for fpath in matched_files:
        fname = os.path.basename(fpath)
        # Remove suffix after _space-... to get the scan identifier
        scan_id = fname.split("_space-")[0]  # e.g., sub-001_task-pieman_run-1 or sub-021_task-pieman

        # Skip if excluded
        if scan_id in pieman_exclusions.get(subject_id, []):
            continue

        # Keep track of included files
        subject_to_files[subject_id].append(fpath)

# === Keep only the earliest run per subject ===
def extract_run_index(fname):
    """Return integer run number if present, else 0 (single-run case)."""
    match = re.search(r"_run-(\d+)", fname)
    return int(match.group(1)) if match else 0

# === Print included file per subject (only earliest run) ===
for subject_id in sorted(subject_to_files.keys()):
    included_files = subject_to_files[subject_id]
    
    if not included_files:
        print(f"{subject_id} — no valid runs after exclusion.")
        continue

    # Sort by run index and pick the earliest
    included_files.sort(key=lambda x: extract_run_index(os.path.basename(x)))
    earliest_file = included_files[0]

    print(f"{subject_id} — 1 run included (earliest):")
    print(f"    {earliest_file}")


# === Count how many NIfTIs were kept ===
final_niftis = [
    sorted(subject_to_files[sub], key=lambda x: extract_run_index(os.path.basename(x)))[0]
    for sub in subject_to_files
    if subject_to_files[sub]  # skip if empty
]

# === Print subjects with usable NIfTIs ===
print("\nSubjects with usable NIfTIs:")
usable_subjects = []

for sub in sorted(subject_to_files):
    if subject_to_files[sub]:  # skip if list is empty
        usable_subjects.append(sub)
        print(f"  {sub}")

# Final list of NIfTI paths
final_niftis = [
    sorted(subject_to_files[sub], key=lambda x: extract_run_index(os.path.basename(x)))[0]
    for sub in usable_subjects
]

print("\n==============================")
print(f"Total number of subject NIfTIs included: {len(final_niftis)}")
print("==============================")


sub-002 — 1 run included (earliest):
    /home/jovyan/shared/data/narratives/derivatives/afni-smooth/sub-002/func/sub-002_task-pieman_run-1_space-MNI152NLin2009cAsym_res-native_desc-sm6_bold.nii.gz
sub-003 — 1 run included (earliest):
    /home/jovyan/shared/data/narratives/derivatives/afni-smooth/sub-003/func/sub-003_task-pieman_run-1_space-MNI152NLin2009cAsym_res-native_desc-sm6_bold.nii.gz
sub-004 — 1 run included (earliest):
    /home/jovyan/shared/data/narratives/derivatives/afni-smooth/sub-004/func/sub-004_task-pieman_run-1_space-MNI152NLin2009cAsym_res-native_desc-sm6_bold.nii.gz
sub-005 — 1 run included (earliest):
    /home/jovyan/shared/data/narratives/derivatives/afni-smooth/sub-005/func/sub-005_task-pieman_run-1_space-MNI152NLin2009cAsym_res-native_desc-sm6_bold.nii.gz
sub-006 — 1 run included (earliest):
    /home/jovyan/shared/data/narratives/derivatives/afni-smooth/sub-006/func/sub-006_task-pieman_run-1_space-MNI152NLin2009cAsym_res-native_desc-sm6_bold.nii.gz
sub-007 — 

Now I'm just inspecting and locating the correct Shirer masks...

In [4]:
# Set path to directory with Shirer masks
mask_dir = "/home/jovyan/shirer2012"

# List all mask files
mask_files = [f for f in os.listdir(mask_dir) if f.endswith("_mask.nii")]

# Extract metadata from filenames
records = []
for fname in mask_files:
    match = re.match(
        r"network-(?P<network>.+?)_space-MNI152NLin2009cAsym_res-(?P<res>\d{2})(?:_thresh-(?P<thresh>\d{2}))?_mask\.nii",
        fname
    )
    if match:
        records.append({
            "filename": fname,
            "network": match.group("network"),
            "resolution": f"res-{match.group('res')}",
            "thresholded": "thresh-" + match.group("thresh") if match.group("thresh") else "none"
        })
    else:
        print(f"[WARN] Filename did not match pattern: {fname}")

# Convert to DataFrame
df = pd.DataFrame(records)

# Display summary
summary = df.groupby("network")[["resolution", "thresholded"]].agg(lambda x: sorted(set(x)))
print("=== Shirer Mask Summary ===")
print(summary)

pattern = os.path.join(mask_dir, "*_res-02_thresh-01_mask.nii")

mask_paths = sorted(glob.glob(pattern))

print("=== res-02_thresh-01 Shirer Masks ===")
for path in mask_paths:
    print(path)

=== Shirer Mask Summary ===
                         resolution        thresholded
network                                               
anteriorsalience   [res-01, res-02]  [none, thresh-01]
auditory           [res-01, res-02]  [none, thresh-01]
basalganglia       [res-01, res-02]  [none, thresh-01]
dorsaldmn          [res-01, res-02]  [none, thresh-01]
highervisual       [res-01, res-02]  [none, thresh-01]
language           [res-01, res-02]  [none, thresh-01]
leftcontrol        [res-01, res-02]  [none, thresh-01]
posteriorsalience  [res-01, res-02]  [none, thresh-01]
precuneus          [res-01, res-02]  [none, thresh-01]
primaryvisual      [res-01, res-02]  [none, thresh-01]
rightcontrol       [res-01, res-02]  [none, thresh-01]
sensorimotor       [res-01, res-02]  [none, thresh-01]
ventraldmn         [res-01, res-02]  [none, thresh-01]
visuospatial       [res-01, res-02]  [none, thresh-01]
=== res-02_thresh-01 Shirer Masks ===
/home/jovyan/shirer2012/network-anteriorsalience_space

Now I want to inspect the shapes of the subject nifitis and the binary shirer masks to verify spatial alignment before applying the masks: 

In [5]:
# === Inspect shape of one subject NIfTI (example) ===
example_nifti_path = final_niftis[0]
nifti_img = nib.load(example_nifti_path)
nifti_data = nifti_img.get_fdata()
print(f"Subject NIfTI shape: {nifti_data.shape}")
print(f"Affine:\n{nifti_img.affine}\n")

# === Load all Shirer masks matching file pattern ===
mask_dir = "/home/jovyan/shirer2012"
mask_pattern = os.path.join(mask_dir, "network-*_space-MNI152NLin2009cAsym_res-02_thresh-01_mask.nii")
mask_paths = sorted(glob.glob(mask_pattern))  # sort for consistent ordering

print("Shirer network mask shapes:\n")
for path in mask_paths:
    mask_img = nib.load(path)
    mask_data = mask_img.get_fdata()
    print(f"{os.path.basename(path)} — shape: {mask_data.shape}")


Subject NIfTI shape: (65, 77, 49, 300)
Affine:
[[   3.    -0.    -0.   -96.5]
 [  -0.     3.    -0.  -132.5]
 [   0.     0.     4.   -78.5]
 [   0.     0.     0.     1. ]]

Shirer network mask shapes:

network-anteriorsalience_space-MNI152NLin2009cAsym_res-02_thresh-01_mask.nii — shape: (97, 115, 97)
network-auditory_space-MNI152NLin2009cAsym_res-02_thresh-01_mask.nii — shape: (97, 115, 97)
network-basalganglia_space-MNI152NLin2009cAsym_res-02_thresh-01_mask.nii — shape: (97, 115, 97)
network-dorsaldmn_space-MNI152NLin2009cAsym_res-02_thresh-01_mask.nii — shape: (97, 115, 97)
network-highervisual_space-MNI152NLin2009cAsym_res-02_thresh-01_mask.nii — shape: (97, 115, 97)
network-language_space-MNI152NLin2009cAsym_res-02_thresh-01_mask.nii — shape: (97, 115, 97)
network-leftcontrol_space-MNI152NLin2009cAsym_res-02_thresh-01_mask.nii — shape: (97, 115, 97)
network-posteriorsalience_space-MNI152NLin2009cAsym_res-02_thresh-01_mask.nii — shape: (97, 115, 97)
network-precuneus_space-MNI152NLi

Resampling Shirer masks to match subj nifti spatial resolution

In [6]:
resampled_masks = {}

for path in mask_paths:  # mask_paths should list the res-02 files
    mask_img = nib.load(path)

    resampled = resample_img(
        mask_img,
        target_affine=nifti_img.affine,
        target_shape=nifti_img.shape[:3],
        interpolation='nearest',  # preserves binary nature
        force_resample=True,
        copy_header=True
    )

    resampled_masks[os.path.basename(path)] = resampled
    print(f"Resampled {os.path.basename(path)} to shape {resampled.shape}")

Resampled network-anteriorsalience_space-MNI152NLin2009cAsym_res-02_thresh-01_mask.nii to shape (65, 77, 49)
Resampled network-auditory_space-MNI152NLin2009cAsym_res-02_thresh-01_mask.nii to shape (65, 77, 49)
Resampled network-basalganglia_space-MNI152NLin2009cAsym_res-02_thresh-01_mask.nii to shape (65, 77, 49)
Resampled network-dorsaldmn_space-MNI152NLin2009cAsym_res-02_thresh-01_mask.nii to shape (65, 77, 49)
Resampled network-highervisual_space-MNI152NLin2009cAsym_res-02_thresh-01_mask.nii to shape (65, 77, 49)
Resampled network-language_space-MNI152NLin2009cAsym_res-02_thresh-01_mask.nii to shape (65, 77, 49)
Resampled network-leftcontrol_space-MNI152NLin2009cAsym_res-02_thresh-01_mask.nii to shape (65, 77, 49)
Resampled network-posteriorsalience_space-MNI152NLin2009cAsym_res-02_thresh-01_mask.nii to shape (65, 77, 49)
Resampled network-precuneus_space-MNI152NLin2009cAsym_res-02_thresh-01_mask.nii to shape (65, 77, 49)
Resampled network-primaryvisual_space-MNI152NLin2009cAsym_res

Z score voxelwise subject data and average within networks to get network-level signals for each subject.

In [12]:
# ----------------------------
# Config (Pie Man hard-coded TR drops)
# ----------------------------
output_dir = "/home/jovyan/narratives-project/shirer_components"
os.makedirs(output_dir, exist_ok=True)

DROP_FRONT_TRS = 10 + 3   # 10 TR intro + 3 TR HRF lag
DROP_END_TRS   = 8        # 8 TR silence at end

# Ensure resampled_masks exists
if "resampled_masks" not in globals() or not resampled_masks:
    raise RuntimeError("resampled_masks is empty or undefined. Run your resampling cell first.")

mask_keys  = sorted(resampled_masks.keys())
mask_bools = [resampled_masks[k].get_fdata().astype(bool) for k in mask_keys]

# Derive network names
network_names       = [k.split("_")[0].replace("network-", "") for k in mask_keys]
network_names_ascii = np.asarray(network_names, dtype="S")

# ----------------------------
# Helper: z-score across time
# ----------------------------
def zscore_time(arr4d):
    """Z-score across time (last axis) in float32, guarding against zero std."""
    if arr4d.dtype != np.float32:
        arr4d = arr4d.astype(np.float32, copy=False)
    mean = arr4d.mean(axis=-1, keepdims=True, dtype=np.float32)
    std  = arr4d.std(axis=-1, keepdims=True, dtype=np.float32)
    std[std == 0] = 1.0
    return (arr4d - mean) / std

# ----------------------------
# Main loop: one subject at a time
# ----------------------------
for nifti_path in final_niftis:
    subj = os.path.basename(nifti_path).split("_")[0]
    print(f"[INFO] Processing {subj}: {nifti_path}")

    # Load and inspect
    img  = nib.load(nifti_path)
    data = img.get_fdata(dtype=np.float32)  # shape (X,Y,Z,T)
    T    = data.shape[-1]

    # Compute slice: drop first 13 TRs, drop last 8 TRs
    start_idx = DROP_FRONT_TRS
    end_excl   = T - DROP_END_TRS
    if start_idx >= end_excl:
        raise ValueError(f"No data left after dropping TRs for {subj}: "
                         f"start={start_idx}, end={end_excl}, T={T}")
    print(f"  Dropping first {DROP_FRONT_TRS} TRs and last {DROP_END_TRS} TRs → "
          f"keeping [{start_idx}:{end_excl}) = {end_excl-start_idx} TRs")

    # Trim and z-score
    trimmed   = data[..., start_idx:end_excl]
    z_trimmed = zscore_time(trimmed)
    del data, trimmed
    gc.collect()

    # Sanity-check mask shapes
    vol_shape = z_trimmed.shape[:3]
    if any(m.shape != vol_shape for m in mask_bools):
        print(f"[ERROR] Mask/volume shape mismatch for {subj}: "
              f"volume={vol_shape}, mask={mask_bools[0].shape}. Skipping.")
        del z_trimmed, img
        gc.collect()
        continue

    # Average within each mask
    kept_TRs    = z_trimmed.shape[-1]
    averaged_ts = np.zeros((len(mask_bools), kept_TRs), dtype=np.float32)
    for i, mask in enumerate(mask_bools):
        vox = z_trimmed[mask]
        if vox.size:
            averaged_ts[i, :] = vox.mean(axis=0)

    # Save to HDF5
    out_f = os.path.join(output_dir, f"{subj}_desc-shirercomponents.h5")
    with h5py.File(out_f, "w") as f:
        f.create_dataset("timeseries", data=averaged_ts)
        f.create_dataset("network",    data=network_names_ascii)
        f.attrs["drop_front_TRs"] = DROP_FRONT_TRS
        f.attrs["drop_end_TRs"]   = DROP_END_TRS
        f.attrs["original_T"]     = T

    print(f"[INFO] Saved: {out_f}")
    del z_trimmed, averaged_ts, img
    gc.collect()

print("[DONE] All subjects processed.")

[INFO] Processing sub-002: /home/jovyan/shared/data/narratives/derivatives/afni-smooth/sub-002/func/sub-002_task-pieman_run-1_space-MNI152NLin2009cAsym_res-native_desc-sm6_bold.nii.gz
  Dropping first 13 TRs and last 8 TRs → keeping [13:292) = 279 TRs
[INFO] Saved: /home/jovyan/narratives-project/shirer_components/sub-002_desc-shirercomponents.h5
[INFO] Processing sub-003: /home/jovyan/shared/data/narratives/derivatives/afni-smooth/sub-003/func/sub-003_task-pieman_run-1_space-MNI152NLin2009cAsym_res-native_desc-sm6_bold.nii.gz
  Dropping first 13 TRs and last 8 TRs → keeping [13:292) = 279 TRs
[INFO] Saved: /home/jovyan/narratives-project/shirer_components/sub-003_desc-shirercomponents.h5
[INFO] Processing sub-004: /home/jovyan/shared/data/narratives/derivatives/afni-smooth/sub-004/func/sub-004_task-pieman_run-1_space-MNI152NLin2009cAsym_res-native_desc-sm6_bold.nii.gz
  Dropping first 13 TRs and last 8 TRs → keeping [13:292) = 279 TRs
[INFO] Saved: /home/jovyan/narratives-project/shir

inspecting outputs...

In [14]:
# Update this to your actual directory
h5_dir = "/home/jovyan/narratives-project/shirer_components"

# List available files
files = sorted([f for f in os.listdir(h5_dir) if f.endswith(".h5")])
print("Available HDF5 files:")
for fname in files:
    print(f"  {fname}")

# Inspect a specific subject (change as needed)
subject = "sub-002"
matches = [f for f in files if f.startswith(subject)]
if not matches:
    print(f"No files found for subject {subject}.")
else:
    file_path = os.path.join(h5_dir, matches[0])
    print(f"\nInspecting file: {file_path}")
    with h5py.File(file_path, "r") as f:
        for name, ds in f.items():
            if isinstance(ds, h5py.Dataset):
                print(f"Dataset '{name}': shape={ds.shape}, dtype={ds.dtype}")
                # Show a small preview
                data = ds[...]
                print("Preview:", data.flatten()[:10])


Available HDF5 files:
  sub-002_desc-shirercomponents.h5
  sub-003_desc-shirercomponents.h5
  sub-004_desc-shirercomponents.h5
  sub-005_desc-shirercomponents.h5
  sub-006_desc-shirercomponents.h5
  sub-007_desc-shirercomponents.h5
  sub-008_desc-shirercomponents.h5
  sub-009_desc-shirercomponents.h5
  sub-010_desc-shirercomponents.h5
  sub-011_desc-shirercomponents.h5
  sub-012_desc-shirercomponents.h5
  sub-013_desc-shirercomponents.h5
  sub-014_desc-shirercomponents.h5
  sub-015_desc-shirercomponents.h5
  sub-016_desc-shirercomponents.h5
  sub-017_desc-shirercomponents.h5
  sub-018_desc-shirercomponents.h5
  sub-019_desc-shirercomponents.h5
  sub-020_desc-shirercomponents.h5
  sub-023_desc-shirercomponents.h5
  sub-024_desc-shirercomponents.h5
  sub-025_desc-shirercomponents.h5
  sub-026_desc-shirercomponents.h5
  sub-027_desc-shirercomponents.h5
  sub-028_desc-shirercomponents.h5
  sub-029_desc-shirercomponents.h5
  sub-030_desc-shirercomponents.h5
  sub-031_desc-shirercomponents.h