In [1]:
import pandas as pd
import numpy as np
import zipfile
from pathlib import Path
from collections import defaultdict
from tqdm import tqdm

In [2]:
# IDA search results for MRI + fMRI raw data
search_path = "../metadata/idaSearch_11_07_2025.csv"
search_df = pd.read_csv(search_path)
print(search_df.shape)
search_df.head()

(39123, 4)


Unnamed: 0,Subject ID,Sex,Age,Description
0,00705MAR14,M,59.0,ep2d_RESTING_STATE
1,00706FEB13,X,50.0,ep2d_RESTING_STATE
2,00706MAY15,X,50.0,ep2d_RESTING_STATE
3,00707MAY14,M,59.0,ep2d_RESTING_STATE
4,00708MAY15,X,50.0,ep2d_RESTING_STATE


In [3]:
# counts of each image type
desc_counts = search_df["Description"].value_counts()
desc_counts.loc[desc_counts > 200]

Description
2D GRE-MT                                     5270
AX T2 GRE MT                                  2708
3D T2 FLAIR                                   1797
rsfMRI_RL                                     1519
3D T1-weighted                                1258
AXIAL 2D GRE-MT                               1205
Axial PD-T2 TSE FS                            1140
DTI_revB0_AP                                   975
rsfMRI_PA                                      971
rsfMRI_AP                                      968
rsfMRI_LR                                      966
2D GRE MT                                      840
MPRAGE GRAPPA                                  802
2D GRE-NM_MT                                   755
Axial PD-T2 TSE                                735
ep2d_RESTING_STATE                             577
AX GRE -MT                                     575
NM-MT                                          571
T2                                             530
2D_GRE-MT          

In [4]:
# image types to include. these seem like the main image types for T1w and fMRI

include_descs = [
    "3D T1-weighted",
    "rsfMRI_LR",
    "rsfMRI_RL",
    "rsfMRI_AP",
    "rsfMRI_PA",
]
desc_counts.loc[include_descs]

Description
3D T1-weighted    1258
rsfMRI_LR          966
rsfMRI_RL         1519
rsfMRI_AP          968
rsfMRI_PA          971
Name: count, dtype: int64

In [5]:
# table of images after collecting all included image types into a package
filtered_path = "../metadata/PPMI_T1+fMRI_11_07_2025.csv"
filtered_df = pd.read_csv(filtered_path)
print(filtered_df.shape)
filtered_df.head()

(5951, 12)


Unnamed: 0,Image Data ID,Subject,Group,Sex,Age,Visit,Modality,Description,Type,Acq Date,Format,Downloaded
0,I1660587,75565,Prodromal,M,69,V10,MRI,3D T1-weighted,Original,11/15/2022,DCM,
1,I11083227,75565,Prodromal,M,67,V06,fMRI,rsfMRI_RL,Original,12/17/2020,DCM,
2,I1660589,75565,Prodromal,M,69,V10,fMRI,rsfMRI_LR,Original,11/15/2022,DCM,
3,I11083226,75565,Prodromal,M,67,V06,fMRI,rsfMRI_LR,Original,12/17/2020,DCM,
4,I1660588,75565,Prodromal,M,69,V10,fMRI,rsfMRI_RL,Original,11/15/2022,DCM,


In [6]:
# count anatomical and functional images for each subject
image_counts = filtered_df.groupby("Subject").agg(
    {
        "Modality": [
            ("t1w_count", lambda x: (x == "MRI").sum()),
            ("bold_count", lambda x: (x == "fMRI").sum()),
        ]
    },
)
image_counts.columns = ["t1w_count", "bold_count"]

In [7]:
has_t1w = image_counts["t1w_count"] > 0
has_bold = image_counts["bold_count"] > 0

print("CSV table counts")
print(f"total subjects: {len(image_counts)}")
print(f"with bold: {has_bold.sum()}")
print(f"with t1w: {has_t1w.sum()}")
print(f"with bold and t1w: {(has_bold & has_t1w).sum()}")

CSV table counts
total subjects: 1486
with bold: 765
with t1w: 1326
with bold and t1w: 605


In [8]:
# now index the actual zips to see what downloaded
zip_root = Path("/Volumes/bigboy/PPMI")
zips = {p.stem: p for p in sorted((zip_root / "zips").glob("*.zip"))}
print(zips)

{'PPMI_01': PosixPath('/Volumes/bigboy/PPMI/zips/PPMI_01.zip'), 'PPMI_02': PosixPath('/Volumes/bigboy/PPMI/zips/PPMI_02.zip'), 'PPMI_03': PosixPath('/Volumes/bigboy/PPMI/zips/PPMI_03.zip'), 'PPMI_04': PosixPath('/Volumes/bigboy/PPMI/zips/PPMI_04.zip'), 'PPMI_05': PosixPath('/Volumes/bigboy/PPMI/zips/PPMI_05.zip'), 'PPMI_06': PosixPath('/Volumes/bigboy/PPMI/zips/PPMI_06.zip'), 'PPMI_07': PosixPath('/Volumes/bigboy/PPMI/zips/PPMI_07.zip'), 'PPMI_08': PosixPath('/Volumes/bigboy/PPMI/zips/PPMI_08.zip'), 'PPMI_09': PosixPath('/Volumes/bigboy/PPMI/zips/PPMI_09.zip'), 'PPMI_10': PosixPath('/Volumes/bigboy/PPMI/zips/PPMI_10.zip')}


In [9]:
# map of runs to list of filepaths in each zip
zip_run_map = defaultdict(list)
for name, zip_path in zips.items():
    with zipfile.ZipFile(zip_path) as archive:
        for path in tqdm(archive.namelist()):
            run = str(Path(path).parents[1])
            zip_run_map[run].append(f"{name}:{path}")

100%|██████████| 632555/632555 [00:01<00:00, 585166.89it/s]
100%|██████████| 529431/529431 [00:00<00:00, 566124.95it/s]
100%|██████████| 625274/625274 [00:01<00:00, 585713.16it/s]
100%|██████████| 809613/809613 [00:01<00:00, 587093.42it/s]
100%|██████████| 768838/768838 [00:01<00:00, 581910.40it/s]
100%|██████████| 833839/833839 [00:01<00:00, 590005.57it/s]
100%|██████████| 913729/913729 [00:01<00:00, 585799.57it/s]
100%|██████████| 788410/788410 [00:01<00:00, 589366.31it/s]
100%|██████████| 918563/918563 [00:01<00:00, 586779.30it/s]
100%|██████████| 888953/888953 [00:01<00:00, 584014.80it/s]


In [10]:
k = list(zip_run_map)[0]
print(k)
print(zip_run_map[k][:10])

PPMI/58099/rsfMRI_RL/2021-03-18_08_07_06.0
['PPMI_01:PPMI/58099/rsfMRI_RL/2021-03-18_08_07_06.0/I1490732/PPMI_58099_MR_rsfMRI_RL__br_raw_20210913101244680_233_S1061608_I1490732.dcm', 'PPMI_01:PPMI/58099/rsfMRI_RL/2021-03-18_08_07_06.0/I1490732/PPMI_58099_MR_rsfMRI_RL__br_raw_20210913101245854_105_S1061608_I1490732.dcm', 'PPMI_01:PPMI/58099/rsfMRI_RL/2021-03-18_08_07_06.0/I1490732/PPMI_58099_MR_rsfMRI_RL__br_raw_20210913101246658_78_S1061608_I1490732.dcm', 'PPMI_01:PPMI/58099/rsfMRI_RL/2021-03-18_08_07_06.0/I1490732/PPMI_58099_MR_rsfMRI_RL__br_raw_20210913101247145_148_S1061608_I1490732.dcm', 'PPMI_01:PPMI/58099/rsfMRI_RL/2021-03-18_08_07_06.0/I1490732/PPMI_58099_MR_rsfMRI_RL__br_raw_20210913101247955_100_S1061608_I1490732.dcm', 'PPMI_01:PPMI/58099/rsfMRI_RL/2021-03-18_08_07_06.0/I1490732/PPMI_58099_MR_rsfMRI_RL__br_raw_20210913101249903_240_S1061608_I1490732.dcm', 'PPMI_01:PPMI/58099/rsfMRI_RL/2021-03-18_08_07_06.0/I1490732/PPMI_58099_MR_rsfMRI_RL__br_raw_20210913101250699_234_S1061608

In [11]:
zip_run_df = []
for run, filelist in tqdm(zip_run_map.items()):
    count = len(filelist)
    nimg = len(set(p.split("/")[4] for p in filelist))
    nzips = len(set(p.split(":")[0] for p in filelist))
    parts = run.split("/")
    sub = parts[1]
    acq = parts[2].lower()
    date = parts[3]
    datatype = "bold" if "fmri" in acq else "anat"
    record = {
        "run": run,
        "sub": sub,
        "acq": acq,
        "date": date,
        "datatype": datatype,
        "num_dcm": count,
        "num_img": nimg,
        "num_zips": nzips,
    }
    zip_run_df.append(record)
zip_run_df = pd.DataFrame.from_records(zip_run_df)

print(zip_run_df.shape)
zip_run_df.head()

100%|██████████| 5411/5411 [00:01<00:00, 3355.42it/s]

(5411, 8)





Unnamed: 0,run,sub,acq,date,datatype,num_dcm,num_img,num_zips
0,PPMI/58099/rsfMRI_RL/2021-03-18_08_07_06.0,58099,rsfmri_rl,2021-03-18_08_07_06.0,bold,240,1,1
1,PPMI/101050/rsfMRI_RL/2021-04-19_10_12_48.0,101050,rsfmri_rl,2021-04-19_10_12_48.0,bold,240,1,1
2,PPMI/101174/rsfMRI_LR/2021-05-04_10_12_31.0,101174,rsfmri_lr,2021-05-04_10_12_31.0,bold,10,1,1
3,PPMI/101476/rsfMRI_RL/2021-06-10_11_30_59.0,101476,rsfmri_rl,2021-06-10_11_30_59.0,bold,240,1,1
4,PPMI/101479/3D_T1-weighted/2021-06-14_08_17_23.0,101479,3d_t1-weighted,2021-06-14_08_17_23.0,anat,192,1,1


In [12]:
# count anatomical and functional images for each subject
zip_image_counts = zip_run_df.groupby("sub").agg(
    {
        "datatype": [
            ("t1w_count", lambda x: (x == "anat").sum()),
            ("bold_count", lambda x: (x == "bold").sum()),
        ]
    },
)
zip_image_counts.columns = ["t1w_count", "bold_count"]

In [13]:
has_t1w = zip_image_counts["t1w_count"] > 0
has_bold = zip_image_counts["bold_count"] > 0

print("Zip image counts")
print(f"total subjects: {len(zip_image_counts)}")
print(f"with bold: {has_bold.sum()}")
print(f"with t1w: {has_t1w.sum()}")
print(f"with bold and t1w: {(has_bold & has_t1w).sum()}")

Zip image counts
total subjects: 1486
with bold: 1391
with t1w: 890
with bold and t1w: 795


In [14]:
run_length_counts = zip_run_df.groupby(["acq", "num_dcm"]).agg({"run": "count"})
run_length_counts.sort_values("run", ascending=False).head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,run
acq,num_dcm,Unnamed: 2_level_1
3d_t1-weighted,192,980
rsfmri_ap,10,809
rsfmri_pa,600,809
rsfmri_rl,240,496
rsfmri_rl,9600,470
rsfmri_lr,10,443
rsfmri_lr,400,420
3d_t1-weighted,1,201
3d_t1-weighted,188,130
rsfmri_pa,10,121
