In [1]:
import pathlib as pl
from glob import glob

import nibabel
import pandas as pd
from IPython.display import display


DATA_ROOT = pl.Path('../data')
DIR_BIDS = DATA_ROOT / 'C-PAC_Derivatives/preproc/output/cpac_cpac_preproc/sub-0025429_ses-1'
GLOB_BIDS_FILES = '/**/*.*'

In [2]:
# The following code groups all files in a BIDS directory by their tags and adds some
# meta information (sources, description, shape & dtype for images).

from common.bids_utils import\
    bids_file_fetch_json,\
    bids_file_to_dict,\
    group_bids_dicts,\
    remove_all_file_extensions,\
    get_all_file_extensions

# Collect non-JSON files
bids_dicts = [bids_file_to_dict(file_path) for file_path in glob(DIR_BIDS.as_posix() + GLOB_BIDS_FILES) if
              not file_path.endswith('.json')]

# Group bids data
bids_groups = group_bids_dicts(bids_dicts)

# Collect additional information
for bids_group in bids_groups:
    for bids_record in bids_group:
        file_path = bids_record['_filepath']
        bids_record['_filepath'] = pl.Path(remove_all_file_extensions(file_path)).name
        bids_record['_fileext'] = get_all_file_extensions(file_path)
        bids_json = bids_file_fetch_json(file_path)

        if bids_record['_fileext'] in ('nii.gz', 'nii'):
            try:
                img_nii = nibabel.load(file_path)
                bids_record['_shape'] = f'{img_nii.shape}'
                bids_record['_dtype'] = f'{img_nii.get_data_dtype()}'
            except:  # noqa
                pass
        if bids_json is not None:
            if 'Sources' in bids_json:
                bids_record['_sources'] = ', '.join(bids_json['Sources'])
            if 'Description' in bids_json:
                bids_record['_description'] = bids_json['Description']

# Convert to data frames
dfs = [pd.DataFrame.from_records(g) for g in bids_groups]

# Move filepath back
for df in dfs:
    if '_filepath' in df.columns:
        column_filepath = df.pop('_filepath')
        df['_filepath'] = column_filepath

In [3]:
for df in dfs:
    display(df)

Unnamed: 0,ses,desc,_source,_suffix,_fileext,_shape,_dtype,_sources,_filepath
0,1,brain,sub-0025429,T1w,nii.gz,"(176, 256, 256)",float32,"desc-preproc_T1w, space-T1w_desc-brain_mask",sub-0025429_ses-1_desc-brain_T1w
1,1,preproc,sub-0025429,T1w,nii.gz,"(176, 256, 256)",float32,T1w,sub-0025429_ses-1_desc-preproc_T1w
2,1,reorient,sub-0025429,T1w,nii.gz,"(176, 256, 256)",float32,T1w,sub-0025429_ses-1_desc-reorient_T1w


Unnamed: 0,ses,from,to,mode,desc,_source,_suffix,_fileext,_shape,_dtype,_sources,_description,_filepath
0,1,T1w,template,image,linear,sub-0025429,xfm,nii.gz,"(91, 109, 91, 1, 3)",float64,"desc-brain_T1w, space-T1w_desc-brain_mask, des...",Linear (affine) transform from T1w native spac...,sub-0025429_ses-1_from-T1w_to-template_mode-im...
1,1,T1w,template,image,nonlinear,sub-0025429,xfm,nii.gz,"(91, 109, 91, 1, 3)",float64,"desc-brain_T1w, space-T1w_desc-brain_mask, des...",Nonlinear (warp field) transform from T1w nati...,sub-0025429_ses-1_from-T1w_to-template_mode-im...
2,1,template,T1w,image,linear,sub-0025429,xfm,nii.gz,"(176, 256, 256, 1, 3)",float64,"desc-brain_T1w, space-T1w_desc-brain_mask, des...",Linear (affine) transform from T1w-template sp...,sub-0025429_ses-1_from-template_to-T1w_mode-im...
3,1,template,T1w,image,nonlinear,sub-0025429,xfm,nii.gz,"(91, 109, 91, 1, 3)",float64,"desc-brain_T1w, space-T1w_desc-brain_mask, des...",Nonlinear (warp field) transform from T1w-temp...,sub-0025429_ses-1_from-template_to-T1w_mode-im...


Unnamed: 0,ses,from,to,mode,_source,_suffix,_fileext,_shape,_dtype,_sources,_description,_filepath
0,1,T1w,template,image,sub-0025429,xfm,nii.gz,"(91, 109, 91, 1, 3)",float64,"desc-brain_T1w, space-T1w_desc-brain_mask, des...",Composite (affine + warp field) transform from...,sub-0025429_ses-1_from-T1w_to-template_mode-im...
1,1,template,T1w,image,sub-0025429,xfm,nii.gz,"(176, 256, 256, 1, 3)",float64,"desc-brain_T1w, space-T1w_desc-brain_mask, des...",Composite (affine + warp field) transform from...,sub-0025429_ses-1_from-template_to-T1w_mode-im...


Unnamed: 0,ses,label,desc,_source,_suffix,_fileext,_shape,_dtype,_sources,_filepath
0,1,CSF,preproc,sub-0025429,mask,nii.gz,"(176, 256, 256)",float32,"desc-brain_T1w, space-T1w_desc-brain_mask, fro...",sub-0025429_ses-1_label-CSF_desc-preproc_mask
1,1,GM,preproc,sub-0025429,mask,nii.gz,"(176, 256, 256)",float32,"desc-brain_T1w, space-T1w_desc-brain_mask, fro...",sub-0025429_ses-1_label-GM_desc-preproc_mask
2,1,WM,preproc,sub-0025429,mask,nii.gz,"(176, 256, 256)",float32,"desc-brain_T1w, space-T1w_desc-brain_mask, fro...",sub-0025429_ses-1_label-WM_desc-preproc_mask


Unnamed: 0,ses,label,_source,_suffix,_fileext,_shape,_dtype,_sources,_filepath
0,1,CSF,sub-0025429,mask,nii.gz,"(176, 256, 256)",int32,"desc-brain_T1w, space-T1w_desc-brain_mask, fro...",sub-0025429_ses-1_label-CSF_mask
1,1,CSF,sub-0025429,probseg,nii.gz,"(176, 256, 256)",float32,"desc-brain_T1w, space-T1w_desc-brain_mask, fro...",sub-0025429_ses-1_label-CSF_probseg
2,1,GM,sub-0025429,mask,nii.gz,"(176, 256, 256)",int32,"desc-brain_T1w, space-T1w_desc-brain_mask, fro...",sub-0025429_ses-1_label-GM_mask
3,1,GM,sub-0025429,probseg,nii.gz,"(176, 256, 256)",float32,"desc-brain_T1w, space-T1w_desc-brain_mask, fro...",sub-0025429_ses-1_label-GM_probseg
4,1,WM,sub-0025429,mask,nii.gz,"(176, 256, 256)",int32,"desc-brain_T1w, space-T1w_desc-brain_mask, fro...",sub-0025429_ses-1_label-WM_mask
5,1,WM,sub-0025429,probseg,nii.gz,"(176, 256, 256)",float32,"desc-brain_T1w, space-T1w_desc-brain_mask, fro...",sub-0025429_ses-1_label-WM_probseg


Unnamed: 0,ses,space,desc,_source,_suffix,_fileext,_shape,_dtype,_sources,_description,_filepath
0,1,T1w,brain,sub-0025429,mask,nii.gz,"(176, 256, 256)",int16,desc-preproc_T1w,,sub-0025429_ses-1_space-T1w_desc-brain_mask
1,1,template,brain,sub-0025429,T1w,nii.gz,"(91, 109, 91)",float64,"desc-brain_T1w, space-T1w_desc-brain_mask, des...",The preprocessed T1w brain transformed to temp...,sub-0025429_ses-1_space-template_desc-brain_T1w


Unnamed: 0,ses,task,run,_source,_suffix,_fileext,_sources,_filepath
0,1,rest,1,sub-0025429,bold-snr-axial-qc,png,"desc-brain_bold, space-bold_desc-brain_mask, f...",sub-0025429_ses-1_task-rest_run-1_bold-snr-axi...
1,1,rest,1,sub-0025429,bold-snr-hist-qc,png,"desc-brain_bold, space-bold_desc-brain_mask, f...",sub-0025429_ses-1_task-rest_run-1_bold-snr-his...
2,1,rest,1,sub-0025429,bold-snr-qc,txt,"desc-brain_bold, space-bold_desc-brain_mask, f...",sub-0025429_ses-1_task-rest_run-1_bold-snr-qc
3,1,rest,1,sub-0025429,bold-snr-sagittal-qc,png,"desc-brain_bold, space-bold_desc-brain_mask, f...",sub-0025429_ses-1_task-rest_run-1_bold-snr-sag...
4,1,rest,1,sub-0025429,dvars,1D,"desc-preproc_bold, space-bold_desc-brain_mask,...",sub-0025429_ses-1_task-rest_run-1_dvars
5,1,rest,1,sub-0025429,framewise-displacement-jenkinson-plot-qc,png,framewise-displacement-jenkinson,sub-0025429_ses-1_task-rest_run-1_framewise-di...
6,1,rest,1,sub-0025429,framewise-displacement-jenkinson,1D,"desc-preproc_bold, space-bold_desc-brain_mask,...",sub-0025429_ses-1_task-rest_run-1_framewise-di...
7,1,rest,1,sub-0025429,framewise-displacement-power,1D,"desc-preproc_bold, space-bold_desc-brain_mask,...",sub-0025429_ses-1_task-rest_run-1_framewise-di...
8,1,rest,1,sub-0025429,max-displacement,1D,"desc-preproc_bold, motion-basefile",sub-0025429_ses-1_task-rest_run-1_max-displace...
9,1,rest,1,sub-0025429,motion-params,txt,"desc-preproc_bold, space-bold_desc-brain_mask,...",sub-0025429_ses-1_task-rest_run-1_motion-params


Unnamed: 0,ses,task,run,desc,_source,_suffix,_fileext,_sources,_shape,_dtype,_filepath
0,1,rest,1,1,sub-0025429,regressors,1D,"TR, regressors, space-bold_desc-brain_mask, fr...",,,sub-0025429_ses-1_task-rest_run-1_desc-1_regre...
1,1,rest,1,2,sub-0025429,regressors,1D,"TR, regressors, space-bold_desc-brain_mask, fr...",,,sub-0025429_ses-1_task-rest_run-1_desc-2_regre...
2,1,rest,1,mean,sub-0025429,bold,nii.gz,desc-preproc_bold,"(64, 64, 43)",float32,sub-0025429_ses-1_task-rest_run-1_desc-mean_bold
3,1,rest,1,preproc-1,sub-0025429,bold,nii.gz,"TR, regressors, space-bold_desc-brain_mask, fr...","(64, 64, 43, 300)",float32,sub-0025429_ses-1_task-rest_run-1_desc-preproc...
4,1,rest,1,preproc-2,sub-0025429,bold,nii.gz,"TR, regressors, space-bold_desc-brain_mask, fr...","(64, 64, 43, 300)",float32,sub-0025429_ses-1_task-rest_run-1_desc-preproc...


Unnamed: 0,ses,task,run,from,to,mode,desc,_source,_suffix,_fileext,_sources,_filepath
0,1,rest,1,bold,T1w,image,linear,sub-0025429,xfm,mat,"desc-reginput_bold, desc-motion_bold, desc-bra...",sub-0025429_ses-1_task-rest_run-1_from-bold_to...


Unnamed: 0,ses,task,run,from,to,mode,_source,_suffix,_fileext,_shape,_dtype,_sources,_filepath
0,1,rest,1,bold,template,image,sub-0025429,xfm,nii.gz,"(61, 73, 61, 1, 3)",float64,"desc-reginput_bold, from-bold_to-T1w_mode-imag...",sub-0025429_ses-1_task-rest_run-1_from-bold_to...
1,1,rest,1,template,bold,image,sub-0025429,xfm,nii.gz,"(64, 64, 43, 1, 3)",float64,"desc-reginput_bold, from-bold_to-T1w_mode-imag...",sub-0025429_ses-1_task-rest_run-1_from-templat...


Unnamed: 0,ses,task,run,space,desc,_source,_suffix,_fileext,_shape,_dtype,_sources,_description,_filepath
0,1,rest,1,bold,brain,sub-0025429,mask,nii.gz,"(64, 64, 43)",uint8,desc-preproc_bold,Binary brain mask of the BOLD functional time-...,sub-0025429_ses-1_task-rest_run-1_space-bold_d...
1,1,rest,1,T1w,mean,sub-0025429,bold,nii.gz,"(176, 256, 256)",float32,"desc-reginput_bold, desc-motion_bold, desc-bra...",,sub-0025429_ses-1_task-rest_run-1_space-T1w_de...
2,1,rest,1,template,bold,sub-0025429,mask,nii.gz,"(61, 73, 61)",float64,"space-bold_desc-brain_mask, from-bold_to-templ...",,sub-0025429_ses-1_task-rest_run-1_space-templa...
3,1,rest,1,template,mean,sub-0025429,bold,nii.gz,"(61, 73, 61)",float64,"desc-mean_bold, from-bold_to-template_mode-ima...",Single-volume mean of the BOLD time-series tra...,sub-0025429_ses-1_task-rest_run-1_space-templa...
4,1,rest,1,template,preproc-1,sub-0025429,bold,nii.gz,"(61, 73, 61, 300)",float32,"desc-preproc_bold, from-bold_to-template_mode-...",,sub-0025429_ses-1_task-rest_run-1_space-templa...
5,1,rest,1,template,preproc-2,sub-0025429,bold,nii.gz,"(61, 73, 61, 300)",float32,"desc-preproc_bold, from-bold_to-template_mode-...",,sub-0025429_ses-1_task-rest_run-1_space-templa...


Unnamed: 0,ses,task,run,space,res,desc,_source,_suffix,_fileext,_shape,_dtype,_sources,_filepath
0,1,rest,1,template,derivative,bold,sub-0025429,mask,nii.gz,"(61, 73, 61)",float64,"space-bold_desc-brain_mask, from-bold_to-templ...",sub-0025429_ses-1_task-rest_run-1_space-templa...


The above code is the most (possibly/maybe-)practical thing I could come up with for an automated solution of this challenge.

The task description mentions crawling with a GNU shell. I collected the following information with `tree` and by `grep`-ing `find`. (Although I had prior knowledge about BIDS and C-PAC)

### BIDS input directories

BIDS input directories are quite self-explanatory following a (simplified) pattern of `<subject>/<session>/[anat | fmap | func]/<image>.{ext | json}`.

Where `anat` and `func` indicate anatomical and functional image data respectively.
I had to look up `fmap` in the [BIDS spec](https://bids-specification.readthedocs.io/en/stable/04-modality-specific-files/01-magnetic-resonance-imaging-data.html#echo-planar-imaging-and-b0-mapping) which indicates fieldmap estimation data (presumably for diffusion imaging).


### C-PAC Minimal Preprocessed Output Directory

Much of this should be covered by the code above and BIDS description, but here are some additional observations:
- `log` contains runtime logs, `working` contains data used during runtime (a lot of serialized/pickeled files), `output` contains the final image and data artifacts.
- There is a custom metadata field `CpacProvenance` which seems to model a full dependency graph.
- There are HTML visualization of dependencies in `working/cpac_sub-0025429_ses-1/index.html` and of runtime metrics in `log/pipeline_cpac_preproc/sub-0025429_ses-1/callback.log.html`
