In [100]:
import os
import re
import tqdm
import numpy as np
import pandas as pd
import seaborn as sns
import nibabel as nb
import matplotlib.pyplot as plt
from pathlib import Path

In [54]:
def parse_session_hcp(file_path, scan_types):
    """
    Finds the BOLD ID associated with each named file from
    the session_hcp.txt file of a given session

    Parameters
    ----------
    file_path: str
        Path to the sessions_hcp.txt file
        Example:
            ~/fsx-mount/embarc-20201122-LHzJPHi4/sessions/CU0011_baseline/session_hcp.txt
    scan_types: list[str]
        The names of scans to process. The names must exactly match
        what appears in the session_hcp.txt file. Not all scan types
        need to appear in all of the session_hcp.txt files from every
        session, but "ert" won't pick up "bold ert", if "bold ert"
        is what appears in session_hcp.txt. Within a given study,
        some sessions will assign different IDs to the same scan types
        depending on scan availability within a session. QuNex doesn't
        like gaps in BOLD ID numbers within a session
        Example: ['bold ert', 'bold rest run-1', 'bold rest run-2',
            'bold reward' ]

    Returns
    -------
    mapping: dict:
        The names and their associated BOLD IDs
        Example: {
            'bold ert': '1', 'bold rest run-1': '2',
            'bold rest run-2': '3', 'bold reward': '4'
        }
    """

    # Build the mapping from rest/task to the bold scan, e.g., bold1, bold2, etc.
    with open(file_path, 'r') as f:
        info = f.read()
    mapping = dict()

    # The regex | means "or", i.e., allowed to match any of the scan_types
    scan_type_regex = '|'.join(scan_types)

    # Find all of the matches
    matches = re.findall(f'bold([0-9]+).+:({scan_type_regex})\n', info)
    for bold_id, bold_name in matches:
        mapping[bold_name] = bold_id

    # Try the first known HCP format if no matches found after standard mapping
    if len(mapping) == 0:
        matches = re.findall(f'bold([0-9]+):.+\s:\s({scan_type_regex}).+\n', info)
        for bold_id, bold_name in matches:
            mapping[bold_name] = bold_id
    return mapping

In [2]:
# set up
repo_dir = '/home/ubuntu/Projects/canbind'

In [3]:
# read visual QC dataframe
manifest_df = pd.read_csv(Path(repo_dir, 'reference/manifest_df_base.csv'))
manifest_df.shape

(1077, 5)

In [13]:
# Read CANBIND provided fMRI QC information
anhed_in_qc  = pd.read_excel('/home/ubuntu/canbind_upload/fMRI-Anhed/Documentation/QualityControl_Documentation_Anhedonia_Baseline_final.xlsx')
faces_qc     = pd.read_excel('/home/ubuntu/canbind_upload/fMRI-Faces/Documentation/QualityControl_Documentation_Faces_Baseline.xlsx')
gonogo_in_qc    = pd.read_excel('/home/ubuntu/canbind_upload/fMRI-GoNoGo/Documentation/QualityControl_Documentation_fMRI-GNG_Baseline_final.xlsx')


  warn(msg)
  warn(msg)
  warn(msg)


### Format Anhedonia QC DataFrame

In [5]:
# Format Anhedonia dataframe
anhed_qc         = anhed_in_qc.loc[:,~anhed_in_qc.columns.str.contains('Unnamed')]
anhed_qc.columns = ['subject_ID_long', 'subject_ID_short', 'Group', 'ImagingQC', 'ImagingComments']
anhed_qc         = anhed_qc.loc[anhed_qc.iloc[:,0].str.contains('_')]
cond_dict = {'Do not have this data': 'NoData',
                'Yes, useable ': 'Useable',
                'Yes, useable': 'Useable',
                'Useable ': 'Useable',
                'Yes, useable (Note: Data Quality Questionable)': 'Questionable',
                'No Data': 'NoData',
                'Unusable': 'Unusable',
                'No data': 'NoData'}
anhed_qc['DataUse'] = anhed_qc['ImagingQC'].map(cond_dict)
anhed_qc['DataUse'].value_counts()

NoData          146
Useable         132
Questionable     32
Unusable         13
Name: DataUse, dtype: int64

### Format Faces (ERT) QC DataFrame

In [6]:
faces_qc = faces_qc.iloc[np.where(faces_qc.iloc[:,0].str.contains('_'))]
faces_qc = faces_qc.loc[faces_qc.iloc[:,0].notna()]

In [7]:
%%capture
# run 1
faces_run01_qc = faces_qc[['subject_ID_long', 'subject_ID_short', 'Group', 
                            'Imaging Data: Faces - Run 1', 'Data Quality comments - specifics',  
                            'Behavioural Data: Faces - Run 1', 'Data Quality comments - specifics.1']]
faces_run01_qc.columns = ['subject_ID_long', 'subject_ID_short', 'Group', 'ImagingQC', 'ImagingComments', 'BehaviorQC', 'BehaviorComments']
faces_run01_qc['run']  = 1


In [8]:
%%capture
# run 2
faces_run02_qc = faces_qc[['subject_ID_long', 'subject_ID_short', 'Group', 
                            'Imaging Data: Faces - Run 2', 'Data Quality comments - specifics ',  
                            'Behavioural Data: Faces - Run 2', 'Data Quality comments - specifics.2']]
faces_run02_qc.columns = ['subject_ID_long', 'subject_ID_short', 'Group', 'ImagingQC', 'ImagingComments', 'BehaviorQC', 'BehaviorComments']
faces_run02_qc['run']  = 2

In [9]:
faces_long_qc = pd.concat([faces_run01_qc, faces_run02_qc]).reset_index(drop=True)
faces_long_qc = faces_long_qc.iloc[np.where(faces_long_qc.iloc[:,0].str.contains('_'))]
faces_long_qc = faces_long_qc.loc[faces_long_qc.iloc[:,0].notna()]
cond_dict = {'Do not have this data': 'NoData',
    'Yes, useable ': 'Useable',
    'Yes, useable': 'Useable',
    'Useable ': 'Useable',
    'Yes (Note: Data Quality Questionable)': 'Questionable',
    'Yes, useable (Note: Data Quality Questionable)': 'Questionable',
    'Yes, useable (Note: Data Quality)': 'Questionable',
    'No Data': 'NoData',
    'cannot be released due to incorrect consent form': 'NoData',
    'Unusable': 'Unusable',
    'Unusable ': 'Unusable',
    'No data': 'NoData'
}
faces_long_qc['DataUse'] = faces_long_qc['ImagingQC'].map(cond_dict)


In [10]:
faces_long_qc['DataUse'].value_counts()

NoData          406
Useable         198
Unusable         27
Questionable     15
Name: DataUse, dtype: int64

### Format GO/NoGo QC DataFrame

In [14]:
gonogo_in_qc.columns = ['subject_ID_long', 'subject_ID_short', 'Group', 'ImagingQC', 'ImagingComments']
gonogo_qc = gonogo_in_qc.loc[gonogo_in_qc.iloc[:,0].str.contains('_')]

cond_dict = {'Do not have this data': 'NoData',
    'Yes, useable ': 'Useable',
    'Yes, useable': 'Useable',
    'Useable ': 'Useable',
    'Yes, useable (Note: Data Quality)': 'Questionable',
    'No Data': 'NoData',
    'Unusable': 'Unusable',
    'No data': 'NoData'
}
gonogo_qc['DataUse'] = gonogo_qc['ImagingQC'].map(cond_dict)
gonogo_qc['DataUse'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gonogo_qc['DataUse'] = gonogo_qc['ImagingQC'].map(cond_dict)


NoData          146
Useable         133
Unusable         28
Questionable     16
Name: DataUse, dtype: int64

## QuNex Processed Data

In [69]:
study_dir = '/home/ubuntu/canbind-fsx/research/imaging/datasets/CANBIND/processed_data/pf-pipelines/qunex-nbridge/studies'
session_dirs       = list(Path(study_dir, 'CANBIND-20220818-mCcU5pi4/sessions').glob('*_01'))
#rerun_session_dirs = list(Path(study_dir, 'CANBIND-20220830-RBLE2Vqb/sessions').glob('*_01'))

In [70]:
session_dict = {x.stem:x for x in session_dirs}
#for x in rerun_session_dirs:
#    session_dict[x.stem] = x
session_list = list(session_dict.values())

In [112]:
scan_types = [
    'bold anhedonia run-01',
    'bold anhedonia run-02',
    'bold gonogo run-01',
    'bold gonogo run-02',
    'bold faces run-01',
    'bold faces run-02',
    'bold faces run-03',
    'bold rest run-01',
    'bold rest run-02'
]

info_list = []
for session_dir in tqdm.tqdm(session_list):
    session_file = Path(session_dir, 'session_hcp.txt')
    if not session_file.exists():
        continue
    # read scan to scan-name mapping
    scan_dict  = parse_session_hcp(session_file, scan_types)
    # create qunex scan inventory/df
    session_df = pd.DataFrame({'scan_info':scan_dict.keys(), 'scan_num': scan_dict.values()})
    session_df.insert(0, 'session_id', session_dir.stem)
    session_df['session_dir'] = session_dir

    # get properties of functional scans
    scanlen_dict = {}
    for scan_num in scan_dict.values():
        nii_file = Path(session_dir, f'images/functional/bold{scan_num}.nii.gz')
        nii_obj  = nb.load(nii_file)
        num_trs  = nii_obj.shape[-1]
        scanlen_dict[scan_num] = num_trs
    session_df['scan_len'] = session_df['scan_num'].map(scanlen_dict)
    info_list.append(session_df)




 32%|███▏      | 94/298 [11:06<24:06,  7.09s/it]


FileNotFoundError: No such file or no access: '/home/ubuntu/canbind-fsx/research/imaging/datasets/CANBIND/processed_data/pf-pipelines/qunex-nbridge/studies/CANBIND-20220818-mCcU5pi4/sessions/QNS0037_01/images/functional/bold1.nii.gz'

In [111]:
pd.concat(info_list)

Unnamed: 0,session_id,scan_info,scan_num,session_dir,scan_len
0,QNS0030_01,bold anhedonia run-01,1,/home/ubuntu/canbind-fsx/research/imaging/data...,350
1,QNS0030_01,bold gonogo run-01,2,/home/ubuntu/canbind-fsx/research/imaging/data...,300
2,QNS0030_01,bold rest run-01,3,/home/ubuntu/canbind-fsx/research/imaging/data...,300
0,UBC0034_01,bold anhedonia run-01,1,/home/ubuntu/canbind-fsx/research/imaging/data...,350
1,UBC0034_01,bold gonogo run-01,2,/home/ubuntu/canbind-fsx/research/imaging/data...,300
2,UBC0034_01,bold rest run-01,3,/home/ubuntu/canbind-fsx/research/imaging/data...,300
0,MCU0029_01,bold anhedonia run-01,1,/home/ubuntu/canbind-fsx/research/imaging/data...,350
1,MCU0029_01,bold gonogo run-01,2,/home/ubuntu/canbind-fsx/research/imaging/data...,300
2,MCU0029_01,bold rest run-01,3,/home/ubuntu/canbind-fsx/research/imaging/data...,300


In [108]:
qc_df = pd.read_csv('/home/ubuntu/canbind-fsx/research/imaging/datasets/CANBIND/imaging-features/CANBIND/production/qc/study-CANBIND_bold_motion.csv.gz', compression='gzip')
qc_df.shape


(17, 5)