-
Notifications
You must be signed in to change notification settings - Fork 579
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Brainomics fetcher #2097
Brainomics fetcher #2097
Changes from all commits
e3c9dfe
1123080
05899d0
0b488a5
4b183cc
d2274cf
d377484
00a2b21
cb48766
195ab8a
d1d036b
a99d78b
448f141
04510e5
e3ff9d3
37353ea
c5815ca
2675d85
ba48070
530d324
aa989e7
68532d8
703d046
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,7 +35,7 @@ | |
|
||
tested_var = localizer_dataset.ext_vars['pseudo'] | ||
# Quality check / Remove subjects with bad tested variate | ||
mask_quality_check = np.where(tested_var != b'None')[0] | ||
mask_quality_check = np.where(tested_var != b'n/a')[0] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another change, though less important. These changes are problematic: we are are breaking people's code. If it's hard to circumvent, I am willing to close my eyes on this one. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. While converting the dataset to BIDS, the empty cells of CSV files have been converted to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, let's live with this. Can you add a note about this in the what's new file. Thanks! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, let's live with this. Can you add a note about this in the what's new file. Thanks! |
||
n_samples = mask_quality_check.size | ||
contrast_map_filenames = [localizer_dataset.cmaps[i] | ||
for i in mask_quality_check] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,7 +13,7 @@ | |
from sklearn.utils import deprecated | ||
|
||
from .utils import (_get_dataset_dir, _fetch_files, _get_dataset_descr, | ||
_read_md5_sum_file, _tree, _filter_columns) | ||
_read_md5_sum_file, _tree, _filter_columns, _fetch_file) | ||
from .._utils import check_niimg | ||
from .._utils.compat import BytesIO, _basestring, _urllib | ||
from .._utils.numpy_conversions import csv_to_array | ||
|
@@ -824,7 +824,7 @@ def fetch_localizer_contrasts(contrasts, n_subjects=None, get_tmaps=False, | |
if n_subjects is None: | ||
n_subjects = 94 # 94 subjects available | ||
if (isinstance(n_subjects, numbers.Number) and | ||
((n_subjects > 94) or (n_subjects < 1))): | ||
((n_subjects > 94) or (n_subjects < 1))): | ||
warnings.warn("Wrong value for \'n_subjects\' (%d). The maximum " | ||
"value will be used instead (\'n_subjects=94\')") | ||
n_subjects = 94 # 94 subjects available | ||
|
@@ -886,28 +886,35 @@ def fetch_localizer_contrasts(contrasts, n_subjects=None, get_tmaps=False, | |
"button press vs calculation and sentence listening/reading": | ||
"auditory&visual motor vs cognitive processing"} | ||
allowed_contrasts = list(contrast_name_wrapper.values()) | ||
|
||
# convert contrast names | ||
contrasts_wrapped = [] | ||
# get a unique ID for each contrast. It is used to give a unique name to | ||
# each download file and avoid name collisions. | ||
contrasts_indices = [] | ||
for contrast in contrasts: | ||
if contrast in allowed_contrasts: | ||
contrasts_wrapped.append(contrast) | ||
contrasts_wrapped.append(contrast.title().replace(" ", "")) | ||
contrasts_indices.append(allowed_contrasts.index(contrast)) | ||
elif contrast in contrast_name_wrapper: | ||
name = contrast_name_wrapper[contrast] | ||
contrasts_wrapped.append(name) | ||
contrasts_wrapped.append(name.title().replace(" ", "")) | ||
contrasts_indices.append(allowed_contrasts.index(name)) | ||
else: | ||
raise ValueError("Contrast \'%s\' is not available" % contrast) | ||
|
||
# It is better to perform several small requests than a big one because: | ||
# - Brainomics server has no cache (can lead to timeout while the archive | ||
# is generated on the remote server) | ||
# - Local (cached) version of the files can be checked for each contrast | ||
opts = {'uncompress': True} | ||
# Get the dataset OSF index | ||
dataset_name = "brainomics_localizer" | ||
index_url = "https://osf.io/hwbm2/download" | ||
data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, | ||
verbose=verbose) | ||
index_file = _fetch_file(index_url, data_dir, verbose=verbose) | ||
with open(index_file, "rt") as of: | ||
index = json.load(of) | ||
|
||
# Build data URLs that will be fetched | ||
files = {} | ||
root_url = "https://osf.io/download/{0}" | ||
if isinstance(n_subjects, numbers.Number): | ||
subject_mask = np.arange(1, n_subjects + 1) | ||
subject_id_max = "S%02d" % n_subjects | ||
|
@@ -916,107 +923,106 @@ def fetch_localizer_contrasts(contrasts, n_subjects=None, get_tmaps=False, | |
subject_id_max = "S%02d" % np.max(n_subjects) | ||
n_subjects = len(n_subjects) | ||
subject_ids = ["S%02d" % s for s in subject_mask] | ||
data_types = ["c map"] | ||
data_types = ["cmaps"] | ||
kchawla-pi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if get_tmaps: | ||
data_types.append("t map") | ||
rql_types = str.join(", ", ["\"%s\"" % x for x in data_types]) | ||
root_url = "http://brainomics.cea.fr/localizer/" | ||
|
||
base_query = ("Any X,XT,XL,XI,XF,XD WHERE X is Scan, X type XT, " | ||
"X concerns S, " | ||
"X label XL, X identifier XI, " | ||
"X format XF, X description XD, " | ||
'S identifier <= "%s", ' % (subject_id_max, ) + | ||
'X type IN(%(types)s), X label "%(label)s"') | ||
|
||
urls = ["%sbrainomics_data_%d.zip?rql=%s&vid=data-zip" | ||
% (root_url, i, | ||
_urllib.parse.quote(base_query % {"types": rql_types, | ||
"label": c}, | ||
safe=',()')) | ||
for c, i in zip(contrasts_wrapped, contrasts_indices)] | ||
data_types.append("tmaps") | ||
kchawla-pi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
filenames = [] | ||
|
||
def _is_valid_path(path, index, verbose): | ||
if path not in index: | ||
if verbose > 0: | ||
print("Skiping path '{0}'...".format(path)) | ||
return False | ||
return True | ||
|
||
for subject_id in subject_ids: | ||
for data_type in data_types: | ||
for contrast_id, contrast in enumerate(contrasts_wrapped): | ||
name_aux = str.replace( | ||
str.join('_', [data_type, contrast]), ' ', '_') | ||
file_path = os.path.join( | ||
"brainomics_data", subject_id, "%s.nii.gz" % name_aux) | ||
file_tarball_url = urls[contrast_id] | ||
filenames.append((file_path, file_tarball_url, opts)) | ||
path = "/".join([ | ||
"/localizer", "derivatives", "spm_1st_level", | ||
"sub-%s" % subject_id, | ||
kchawla-pi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"sub-%s_task-localizer_acq-%s_%s.nii.gz" % ( | ||
subject_id, contrast, data_type)]) | ||
if _is_valid_path(path, index, verbose=verbose): | ||
file_url = root_url.format(index[path][1:]) | ||
opts = {"move": file_path} | ||
filenames.append((file_path, file_url, opts)) | ||
files.setdefault(data_type, []).append(file_path) | ||
|
||
# Fetch masks if asked by user | ||
if get_masks: | ||
urls.append("%sbrainomics_data_masks.zip?rql=%s&vid=data-zip" | ||
% (root_url, | ||
_urllib.parse.quote(base_query % {"types": '"boolean mask"', | ||
"label": "mask"}, | ||
safe=',()'))) | ||
for subject_id in subject_ids: | ||
file_path = os.path.join( | ||
"brainomics_data", subject_id, "boolean_mask_mask.nii.gz") | ||
file_tarball_url = urls[-1] | ||
filenames.append((file_path, file_tarball_url, opts)) | ||
path = "/".join([ | ||
"/localizer", "derivatives", "spm_1st_level", | ||
"sub-%s" % subject_id, "sub-%s_mask.nii.gz" % subject_id]) | ||
if _is_valid_path(path, index, verbose=verbose): | ||
kchawla-pi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
file_url = root_url.format(index[path][1:]) | ||
opts = {"move": file_path} | ||
filenames.append((file_path, file_url, opts)) | ||
files.setdefault("masks", []).append(file_path) | ||
kchawla-pi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# Fetch anats if asked by user | ||
if get_anats: | ||
urls.append("%sbrainomics_data_anats.zip?rql=%s&vid=data-zip" | ||
% (root_url, | ||
_urllib.parse.quote(base_query % {"types": '"normalized T1"', | ||
"label": "anatomy"}, | ||
safe=',()'))) | ||
for subject_id in subject_ids: | ||
file_path = os.path.join( | ||
"brainomics_data", subject_id, | ||
"normalized_T1_anat_defaced.nii.gz") | ||
file_tarball_url = urls[-1] | ||
filenames.append((file_path, file_tarball_url, opts)) | ||
# Fetch subject characteristics (separated in two files) | ||
if url is None: | ||
url_csv = ("%sdataset/cubicwebexport.csv?rql=%s&vid=csvexport" | ||
% (root_url, _urllib.parse.quote("Any X WHERE X is Subject"))) | ||
url_csv2 = ("%sdataset/cubicwebexport2.csv?rql=%s&vid=csvexport" | ||
% (root_url, | ||
_urllib.parse.quote("Any X,XI,XD WHERE X is QuestionnaireRun, " | ||
"X identifier XI, X datetime " | ||
"XD", safe=',') | ||
)) | ||
else: | ||
url_csv = "%s/cubicwebexport.csv" % url | ||
url_csv2 = "%s/cubicwebexport2.csv" % url | ||
filenames += [("cubicwebexport.csv", url_csv, {}), | ||
("cubicwebexport2.csv", url_csv2, {})] | ||
path = "/".join([ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will break on Windows, we have to keep "/".join(). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, my wrong. These are URLs, not file paths. Sorry. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe change the variable name to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would avoid mixing the two terms url and path:
Therefore I would avoid url_path. Perhaps file_url, partial_url, tmp_url or whatever better describes an URL or part of an URL? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, like I said, something better, that clearly communicates what it is and what it is not. I'll leave the choice of name to @AGrigis There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah indeed, but is it worth adding a dependency on urllib just for this? Perhaps we should leave it as is. Antoine won't have much more time to invest in preparing this patch. If needed I'm willing to add minor changes later on, just tell me what needs to be done. Not sure how to take over Antoine commits his changes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. keeping it as it is is also fine |
||
"/localizer", "derivatives", "spm_preprocessing", | ||
"sub-%s" % subject_id, "sub-%s_T1w.nii.gz" % subject_id]) | ||
if _is_valid_path(path, index, verbose=verbose): | ||
file_url = root_url.format(index[path][1:]) | ||
opts = {"move": file_path} | ||
filenames.append((file_path, file_url, opts)) | ||
files.setdefault("anats", []).append(file_path) | ||
|
||
# Fetch subject characteristics | ||
participants_file = os.path.join("brainomics_data", "participants.tsv") | ||
path = "/localizer/participants.tsv" | ||
kchawla-pi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if _is_valid_path(path, index, verbose=verbose): | ||
file_url = root_url.format(index[path][1:]) | ||
opts = {"move": participants_file} | ||
kchawla-pi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
filenames.append((participants_file, file_url, opts)) | ||
|
||
# Fetch behavioural | ||
behavioural_file = os.path.join( | ||
"brainomics_data", "phenotype", "behavioural.tsv") | ||
path = "/localizer/phenotype/behavioural.tsv" | ||
if _is_valid_path(path, index, verbose=verbose): | ||
file_url = root_url.format(index[path][1:]) | ||
opts = {"move": behavioural_file} | ||
filenames.append((behavioural_file, file_url, opts)) | ||
|
||
# Actual data fetching | ||
dataset_name = 'brainomics_localizer' | ||
data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, | ||
verbose=verbose) | ||
fdescr = _get_dataset_descr(dataset_name) | ||
files = _fetch_files(data_dir, filenames, verbose=verbose) | ||
anats = None | ||
masks = None | ||
tmaps = None | ||
# combine data from both covariates files into one single recarray | ||
_fetch_files(data_dir, filenames, verbose=verbose) | ||
for key, value in files.items(): | ||
files[key] = [os.path.join(data_dir, val) for val in value] | ||
|
||
# Load covariates file | ||
from numpy.lib.recfunctions import join_by | ||
ext_vars_file2 = files[-1] | ||
csv_data2 = np.recfromcsv(ext_vars_file2, delimiter=';') | ||
files = files[:-1] | ||
ext_vars_file = files[-1] | ||
csv_data = np.recfromcsv(ext_vars_file, delimiter=';') | ||
files = files[:-1] | ||
# join_by sorts the output along the key | ||
csv_data = join_by('subject_id', csv_data, csv_data2, | ||
usemask=False, asrecarray=True)[subject_mask - 1] | ||
if get_anats: | ||
anats = files[-n_subjects:] | ||
files = files[:-n_subjects] | ||
if get_masks: | ||
masks = files[-n_subjects:] | ||
files = files[:-n_subjects] | ||
if get_tmaps: | ||
tmaps = files[1::2] | ||
files = files[::2] | ||
return Bunch(cmaps=files, tmaps=tmaps, masks=masks, anats=anats, | ||
ext_vars=csv_data, description=fdescr) | ||
participants_file = os.path.join(data_dir, participants_file) | ||
csv_data = np.recfromcsv(participants_file, delimiter='\t') | ||
behavioural_file = os.path.join(data_dir, behavioural_file) | ||
csv_data2 = np.recfromcsv(behavioural_file, delimiter='\t') | ||
csv_data = join_by( | ||
"participant_id", csv_data, csv_data2, usemask=False, asrecarray=True) | ||
subject_names = csv_data["participant_id"].tolist() | ||
subjects_indices = [] | ||
for name in subject_ids: | ||
name = name.encode("utf8") | ||
if name not in subject_names: | ||
continue | ||
subjects_indices.append(subject_names.index(name)) | ||
csv_data = csv_data[subjects_indices] | ||
|
||
return Bunch(ext_vars=csv_data, description=fdescr, **files) | ||
|
||
|
||
def fetch_localizer_calculation_task(n_subjects=1, data_dir=None, url=None, | ||
|
@@ -1064,20 +1070,20 @@ def fetch_localizer_calculation_task(n_subjects=1, data_dir=None, url=None, | |
get_tmaps=False, get_masks=False, | ||
get_anats=False, data_dir=data_dir, | ||
url=url, resume=True, verbose=verbose) | ||
data.pop('tmaps') | ||
data.pop('masks') | ||
data.pop('anats') | ||
return data | ||
|
||
|
||
def fetch_localizer_button_task(data_dir=None, url=None, verbose=1): | ||
def fetch_localizer_button_task(n_subjects=None, data_dir=None, url=None, | ||
verbose=1): | ||
"""Fetch left vs right button press contrast maps from the localizer. | ||
|
||
This function ships only 2nd subject (S02) specific tmap and | ||
its normalized T1 image. | ||
|
||
Parameters | ||
---------- | ||
n_subjects: int, optional | ||
The number of subjects to load. If None is given, | ||
this function ships only 2nd subject (S02) specific tmap and | ||
its normalized T1 image. | ||
|
||
data_dir: string, optional | ||
Path of the data directory. Used to force data storage in a specified | ||
location. | ||
|
@@ -1093,6 +1099,7 @@ def fetch_localizer_button_task(data_dir=None, url=None, verbose=1): | |
------- | ||
data: Bunch | ||
Dictionary-like object, the interest attributes are : | ||
'cmaps': string list, giving paths to nifti contrast maps | ||
'tmap': string, giving paths to nifti contrast maps | ||
'anat': string, giving paths to normalized anatomical image | ||
|
||
|
@@ -1109,30 +1116,19 @@ def fetch_localizer_button_task(data_dir=None, url=None, verbose=1): | |
nilearn.datasets.fetch_localizer_contrasts | ||
|
||
""" | ||
# The URL can be retrieved from the nilearn account on OSF (Open | ||
# Science Framework). Uploaded files specific to S02 from | ||
# fetch_localizer_contrasts ['left vs right button press'] | ||
if url is None: | ||
url = 'https://osf.io/dx9jn/download' | ||
|
||
tmap = "t_map_left_auditory_&_visual_click_vs_right_auditory&visual_click.nii.gz" | ||
anat = "normalized_T1_anat_defaced.nii.gz" | ||
|
||
opts = {'uncompress': True} | ||
|
||
options = ('tmap', 'anat') | ||
filenames = [(os.path.join('localizer_button_task', name), url, opts) | ||
for name in (tmap, anat)] | ||
|
||
dataset_name = 'brainomics' | ||
data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, | ||
verbose=verbose) | ||
files = _fetch_files(data_dir, filenames, verbose=verbose) | ||
|
||
fdescr = _get_dataset_descr('brainomics_localizer') | ||
|
||
params = dict([('description', fdescr)] + list(zip(options, files))) | ||
return Bunch(**params) | ||
if n_subjects is None: | ||
n_subjects = [2] | ||
data = fetch_localizer_contrasts(["left vs right button press"], | ||
n_subjects=n_subjects, | ||
get_tmaps=True, get_masks=False, | ||
get_anats=True, data_dir=data_dir, | ||
url=url, resume=True, verbose=verbose) | ||
# TODO: remove -> only here for compatibility | ||
if len(data["tmaps"]) == 1: | ||
setattr(data, "tmap", data["tmaps"][0]) | ||
if len(data["anats"]) == 1: | ||
setattr(data, "anat", data["anats"][0]) | ||
return data | ||
|
||
|
||
def fetch_abide_pcp(data_dir=None, n_subjects=None, pipeline='cpac', | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems that the return of datasets.fetch_localizer_button_task() has changed. I do not think that we can do this: we are going to break people script. Fixing this is probably very easy by a simple modification of datasets.fetch_localizer_button_task().
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The 'fetch_localizer_button_task function' ships only 2nd subject (S02) specific tmap and its normalized T1 image. Now its working for all subjects so the output is a list. The API of the function also changed: new n_subjects parameter. I can make a little hack if only one subject is requested.