diff --git a/doc/changes/latest.rst b/doc/changes/latest.rst index a1945f644d..747c38806f 100644 --- a/doc/changes/latest.rst +++ b/doc/changes/latest.rst @@ -200,3 +200,9 @@ Changes - Descriptions of datasets retrieved with fetchers from :mod:`nilearn.datasets` are now python strings rather than `bytes`. Therefore, decoding the descriptions is no longer necessary. +- Dataset fetchers returning a :class:`numpy.recarray` can now return a + :class:`pandas.DataFrame` instead. These fetchers now have a ``legacy_format`` optional + argument defaulting to ``True`` for backward compatibility. Users will be warned that + this parameter will default to ``False`` in release 0.11.0, making + :class:`pandas.DataFrame` the default return type instead or :class:`numpy.recarray`. + (See PR `#2829 `_). diff --git a/examples/01_plotting/plot_3d_map_to_surface_projection.py b/examples/01_plotting/plot_3d_map_to_surface_projection.py index 76cca8faea..6cb001e6d2 100644 --- a/examples/01_plotting/plot_3d_map_to_surface_projection.py +++ b/examples/01_plotting/plot_3d_map_to_surface_projection.py @@ -209,7 +209,7 @@ # averaging the labels between neighboring regions. Using nearest-neighbor # interpolation with zero radius will achieve this. -destrieux = datasets.fetch_atlas_destrieux_2009() +destrieux = datasets.fetch_atlas_destrieux_2009(legacy_format=False) view = plotting.view_img_on_surf( destrieux.maps, diff --git a/examples/01_plotting/plot_dim_plotting.py b/examples/01_plotting/plot_dim_plotting.py index 7f11b04031..fff80e5e3a 100644 --- a/examples/01_plotting/plot_dim_plotting.py +++ b/examples/01_plotting/plot_dim_plotting.py @@ -19,11 +19,12 @@ from nilearn import datasets -localizer_dataset = datasets.fetch_localizer_button_task() +localizer_dataset = datasets.fetch_localizer_button_task(legacy_format=False) # Contrast map of motor task localizer_tmap_filename = localizer_dataset.tmap # Subject specific anatomical image localizer_anat_filename = localizer_dataset.anat + ########################################################################### # Plotting with enhancement of background image with dim=-.5 # -------------------------------------------------------------------------- diff --git a/examples/01_plotting/plot_prob_atlas.py b/examples/01_plotting/plot_prob_atlas.py index ab38c45388..0d90599304 100644 --- a/examples/01_plotting/plot_prob_atlas.py +++ b/examples/01_plotting/plot_prob_atlas.py @@ -46,8 +46,9 @@ # Dictionaries of Functional Modes (“DiFuMo”) atlas dim = 64 res = 2 -difumo = datasets.fetch_atlas_difumo(dimension=dim, - resolution_mm=res) +difumo = datasets.fetch_atlas_difumo( + dimension=dim, resolution_mm=res, legacy_format=False +) # Visualization from nilearn import plotting diff --git a/examples/02_decoding/plot_oasis_vbm.py b/examples/02_decoding/plot_oasis_vbm.py index e438554b07..2dabca6f96 100644 --- a/examples/02_decoding/plot_oasis_vbm.py +++ b/examples/02_decoding/plot_oasis_vbm.py @@ -56,9 +56,11 @@ ############################################################################ # Load Oasis dataset # ------------------- -oasis_dataset = datasets.fetch_oasis_vbm(n_subjects=n_subjects) +oasis_dataset = datasets.fetch_oasis_vbm( + n_subjects=n_subjects, legacy_format=False +) gray_matter_map_filenames = oasis_dataset.gray_matter_maps -age = oasis_dataset.ext_vars['age'].astype(float) +age = oasis_dataset.ext_vars['age'].values # Split data into training set and test set from sklearn.model_selection import train_test_split diff --git a/examples/02_decoding/plot_oasis_vbm_space_net.py b/examples/02_decoding/plot_oasis_vbm_space_net.py index 4b6749edc6..5b6019fc10 100644 --- a/examples/02_decoding/plot_oasis_vbm_space_net.py +++ b/examples/02_decoding/plot_oasis_vbm_space_net.py @@ -16,7 +16,9 @@ import numpy as np from nilearn import datasets n_subjects = 200 # increase this number if you have more RAM on your box -dataset_files = datasets.fetch_oasis_vbm(n_subjects=n_subjects) +dataset_files = datasets.fetch_oasis_vbm( + n_subjects=n_subjects, legacy_format=False +) age = dataset_files.ext_vars['age'].astype(float) age = np.array(age) gm_imgs = np.array(dataset_files.gray_matter_maps) diff --git a/examples/03_connectivity/plot_atlas_comparison.py b/examples/03_connectivity/plot_atlas_comparison.py index 374531b2da..1c2bd68867 100644 --- a/examples/03_connectivity/plot_atlas_comparison.py +++ b/examples/03_connectivity/plot_atlas_comparison.py @@ -118,8 +118,9 @@ def lag_correlation(time_series, lag): # ----------------------------------------------------------------- dim = 64 -difumo = datasets.fetch_atlas_difumo(dimension=dim, - resolution_mm=2) +difumo = datasets.fetch_atlas_difumo( + dimension=dim, resolution_mm=2, legacy_format=False +) ########################################################################## # Iterate over fetched atlases to extract coordinates - probabilistic diff --git a/examples/03_connectivity/plot_sphere_based_connectome.py b/examples/03_connectivity/plot_sphere_based_connectome.py index 5bbb053bec..f184ad053a 100644 --- a/examples/03_connectivity/plot_sphere_based_connectome.py +++ b/examples/03_connectivity/plot_sphere_based_connectome.py @@ -158,7 +158,7 @@ # # First we fetch the coordinates of the Power atlas -power = datasets.fetch_coords_power_2011() +power = datasets.fetch_coords_power_2011(legacy_format=False) print('Power atlas comes with {0}.'.format(power.keys())) @@ -306,7 +306,7 @@ # ------------------------------------------- # # We repeat the same steps for Dosenbach's atlas. -dosenbach = datasets.fetch_coords_dosenbach_2010() +dosenbach = datasets.fetch_coords_dosenbach_2010(legacy_format=False) coords = np.vstack(( dosenbach.rois['x'], diff --git a/examples/05_glm_second_level/plot_oasis.py b/examples/05_glm_second_level/plot_oasis.py index c79dfd28d1..1e5dda05d8 100644 --- a/examples/05_glm_second_level/plot_oasis.py +++ b/examples/05_glm_second_level/plot_oasis.py @@ -33,13 +33,15 @@ # ------------------ from nilearn import datasets -oasis_dataset = datasets.fetch_oasis_vbm(n_subjects=n_subjects) +oasis_dataset = datasets.fetch_oasis_vbm( + n_subjects=n_subjects, legacy_format=False +) gray_matter_map_filenames = oasis_dataset.gray_matter_maps age = oasis_dataset.ext_vars['age'].astype(float) ############################################################################### # Sex is encoded as 'M' or 'F'. Hence, we make it a binary variable. -sex = oasis_dataset.ext_vars['mf'] == b'F' +sex = oasis_dataset.ext_vars['mf'] == 'F' ############################################################################### # Print basic information on the dataset. diff --git a/examples/05_glm_second_level/plot_proportion_activated_voxels.py b/examples/05_glm_second_level/plot_proportion_activated_voxels.py index 431ee3060d..6a17955303 100644 --- a/examples/05_glm_second_level/plot_proportion_activated_voxels.py +++ b/examples/05_glm_second_level/plot_proportion_activated_voxels.py @@ -21,9 +21,10 @@ # BOLD activity estimate divided by the uncertainty about this estimate. from nilearn.datasets import fetch_localizer_contrasts n_subjects = 16 -data = fetch_localizer_contrasts(["left vs right button press"], n_subjects, - get_tmaps=True) - +data = fetch_localizer_contrasts( + ["left vs right button press"], n_subjects, + get_tmaps=True, legacy_format=False +) from nilearn import plotting ############################################################################ diff --git a/examples/05_glm_second_level/plot_second_level_association_test.py b/examples/05_glm_second_level/plot_second_level_association_test.py index 989ae7603c..5dffedd56e 100644 --- a/examples/05_glm_second_level/plot_second_level_association_test.py +++ b/examples/05_glm_second_level/plot_second_level_association_test.py @@ -19,7 +19,9 @@ from nilearn import datasets n_samples = 94 localizer_dataset = datasets.fetch_localizer_contrasts( - ['left button press (auditory cue)'], n_subjects=n_samples) + ['left button press (auditory cue)'], + n_subjects=n_samples, legacy_format=False +) ############################################################################## # Let's print basic information on the dataset. @@ -34,11 +36,13 @@ ############################################################################## # It is worth to do a auality check and remove subjects with missing values. import numpy as np -mask_quality_check = np.where(tested_var != b'n/a')[0] +mask_quality_check = np.where( + np.logical_not(np.isnan(tested_var)) +)[0] n_samples = mask_quality_check.size contrast_map_filenames = [localizer_dataset.cmaps[i] for i in mask_quality_check] -tested_var = tested_var[mask_quality_check].astype(float).reshape((-1, 1)) +tested_var = tested_var[mask_quality_check].values.reshape((-1, 1)) print("Actual number of subjects after quality check: %d" % n_samples) ############################################################################ diff --git a/examples/05_glm_second_level/plot_second_level_one_sample_test.py b/examples/05_glm_second_level/plot_second_level_one_sample_test.py index a053b51e9e..acea9504e4 100644 --- a/examples/05_glm_second_level/plot_second_level_one_sample_test.py +++ b/examples/05_glm_second_level/plot_second_level_one_sample_test.py @@ -27,8 +27,10 @@ # estimate. from nilearn.datasets import fetch_localizer_contrasts n_subjects = 16 -data = fetch_localizer_contrasts(["left vs right button press"], n_subjects, - get_tmaps=True) +data = fetch_localizer_contrasts( + ["left vs right button press"], n_subjects, + get_tmaps=True, legacy_format=False +) ########################################################################### # Display subject t_maps diff --git a/examples/05_glm_second_level/plot_second_level_two_sample_test.py b/examples/05_glm_second_level/plot_second_level_two_sample_test.py index ad045c576d..4f9264b8b5 100644 --- a/examples/05_glm_second_level/plot_second_level_two_sample_test.py +++ b/examples/05_glm_second_level/plot_second_level_two_sample_test.py @@ -35,9 +35,13 @@ # localizer dataset. n_subjects = 16 sample_vertical = fetch_localizer_contrasts( - ["vertical checkerboard"], n_subjects, get_tmaps=True) + ["vertical checkerboard"], n_subjects, + get_tmaps=True, legacy_format=False +) sample_horizontal = fetch_localizer_contrasts( - ["horizontal checkerboard"], n_subjects, get_tmaps=True) + ["horizontal checkerboard"], n_subjects, + get_tmaps=True, legacy_format=False +) # Implicitly, there is a one-to-one correspondence between the two samples: # the first image of both samples comes from subject S1, the second from subject S2 etc. diff --git a/examples/05_glm_second_level/plot_thresholding.py b/examples/05_glm_second_level/plot_thresholding.py index 3e0edcc532..2e60ebfca8 100644 --- a/examples/05_glm_second_level/plot_thresholding.py +++ b/examples/05_glm_second_level/plot_thresholding.py @@ -18,7 +18,8 @@ from nilearn import datasets n_samples = 20 localizer_dataset = datasets.fetch_localizer_calculation_task( - n_subjects=n_samples) + n_subjects=n_samples, legacy_format=False +) ######################################################################### # Get the set of individual statstical maps (contrast estimates) diff --git a/examples/07_advanced/plot_localizer_mass_univariate_methods.py b/examples/07_advanced/plot_localizer_mass_univariate_methods.py index 75128fe650..02956fc0e9 100644 --- a/examples/07_advanced/plot_localizer_mass_univariate_methods.py +++ b/examples/07_advanced/plot_localizer_mass_univariate_methods.py @@ -29,7 +29,9 @@ # Load Localizer contrast n_samples = 94 localizer_dataset = datasets.fetch_localizer_contrasts( - ['left button press (auditory cue)'], n_subjects=n_samples) + ['left button press (auditory cue)'], + n_subjects=n_samples, legacy_format=False +) # print basic information on the dataset print('First contrast nifti image (3D) is located at: %s' % @@ -37,11 +39,13 @@ tested_var = localizer_dataset.ext_vars['pseudo'] # Quality check / Remove subjects with bad tested variate -mask_quality_check = np.where(tested_var != b'n/a')[0] +mask_quality_check = np.where( + np.logical_not(np.isnan(tested_var)) +)[0] n_samples = mask_quality_check.size contrast_map_filenames = [localizer_dataset.cmaps[i] for i in mask_quality_check] -tested_var = tested_var[mask_quality_check].astype(float).reshape((-1, 1)) +tested_var = tested_var[mask_quality_check].values.reshape((-1, 1)) print("Actual number of subjects after quality check: %d" % n_samples) diff --git a/examples/07_advanced/plot_localizer_simple_analysis.py b/examples/07_advanced/plot_localizer_simple_analysis.py index a1cd1aeb5a..2c91a4d53f 100644 --- a/examples/07_advanced/plot_localizer_simple_analysis.py +++ b/examples/07_advanced/plot_localizer_simple_analysis.py @@ -25,7 +25,8 @@ # Load Localizer contrast n_samples = 20 localizer_dataset = datasets.fetch_localizer_calculation_task( - n_subjects=n_samples) + n_subjects=n_samples, legacy_format=False +) tested_var = np.ones((n_samples, 1)) diff --git a/nilearn/_utils/docs.py b/nilearn/_utils/docs.py index 01a6a11b4c..473db8ad6d 100644 --- a/nilearn/_utils/docs.py +++ b/nilearn/_utils/docs.py @@ -35,6 +35,13 @@ ax : :class:`~matplotlib.axes.Axes` The matplotlib axes in which the plots will be drawn.""" +# Legacy_format +docdict['legacy_format'] = """ +legacy_format : :obj:`bool`, optional + If set to ``True``, the fetcher will return recarrays. Otherwise, + it will return pandas dataframes. + Default=True.""" + # Resume docdict['resume'] = """ resume : :obj:`bool`, optional diff --git a/nilearn/datasets/__init__.py b/nilearn/datasets/__init__.py index 19ee5000c3..64ad875b81 100644 --- a/nilearn/datasets/__init__.py +++ b/nilearn/datasets/__init__.py @@ -2,7 +2,6 @@ Helper functions to download NeuroImaging datasets """ -from warnings import warn from .struct import (fetch_icbm152_2009, load_mni152_template, load_mni152_brain_mask, load_mni152_gm_template, load_mni152_gm_mask, load_mni152_wm_template, @@ -90,7 +89,3 @@ 'fetch_fiac_first_level', ] -warn("Fetchers from the nilearn.datasets module will be " - "updated in version 0.9 to return python strings " - "instead of bytes and Pandas dataframes instead of " - "Numpy arrays.", FutureWarning) diff --git a/nilearn/datasets/atlas.py b/nilearn/datasets/atlas.py index cd316a493d..b4dee2c7b9 100644 --- a/nilearn/datasets/atlas.py +++ b/nilearn/datasets/atlas.py @@ -10,6 +10,7 @@ import nibabel as nb import numpy as np +import pandas as pd from numpy.lib import recfunctions import re from sklearn.utils import Bunch @@ -20,9 +21,16 @@ _TALAIRACH_LEVELS = ['hemisphere', 'lobe', 'gyrus', 'tissue', 'ba'] +_LEGACY_FORMAT_MSG = ( + "`legacy_format` will default to `False` in release 0.11. " + "Dataset fetchers will then return pandas dataframes by default " + "instead of recarrays." +) + @fill_doc -def fetch_atlas_difumo(dimension=64, resolution_mm=2, data_dir=None, resume=True, verbose=1): +def fetch_atlas_difumo(dimension=64, resolution_mm=2, data_dir=None, + resume=True, verbose=1, legacy_format=True): """Fetch DiFuMo brain atlas Dictionaries of Functional Modes, or “DiFuMo”, can serve as atlases to extract @@ -56,6 +64,7 @@ def fetch_atlas_difumo(dimension=64, resolution_mm=2, data_dir=None, resume=True %(data_dir)s %(resume)s %(verbose)s + %(legacy_format)s Returns ------- @@ -70,6 +79,8 @@ def fetch_atlas_difumo(dimension=64, resolution_mm=2, data_dir=None, resume=True the regions. The length of the label array corresponds to the number of dimensions requested. ``data.labels[i]`` is the label corresponding to volume ``i`` in the 'maps' image. + If ``legacy_format`` is set to ``False``, this is a + :class:`pandas.DataFrame`. - 'description': :obj:`str`, general description of the dataset. References @@ -112,7 +123,11 @@ def fetch_atlas_difumo(dimension=64, resolution_mm=2, data_dir=None, resume=True # Download the zip file, first files_ = _fetch_files(data_dir, files, verbose=verbose) - labels = np.recfromcsv(files_[0]) + labels = pd.read_csv(files_[0]) + labels = labels.rename(columns={c: c.lower() for c in labels.columns}) + if legacy_format: + warnings.warn(_LEGACY_FORMAT_MSG) + labels = labels.to_records(index=False) # README readme_files = [('README.md', 'https://osf.io/4k9bf/download', @@ -199,7 +214,7 @@ def fetch_atlas_craddock_2012(data_dir=None, url=None, resume=True, verbose=1): @fill_doc def fetch_atlas_destrieux_2009(lateralized=True, data_dir=None, url=None, - resume=True, verbose=1): + resume=True, verbose=1, legacy_format=True): """Download and load the Destrieux cortical atlas (dated 2009). See :footcite:`Fischl2004Automatically`, @@ -220,6 +235,7 @@ def fetch_atlas_destrieux_2009(lateralized=True, data_dir=None, url=None, %(url)s %(resume)s %(verbose)s + %(legacy_format)s Returns ------- @@ -233,6 +249,8 @@ def fetch_atlas_destrieux_2009(lateralized=True, data_dir=None, url=None, indices in the list of labels. - 'labels': :class:`numpy.recarray`, rec array containing the names of the ROIs. + If ``legacy_format`` is set to ``False``, this is a + :class:`pandas.DataFrame`. - 'description': :obj:`str`, description of the atlas. References @@ -259,7 +277,12 @@ def fetch_atlas_destrieux_2009(lateralized=True, data_dir=None, url=None, files_ = _fetch_files(data_dir, files, resume=resume, verbose=verbose) - params = dict(maps=files_[1], labels=np.recfromcsv(files_[0])) + params = dict(maps=files_[1], + labels=pd.read_csv(files_[0], index_col=0)) + + if legacy_format: + warnings.warn(_LEGACY_FORMAT_MSG) + params['labels'] = params['labels'].to_records() with open(files_[2], 'r') as rst_file: params['description'] = rst_file.read() @@ -712,25 +735,32 @@ def fetch_atlas_msdl(data_dir=None, url=None, resume=True, verbose=1): data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) files = _fetch_files(data_dir, files, resume=resume, verbose=verbose) - csv_data = np.recfromcsv(files[0]) + csv_data = pd.read_csv(files[0]) labels = [name.strip() for name in csv_data['name'].tolist()] - labels = [label.decode("utf-8") for label in labels] + with warnings.catch_warnings(): warnings.filterwarnings('ignore', module='numpy', category=FutureWarning) - region_coords = csv_data[['x', 'y', 'z']].tolist() - net_names = [net_name.strip() for net_name in csv_data['net_name'].tolist()] + region_coords = csv_data[['x', 'y', 'z']].values.tolist() + net_names = [ + net_name.strip() for net_name in csv_data['net name'].tolist() + ] fdescr = _get_dataset_descr(dataset_name) return Bunch(maps=files[1], labels=labels, region_coords=region_coords, networks=net_names, description=fdescr) -def fetch_coords_power_2011(): +@fill_doc +def fetch_coords_power_2011(legacy_format=True): """Download and load the Power et al. brain atlas composed of 264 ROIs. See :footcite:`Power2011Functional`. + Parameters + ---------- + %(legacy_format)s + Returns ------- data : :func:`sklearn.utils.Bunch` @@ -738,6 +768,8 @@ def fetch_coords_power_2011(): - 'rois': :class:`numpy.recarray`, rec array containing the coordinates of 264 ROIs in :term:`MNI` space. + If ``legacy_format`` is set to ``False``, this is a + :class:`pandas.DataFrame`. - 'description': :obj:`str`, description of the atlas. @@ -750,8 +782,13 @@ def fetch_coords_power_2011(): fdescr = _get_dataset_descr(dataset_name) package_directory = os.path.dirname(os.path.abspath(__file__)) csv = os.path.join(package_directory, "data", "power_2011.csv") - params = dict(rois=np.recfromcsv(csv), description=fdescr) - + params = dict(rois=pd.read_csv(csv), description=fdescr) + params['rois'] = params['rois'].rename( + columns={c: c.lower() for c in params['rois'].columns} + ) + if legacy_format: + warnings.warn(_LEGACY_FORMAT_MSG) + params['rois'] = params['rois'].to_records(index=False) return Bunch(**params) @@ -1166,7 +1203,8 @@ def fetch_atlas_basc_multiscale_2015(version='sym', data_dir=None, url=None, return Bunch(**params) -def fetch_coords_dosenbach_2010(ordered_regions=True): +@fill_doc +def fetch_coords_dosenbach_2010(ordered_regions=True, legacy_format=True): """Load the Dosenbach et al. 160 ROIs. These ROIs cover much of the cerebral cortex and cerebellum and are assigned to 6 networks. @@ -1179,6 +1217,7 @@ def fetch_coords_dosenbach_2010(ordered_regions=True): ROIs from same networks are grouped together and ordered with respect to their names and their locations (anterior to posterior). Default=True. + %(legacy_format)s Returns ------- @@ -1187,6 +1226,8 @@ def fetch_coords_dosenbach_2010(ordered_regions=True): - 'rois': :class:`numpy.recarray`, rec array with the coordinates of the 160 ROIs in :term:`MNI` space. + If ``legacy_format`` is set to ``False``, this is a + :class:`pandas.DataFrame`. - 'labels': :class:`numpy.ndarray` of :obj:`str`, list of label names for the 160 ROIs. - 'networks': :class:`numpy.ndarray` of :obj:`str`, list of network @@ -1202,10 +1243,10 @@ def fetch_coords_dosenbach_2010(ordered_regions=True): fdescr = _get_dataset_descr(dataset_name) package_directory = os.path.dirname(os.path.abspath(__file__)) csv = os.path.join(package_directory, "data", "dosenbach_2010.csv") - out_csv = np.recfromcsv(csv) + out_csv = pd.read_csv(csv) if ordered_regions: - out_csv = np.sort(out_csv, order=['network', 'name', 'y']) + out_csv = out_csv.sort_values(by=['network', 'name', 'y']) # We add the ROI number to its name, since names are not unique names = out_csv['name'] @@ -1216,10 +1257,15 @@ def fetch_coords_dosenbach_2010(ordered_regions=True): labels=labels, networks=out_csv['network'], description=fdescr) + if legacy_format: + warnings.warn(_LEGACY_FORMAT_MSG) + params['rois'] = params['rois'].to_records(index=False) + return Bunch(**params) -def fetch_coords_seitzman_2018(ordered_regions=True): +@fill_doc +def fetch_coords_seitzman_2018(ordered_regions=True, legacy_format=True): """Load the Seitzman et al. 300 ROIs. These ROIs cover cortical, subcortical and cerebellar regions and are @@ -1238,6 +1284,7 @@ def fetch_coords_seitzman_2018(ordered_regions=True): ordered_regions : :obj:`bool`, optional ROIs from same networks are grouped together and ordered with respect to their locations (anterior to posterior). Default=True. + %(legacy_format)s Returns ------- @@ -1246,6 +1293,8 @@ def fetch_coords_seitzman_2018(ordered_regions=True): - 'rois': :class:`numpy.recarray`, rec array with the coordinates of the 300 ROIs in :term:`MNI` space. + If ``legacy_format`` is set to ``False``, this is a + :class:`pandas.DataFrame`. - 'radius': :class:`numpy.ndarray` of :obj:`int`, radius of each ROI in mm. - 'networks': :class:`numpy.ndarray` of :obj:`str`, names of the @@ -1267,10 +1316,8 @@ def fetch_coords_seitzman_2018(ordered_regions=True): anatomical_file = os.path.join(package_directory, "data", "seitzman_2018_ROIs_anatomicalLabels.txt") - rois = np.recfromcsv(roi_file, delimiter=" ") - rois = recfunctions.rename_fields(rois, {"netname": "network", - "radiusmm": "radius"}) - rois.network = rois.network.astype(str) + rois = pd.read_csv(roi_file, delimiter=" ") + rois = rois.rename(columns={"netName": "network", "radius(mm)": "radius"}) # get integer regional labels and convert to text labels with mapping # from header line @@ -1284,17 +1331,21 @@ def fetch_coords_seitzman_2018(ordered_regions=True): anatomical = np.genfromtxt(anatomical_file, skip_header=1) anatomical_names = np.array([region_mapping[a] for a in anatomical]) - rois = recfunctions.merge_arrays((rois, anatomical_names), - asrecarray=True, flatten=True) - rois.dtype.names = rois.dtype.names[:-1] + ("region",) + rois = pd.concat([rois, pd.DataFrame(anatomical_names)], axis=1) + rois.columns = list(rois.columns[:-1]) + ["region"] if ordered_regions: - rois = np.sort(rois, order=['network', 'y']) + rois = rois.sort_values(by=['network', 'y']) + + if legacy_format: + warnings.warn(_LEGACY_FORMAT_MSG) + rois = rois.to_records() params = dict(rois=rois[['x', 'y', 'z']], - radius=rois['radius'], - networks=rois['network'].astype(str), - regions=rois['region'], description=fdescr) + radius=np.array(rois['radius']), + networks=np.array(rois['network']), + regions=np.array(rois['region']), + description=fdescr) return Bunch(**params) diff --git a/nilearn/datasets/func.py b/nilearn/datasets/func.py index e0a1430336..f67abcd700 100644 --- a/nilearn/datasets/func.py +++ b/nilearn/datasets/func.py @@ -30,6 +30,13 @@ from nilearn.image import get_data +_LEGACY_FORMAT_MSG = ( + "`legacy_format` will default to `False` in release 0.11. " + "Dataset fetchers will then return pandas dataframes by default " + "instead of recarrays." +) + + @fill_doc def fetch_haxby(data_dir=None, subjects=(2,), fetch_stimuli=False, url=None, resume=True, verbose=1): @@ -412,7 +419,8 @@ def fetch_miyawaki2008(data_dir=None, url=None, resume=True, verbose=1): @fill_doc def fetch_localizer_contrasts(contrasts, n_subjects=None, get_tmaps=False, get_masks=False, get_anats=False, - data_dir=None, url=None, resume=True, verbose=1): + data_dir=None, url=None, resume=True, verbose=1, + legacy_format=True): """Download and load Brainomics/Localizer dataset (94 subjects). "The Functional Localizer is a simple and fast acquisition @@ -525,6 +533,7 @@ def fetch_localizer_contrasts(contrasts, n_subjects=None, get_tmaps=False, %(url)s %(resume)s %(verbose)s + %(legacy_format)s Returns ------- @@ -740,26 +749,26 @@ def _is_valid_path(path, index, verbose): # Load covariates file from numpy.lib.recfunctions import join_by participants_file = os.path.join(data_dir, participants_file) - csv_data = np.recfromcsv(participants_file, delimiter='\t') + csv_data = pd.read_csv(participants_file, delimiter='\t') behavioural_file = os.path.join(data_dir, behavioural_file) - csv_data2 = np.recfromcsv(behavioural_file, delimiter='\t') - csv_data = join_by( - "participant_id", csv_data, csv_data2, usemask=False, asrecarray=True) + csv_data2 = pd.read_csv(behavioural_file, delimiter='\t') + csv_data = csv_data.merge(csv_data2) subject_names = csv_data["participant_id"].tolist() subjects_indices = [] for name in subject_ids: - name = name.encode("utf8") if name not in subject_names: continue subjects_indices.append(subject_names.index(name)) - csv_data = csv_data[subjects_indices] - + csv_data = csv_data.iloc[subjects_indices] + if legacy_format: + warnings.warn(_LEGACY_FORMAT_MSG) + csv_data = csv_data.to_records(index=False) return Bunch(ext_vars=csv_data, description=fdescr, **files) @fill_doc def fetch_localizer_calculation_task(n_subjects=1, data_dir=None, url=None, - verbose=1): + verbose=1, legacy_format=True): """Fetch calculation task contrast maps from the localizer. Parameters @@ -770,6 +779,7 @@ def fetch_localizer_calculation_task(n_subjects=1, data_dir=None, url=None, %(data_dir)s %(url)s %(verbose)s + %(legacy_format)s Returns ------- @@ -793,13 +803,14 @@ def fetch_localizer_calculation_task(n_subjects=1, data_dir=None, url=None, n_subjects=n_subjects, get_tmaps=False, get_masks=False, get_anats=False, data_dir=data_dir, - url=url, resume=True, verbose=verbose) + url=url, resume=True, verbose=verbose, + legacy_format=legacy_format) return data @fill_doc def fetch_localizer_button_task(data_dir=None, url=None, - verbose=1): + verbose=1, legacy_format=True): """Fetch left vs right button press contrast maps from the localizer. Parameters @@ -807,6 +818,7 @@ def fetch_localizer_button_task(data_dir=None, url=None, %(data_dir)s %(url)s %(verbose)s + %(legacy_format)s Returns ------- @@ -833,7 +845,8 @@ def fetch_localizer_button_task(data_dir=None, url=None, n_subjects=[2], get_tmaps=True, get_masks=False, get_anats=True, data_dir=data_dir, - url=url, resume=True, verbose=verbose) + url=url, resume=True, verbose=verbose, + legacy_format=legacy_format) # Additional keys for backward compatibility data['tmap'] = data['tmaps'][0] data['anat'] = data['anats'][0] @@ -844,7 +857,8 @@ def fetch_localizer_button_task(data_dir=None, url=None, def fetch_abide_pcp(data_dir=None, n_subjects=None, pipeline='cpac', band_pass_filtering=False, global_signal_regression=False, derivatives=['func_preproc'], - quality_checked=True, url=None, verbose=1, **kwargs): + quality_checked=True, url=None, verbose=1, + legacy_format=True, **kwargs): """Fetch ABIDE dataset. Fetch the Autism Brain Imaging Data Exchange (ABIDE) dataset wrt criteria @@ -886,6 +900,7 @@ def fetch_abide_pcp(data_dir=None, n_subjects=None, pipeline='cpac', passed quality assessment for all raters. Default=True. %(url)s %(verbose)s + %(legacy_format)s kwargs : parameter list, optional Any extra keyword argument will be used to filter downloaded subjects according to the CSV phenotypic file. Some examples of filters are @@ -982,10 +997,10 @@ def fetch_abide_pcp(data_dir=None, n_subjects=None, pipeline='cpac', # bytes (encode()) needed for python 2/3 compat with numpy pheno = '\n'.join(pheno).encode() pheno = BytesIO(pheno) - pheno = np.recfromcsv(pheno, comments='$', case_sensitive=True) + pheno = pd.read_csv(pheno, comment='$') # First, filter subjects with no filename - pheno = pheno[pheno['FILE_ID'] != b'no_filename'] + pheno = pheno[pheno['FILE_ID'] != 'no_filename'] # Apply user defined filters user_filter = _filter_columns(pheno, kwargs) pheno = pheno[user_filter] @@ -996,11 +1011,15 @@ def fetch_abide_pcp(data_dir=None, n_subjects=None, pipeline='cpac', # Get the files results = {} - file_ids = [file_id.decode() for file_id in pheno['FILE_ID']] + file_ids = pheno['FILE_ID'].tolist() if n_subjects is not None: file_ids = file_ids[:n_subjects] pheno = pheno[:n_subjects] + if legacy_format: + warnings.warn(_LEGACY_FORMAT_MSG) + pheno = pheno.to_records(index=False) + results['description'] = _get_dataset_descr(dataset_name) results['phenotypic'] = pheno for derivative in derivatives: @@ -1667,7 +1686,7 @@ def _reduce_confounds(regressors, keep_confounds): out_file = in_file.replace('desc-confounds', 'desc-reducedConfounds') if not os.path.isfile(out_file): - confounds = np.recfromcsv(in_file, delimiter='\t') + confounds = pd.read_csv(in_file, delimiter='\t').to_records() selected_confounds = confounds[keep_confounds] header = '\t'.join(selected_confounds.dtype.names) np.savetxt(out_file, np.array(selected_confounds.tolist()), diff --git a/nilearn/datasets/struct.py b/nilearn/datasets/struct.py index 49b13c0a5d..6fdf3a3f1c 100644 --- a/nilearn/datasets/struct.py +++ b/nilearn/datasets/struct.py @@ -10,6 +10,7 @@ from pathlib import Path import numpy as np +import pandas as pd from scipy import ndimage from sklearn.utils import Bunch @@ -30,6 +31,11 @@ "mni_icbm152_wm_tal_nlin_sym_09a_converted.nii.gz") FSAVERAGE5_PATH = os.path.join(_package_directory, "data", "fsaverage5") +_LEGACY_FORMAT_MSG = ( + "`legacy_format` will default to `False` in release 0.11. " + "Dataset fetchers will then return pandas dataframes by default " + "instead of recarrays." +) # workaround for # https://github.com/nilearn/nilearn/pull/2738#issuecomment-869018842 @@ -551,7 +557,7 @@ def fetch_icbm152_brain_gm_mask(data_dir=None, threshold=0.2, resume=True, @fill_doc def fetch_oasis_vbm(n_subjects=None, dartel_version=True, data_dir=None, - url=None, resume=True, verbose=1): + url=None, resume=True, verbose=1, legacy_format=True): """Download and load Oasis "cross-sectional MRI" dataset (416 subjects). For more information, see :footcite:`OASISbrain`, @@ -570,6 +576,7 @@ def fetch_oasis_vbm(n_subjects=None, dartel_version=True, data_dir=None, %(url)s %(resume)s %(verbose)s + %(legacy_format)s Returns ------- @@ -735,18 +742,24 @@ def fetch_oasis_vbm(n_subjects=None, dartel_version=True, data_dir=None, data_usage_agreement = files[-1] # Keep CSV information only for selected subjects - csv_data = np.recfromcsv(ext_vars_file) + csv_data = pd.read_csv(ext_vars_file) # Comparisons to recfromcsv data must be bytes. actual_subjects_ids = [("OAS1" + str.split(os.path.basename(x), - "OAS1")[1][:9]).encode() + "OAS1")[1][:9]) for x in gm_maps] subject_mask = np.asarray([subject_id in actual_subjects_ids - for subject_id in csv_data['id']]) + for subject_id in csv_data['ID']]) csv_data = csv_data[subject_mask] - + csv_data = csv_data.rename( + columns={c: c.lower().replace("/", "") for c in csv_data.columns} + ) fdescr = _get_dataset_descr(dataset_name) + if legacy_format: + warnings.warn(_LEGACY_FORMAT_MSG) + csv_data = csv_data.to_records(index=False) + return Bunch( gray_matter_maps=gm_maps, white_matter_maps=wm_maps, diff --git a/nilearn/datasets/tests/test_atlas.py b/nilearn/datasets/tests/test_atlas.py index ed4ee35f53..2d509233e8 100644 --- a/nilearn/datasets/tests/test_atlas.py +++ b/nilearn/datasets/tests/test_atlas.py @@ -359,7 +359,7 @@ def test_fetch_atlas_destrieux_2009(tmp_path, request_mocker, lateralized): def test_fetch_atlas_msdl(tmp_path, request_mocker): labels = pd.DataFrame( {"x": [1.5, 1.2], "y": [1.5, 1.3], - "z": [1.5, 1.4], "name": ["Aud", "DMN"], "net_name": ["Aud", "DMN"]}) + "z": [1.5, 1.4], "name": ["Aud", "DMN"], "net name": ["Aud", "DMN"]}) root = Path("MSDL_rois") archive = {root / "msdl_rois_labels.csv": labels.to_csv(index=False), root / "msdl_rois.nii": "", diff --git a/nilearn/datasets/tests/test_func.py b/nilearn/datasets/tests/test_func.py index c6a44775ec..c46b807d0b 100644 --- a/nilearn/datasets/tests/test_func.py +++ b/nilearn/datasets/tests/test_func.py @@ -165,7 +165,8 @@ def test_fetch_localizer_contrasts(tmp_path, request_mocker, localizer_mocker): ['checkerboard'], n_subjects=2, data_dir=tmp_path, - verbose=1) + verbose=1, + legacy_format=True) assert not hasattr(dataset, 'anats') assert not hasattr(dataset, 'tmaps') assert not hasattr(dataset, 'masks') @@ -174,16 +175,31 @@ def test_fetch_localizer_contrasts(tmp_path, request_mocker, localizer_mocker): assert len(dataset.cmaps) == 2 assert dataset.ext_vars.size == 2 + dataset = func.fetch_localizer_contrasts( + ['checkerboard'], + n_subjects=2, + data_dir=tmp_path, + verbose=1, + legacy_format=False) + assert not hasattr(dataset, 'anats') + assert not hasattr(dataset, 'tmaps') + assert not hasattr(dataset, 'masks') + assert isinstance(dataset.cmaps[0], str) + assert isinstance(dataset.ext_vars, pd.DataFrame) + assert len(dataset.cmaps) == 2 + assert len(dataset['ext_vars']) == 2 + # Multiple contrasts dataset = func.fetch_localizer_contrasts( ['checkerboard', 'horizontal checkerboard'], n_subjects=2, data_dir=tmp_path, - verbose=1) - assert isinstance(dataset.ext_vars, np.recarray) + verbose=1, + legacy_format=False) + assert isinstance(dataset.ext_vars, pd.DataFrame) assert isinstance(dataset.cmaps[0], str) assert len(dataset.cmaps) == 2 * 2 # two contrasts are fetched - assert dataset.ext_vars.size == 2 + assert len(dataset['ext_vars']) == 2 # all get_*=True dataset = func.fetch_localizer_contrasts( @@ -193,13 +209,14 @@ def test_fetch_localizer_contrasts(tmp_path, request_mocker, localizer_mocker): get_anats=True, get_masks=True, get_tmaps=True, - verbose=1) - assert isinstance(dataset.ext_vars, np.recarray) + verbose=1, + legacy_format=False) + assert isinstance(dataset.ext_vars, pd.DataFrame) assert isinstance(dataset.anats[0], str) assert isinstance(dataset.cmaps[0], str) assert isinstance(dataset.masks[0], str) assert isinstance(dataset.tmaps[0], str) - assert dataset.ext_vars.size == 1 + assert len(dataset['ext_vars']) == 1 assert len(dataset.anats) == 1 assert len(dataset.cmaps) == 1 assert len(dataset.masks) == 1 @@ -211,11 +228,13 @@ def test_fetch_localizer_contrasts(tmp_path, request_mocker, localizer_mocker): ['checkerboard'], n_subjects=[2, 3, 5], data_dir=tmp_path, - verbose=1) - assert dataset2.ext_vars.size == 3 + verbose=1, + legacy_format=False) + assert len(dataset2['ext_vars']) == 3 assert len(dataset2.cmaps) == 3 - assert ([row[0] for row in dataset2.ext_vars] == - [b'S02', b'S03', b'S05']) + assert (list(dataset2['ext_vars']['participant_id'].values) == ['S02', + 'S03', + 'S05']) def test_fetch_localizer_calculation_task(tmp_path, request_mocker, @@ -224,7 +243,19 @@ def test_fetch_localizer_calculation_task(tmp_path, request_mocker, dataset = func.fetch_localizer_calculation_task( n_subjects=2, data_dir=tmp_path, - verbose=1) + verbose=1, + legacy_format=False) + assert isinstance(dataset.ext_vars, pd.DataFrame) + assert isinstance(dataset.cmaps[0], str) + assert len(dataset['ext_vars']) == 2 + assert len(dataset.cmaps) == 2 + assert dataset.description != '' + + dataset = func.fetch_localizer_calculation_task( + n_subjects=2, + data_dir=tmp_path, + verbose=1, + legacy_format=True) assert isinstance(dataset.ext_vars, np.recarray) assert isinstance(dataset.cmaps[0], str) assert dataset.ext_vars.size == 2 diff --git a/nilearn/datasets/tests/test_struct.py b/nilearn/datasets/tests/test_struct.py index 5859b6e0e7..10147a2742 100644 --- a/nilearn/datasets/tests/test_struct.py +++ b/nilearn/datasets/tests/test_struct.py @@ -110,27 +110,37 @@ def _make_oasis_data(dartel=True): return dict_to_archive(data) -def test_fetch_oasis_vbm(tmp_path, request_mocker): +@pytest.mark.parametrize('legacy_format', [True, False]) +def test_fetch_oasis_vbm(tmp_path, request_mocker, legacy_format): request_mocker.url_mapping["*archive_dartel.tgz*"] = _make_oasis_data() request_mocker.url_mapping["*archive.tgz*"] = _make_oasis_data(False) dataset = struct.fetch_oasis_vbm( - data_dir=str(tmp_path), verbose=0) + data_dir=str(tmp_path), verbose=0, legacy_format=legacy_format + ) assert len(dataset.gray_matter_maps) == 403 assert len(dataset.white_matter_maps) == 403 assert isinstance(dataset.gray_matter_maps[0], str) assert isinstance(dataset.white_matter_maps[0], str) - assert isinstance(dataset.ext_vars, np.recarray) + if legacy_format: + assert isinstance(dataset.ext_vars, np.recarray) + else: + assert isinstance(dataset.ext_vars, pd.DataFrame) assert isinstance(dataset.data_usage_agreement, str) assert request_mocker.url_count == 1 - dataset = struct.fetch_oasis_vbm(data_dir=str(tmp_path), - dartel_version=False, verbose=0) + dataset = struct.fetch_oasis_vbm( + data_dir=str(tmp_path), dartel_version=False, verbose=0, + legacy_format=legacy_format + ) assert len(dataset.gray_matter_maps) == 415 assert len(dataset.white_matter_maps) == 415 assert isinstance(dataset.gray_matter_maps[0], str) assert isinstance(dataset.white_matter_maps[0], str) - assert isinstance(dataset.ext_vars, np.recarray) + if legacy_format: + assert isinstance(dataset.ext_vars, np.recarray) + else: + assert isinstance(dataset.ext_vars, pd.DataFrame) assert isinstance(dataset.data_usage_agreement, str) assert request_mocker.url_count == 2 assert dataset.description != ''