diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index b9a1079be..f2212145d 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -137,7 +137,7 @@ def _get_cached_dataset_arff(dataset_id): "cached" % dataset_id) -def list_datasets(offset=None, size=None, tag=None, status=None): +def list_datasets(offset=None, size=None, status=None, **kwargs): """Return a list of all dataset which are on OpenML. Parameters @@ -146,12 +146,13 @@ def list_datasets(offset=None, size=None, tag=None, status=None): The number of datasets to skip, starting from the first. size : int, optional The maximum number of datasets to show. - tag : str, optional - Only include datasets matching this tag. status : str, optional Should be {active, in_preparation, deactivated}. By default active datasets are returned, but also datasets - from another status can be requested. + from another status can be requested. + kwargs : dict, optional + Legal filter operators (keys in the dict): + {tag, status, limit, offset, data_name, data_version, number_instances, number_features, number_classes, number_missing_values}. Returns ------- @@ -175,12 +176,13 @@ def list_datasets(offset=None, size=None, tag=None, status=None): if size is not None: api_call += "/limit/%d" % int(size) - if tag is not None: - api_call += "/tag/%s" % tag - if status is not None: api_call += "/status/%s" %status + if kwargs is not None: + for filter, value in kwargs.items(): + api_call += "/%s/%s" % (filter, value) + return _list_datasets(api_call) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 0f55b503d..85986fdf1 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -115,6 +115,9 @@ def _check_dataset(self, dataset): self.assertIsInstance(dataset['status'], six.string_types) self.assertIn(dataset['status'], ['in_preparation', 'active', 'deactivated']) + def _check_datasets(self, datasets): + for did in datasets: + self._check_dataset(datasets[did]) def test_tag_untag_dataset(self): tag = 'test_tag_%d' %random.randint(1, 1000000) @@ -129,14 +132,37 @@ def test_list_datasets(self): datasets = openml.datasets.list_datasets() # 1087 as the number of datasets on openml.org self.assertGreaterEqual(len(datasets), 100) - for did in datasets: - self._check_dataset(datasets[did]) + self._check_datasets(datasets) def test_list_datasets_by_tag(self): datasets = openml.datasets.list_datasets(tag='study_14') self.assertGreaterEqual(len(datasets), 100) - for did in datasets: - self._check_dataset(datasets[did]) + self._check_datasets(datasets) + + def test_list_datasets_by_number_instances(self): + datasets = openml.datasets.list_datasets(number_instances="5..100") + self.assertGreaterEqual(len(datasets), 4) + self._check_datasets(datasets) + + def test_list_datasets_by_number_features(self): + datasets = openml.datasets.list_datasets(number_features="50..100") + self.assertGreaterEqual(len(datasets), 8) + self._check_datasets(datasets) + + def test_list_datasets_by_number_classes(self): + datasets = openml.datasets.list_datasets(number_classes="5") + self.assertGreaterEqual(len(datasets), 3) + self._check_datasets(datasets) + + def test_list_datasets_by_number_missing_values(self): + datasets = openml.datasets.list_datasets(number_missing_values="5..100") + self.assertGreaterEqual(len(datasets), 5) + self._check_datasets(datasets) + + def test_list_datasets_combined_filters(self): + datasets = openml.datasets.list_datasets(tag='study_14', number_instances="100..1000", number_missing_values="800..1000") + self.assertGreaterEqual(len(datasets), 1) + self._check_datasets(datasets) def test_list_datasets_paginate(self): size = 10 @@ -144,8 +170,7 @@ def test_list_datasets_paginate(self): for i in range(0, max, size): datasets = openml.datasets.list_datasets(offset=i, size=size) self.assertGreaterEqual(size, len(datasets)) - for did in datasets: - self._check_dataset(datasets[did]) + self._check_datasets(datasets) def test_list_datasets_empty(self): datasets = openml.datasets.list_datasets(tag='NoOneWouldUseThisTagAnyway')