Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def _get_cached_dataset_arff(dataset_id):
"cached" % dataset_id)


def list_datasets(offset=None, size=None, tag=None, status=None):
def list_datasets(offset=None, size=None, status=None, **kwargs):
"""Return a list of all dataset which are on OpenML.

Parameters
Expand All @@ -146,12 +146,13 @@ def list_datasets(offset=None, size=None, tag=None, status=None):
The number of datasets to skip, starting from the first.
size : int, optional
The maximum number of datasets to show.
tag : str, optional
Only include datasets matching this tag.
status : str, optional
Should be {active, in_preparation, deactivated}. By
default active datasets are returned, but also datasets
from another status can be requested.
from another status can be requested.
kwargs : dict, optional
Legal filter operators (keys in the dict):
{tag, status, limit, offset, data_name, data_version, number_instances, number_features, number_classes, number_missing_values}.

Returns
-------
Expand All @@ -175,12 +176,13 @@ def list_datasets(offset=None, size=None, tag=None, status=None):
if size is not None:
api_call += "/limit/%d" % int(size)

if tag is not None:
api_call += "/tag/%s" % tag

if status is not None:
api_call += "/status/%s" %status

if kwargs is not None:
for filter, value in kwargs.items():
api_call += "/%s/%s" % (filter, value)

return _list_datasets(api_call)


Expand Down
37 changes: 31 additions & 6 deletions tests/test_datasets/test_dataset_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,9 @@ def _check_dataset(self, dataset):
self.assertIsInstance(dataset['status'], six.string_types)
self.assertIn(dataset['status'], ['in_preparation', 'active',
'deactivated'])
def _check_datasets(self, datasets):
for did in datasets:
self._check_dataset(datasets[did])

def test_tag_untag_dataset(self):
tag = 'test_tag_%d' %random.randint(1, 1000000)
Expand All @@ -129,23 +132,45 @@ def test_list_datasets(self):
datasets = openml.datasets.list_datasets()
# 1087 as the number of datasets on openml.org
self.assertGreaterEqual(len(datasets), 100)
for did in datasets:
self._check_dataset(datasets[did])
self._check_datasets(datasets)

def test_list_datasets_by_tag(self):
datasets = openml.datasets.list_datasets(tag='study_14')
self.assertGreaterEqual(len(datasets), 100)
for did in datasets:
self._check_dataset(datasets[did])
self._check_datasets(datasets)

def test_list_datasets_by_number_instances(self):
datasets = openml.datasets.list_datasets(number_instances="5..100")
self.assertGreaterEqual(len(datasets), 4)
self._check_datasets(datasets)

def test_list_datasets_by_number_features(self):
datasets = openml.datasets.list_datasets(number_features="50..100")
self.assertGreaterEqual(len(datasets), 8)
self._check_datasets(datasets)

def test_list_datasets_by_number_classes(self):
datasets = openml.datasets.list_datasets(number_classes="5")
self.assertGreaterEqual(len(datasets), 3)
self._check_datasets(datasets)

def test_list_datasets_by_number_missing_values(self):
datasets = openml.datasets.list_datasets(number_missing_values="5..100")
self.assertGreaterEqual(len(datasets), 5)
self._check_datasets(datasets)

def test_list_datasets_combined_filters(self):
datasets = openml.datasets.list_datasets(tag='study_14', number_instances="100..1000", number_missing_values="800..1000")
self.assertGreaterEqual(len(datasets), 1)
self._check_datasets(datasets)

def test_list_datasets_paginate(self):
size = 10
max = 100
for i in range(0, max, size):
datasets = openml.datasets.list_datasets(offset=i, size=size)
self.assertGreaterEqual(size, len(datasets))
for did in datasets:
self._check_dataset(datasets[did])
self._check_datasets(datasets)

def test_list_datasets_empty(self):
datasets = openml.datasets.list_datasets(tag='NoOneWouldUseThisTagAnyway')
Expand Down