Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,9 +160,10 @@ def _list_datasets(api_call):
assert datasets_dict['oml:data']['@xmlns:oml'] == \
'http://openml.org/openml'

datasets = []
datasets = dict()
for dataset_ in datasets_dict['oml:data']['oml:dataset']:
dataset = {'did': int(dataset_['oml:did']),
did = int(dataset_['oml:did'])
dataset = {'did': did,
'name': dataset_['oml:name'],
'format': dataset_['oml:format'],
'status': dataset_['oml:status']}
Expand All @@ -173,9 +174,7 @@ def _list_datasets(api_call):
if abs(int(quality['#text']) - quality['#text']) < 0.0000001:
quality['#text'] = int(quality['#text'])
dataset[quality['@name']] = quality['#text']

datasets.append(dataset)
datasets.sort(key=lambda t: t['did'])
datasets[did] = dataset

return datasets

Expand Down
43 changes: 16 additions & 27 deletions tests/datasets/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,13 +71,7 @@ def test_get_cached_dataset_arff_not_cached(self):
openml.datasets.functions._get_cached_dataset_arff,
3)

def test_list_datasets(self):
# We can only perform a smoke test here because we test on dynamic
# data from the internet...
datasets = openml.datasets.list_datasets()
# 1087 as the number of datasets on openml.org
self.assertGreaterEqual(len(datasets), 1087)
for dataset in datasets:
def _check_dataset(self, dataset):
self.assertEqual(type(dataset), dict)
self.assertGreaterEqual(len(dataset), 2)
self.assertIn('did', dataset)
Expand All @@ -87,34 +81,29 @@ def test_list_datasets(self):
self.assertIn(dataset['status'], ['in_preparation', 'active',
'deactivated'])

def test_list_datasets(self):
# We can only perform a smoke test here because we test on dynamic
# data from the internet...
datasets = openml.datasets.list_datasets()
# 1087 as the number of datasets on openml.org
self.assertGreaterEqual(len(datasets), 1087)
for did in datasets:
self._check_dataset(datasets[did])

def test_list_datasets_by_tag(self):
datasets = openml.datasets.list_datasets(tag='uci')
self.assertGreaterEqual(len(datasets), 5)
for dataset in datasets:
self.assertEqual(type(dataset), dict)
self.assertGreaterEqual(len(dataset), 2)
self.assertIn('did', dataset)
self.assertIsInstance(dataset['did'], int)
self.assertIn('status', dataset)
self.assertTrue(is_string(dataset['status']))
self.assertIn(dataset['status'], ['in_preparation', 'active',
'deactivated'])
for did in datasets:
self._check_dataset(datasets[did])

def test_list_datasets_paginate(self):
size = 10
max = 100
for i in range(0, max, size):
data = openml.datasets.list_datasets(offset=i, size=size)
self.assertGreaterEqual(size, len(data))
for dataset in data:
self.assertEqual(type(dataset), dict)
self.assertGreaterEqual(len(dataset), 2)
self.assertIn('did', dataset)
self.assertIsInstance(dataset['did'], int)
self.assertIn('status', dataset)
self.assertTrue(is_string(dataset['status']))
self.assertIn(dataset['status'], ['in_preparation',
'active', 'deactivated'])
datasets = openml.datasets.list_datasets(offset=i, size=size)
self.assertGreaterEqual(size, len(datasets))
for did in datasets:
self._check_dataset(datasets[did])

@unittest.skip('See https://github.com/openml/openml-python/issues/149')
def test_check_datasets_active(self):
Expand Down