From 7d056bfc33cf6e4f5bcc3cf165af64b4da30b23c Mon Sep 17 00:00:00 2001 From: janvanrijn Date: Fri, 2 Sep 2016 14:51:09 +0200 Subject: [PATCH] changed dataset list to dict --- openml/datasets/functions.py | 9 +++---- tests/datasets/test_datasets.py | 43 ++++++++++++--------------------- 2 files changed, 20 insertions(+), 32 deletions(-) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 30178cd48..856837d30 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -160,9 +160,10 @@ def _list_datasets(api_call): assert datasets_dict['oml:data']['@xmlns:oml'] == \ 'http://openml.org/openml' - datasets = [] + datasets = dict() for dataset_ in datasets_dict['oml:data']['oml:dataset']: - dataset = {'did': int(dataset_['oml:did']), + did = int(dataset_['oml:did']) + dataset = {'did': did, 'name': dataset_['oml:name'], 'format': dataset_['oml:format'], 'status': dataset_['oml:status']} @@ -173,9 +174,7 @@ def _list_datasets(api_call): if abs(int(quality['#text']) - quality['#text']) < 0.0000001: quality['#text'] = int(quality['#text']) dataset[quality['@name']] = quality['#text'] - - datasets.append(dataset) - datasets.sort(key=lambda t: t['did']) + datasets[did] = dataset return datasets diff --git a/tests/datasets/test_datasets.py b/tests/datasets/test_datasets.py index 4293e33fa..e4ba693c8 100644 --- a/tests/datasets/test_datasets.py +++ b/tests/datasets/test_datasets.py @@ -71,13 +71,7 @@ def test_get_cached_dataset_arff_not_cached(self): openml.datasets.functions._get_cached_dataset_arff, 3) - def test_list_datasets(self): - # We can only perform a smoke test here because we test on dynamic - # data from the internet... - datasets = openml.datasets.list_datasets() - # 1087 as the number of datasets on openml.org - self.assertGreaterEqual(len(datasets), 1087) - for dataset in datasets: + def _check_dataset(self, dataset): self.assertEqual(type(dataset), dict) self.assertGreaterEqual(len(dataset), 2) self.assertIn('did', dataset) @@ -87,34 +81,29 @@ def test_list_datasets(self): self.assertIn(dataset['status'], ['in_preparation', 'active', 'deactivated']) + def test_list_datasets(self): + # We can only perform a smoke test here because we test on dynamic + # data from the internet... + datasets = openml.datasets.list_datasets() + # 1087 as the number of datasets on openml.org + self.assertGreaterEqual(len(datasets), 1087) + for did in datasets: + self._check_dataset(datasets[did]) + def test_list_datasets_by_tag(self): datasets = openml.datasets.list_datasets(tag='uci') self.assertGreaterEqual(len(datasets), 5) - for dataset in datasets: - self.assertEqual(type(dataset), dict) - self.assertGreaterEqual(len(dataset), 2) - self.assertIn('did', dataset) - self.assertIsInstance(dataset['did'], int) - self.assertIn('status', dataset) - self.assertTrue(is_string(dataset['status'])) - self.assertIn(dataset['status'], ['in_preparation', 'active', - 'deactivated']) + for did in datasets: + self._check_dataset(datasets[did]) def test_list_datasets_paginate(self): size = 10 max = 100 for i in range(0, max, size): - data = openml.datasets.list_datasets(offset=i, size=size) - self.assertGreaterEqual(size, len(data)) - for dataset in data: - self.assertEqual(type(dataset), dict) - self.assertGreaterEqual(len(dataset), 2) - self.assertIn('did', dataset) - self.assertIsInstance(dataset['did'], int) - self.assertIn('status', dataset) - self.assertTrue(is_string(dataset['status'])) - self.assertIn(dataset['status'], ['in_preparation', - 'active', 'deactivated']) + datasets = openml.datasets.list_datasets(offset=i, size=size) + self.assertGreaterEqual(size, len(datasets)) + for did in datasets: + self._check_dataset(datasets[did]) @unittest.skip('See https://github.com/openml/openml-python/issues/149') def test_check_datasets_active(self):