From fda4b905668c31196f2123cc9dc4d03c56b39880 Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Fri, 21 Sep 2018 13:08:36 +0100 Subject: [PATCH 01/43] Add unit test for list of lists dataset upload --- examples/create_upload_tutorial.py | 4 +- openml/datasets/__init__.py | 13 +++- openml/datasets/dataset.py | 4 +- openml/datasets/functions.py | 2 +- tests/test_datasets/test_dataset_functions.py | 62 +++++++++++++++++++ 5 files changed, 77 insertions(+), 8 deletions(-) diff --git a/examples/create_upload_tutorial.py b/examples/create_upload_tutorial.py index 962c9b98e..6e757ecff 100644 --- a/examples/create_upload_tutorial.py +++ b/examples/create_upload_tutorial.py @@ -30,8 +30,8 @@ data = np.concatenate((X, y.reshape((-1, 1))), axis=1) attribute_names = list(attribute_names) attributes = [ - (attribute_name, 'REAL') for attribute_name in attribute_names -] + [('class', 'REAL')] + (attribute_name, 'REAL') for attribute_name in attribute_names + ] + [('class', targets)] ############################################################################ # Create the dataset object diff --git a/openml/datasets/__init__.py b/openml/datasets/__init__.py index d7b82cc6d..e4c81d203 100644 --- a/openml/datasets/__init__.py +++ b/openml/datasets/__init__.py @@ -1,7 +1,14 @@ from .functions import (list_datasets, check_datasets_active, - get_datasets, get_dataset) + get_datasets, get_dataset, create_dataset) from .dataset import OpenMLDataset from .data_feature import OpenMLDataFeature -__all__ = ['check_datasets_active', 'get_dataset', 'get_datasets', - 'OpenMLDataset', 'OpenMLDataFeature', 'list_datasets'] +__all__ = [ + 'check_datasets_active', + 'get_dataset', + 'get_datasets', + 'OpenMLDataset', + 'OpenMLDataFeature', + 'list_datasets', + 'create_dataset', +] \ No newline at end of file diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index fe05fa29f..a1ce10328 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -30,7 +30,7 @@ class OpenMLDataset(object): Name of the dataset. description : str Description of the dataset. - format : str + format : str, optional Format of the dataset. Only 'arff' for now. dataset_id : int, optional Id autogenerated by the server. @@ -86,7 +86,7 @@ class OpenMLDataset(object): dataset: string, optional Serialized arff dataset string. """ - def __init__(self, name, description, format, dataset_id=None, + def __init__(self, name, description, format='arff', dataset_id=None, version=None, creator=None, contributor=None, collection_date=None, upload_date=None, language=None, licence=None, url=None, default_target_attribute=None, diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index e916246cf..3df496da5 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -609,7 +609,7 @@ def _create_dataset_from_description(description, features, qualities, arff_file Parameters ---------- description : dict - Description of a dataset in xmlish dict. + Description of a dataset in xml dict. arff_file : string Path of dataset arff file. diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index db6025b1a..900f156b8 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -342,13 +342,16 @@ def test_upload_dataset_with_url(self): dataset.publish() self.assertIsInstance(dataset.dataset_id, int) + def test_create_dataset_numpy(self): + data = np.array([[1, 2, 3], [1.2, 2.5, 3.8], [2, 5, 8], [0, 1, 0]]).T attributes = [('col_{}'.format(i), 'REAL') for i in range(data.shape[1])] + name = 'NumPy_testing_dataset' description = 'Synthetic dataset created from a NumPy array' creator = 'OpenML tester' @@ -379,3 +382,62 @@ def test_create_dataset_numpy(self): paper_url=paper_url ) dataset.publish() + + + def test_create_dataset_list(self): + + data = [ + ['a', 'sunny', 85.0, 85.0, 'FALSE', 'no'], + ['b', 'sunny', 80.0, 90.0, 'TRUE', 'no'], + ['c', 'overcast', 83.0, 86.0, 'FALSE', 'yes'], + ['d', 'rainy', 70.0, 96.0, 'FALSE', 'yes'], + ['e', 'rainy', 68.0, 80.0, 'FALSE', 'yes'], + ['f', 'rainy', 65.0, 70.0, 'TRUE', 'no'], + ['g', 'overcast', 64.0, 65.0, 'TRUE', 'yes'], + ['h', 'sunny', 72.0, 95.0, 'FALSE', 'no'], + ['i', 'sunny', 69.0, 70.0, 'FALSE', 'yes'], + ['j', 'rainy', 75.0, 80.0, 'FALSE', 'yes'], + ['k', 'sunny', 75.0, 70.0, 'TRUE', 'yes'], + ['l', 'overcast', 72.0, 90.0, 'TRUE', 'yes'], + ['m', 'overcast', 81.0, 75.0, 'FALSE', 'yes'], + ['n', 'rainy', 71.0, 91.0, 'TRUE', 'no'], + ] + column_names = [ + ('rnd_str', 'STRING'), + ('outlook', ['sunny', 'overcast', 'rainy']), + ('temperature', 'REAL'), + ('humidity', 'REAL'), + ('windy', ['TRUE', 'FALSE']), + ('play', ['yes', 'no']), + ] + + name = "Wind dataset" + description = 'Testing dataset upload when the data is a list of lists' + creator = 'OpenML test' + collection_date = '21-09-2018' + language = 'English' + licence = 'MIT' + default_target_attribute = 'play' + citation = 'None' + original_data_url = 'http://openml.github.io/openml-python' + paper_url = 'http://openml.github.io/openml-python' + + dataset = openml.datasets.functions.create_dataset( + name=name, + description=description, + creator=creator, + contributor=None, + collection_date=collection_date, + language=language, + licence=licence, + default_target_attribute=default_target_attribute, + row_id_attribute=None, + ignore_attribute=None, + citation=citation, + attributes=column_names, + data=data, + version_label='test', + original_data_url=original_data_url, + paper_url=paper_url + ) + dataset.publish() From 2c7fd304c340ab0aa80bf6679c2acc0c4546f1be Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Fri, 21 Sep 2018 14:04:46 +0100 Subject: [PATCH 02/43] Fixing xml pattern typo --- tests/test_datasets/test_dataset_functions.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 900f156b8..8de4f477b 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -383,7 +383,6 @@ def test_create_dataset_numpy(self): ) dataset.publish() - def test_create_dataset_list(self): data = [ @@ -411,7 +410,7 @@ def test_create_dataset_list(self): ('play', ['yes', 'no']), ] - name = "Wind dataset" + name = "Wind_dataset_test" description = 'Testing dataset upload when the data is a list of lists' creator = 'OpenML test' collection_date = '21-09-2018' From ed33768b2dc3bda42e8e75182336e11eb47e1fef Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Fri, 21 Sep 2018 15:05:11 +0100 Subject: [PATCH 03/43] Fix pep8 no newline at the end of file --- openml/datasets/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openml/datasets/__init__.py b/openml/datasets/__init__.py index e4c81d203..776cb83e3 100644 --- a/openml/datasets/__init__.py +++ b/openml/datasets/__init__.py @@ -11,4 +11,4 @@ 'OpenMLDataFeature', 'list_datasets', 'create_dataset', -] \ No newline at end of file +] From a4ebfa7d69257a1773f89fc7f7434eb9de552ae0 Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Fri, 21 Sep 2018 15:19:43 +0100 Subject: [PATCH 04/43] Remove format from definitions --- openml/datasets/dataset.py | 10 ++++------ openml/datasets/functions.py | 6 ++---- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index a1ce10328..62819570c 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -30,8 +30,6 @@ class OpenMLDataset(object): Name of the dataset. description : str Description of the dataset. - format : str, optional - Format of the dataset. Only 'arff' for now. dataset_id : int, optional Id autogenerated by the server. version : int, optional @@ -86,15 +84,15 @@ class OpenMLDataset(object): dataset: string, optional Serialized arff dataset string. """ - def __init__(self, name, description, format='arff', dataset_id=None, + def __init__(self, name, description, dataset_id=None, version=None, creator=None, contributor=None, collection_date=None, upload_date=None, language=None, licence=None, url=None, default_target_attribute=None, row_id_attribute=None, ignore_attribute=None, version_label=None, citation=None, tag=None, visibility=None, original_data_url=None, paper_url=None, update_comment=None, - md5_checksum=None, data_file=None, features=None, qualities=None, - dataset=None): + md5_checksum=None, data_file=None, features=None, + qualities=None, dataset=None): # TODO add function to check if the name is casual_string128 # Attributes received by querying the RESTful API @@ -102,7 +100,7 @@ def __init__(self, name, description, format='arff', dataset_id=None, self.name = name self.version = int(version) if version is not None else None self.description = description - self.format = format + self.format = 'arff' self.creator = creator self.contributor = contributor self.collection_date = collection_date diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 3df496da5..6f228de4a 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -355,7 +355,7 @@ def get_dataset(dataset_id): def create_dataset(name, description, creator, contributor, collection_date, language, licence, attributes, data, default_target_attribute, - row_id_attribute, ignore_attribute, citation, format="arff", + row_id_attribute, ignore_attribute, citation, original_data_url=None, paper_url=None, update_comment=None, version_label=None): """Create a dataset. @@ -396,8 +396,6 @@ def create_dataset(name, description, creator, contributor, collection_date, Attributes that should be excluded in modelling, such as identifiers and indexes. citation : str Reference(s) that should be cited when building on this data. - format : str, optional - Format of the dataset. Only 'arff' for now. version_label : str, optional Version label provided by user, can be a date, hash, or some other type of id. original_data_url : str, optional @@ -428,7 +426,7 @@ def create_dataset(name, description, creator, contributor, collection_date, raise ValueError("The arguments you have provided \ do not construct a valid arff file") - return OpenMLDataset(name, description, format, creator=creator, + return OpenMLDataset(name, description, creator=creator, contributor=contributor, collection_date=collection_date, language=language, licence=licence, default_target_attribute=default_target_attribute, row_id_attribute=row_id_attribute, ignore_attribute=ignore_attribute, citation=citation, From ebd7113766e64941a93905094bdffd7fc8383a95 Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Fri, 21 Sep 2018 17:09:58 +0100 Subject: [PATCH 05/43] Restoring format in dataset --- openml/datasets/dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index 62819570c..b75bbfc59 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -84,7 +84,7 @@ class OpenMLDataset(object): dataset: string, optional Serialized arff dataset string. """ - def __init__(self, name, description, dataset_id=None, + def __init__(self, name, description, dataset_id=None, format='arff', version=None, creator=None, contributor=None, collection_date=None, upload_date=None, language=None, licence=None, url=None, default_target_attribute=None, @@ -100,7 +100,7 @@ def __init__(self, name, description, dataset_id=None, self.name = name self.version = int(version) if version is not None else None self.description = description - self.format = 'arff' + self.format = format self.creator = creator self.contributor = contributor self.collection_date = collection_date From 5d6053eafbfca7477353e9d6eef7560f7e5dd68d Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Fri, 21 Sep 2018 17:27:50 +0100 Subject: [PATCH 06/43] Fixing a couple of unused imports and fixings bugs with create_dataset call --- tests/test_datasets/test_dataset_functions.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 8de4f477b..cd6311044 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -295,7 +295,7 @@ def test__get_dataset_qualities(self): def test_deletion_of_cache_dir(self): # Simple removal - did_cache_dir = openml.utils._create_cache_directory_for_id( + did_cache_dir = _create_cache_directory_for_id( DATASETS_CACHE_DIR_NAME, 1, ) self.assertTrue(os.path.exists(did_cache_dir)) @@ -362,7 +362,7 @@ def test_create_dataset_numpy(self): citation = 'None' original_data_url = 'http://openml.github.io/openml-python' paper_url = 'http://openml.github.io/openml-python' - dataset = openml.datasets.functions.create_dataset( + dataset = create_dataset( name=name, description=description, creator=creator, @@ -376,7 +376,6 @@ def test_create_dataset_numpy(self): citation=citation, attributes=attributes, data=data, - format='arff', version_label='test', original_data_url=original_data_url, paper_url=paper_url @@ -421,7 +420,7 @@ def test_create_dataset_list(self): original_data_url = 'http://openml.github.io/openml-python' paper_url = 'http://openml.github.io/openml-python' - dataset = openml.datasets.functions.create_dataset( + dataset = create_dataset( name=name, description=description, creator=creator, From fbc1f6be8bd8371ecd0eea3503ab361953c161a5 Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Fri, 21 Sep 2018 17:54:05 +0100 Subject: [PATCH 07/43] Adapting unit tests to changes --- openml/datasets/functions.py | 5 +++-- tests/test_datasets/test_dataset_functions.py | 19 +++++++++++++++---- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 6f228de4a..c56a58b35 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -619,7 +619,6 @@ def _create_dataset_from_description(description, features, qualities, arff_file dataset = OpenMLDataset( description["oml:name"], description.get("oml:description"), - description["oml:format"], description["oml:id"], description["oml:version"], description.get("oml:creator"), @@ -642,5 +641,7 @@ def _create_dataset_from_description(description, features, qualities, arff_file description.get("oml:md5_checksum"), data_file=arff_file, features=features, - qualities=qualities) + qualities=qualities, + format=description["oml:format"], + ) return dataset diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index cd6311044..57e4f800f 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -317,12 +317,19 @@ def test_deletion_of_cache_dir_faulty_download(self, patch): self.assertEqual(len(os.listdir(datasets_cache_dir)), 0) def test_publish_dataset(self): + openml.datasets.get_dataset(3) file_path = os.path.join(openml.config.get_cache_directory(), "datasets", "3", "dataset.arff") dataset = OpenMLDataset( - "anneal", "test", "ARFF", - version=1, licence="public", default_target_attribute="class", data_file=file_path) + "anneal", + "test", + format="ARFF", + version=1, + licence="public", + default_target_attribute="class", + data_file=file_path, + ) dataset.publish() self.assertIsInstance(dataset.dataset_id, int) @@ -335,10 +342,14 @@ def test__retrieve_class_labels(self): self.assertEqual(labels, ['C', 'H', 'G']) def test_upload_dataset_with_url(self): + dataset = OpenMLDataset( - "UploadTestWithURL", "test", "ARFF", + "UploadTestWithURL", + "test", + format="ARFF", version=1, - url="https://www.openml.org/data/download/61/dataset_61_iris.arff") + url="https://www.openml.org/data/download/61/dataset_61_iris.arff", + ) dataset.publish() self.assertIsInstance(dataset.dataset_id, int) From 6a3ffb818afbeca3927b2d6d6e17706f46105b61 Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Fri, 21 Sep 2018 18:14:44 +0100 Subject: [PATCH 08/43] Fixing failing unit tests --- openml/datasets/dataset.py | 2 +- openml/datasets/functions.py | 42 ++++++++++++++++++------------------ 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index b75bbfc59..d4ebe8549 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -84,7 +84,7 @@ class OpenMLDataset(object): dataset: string, optional Serialized arff dataset string. """ - def __init__(self, name, description, dataset_id=None, format='arff', + def __init__(self, name, description, format='arff', dataset_id=None, version=None, creator=None, contributor=None, collection_date=None, upload_date=None, language=None, licence=None, url=None, default_target_attribute=None, diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index c56a58b35..6cc321fa2 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -619,29 +619,29 @@ def _create_dataset_from_description(description, features, qualities, arff_file dataset = OpenMLDataset( description["oml:name"], description.get("oml:description"), - description["oml:id"], - description["oml:version"], - description.get("oml:creator"), - description.get("oml:contributor"), - description.get("oml:collection_date"), - description.get("oml:upload_date"), - description.get("oml:language"), - description.get("oml:licence"), - description["oml:url"], - description.get("oml:default_target_attribute"), - description.get("oml:row_id_attribute"), - description.get("oml:ignore_attribute"), - description.get("oml:version_label"), - description.get("oml:citation"), - description.get("oml:tag"), - description.get("oml:visibility"), - description.get("oml:original_data_url"), - description.get("oml:paper_url"), - description.get("oml:update_comment"), - description.get("oml:md5_checksum"), + format=description["oml:format"], + dataset_id=description["oml:id"], + version=description["oml:version"], + creator=description.get("oml:creator"), + contributor=description.get("oml:contributor"), + collection_date=description.get("oml:collection_date"), + upload_date=description.get("oml:upload_date"), + language=description.get("oml:language"), + license=description.get("oml:licence"), + url=description["oml:url"], + default_target_attribute=description.get("oml:default_target_attribute"), + row_id_attribute=description.get("oml:row_id_attribute"), + ignore_attribute=description.get("oml:ignore_attribute"), + version_label=description.get("oml:version_label"), + citation=description.get("oml:citation"), + tag=description.get("oml:tag"), + visibility=description.get("oml:visibility"), + original_data_url=description.get("oml:original_data_url"), + paper_url=description.get("oml:paper_url"), + update_comment=description.get("oml:update_comment"), + md5_checksum=description.get("oml:md5_checksum"), data_file=arff_file, features=features, qualities=qualities, - format=description["oml:format"], ) return dataset From 7dc9355c604f5f4a498130c578c9892bc41905cf Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Fri, 21 Sep 2018 18:40:20 +0100 Subject: [PATCH 09/43] fixing typo --- openml/datasets/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 6cc321fa2..f30d93760 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -627,7 +627,7 @@ def _create_dataset_from_description(description, features, qualities, arff_file collection_date=description.get("oml:collection_date"), upload_date=description.get("oml:upload_date"), language=description.get("oml:language"), - license=description.get("oml:licence"), + licence=description.get("oml:licence"), url=description["oml:url"], default_target_attribute=description.get("oml:default_target_attribute"), row_id_attribute=description.get("oml:row_id_attribute"), From 7b0fdde0c4a3bea7ae9ddc8f759f556acb34439b Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Fri, 21 Sep 2018 19:14:34 +0100 Subject: [PATCH 10/43] Enforce pep8 style guide, fix doc tutorial trying to invoke create_dataset with format attribute --- examples/create_upload_tutorial.py | 2 -- openml/datasets/functions.py | 3 ++- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/create_upload_tutorial.py b/examples/create_upload_tutorial.py index 6e757ecff..762b9f5d3 100644 --- a/examples/create_upload_tutorial.py +++ b/examples/create_upload_tutorial.py @@ -73,8 +73,6 @@ # Attributes of the data attributes=attributes, data=data, - # Format of the dataset. Only 'arff' for now. - format='arff', # A version label which is provided by the user. version_label='test', original_data_url='https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)', diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index f30d93760..d7c7a8fd0 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -629,7 +629,8 @@ def _create_dataset_from_description(description, features, qualities, arff_file language=description.get("oml:language"), licence=description.get("oml:licence"), url=description["oml:url"], - default_target_attribute=description.get("oml:default_target_attribute"), + default_target_attribute= + description.get("oml:default_target_attribute"), row_id_attribute=description.get("oml:row_id_attribute"), ignore_attribute=description.get("oml:ignore_attribute"), version_label=description.get("oml:version_label"), From 2d7b75cece147ee324e29265b7da892598a59bbf Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Fri, 21 Sep 2018 19:37:22 +0100 Subject: [PATCH 11/43] Workaround for pep8 style guide --- openml/datasets/functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index d7c7a8fd0..c910ccf97 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -629,8 +629,8 @@ def _create_dataset_from_description(description, features, qualities, arff_file language=description.get("oml:language"), licence=description.get("oml:licence"), url=description["oml:url"], - default_target_attribute= - description.get("oml:default_target_attribute"), + default_target_attribute=description. + get("oml:default_target_attribute"), row_id_attribute=description.get("oml:row_id_attribute"), ignore_attribute=description.get("oml:ignore_attribute"), version_label=description.get("oml:version_label"), From 2919dd69b1020004e16be9c1cd37c0735006475e Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Fri, 21 Sep 2018 19:51:58 +0100 Subject: [PATCH 12/43] fix long time typo --- openml/datasets/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index d4ebe8549..de9c80c2c 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -126,7 +126,7 @@ def __init__(self, name, description, format='arff', dataset_id=None, self.original_data_url = original_data_url self.paper_url = paper_url self.update_comment = update_comment - self.md5_cheksum = md5_checksum + self.md5_checksum = md5_checksum self.data_file = data_file self.features = None self.qualities = None From 1c4faff68b6fc1013ce9c20a8c8437fd654be20e Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Sat, 22 Sep 2018 11:19:57 +0100 Subject: [PATCH 13/43] update pep8 failing statement and bug fix for dataset upload tutorial --- examples/create_upload_tutorial.py | 2 +- openml/datasets/functions.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/create_upload_tutorial.py b/examples/create_upload_tutorial.py index 762b9f5d3..16dde6125 100644 --- a/examples/create_upload_tutorial.py +++ b/examples/create_upload_tutorial.py @@ -31,7 +31,7 @@ attribute_names = list(attribute_names) attributes = [ (attribute_name, 'REAL') for attribute_name in attribute_names - ] + [('class', targets)] + ] + [('class', list(targets))] ############################################################################ # Create the dataset object diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index c910ccf97..b7d3faeb4 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -629,8 +629,9 @@ def _create_dataset_from_description(description, features, qualities, arff_file language=description.get("oml:language"), licence=description.get("oml:licence"), url=description["oml:url"], - default_target_attribute=description. - get("oml:default_target_attribute"), + default_target_attribute=description.get( + "oml:default_target_attribute" + ), row_id_attribute=description.get("oml:row_id_attribute"), ignore_attribute=description.get("oml:ignore_attribute"), version_label=description.get("oml:version_label"), From 3602739eae5f49458d9b8f96cb61e641bb6db0aa Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Mon, 24 Sep 2018 12:27:18 +0200 Subject: [PATCH 14/43] fixed problem with arff file --- examples/create_upload_tutorial.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/create_upload_tutorial.py b/examples/create_upload_tutorial.py index 16dde6125..d05969c92 100644 --- a/examples/create_upload_tutorial.py +++ b/examples/create_upload_tutorial.py @@ -9,7 +9,8 @@ import sklearn.datasets ############################################################################ -# For this example we will upload to the test server to not pollute the live server with countless copies of the same dataset. +# For this example we will upload to the test server to not pollute the live +# server with countless copies of the same dataset. openml.config.server = 'https://test.openml.org/api/v1/xml' ############################################################################ @@ -21,7 +22,6 @@ X = breast_cancer.data y = breast_cancer.target attribute_names = breast_cancer.feature_names -targets = breast_cancer.target_names description = breast_cancer.DESCR ############################################################################ @@ -31,7 +31,7 @@ attribute_names = list(attribute_names) attributes = [ (attribute_name, 'REAL') for attribute_name in attribute_names - ] + [('class', list(targets))] + ] + [('class', 'INTEGER')] ############################################################################ # Create the dataset object @@ -76,7 +76,8 @@ # A version label which is provided by the user. version_label='test', original_data_url='https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)', - paper_url='https://www.spiedigitallibrary.org/conference-proceedings-of-spie/1905/0000/Nuclear-feature-extraction-for-breast-tumor-diagnosis/10.1117/12.148698.short?SSO=1' + paper_url='https://www.spiedigitallibrary.org/conference-proceedings-of-spie/1905/0000/' + 'Nuclear-feature-extraction-for-breast-tumor-diagnosis/10.1117/12.148698.short?SSO=1' ) ############################################################################ From 46cf1fa630c5ae5e705c6c838db872422364800e Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Mon, 24 Sep 2018 13:57:09 +0200 Subject: [PATCH 15/43] Fix pep8 line too long --- examples/create_upload_tutorial.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/create_upload_tutorial.py b/examples/create_upload_tutorial.py index d05969c92..ea85e0a3d 100644 --- a/examples/create_upload_tutorial.py +++ b/examples/create_upload_tutorial.py @@ -75,9 +75,11 @@ data=data, # A version label which is provided by the user. version_label='test', - original_data_url='https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)', - paper_url='https://www.spiedigitallibrary.org/conference-proceedings-of-spie/1905/0000/' - 'Nuclear-feature-extraction-for-breast-tumor-diagnosis/10.1117/12.148698.short?SSO=1' + original_data_url='https://archive.ics.uci.edu/ml/datasets/' + 'Breast+Cancer+Wisconsin+(Diagnostic)', + paper_url='https://www.spiedigitallibrary.org/conference-proceedings-of-spie/' + '1905/0000/Nuclear-feature-extraction-for-breast-tumor-diagnosis/' + '10.1117/12.148698.short?SSO=1' ) ############################################################################ From 693c3680c97bb8df45861199bde3fc74b946d75c Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Tue, 25 Sep 2018 13:09:13 +0200 Subject: [PATCH 16/43] Extending the unit test for dataset upload, changing upload tutorial --- examples/create_upload_tutorial.py | 19 ++++++++----- openml/datasets/dataset.py | 28 +++++++++++++++---- tests/test_datasets/test_dataset_functions.py | 4 ++- 3 files changed, 37 insertions(+), 14 deletions(-) diff --git a/examples/create_upload_tutorial.py b/examples/create_upload_tutorial.py index ea85e0a3d..b41fb1225 100644 --- a/examples/create_upload_tutorial.py +++ b/examples/create_upload_tutorial.py @@ -21,9 +21,10 @@ name = 'BreastCancer(scikit-learn)' X = breast_cancer.data y = breast_cancer.target +target_names = breast_cancer.target_names +y = np.array([target_names[i] for i in y]) attribute_names = breast_cancer.feature_names description = breast_cancer.DESCR - ############################################################################ # OpenML does not distinguish between the attributes and targets on the data level and stores all data in a # single matrix. The target feature is indicated as meta-data of the dataset (and tasks on that data). @@ -31,7 +32,7 @@ attribute_names = list(attribute_names) attributes = [ (attribute_name, 'REAL') for attribute_name in attribute_names - ] + [('class', 'INTEGER')] + ] + [('class', list(breast_cancer.target_names))] ############################################################################ # Create the dataset object @@ -75,11 +76,15 @@ data=data, # A version label which is provided by the user. version_label='test', - original_data_url='https://archive.ics.uci.edu/ml/datasets/' - 'Breast+Cancer+Wisconsin+(Diagnostic)', - paper_url='https://www.spiedigitallibrary.org/conference-proceedings-of-spie/' - '1905/0000/Nuclear-feature-extraction-for-breast-tumor-diagnosis/' - '10.1117/12.148698.short?SSO=1' + original_data_url=( + 'https://archive.ics.uci.edu/ml/datasets/' + 'Breast+Cancer+Wisconsin+(Diagnostic)' + ), + paper_url=( + 'https://www.spiedigitallibrary.org/conference-proceedings-of-spie/' + '1905/0000/Nuclear-feature-extraction-for-breast-tumor-diagnosis/' + '10.1117/12.148698.short?SSO=1' + ) ) ############################################################################ diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index de9c80c2c..f51e074e9 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -206,15 +206,31 @@ def remove_tag(self, tag): openml._api_calls._perform_api_call("/data/untag", data=data) def __eq__(self, other): + + server_fields = { + 'dataset_id':True, + 'version':True, + 'upload_date':True, + 'url':True, + 'dataset':True, + 'data_file': True, + } + if type(other) != OpenMLDataset: return False - elif ( - self.dataset_id == other.dataset_id - or (self.name == other._name and self.version == other._version) - ): - return True else: - return False + for field in self.__dict__: + if field not in server_fields: + if field in other.__dict__: + if self.__dict__[field] != other.__dict__[field]: + return False + else: + return False + return True + + def __ne__(self, other): + """Only needed for python 2, unnecessary in Python 3""" + return not self.__eq__(other) def _get_arff(self, format): """Read ARFF file and return decoded arff. diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 57e4f800f..e368ed3fe 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -449,4 +449,6 @@ def test_create_dataset_list(self): original_data_url=original_data_url, paper_url=paper_url ) - dataset.publish() + uploaded_did = dataset.publish() + uploaded_dataset = openml.datasets.get_dataset(uploaded_did) + self.assertTrue(dataset == uploaded_dataset) From e29cf4d345da4f065f49feb1c4624c671050c75c Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Thu, 27 Sep 2018 14:04:54 +0200 Subject: [PATCH 17/43] Workaround for the dataset upload unit test --- openml/datasets/functions.py | 21 +++++++++++++++++++ tests/test_datasets/test_dataset_functions.py | 10 ++++++--- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index b7d3faeb4..7618d81bd 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -647,3 +647,24 @@ def _create_dataset_from_description(description, features, qualities, arff_file qualities=qualities, ) return dataset + + +def _get_online_dataset_arff(did): + """Download the arff file for a given dataset id + from the OpenML website. + + Parameters + ---------- + did : int + A dataset id. + + Returns + ------- + str + A string representation of an arff file. + """ + dataset_xml = openml._api_calls._perform_api_call("data/%d" % did) + # use the url from the dataset description and return the arff string + return openml._api_calls._read_url( + xmltodict.parse(dataset_xml)['oml:data_set_description']['oml:url'] + ) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index e368ed3fe..1188242e8 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -31,7 +31,8 @@ _get_dataset_arff, _get_dataset_features, _get_dataset_qualities, - DATASETS_CACHE_DIR_NAME) + DATASETS_CACHE_DIR_NAME, + _get_online_dataset_arff) class TestOpenMLDataset(TestBase): @@ -450,5 +451,8 @@ def test_create_dataset_list(self): paper_url=paper_url ) uploaded_did = dataset.publish() - uploaded_dataset = openml.datasets.get_dataset(uploaded_did) - self.assertTrue(dataset == uploaded_dataset) + self.assertEqual( + _get_online_dataset_arff(uploaded_did), + dataset._dataset, + "Uploaded arff does not match original one" + ) From f0d8200cf30b9299c6dd01d2c5a6b7e1dcac986d Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Fri, 28 Sep 2018 11:51:32 +0200 Subject: [PATCH 18/43] Adding example with weather dataset into the dataset upload tutorial --- examples/create_upload_tutorial.py | 99 +++++++++++++++++++++++++++--- openml/datasets/functions.py | 2 +- 2 files changed, 93 insertions(+), 8 deletions(-) diff --git a/examples/create_upload_tutorial.py b/examples/create_upload_tutorial.py index b41fb1225..300d7c287 100644 --- a/examples/create_upload_tutorial.py +++ b/examples/create_upload_tutorial.py @@ -7,19 +7,29 @@ import numpy as np import openml import sklearn.datasets +from openml.datasets.functions import create_dataset ############################################################################ -# For this example we will upload to the test server to not pollute the live +# For this tutorial we will upload to the test server to not pollute the live # server with countless copies of the same dataset. openml.config.server = 'https://test.openml.org/api/v1/xml' ############################################################################ -# Prepare the data -# ^^^^^^^^^^^^^^^^ +# The dataset that you upload to OpenML can be: +# +# * A numpy array. +# * A list of lists. + +############################################################################ +# Dataset is a numpy array +# ^^^^^^^^^^^^^^^^^^^^^^^^ +# +# Prepare dataset +# =============== # Load an example dataset from scikit-learn which we will upload to OpenML.org via the API. breast_cancer = sklearn.datasets.load_breast_cancer() name = 'BreastCancer(scikit-learn)' -X = breast_cancer.data +x = breast_cancer.data y = breast_cancer.target target_names = breast_cancer.target_names y = np.array([target_names[i] for i in y]) @@ -28,7 +38,8 @@ ############################################################################ # OpenML does not distinguish between the attributes and targets on the data level and stores all data in a # single matrix. The target feature is indicated as meta-data of the dataset (and tasks on that data). -data = np.concatenate((X, y.reshape((-1, 1))), axis=1) + +data = np.concatenate((x, y.reshape((-1, 1))), axis=1) attribute_names = list(attribute_names) attributes = [ (attribute_name, 'REAL') for attribute_name in attribute_names @@ -36,11 +47,11 @@ ############################################################################ # Create the dataset object -# ^^^^^^^^^^^^^^^^^^^^^^^^^ +# ========================= # The definition of all fields can be found in the XSD files describing the expected format: # # https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.data.upload.xsd -dataset = openml.datasets.functions.create_dataset( +dataset = create_dataset( # The name of the dataset (needs to be unique). # Must not be longer than 128 characters and only contain # a-z, A-Z, 0-9 and the following special characters: _\-\.(), @@ -93,3 +104,77 @@ print('URL for dataset: %s/data/%d' % (openml.config.server, upload_id)) except openml.exceptions.PyOpenMLError as err: print("OpenML: {0}".format(err)) + +############################################################################ +# Dataset is a list of lists +# ^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# Weather dataset: +# http://storm.cis.fordham.edu/~gweiss/data-mining/datasets.html +data = [ + ['sunny', 85, 85, 'FALSE', 'no'], + ['sunny', 80, 90, 'TRUE', 'no'], + ['overcast', 83, 86, 'FALSE', 'yes'], + ['rainy', 70, 96, 'FALSE', 'yes'], + ['rainy', 68, 80, 'FALSE', 'yes'], + ['rainy', 65, 70, 'TRUE', 'no'], + ['overcast', 64, 65, 'TRUE', 'yes'], + ['sunny', 72, 95, 'FALSE', 'no'], + ['sunny', 69, 70, 'FALSE', 'yes'], + ['rainy', 75, 80, 'FALSE', 'yes'], + ['sunny', 75, 70, 'TRUE', 'yes'], + ['overcast', 72, 90, 'TRUE', 'yes'], + ['overcast', 81, 75, 'FALSE', 'yes'], + ['rainy', 71, 91, 'TRUE', 'no'], +] + +column_names = [ + ('outlook', ['sunny', 'overcast', 'rainy']), + ('temperature', 'REAL'), + ('humidity', 'REAL'), + ('windy', ['TRUE', 'FALSE']), + ('play', ['yes', 'no']), +] + +name = "Wind" +description = ( + 'The weather problem is a tiny dataset that we will use repeatedly' + ' to illustrate machine learning methods. Entirely fictitious, it ' + 'supposedly concerns the conditions that are suitable for playing ' + 'some unspecified game. In general, instances in a dataset are ' + 'characterized by the values of features, or attributes, that measure ' + 'different aspects of the instance. In this case there are four ' + 'attributes: outlook, temperature, humidity, and windy. ' + 'The outcome is whether to play or not.' +) +collection_date = '01-01-2011' +language = 'English' +default_target_attribute = 'play' +citation = 'I. H. Witten, E. Frank, M. A. Hall, and ITPro,' \ + ' Data mining practical machine learning tools and techniques, ' \ + 'third edition. Burlington, Mass.: Morgan Kaufmann Publishers, 2011' + +dataset = create_dataset( + name=name, + description=description, + creator=None, + contributor=None, + collection_date=collection_date, + language=language, + licence=None, + default_target_attribute=default_target_attribute, + row_id_attribute=None, + ignore_attribute=None, + citation=citation, + attributes=column_names, + data=data, + version_label='example', +) + +uploaded_did = dataset.publish() +############################################################################ +try: + upload_id = dataset.publish() + print('URL for dataset: %s/data/%d' % (openml.config.server, upload_id)) +except openml.exceptions.PyOpenMLError as err: + print("OpenML: {0}".format(err)) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 7618d81bd..f85b14a09 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -383,7 +383,7 @@ def create_dataset(name, description, creator, contributor, collection_date, License of the data. attributes : list A list of tuples. Each tuple consists of the attribute name and type. - data : numpy.ndarray + data : numpy.ndarray | list An array that contains both the attributes and the targets, with shape=(n_samples, n_features). The target feature is indicated as meta-data of the dataset. From be7791fdb13b6d0147244ec8197c34ba81a13135 Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Mon, 1 Oct 2018 12:07:42 +0200 Subject: [PATCH 19/43] Fixing builds failure --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index ed2c4e235..4bde22b5e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -23,6 +23,8 @@ env: - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.18.2" RUN_FLAKE8="true" SKIP_TESTS="true" - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.19.2" +before_install: + - python -c "import fcntl; fcntl.fcntl(1, fcntl.F_SETFL, 0)" install: source ci_scripts/install.sh script: bash ci_scripts/test.sh after_success: source ci_scripts/success.sh && source ci_scripts/create_doc.sh $TRAVIS_BRANCH "doc_result" From 50112169bf875e9cca8bb82ee552b1b23162d095 Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Mon, 1 Oct 2018 15:20:44 +0200 Subject: [PATCH 20/43] Adding support for sparse datasets, implementing corresponding unit tests --- examples/create_upload_tutorial.py | 126 ++++++++++++------ examples/tasks_tutorial.py | 3 +- openml/datasets/dataset.py | 10 +- openml/datasets/functions.py | 54 +++++++- tests/test_datasets/test_dataset_functions.py | 107 ++++++++++++++- 5 files changed, 241 insertions(+), 59 deletions(-) diff --git a/examples/create_upload_tutorial.py b/examples/create_upload_tutorial.py index 300d7c287..bfe2adb80 100644 --- a/examples/create_upload_tutorial.py +++ b/examples/create_upload_tutorial.py @@ -7,6 +7,7 @@ import numpy as np import openml import sklearn.datasets +from scipy.sparse import coo_matrix from openml.datasets.functions import create_dataset ############################################################################ @@ -19,14 +20,16 @@ # # * A numpy array. # * A list of lists. +# * A sparse matrix ############################################################################ # Dataset is a numpy array -# ^^^^^^^^^^^^^^^^^^^^^^^^ +# ======================== # # Prepare dataset -# =============== +# ^^^^^^^^^^^^^^^ # Load an example dataset from scikit-learn which we will upload to OpenML.org via the API. + breast_cancer = sklearn.datasets.load_breast_cancer() name = 'BreastCancer(scikit-learn)' x = breast_cancer.data @@ -44,14 +47,26 @@ attributes = [ (attribute_name, 'REAL') for attribute_name in attribute_names ] + [('class', list(breast_cancer.target_names))] +citation = ( + "W.N. Street, W.H. Wolberg and O.L. Mangasarian. " + "Nuclear feature extraction for breast tumor diagnosis. " + "IS&T/SPIE 1993 International Symposium on Electronic Imaging: Science and Technology, " + "volume 1905, pages 861-870, San Jose, CA, 1993." +) +paper_url = ( + 'https://www.spiedigitallibrary.org/conference-proceedings-of-spie/' + '1905/0000/Nuclear-feature-extraction-for-breast-tumor-diagnosis/' + '10.1117/12.148698.short?SSO=1' +) ############################################################################ # Create the dataset object -# ========================= +# ^^^^^^^^^^^^^^^^^^^^^^^^^ # The definition of all fields can be found in the XSD files describing the expected format: # # https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.data.upload.xsd -dataset = create_dataset( + +bc_dataset = create_dataset( # The name of the dataset (needs to be unique). # Must not be longer than 128 characters and only contain # a-z, A-Z, 0-9 and the following special characters: _\-\.(), @@ -76,12 +91,7 @@ # Attributes that should be excluded in modelling, such as identifiers and indexes. ignore_attribute=None, # How to cite the paper. - citation=( - "W.N. Street, W.H. Wolberg and O.L. Mangasarian. " - "Nuclear feature extraction for breast tumor diagnosis. " - "IS&T/SPIE 1993 International Symposium on Electronic Imaging: Science and Technology, " - "volume 1905, pages 861-870, San Jose, CA, 1993." - ), + citation=citation, # Attributes of the data attributes=attributes, data=data, @@ -91,26 +101,21 @@ 'https://archive.ics.uci.edu/ml/datasets/' 'Breast+Cancer+Wisconsin+(Diagnostic)' ), - paper_url=( - 'https://www.spiedigitallibrary.org/conference-proceedings-of-spie/' - '1905/0000/Nuclear-feature-extraction-for-breast-tumor-diagnosis/' - '10.1117/12.148698.short?SSO=1' - ) + paper_url=paper_url, ) ############################################################################ -try: - upload_id = dataset.publish() - print('URL for dataset: %s/data/%d' % (openml.config.server, upload_id)) -except openml.exceptions.PyOpenMLError as err: - print("OpenML: {0}".format(err)) + +upload_did = bc_dataset.publish() +print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did)) ############################################################################ # Dataset is a list of lists -# ^^^^^^^^^^^^^^^^^^^^^^^^^^ +# ========================== # # Weather dataset: # http://storm.cis.fordham.edu/~gweiss/data-mining/datasets.html + data = [ ['sunny', 85, 85, 'FALSE', 'no'], ['sunny', 80, 90, 'TRUE', 'no'], @@ -128,7 +133,7 @@ ['rainy', 71, 91, 'TRUE', 'no'], ] -column_names = [ +attribute_names = [ ('outlook', ['sunny', 'overcast', 'rainy']), ('temperature', 'REAL'), ('humidity', 'REAL'), @@ -136,7 +141,6 @@ ('play', ['yes', 'no']), ] -name = "Wind" description = ( 'The weather problem is a tiny dataset that we will use repeatedly' ' to illustrate machine learning methods. Entirely fictitious, it ' @@ -147,34 +151,72 @@ 'attributes: outlook, temperature, humidity, and windy. ' 'The outcome is whether to play or not.' ) -collection_date = '01-01-2011' -language = 'English' -default_target_attribute = 'play' -citation = 'I. H. Witten, E. Frank, M. A. Hall, and ITPro,' \ - ' Data mining practical machine learning tools and techniques, ' \ - 'third edition. Burlington, Mass.: Morgan Kaufmann Publishers, 2011' - -dataset = create_dataset( - name=name, + +citation = ( + 'I. H. Witten, E. Frank, M. A. Hall, and ITPro,' + 'Data mining practical machine learning tools and techniques, ' + 'third edition. Burlington, Mass.: Morgan Kaufmann Publishers, 2011' +) + +wind_dataset = create_dataset( + name="Wind", description=description, creator=None, contributor=None, - collection_date=collection_date, - language=language, + collection_date='01-01-2011', + language='English', licence=None, - default_target_attribute=default_target_attribute, + default_target_attribute='play', row_id_attribute=None, ignore_attribute=None, citation=citation, - attributes=column_names, + attributes=attribute_names, data=data, version_label='example', ) -uploaded_did = dataset.publish() ############################################################################ -try: - upload_id = dataset.publish() - print('URL for dataset: %s/data/%d' % (openml.config.server, upload_id)) -except openml.exceptions.PyOpenMLError as err: - print("OpenML: {0}".format(err)) + +upload_did = wind_dataset.publish() +print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did)) + +############################################################################ +# Dataset is a sparse matrix +# +# Sparse data can be represented as a +# `scipy.sparse.coo https://docs.scipy.org/doc/scipy/reference/sparse.html>`_. +# or a list of dictionaries in the arff object. + +sparse_data = coo_matrix(( + [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + [0, 1, 1, 2, 2, 3, 3], + [0, 1, 2, 0, 2, 0, 1], +)) + +column_names = [ + ('input1', 'REAL'), + ('input2', 'REAL'), + ('y', 'REAL'), +] + +xor_dataset = create_dataset( + name="XOR", + description='Dataset representing the XOR operation', + creator=None, + contributor=None, + collection_date=None, + language='English', + licence=None, + default_target_attribute='y', + row_id_attribute=None, + ignore_attribute=None, + citation=None, + attributes=column_names, + data=sparse_data, + version_label='example', +) + +############################################################################ + +upload_did = xor_dataset.publish() +print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did)) diff --git a/examples/tasks_tutorial.py b/examples/tasks_tutorial.py index ee4b17d69..03e03d094 100644 --- a/examples/tasks_tutorial.py +++ b/examples/tasks_tutorial.py @@ -58,7 +58,8 @@ print(len(filtered_tasks)) ############################################################################ -# Resampling strategies can be found on the `OpenML Website `_. +# Resampling strategies can be found on +# the `OpenML Website `_. # # Similar to listing tasks by task type, we can list tasks by tags: diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index f51e074e9..e29b2df95 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -208,11 +208,11 @@ def remove_tag(self, tag): def __eq__(self, other): server_fields = { - 'dataset_id':True, - 'version':True, - 'upload_date':True, - 'url':True, - 'dataset':True, + 'dataset_id': True, + 'version': True, + 'upload_date': True, + 'url': True, + 'dataset': True, 'data_file': True, } diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index f85b14a09..3fc52933d 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -3,10 +3,10 @@ import io import os import re -import shutil +import numpy as np import six import arff - +from scipy.sparse import coo_matrix from oslo_concurrency import lockutils import xmltodict @@ -383,7 +383,7 @@ def create_dataset(name, description, creator, contributor, collection_date, License of the data. attributes : list A list of tuples. Each tuple consists of the attribute name and type. - data : numpy.ndarray | list + data : numpy.ndarray | list | scipy.sparse.coo_matrix An array that contains both the attributes and the targets, with shape=(n_samples, n_features). The target feature is indicated as meta-data of the dataset. @@ -416,6 +416,21 @@ def create_dataset(name, description, creator, contributor, collection_date, 'data': data } + # Determine arff format from the dataset + if isinstance(data, list): + if isinstance(data[0], list): + d_format = 'arff' + elif isinstance(data[0], dict): + d_format = 'sparse_arff' + else: + raise ValueError('Illegal value, only list of lists/dicts is supported') + elif isinstance(data, np.ndarray): + d_format = 'arff' + elif isinstance(data, coo_matrix): + d_format = 'sparse_arff' + else: + raise ValueError('Illegal value, please check the function documentation') + # serializes the arff dataset object and returns a string arff_dataset = arff.dumps(arff_object) try: @@ -426,11 +441,13 @@ def create_dataset(name, description, creator, contributor, collection_date, raise ValueError("The arguments you have provided \ do not construct a valid arff file") - return OpenMLDataset(name, description, creator=creator, + return OpenMLDataset(name, description, creator=creator, format=d_format, contributor=contributor, collection_date=collection_date, - language=language, licence=licence, default_target_attribute=default_target_attribute, - row_id_attribute=row_id_attribute, ignore_attribute=ignore_attribute, citation=citation, - version_label=version_label, original_data_url=original_data_url, paper_url=paper_url, + language=language, licence=licence, + default_target_attribute=default_target_attribute, + row_id_attribute=row_id_attribute, ignore_attribute=ignore_attribute, + citation=citation, version_label=version_label, + original_data_url=original_data_url, paper_url=paper_url, update_comment=update_comment, dataset=arff_dataset) @@ -664,7 +681,30 @@ def _get_online_dataset_arff(did): A string representation of an arff file. """ dataset_xml = openml._api_calls._perform_api_call("data/%d" % did) + # build a dict from the xml. # use the url from the dataset description and return the arff string return openml._api_calls._read_url( xmltodict.parse(dataset_xml)['oml:data_set_description']['oml:url'] ) + + +def _get_online_dataset_format(did): + """Get the dataset format for a given dataset id + from the OpenML website. + + Parameters + ---------- + did : int + A dataset id. + + Returns + ------- + str + Dataset format. + """ + dataset_xml = openml._api_calls._perform_api_call("data/%d" % did) + # build a dict from the xml and get the format from the dataset description + return xmltodict.parse(dataset_xml) + ['oml:data_set_description'] + ['oml:format']\ + .lower() diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 1188242e8..cb1af4f87 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -32,7 +32,8 @@ _get_dataset_features, _get_dataset_qualities, DATASETS_CACHE_DIR_NAME, - _get_online_dataset_arff) + _get_online_dataset_arff, + _get_online_dataset_format) class TestOpenMLDataset(TestBase): @@ -392,7 +393,18 @@ def test_create_dataset_numpy(self): original_data_url=original_data_url, paper_url=paper_url ) - dataset.publish() + + upload_did = dataset.publish() + self.assertEqual( + _get_online_dataset_arff(upload_did), + dataset._dataset, + "Uploaded arff does not match original one" + ) + self.assertEqual( + _get_online_dataset_format(upload_did), + 'arff', + "Wrong format for dataset" + ) def test_create_dataset_list(self): @@ -450,9 +462,96 @@ def test_create_dataset_list(self): original_data_url=original_data_url, paper_url=paper_url ) - uploaded_did = dataset.publish() + upload_did = dataset.publish() self.assertEqual( - _get_online_dataset_arff(uploaded_did), + _get_online_dataset_arff(upload_did), dataset._dataset, "Uploaded arff does not match original one" ) + self.assertEqual( + _get_online_dataset_format(upload_did), + 'arff', + "Wrong format for dataset" + ) + + +def test_create_dataset_sparse(self): + + # test the scipy.sparse.coo_matrix + sparse_data = scipy.sparse.coo_matrix(( + [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + [0, 1, 1, 2, 2, 3, 3], + [0, 1, 2, 0, 2, 0, 1], + )) + + column_names = [ + ('input1', 'REAL'), + ('input2', 'REAL'), + ('y', 'REAL'), + ] + + xor_dataset = create_dataset( + name="XOR", + description='Dataset representing the XOR operation', + creator=None, + contributor=None, + collection_date=None, + language='English', + licence=None, + default_target_attribute='y', + row_id_attribute=None, + ignore_attribute=None, + citation=None, + attributes=column_names, + data=sparse_data, + version_label='test', + ) + + upload_did = xor_dataset.publish() + self.assertEqual( + _get_online_dataset_arff(upload_did), + xor_dataset._dataset, + "Uploaded arff does not match original one" + ) + self.assertEqual( + _get_online_dataset_format(upload_did), + 'sparse_arff', + "Wrong format for dataset" + ) + + # test the list of dicts sparse representation + sparse_data = [ + {}, + {1: 1.0, 2: 1.0}, + {0: 1.0, 2: 1.0}, + {0: 1.0, 1: 1.0} + ] + + xor_dataset = create_dataset( + name="XOR", + description='Dataset representing the XOR operation', + creator=None, + contributor=None, + collection_date=None, + language='English', + licence=None, + default_target_attribute='y', + row_id_attribute=None, + ignore_attribute=None, + citation=None, + attributes=column_names, + data=sparse_data, + version_label='test', + ) + + upload_did = xor_dataset.publish() + self.assertEqual( + _get_online_dataset_arff(upload_did), + xor_dataset._dataset, + "Uploaded arff does not match original one" + ) + self.assertEqual( + _get_online_dataset_format(upload_did), + 'sparse_arff', + "Wrong format for dataset" + ) From 005649ae1ed42a22a138b3ad0ba5f9e03f6b8cc0 Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Mon, 1 Oct 2018 15:44:41 +0200 Subject: [PATCH 21/43] fix bug --- openml/datasets/functions.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 3fc52933d..27f257ed5 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -704,7 +704,5 @@ def _get_online_dataset_format(did): """ dataset_xml = openml._api_calls._perform_api_call("data/%d" % did) # build a dict from the xml and get the format from the dataset description - return xmltodict.parse(dataset_xml) - ['oml:data_set_description'] - ['oml:format']\ - .lower() + return xmltodict.parse(dataset_xml)\ + ['oml:data_set_description']['oml:format'].lower() From b4103dfd998f79be0489dd80c4f5bed02281f269 Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Mon, 1 Oct 2018 16:03:00 +0200 Subject: [PATCH 22/43] More unit tests and bug fix --- tests/test_datasets/test_dataset_functions.py | 202 +++++++++++------- 1 file changed, 130 insertions(+), 72 deletions(-) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index cb1af4f87..4f236e061 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -475,83 +475,141 @@ def test_create_dataset_list(self): ) -def test_create_dataset_sparse(self): + def test_create_dataset_sparse(self): - # test the scipy.sparse.coo_matrix - sparse_data = scipy.sparse.coo_matrix(( + # test the scipy.sparse.coo_matrix + sparse_data = scipy.sparse.coo_matrix(( [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1], - )) + )) - column_names = [ + column_names = [ ('input1', 'REAL'), ('input2', 'REAL'), ('y', 'REAL'), - ] - - xor_dataset = create_dataset( - name="XOR", - description='Dataset representing the XOR operation', - creator=None, - contributor=None, - collection_date=None, - language='English', - licence=None, - default_target_attribute='y', - row_id_attribute=None, - ignore_attribute=None, - citation=None, - attributes=column_names, - data=sparse_data, - version_label='test', - ) - - upload_did = xor_dataset.publish() - self.assertEqual( - _get_online_dataset_arff(upload_did), - xor_dataset._dataset, - "Uploaded arff does not match original one" - ) - self.assertEqual( - _get_online_dataset_format(upload_did), - 'sparse_arff', - "Wrong format for dataset" - ) - - # test the list of dicts sparse representation - sparse_data = [ - {}, - {1: 1.0, 2: 1.0}, - {0: 1.0, 2: 1.0}, - {0: 1.0, 1: 1.0} - ] - - xor_dataset = create_dataset( - name="XOR", - description='Dataset representing the XOR operation', - creator=None, - contributor=None, - collection_date=None, - language='English', - licence=None, - default_target_attribute='y', - row_id_attribute=None, - ignore_attribute=None, - citation=None, - attributes=column_names, - data=sparse_data, - version_label='test', - ) - - upload_did = xor_dataset.publish() - self.assertEqual( - _get_online_dataset_arff(upload_did), - xor_dataset._dataset, - "Uploaded arff does not match original one" - ) - self.assertEqual( - _get_online_dataset_format(upload_did), - 'sparse_arff', - "Wrong format for dataset" - ) + ] + + xor_dataset = create_dataset( + name="XOR", + description='Dataset representing the XOR operation', + creator=None, + contributor=None, + collection_date=None, + language='English', + licence=None, + default_target_attribute='y', + row_id_attribute=None, + ignore_attribute=None, + citation=None, + attributes=column_names, + data=sparse_data, + version_label='test', + ) + + upload_did = xor_dataset.publish() + self.assertEqual( + _get_online_dataset_arff(upload_did), + xor_dataset._dataset, + "Uploaded arff does not match original one" + ) + self.assertEqual( + _get_online_dataset_format(upload_did), + 'sparse_arff', + "Wrong format for dataset" + ) + + # test the list of dicts sparse representation + sparse_data = [ + {}, + {1: 1.0, 2: 1.0}, + {0: 1.0, 2: 1.0}, + {0: 1.0, 1: 1.0} + ] + + xor_dataset = create_dataset( + name="XOR", + description='Dataset representing the XOR operation', + creator=None, + contributor=None, + collection_date=None, + language='English', + licence=None, + default_target_attribute='y', + row_id_attribute=None, + ignore_attribute=None, + citation=None, + attributes=column_names, + data=sparse_data, + version_label='test', + ) + + upload_did = xor_dataset.publish() + self.assertEqual( + _get_online_dataset_arff(upload_did), + xor_dataset._dataset, + "Uploaded arff does not match original one" + ) + self.assertEqual( + _get_online_dataset_format(upload_did), + 'sparse_arff', + "Wrong format for dataset" + ) + + + def test_create_invalid_dataset(self): + + data = [ + 'sunny', + 'overcast', + 'overcast', + 'rainy', + 'rainy', + 'rainy', + 'overcast', + 'sunny', + 'sunny', + 'rainy', + 'sunny', + 'overcast', + 'overcast', + 'rainy', + ] + + self.assertRaises( + ValueError, + create_dataset, + name=None, + description=None, + creator=None, + contributor=None, + collection_date=None, + language=None, + licence=None, + default_target_attribute=None, + row_id_attribute=None, + ignore_attribute=None, + citation=None, + attributes=None, + data=data, + ) + + data = ["sunny"] + + self.assertRaises( + ValueError, + create_dataset, + name=None, + description=None, + creator=None, + contributor=None, + collection_date=None, + language=None, + licence=None, + default_target_attribute=None, + row_id_attribute=None, + ignore_attribute=None, + citation=None, + attributes=None, + data=data, + ) From 2e898ee08e46bf58e11a02a44082898c09856d30 Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Mon, 1 Oct 2018 16:36:29 +0200 Subject: [PATCH 23/43] Fixing bugs --- openml/datasets/functions.py | 5 ++++- tests/test_datasets/test_dataset_functions.py | 8 ++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 27f257ed5..f90669bcb 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -436,7 +436,10 @@ def create_dataset(name, description, creator, contributor, collection_date, try: # check if arff is valid decoder = arff.ArffDecoder() - decoder.decode(arff_dataset, encode_nominal=True) + if d_format == 'arff': + decoder.decode(arff_dataset, encode_nominal=True) + if d_format == 'sparse_arff': + decoder.decode(arff_dataset, encode_nominal=True, return_type=arff.COO) except arff.ArffException: raise ValueError("The arguments you have provided \ do not construct a valid arff file") diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 4f236e061..c00af34b7 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -355,7 +355,6 @@ def test_upload_dataset_with_url(self): dataset.publish() self.assertIsInstance(dataset.dataset_id, int) - def test_create_dataset_numpy(self): data = np.array([[1, 2, 3], @@ -474,14 +473,12 @@ def test_create_dataset_list(self): "Wrong format for dataset" ) - def test_create_dataset_sparse(self): # test the scipy.sparse.coo_matrix sparse_data = scipy.sparse.coo_matrix(( [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], - [0, 1, 1, 2, 2, 3, 3], - [0, 1, 2, 0, 2, 0, 1], + ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1]) )) column_names = [ @@ -521,7 +518,7 @@ def test_create_dataset_sparse(self): # test the list of dicts sparse representation sparse_data = [ - {}, + {0: 0.0}, {1: 1.0, 2: 1.0}, {0: 1.0, 2: 1.0}, {0: 1.0, 1: 1.0} @@ -556,7 +553,6 @@ def test_create_dataset_sparse(self): "Wrong format for dataset" ) - def test_create_invalid_dataset(self): data = [ From 43c653098444d812cbe805462fcac7aeb374c3a3 Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Mon, 1 Oct 2018 16:57:19 +0200 Subject: [PATCH 24/43] Fix bug and pep8 errors --- examples/create_upload_tutorial.py | 14 +++--- openml/datasets/functions.py | 49 +++++++++++++------ tests/test_datasets/test_dataset_functions.py | 10 ++-- 3 files changed, 48 insertions(+), 25 deletions(-) diff --git a/examples/create_upload_tutorial.py b/examples/create_upload_tutorial.py index bfe2adb80..9523ff8ff 100644 --- a/examples/create_upload_tutorial.py +++ b/examples/create_upload_tutorial.py @@ -40,7 +40,9 @@ description = breast_cancer.DESCR ############################################################################ # OpenML does not distinguish between the attributes and targets on the data level and stores all data in a -# single matrix. The target feature is indicated as meta-data of the dataset (and tasks on that data). +# single matrix. +# +# The target feature is indicated as meta-data of the dataset (and tasks on that data). data = np.concatenate((x, y.reshape((-1, 1))), axis=1) attribute_names = list(attribute_names) @@ -182,15 +184,15 @@ ############################################################################ # Dataset is a sparse matrix +# ========================== # -# Sparse data can be represented as a -# `scipy.sparse.coo https://docs.scipy.org/doc/scipy/reference/sparse.html>`_. -# or a list of dictionaries in the arff object. +# Sparse data can be represented in the arff object as a +# `scipy.sparse.coo `_, +# or a list of dictionaries. sparse_data = coo_matrix(( [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], - [0, 1, 1, 2, 2, 3, 3], - [0, 1, 2, 0, 2, 0, 1], + ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1]), )) column_names = [ diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index f90669bcb..639f268cd 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -423,13 +423,15 @@ def create_dataset(name, description, creator, contributor, collection_date, elif isinstance(data[0], dict): d_format = 'sparse_arff' else: - raise ValueError('Illegal value, only list of lists/dicts is supported') + raise ValueError('Illegal dataset value, ' + 'only list of lists/dicts is supported') elif isinstance(data, np.ndarray): d_format = 'arff' elif isinstance(data, coo_matrix): d_format = 'sparse_arff' else: - raise ValueError('Illegal value, please check the function documentation') + raise ValueError('Illegal dataset value, ' + 'please check the function documentation') # serializes the arff dataset object and returns a string arff_dataset = arff.dumps(arff_object) @@ -437,21 +439,39 @@ def create_dataset(name, description, creator, contributor, collection_date, # check if arff is valid decoder = arff.ArffDecoder() if d_format == 'arff': - decoder.decode(arff_dataset, encode_nominal=True) + decoder.decode( + arff_dataset, + encode_nominal=True, + ) if d_format == 'sparse_arff': - decoder.decode(arff_dataset, encode_nominal=True, return_type=arff.COO) + decoder.decode( + arff_dataset, + encode_nominal=True, + return_type=arff.COO, + ) except arff.ArffException: raise ValueError("The arguments you have provided \ do not construct a valid arff file") - return OpenMLDataset(name, description, creator=creator, format=d_format, - contributor=contributor, collection_date=collection_date, - language=language, licence=licence, - default_target_attribute=default_target_attribute, - row_id_attribute=row_id_attribute, ignore_attribute=ignore_attribute, - citation=citation, version_label=version_label, - original_data_url=original_data_url, paper_url=paper_url, - update_comment=update_comment, dataset=arff_dataset) + return OpenMLDataset( + name, + description, + creator=creator, + format=d_format, + contributor=contributor, + collection_date=collection_date, + language=language, + licence=licence, + default_target_attribute=default_target_attribute, + row_id_attribute=row_id_attribute, + ignore_attribute=ignore_attribute, + citation=citation, + version_label=version_label, + original_data_url=original_data_url, + paper_url=paper_url, + update_comment=update_comment, + dataset=arff_dataset, + ) def _get_dataset_description(did_cache_dir, dataset_id): @@ -707,5 +727,6 @@ def _get_online_dataset_format(did): """ dataset_xml = openml._api_calls._perform_api_call("data/%d" % did) # build a dict from the xml and get the format from the dataset description - return xmltodict.parse(dataset_xml)\ - ['oml:data_set_description']['oml:format'].lower() + return xmltodict\ + .parse(dataset_xml)['oml:data_set_description']['oml:format']\ + .lower() diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index c00af34b7..a0aaabe93 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -477,14 +477,14 @@ def test_create_dataset_sparse(self): # test the scipy.sparse.coo_matrix sparse_data = scipy.sparse.coo_matrix(( - [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], - ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1]) + [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], + ([0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1]) )) column_names = [ - ('input1', 'REAL'), - ('input2', 'REAL'), - ('y', 'REAL'), + ('input1', 'REAL'), + ('input2', 'REAL'), + ('y', 'REAL'), ] xor_dataset = create_dataset( From f45adbfc3a0c74845f69c8e229ae8fccef0b9d9d Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Mon, 1 Oct 2018 17:27:27 +0200 Subject: [PATCH 25/43] Enforcing pep8 and fixing changing the name of attribute format as it is a built-in --- examples/create_upload_tutorial.py | 10 ++++++---- examples/tasks_tutorial.py | 3 ++- openml/datasets/dataset.py | 12 ++++++------ openml/datasets/functions.py | 4 ++-- tests/test_datasets/test_dataset_functions.py | 4 ++-- 5 files changed, 18 insertions(+), 15 deletions(-) diff --git a/examples/create_upload_tutorial.py b/examples/create_upload_tutorial.py index 9523ff8ff..202f48828 100644 --- a/examples/create_upload_tutorial.py +++ b/examples/create_upload_tutorial.py @@ -39,10 +39,11 @@ attribute_names = breast_cancer.feature_names description = breast_cancer.DESCR ############################################################################ -# OpenML does not distinguish between the attributes and targets on the data level and stores all data in a -# single matrix. +# OpenML does not distinguish between the attributes and +# targets on the data level and stores all data in a single matrix. # -# The target feature is indicated as meta-data of the dataset (and tasks on that data). +# The target feature is indicated as meta-data of the +# dataset (and tasks on that data). data = np.concatenate((x, y.reshape((-1, 1))), axis=1) attribute_names = list(attribute_names) @@ -52,7 +53,8 @@ citation = ( "W.N. Street, W.H. Wolberg and O.L. Mangasarian. " "Nuclear feature extraction for breast tumor diagnosis. " - "IS&T/SPIE 1993 International Symposium on Electronic Imaging: Science and Technology, " + "IS&T/SPIE 1993 International Symposium on Electronic " + "Imaging: Science and Technology, " "volume 1905, pages 861-870, San Jose, CA, 1993." ) paper_url = ( diff --git a/examples/tasks_tutorial.py b/examples/tasks_tutorial.py index 03e03d094..b9c63ef66 100644 --- a/examples/tasks_tutorial.py +++ b/examples/tasks_tutorial.py @@ -59,7 +59,8 @@ ############################################################################ # Resampling strategies can be found on -# the `OpenML Website `_. +# the `OpenML Website `_. # # Similar to listing tasks by task type, we can list tasks by tags: diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index e29b2df95..7479c1918 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -84,7 +84,7 @@ class OpenMLDataset(object): dataset: string, optional Serialized arff dataset string. """ - def __init__(self, name, description, format='arff', dataset_id=None, + def __init__(self, name, description, data_format='arff', dataset_id=None, version=None, creator=None, contributor=None, collection_date=None, upload_date=None, language=None, licence=None, url=None, default_target_attribute=None, @@ -100,7 +100,7 @@ def __init__(self, name, description, format='arff', dataset_id=None, self.name = name self.version = int(version) if version is not None else None self.description = description - self.format = format + self.data_format = data_format self.creator = creator self.contributor = contributor self.collection_date = collection_date @@ -157,7 +157,7 @@ def __init__(self, name, description, format='arff', dataset_id=None, logger.debug("Data pickle file already exists.") else: try: - data = self._get_arff(self.format) + data = self._get_arff(self.data_format) except OSError as e: logger.critical("Please check that the data file %s is there " "and can be read.", self.data_file) @@ -415,12 +415,12 @@ def retrieve_class_labels(self, target_name='class'): # Should make a method that only reads the attributes arffFileName = self.data_file - if self.format.lower() == 'arff': + if self.data_format.lower() == 'arff': return_type = arff.DENSE - elif self.format.lower() == 'sparse_arff': + elif self.data_format.lower() == 'sparse_arff': return_type = arff.COO else: - raise ValueError('Unknown data format %s' % self.format) + raise ValueError('Unknown data format %s' % self.data_format) with io.open(arffFileName, encoding='utf8') as fh: arffData = arff.ArffDecoder().decode(fh, return_type=return_type) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 639f268cd..ef0df953f 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -457,7 +457,7 @@ def create_dataset(name, description, creator, contributor, collection_date, name, description, creator=creator, - format=d_format, + data_format=d_format, contributor=contributor, collection_date=collection_date, language=language, @@ -659,7 +659,7 @@ def _create_dataset_from_description(description, features, qualities, arff_file dataset = OpenMLDataset( description["oml:name"], description.get("oml:description"), - format=description["oml:format"], + data_format=description["oml:format"], dataset_id=description["oml:id"], version=description["oml:version"], creator=description.get("oml:creator"), diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index a0aaabe93..091138b42 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -326,7 +326,7 @@ def test_publish_dataset(self): dataset = OpenMLDataset( "anneal", "test", - format="ARFF", + data_format="ARFF", version=1, licence="public", default_target_attribute="class", @@ -348,7 +348,7 @@ def test_upload_dataset_with_url(self): dataset = OpenMLDataset( "UploadTestWithURL", "test", - format="ARFF", + data_format="ARFF", version=1, url="https://www.openml.org/data/download/61/dataset_61_iris.arff", ) From cfd57675e1ee639218f416c142cb62f785a52489 Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Mon, 1 Oct 2018 17:37:35 +0200 Subject: [PATCH 26/43] Implementing change in a better way --- openml/datasets/dataset.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index 7479c1918..64cf7ae84 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -100,7 +100,7 @@ def __init__(self, name, description, data_format='arff', dataset_id=None, self.name = name self.version = int(version) if version is not None else None self.description = description - self.data_format = data_format + self.format = data_format self.creator = creator self.contributor = contributor self.collection_date = collection_date @@ -157,7 +157,7 @@ def __init__(self, name, description, data_format='arff', dataset_id=None, logger.debug("Data pickle file already exists.") else: try: - data = self._get_arff(self.data_format) + data = self._get_arff(self.format) except OSError as e: logger.critical("Please check that the data file %s is there " "and can be read.", self.data_file) @@ -415,12 +415,12 @@ def retrieve_class_labels(self, target_name='class'): # Should make a method that only reads the attributes arffFileName = self.data_file - if self.data_format.lower() == 'arff': + if self.format.lower() == 'arff': return_type = arff.DENSE - elif self.data_format.lower() == 'sparse_arff': + elif self.format.lower() == 'sparse_arff': return_type = arff.COO else: - raise ValueError('Unknown data format %s' % self.data_format) + raise ValueError('Unknown data format %s' % self.format) with io.open(arffFileName, encoding='utf8') as fh: arffData = arff.ArffDecoder().decode(fh, return_type=return_type) From 82c71737d3ebaf2b08bdfc8fd62595d00e6b44be Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Mon, 1 Oct 2018 17:52:00 +0200 Subject: [PATCH 27/43] Fixing bugs introduced by changing the format in the constructor --- openml/datasets/dataset.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index 64cf7ae84..6d86d5862 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -167,13 +167,13 @@ def __init__(self, name, description, data_format='arff', dataset_id=None, for name, type_ in data['attributes']] attribute_names = [name for name, type_ in data['attributes']] - if format.lower() == 'sparse_arff': + if data_format.lower() == 'sparse_arff': X = data['data'] X_shape = (max(X[1]) + 1, max(X[2]) + 1) X = scipy.sparse.coo_matrix( (X[0], (X[1], X[2])), shape=X_shape, dtype=np.float32) X = X.tocsr() - elif format.lower() == 'arff': + elif data_format.lower() == 'arff': X = np.array(data['data'], dtype=np.float32) else: raise Exception() @@ -259,9 +259,9 @@ def _get_arff(self, format): if bits != 64 and os.path.getsize(filename) > 120000000: return NotImplementedError("File too big") - if format.lower() == 'arff': + if data_format.lower() == 'arff': return_type = arff.DENSE - elif format.lower() == 'sparse_arff': + elif data_format.lower() == 'sparse_arff': return_type = arff.COO else: raise ValueError('Unknown data format %s' % format) From 61cd547dfc7cd45c2e92bd8ccb3b145fd84bbbf8 Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Mon, 1 Oct 2018 18:09:45 +0200 Subject: [PATCH 28/43] Another try to tackle the bugs --- openml/datasets/dataset.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index 6d86d5862..40fa246e8 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -167,13 +167,13 @@ def __init__(self, name, description, data_format='arff', dataset_id=None, for name, type_ in data['attributes']] attribute_names = [name for name, type_ in data['attributes']] - if data_format.lower() == 'sparse_arff': + if self.format.lower() == 'sparse_arff': X = data['data'] X_shape = (max(X[1]) + 1, max(X[2]) + 1) X = scipy.sparse.coo_matrix( (X[0], (X[1], X[2])), shape=X_shape, dtype=np.float32) X = X.tocsr() - elif data_format.lower() == 'arff': + elif self.format.lower() == 'arff': X = np.array(data['data'], dtype=np.float32) else: raise Exception() @@ -259,9 +259,9 @@ def _get_arff(self, format): if bits != 64 and os.path.getsize(filename) > 120000000: return NotImplementedError("File too big") - if data_format.lower() == 'arff': + if self.format.lower() == 'arff': return_type = arff.DENSE - elif data_format.lower() == 'sparse_arff': + elif self.format.lower() == 'sparse_arff': return_type = arff.COO else: raise ValueError('Unknown data format %s' % format) From 4ec6b23278382f67e94387add5ebd0c3843c806e Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Wed, 3 Oct 2018 10:48:24 +0100 Subject: [PATCH 29/43] Small refactor --- tests/test_datasets/test_dataset_functions.py | 128 ++++++++---------- 1 file changed, 53 insertions(+), 75 deletions(-) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 091138b42..67bd19819 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -357,43 +357,39 @@ def test_upload_dataset_with_url(self): def test_create_dataset_numpy(self): - data = np.array([[1, 2, 3], - [1.2, 2.5, 3.8], - [2, 5, 8], - [0, 1, 0]]).T + data = np.array( + [ + [1, 2, 3], + [1.2, 2.5, 3.8], + [2, 5, 8], + [0, 1, 0] + ] + ).T + attributes = [('col_{}'.format(i), 'REAL') for i in range(data.shape[1])] - name = 'NumPy_testing_dataset' - description = 'Synthetic dataset created from a NumPy array' - creator = 'OpenML tester' - collection_date = '01-01-2018' - language = 'English' - licence = 'MIT' - default_target_attribute = 'col_{}'.format(data.shape[1] - 1) - citation = 'None' - original_data_url = 'http://openml.github.io/openml-python' - paper_url = 'http://openml.github.io/openml-python' dataset = create_dataset( - name=name, - description=description, - creator=creator, + name='NumPy_testing_dataset', + description='Synthetic dataset created from a NumPy array', + creator='OpenML tester', contributor=None, - collection_date=collection_date, - language=language, - licence=licence, - default_target_attribute=default_target_attribute, + collection_date='01-01-2018', + language='English', + licence='MIT', + default_target_attribute='col_{}'.format(data.shape[1] - 1), row_id_attribute=None, ignore_attribute=None, - citation=citation, + citation='None', attributes=attributes, data=data, version_label='test', - original_data_url=original_data_url, - paper_url=paper_url + original_data_url='http://openml.github.io/openml-python', + paper_url='http://openml.github.io/openml-python' ) upload_did = dataset.publish() + self.assertEqual( _get_online_dataset_arff(upload_did), dataset._dataset, @@ -423,7 +419,8 @@ def test_create_dataset_list(self): ['m', 'overcast', 81.0, 75.0, 'FALSE', 'yes'], ['n', 'rainy', 71.0, 91.0, 'TRUE', 'no'], ] - column_names = [ + + attributes = [ ('rnd_str', 'STRING'), ('outlook', ['sunny', 'overcast', 'rainy']), ('temperature', 'REAL'), @@ -432,35 +429,25 @@ def test_create_dataset_list(self): ('play', ['yes', 'no']), ] - name = "Wind_dataset_test" - description = 'Testing dataset upload when the data is a list of lists' - creator = 'OpenML test' - collection_date = '21-09-2018' - language = 'English' - licence = 'MIT' - default_target_attribute = 'play' - citation = 'None' - original_data_url = 'http://openml.github.io/openml-python' - paper_url = 'http://openml.github.io/openml-python' - dataset = create_dataset( - name=name, - description=description, - creator=creator, + name="Wind_dataset", + description='Testing dataset upload when the data is a list of lists', + creator='OpenML test', contributor=None, - collection_date=collection_date, - language=language, - licence=licence, - default_target_attribute=default_target_attribute, + collection_date='21-09-2018', + language='English', + licence='MIT', + default_target_attribute='play', row_id_attribute=None, ignore_attribute=None, - citation=citation, - attributes=column_names, + citation='None', + attributes=attributes, data=data, version_label='test', - original_data_url=original_data_url, - paper_url=paper_url + original_data_url='http://openml.github.io/openml-python', + paper_url='http://openml.github.io/openml-python' ) + upload_did = dataset.publish() self.assertEqual( _get_online_dataset_arff(upload_did), @@ -572,40 +559,31 @@ def test_create_invalid_dataset(self): 'rainy', ] + param = { + 'name': None, + 'description': None, + 'creator': None, + 'contributor': None, + 'collection_date': None, + 'language': None, + 'licence': None, + 'default_target_attribute': None, + 'row_id_attribute': None, + 'ignore_attribute': None, + 'citation': None, + 'attributes': None, + 'data': data + } + self.assertRaises( ValueError, create_dataset, - name=None, - description=None, - creator=None, - contributor=None, - collection_date=None, - language=None, - licence=None, - default_target_attribute=None, - row_id_attribute=None, - ignore_attribute=None, - citation=None, - attributes=None, - data=data, + **param, ) - data = ["sunny"] - + param['data'] = data[0] self.assertRaises( ValueError, create_dataset, - name=None, - description=None, - creator=None, - contributor=None, - collection_date=None, - language=None, - licence=None, - default_target_attribute=None, - row_id_attribute=None, - ignore_attribute=None, - citation=None, - attributes=None, - data=data, + **param, ) From 45321d24e968e9b3b23661351d9304390e536dba Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Wed, 3 Oct 2018 11:14:50 +0100 Subject: [PATCH 30/43] Fixing pep8 error --- tests/test_datasets/test_dataset_functions.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 67bd19819..e46766f19 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -431,7 +431,9 @@ def test_create_dataset_list(self): dataset = create_dataset( name="Wind_dataset", - description='Testing dataset upload when the data is a list of lists', + description=( + 'Testing dataset upload when the data is a list of lists' + ), creator='OpenML test', contributor=None, collection_date='21-09-2018', From 654cbd0969376d9a18cf5b81b84108a7e36d2372 Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Thu, 4 Oct 2018 10:58:06 +0100 Subject: [PATCH 31/43] Fix python2.7 bug --- tests/test_datasets/test_dataset_functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index e46766f19..894090a89 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -580,12 +580,12 @@ def test_create_invalid_dataset(self): self.assertRaises( ValueError, create_dataset, - **param, + **param ) param['data'] = data[0] self.assertRaises( ValueError, create_dataset, - **param, + **param ) From 714619f78ca1158bd337cafd6c2e5b825897b5f7 Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Mon, 8 Oct 2018 17:10:08 +0100 Subject: [PATCH 32/43] making changes in accordance with Guillaume's suggestions --- openml/datasets/__init__.py | 8 +- openml/datasets/dataset.py | 28 +++-- openml/datasets/functions.py | 101 ++++++++++-------- tests/test_datasets/test_dataset_functions.py | 16 +-- 4 files changed, 86 insertions(+), 67 deletions(-) diff --git a/openml/datasets/__init__.py b/openml/datasets/__init__.py index 776cb83e3..5baa9245a 100644 --- a/openml/datasets/__init__.py +++ b/openml/datasets/__init__.py @@ -1,14 +1,14 @@ -from .functions import (list_datasets, check_datasets_active, - get_datasets, get_dataset, create_dataset) +from .functions import (check_datasets_active, create_dataset, + get_datasets, get_dataset, list_datasets) from .dataset import OpenMLDataset from .data_feature import OpenMLDataFeature __all__ = [ 'check_datasets_active', + 'create_dataset', 'get_dataset', 'get_datasets', + 'list_datasets', 'OpenMLDataset', 'OpenMLDataFeature', - 'list_datasets', - 'create_dataset', ] diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index 40fa246e8..4f8c7d73c 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -1,20 +1,20 @@ -from collections import OrderedDict +import arff import gzip import io import logging import os import six - -import arff - import numpy as np import scipy.sparse -from six.moves import cPickle as pickle import xmltodict +from collections import OrderedDict +from six.moves import cPickle as pickle +import openml._api_calls +from warnings import warn from .data_feature import OpenMLDataFeature from ..exceptions import PyOpenMLError -import openml._api_calls + logger = logging.getLogger(__name__) @@ -30,6 +30,8 @@ class OpenMLDataset(object): Name of the dataset. description : str Description of the dataset. + format : str + Format of the dataset. Only 'arff' for now. dataset_id : int, optional Id autogenerated by the server. version : int, optional @@ -84,7 +86,7 @@ class OpenMLDataset(object): dataset: string, optional Serialized arff dataset string. """ - def __init__(self, name, description, data_format='arff', dataset_id=None, + def __init__(self, name, description, format=None, data_format='arff', dataset_id=None, version=None, creator=None, contributor=None, collection_date=None, upload_date=None, language=None, licence=None, url=None, default_target_attribute=None, @@ -93,14 +95,20 @@ def __init__(self, name, description, data_format='arff', dataset_id=None, original_data_url=None, paper_url=None, update_comment=None, md5_checksum=None, data_file=None, features=None, qualities=None, dataset=None): - # TODO add function to check if the name is casual_string128 + # TODO add function to check if the name is casual_string128 # Attributes received by querying the RESTful API self.dataset_id = int(dataset_id) if dataset_id is not None else None self.name = name self.version = int(version) if version is not None else None self.description = description - self.format = data_format + if format is None: + self.format = data_format + else: + warn("The format parameter in the init will be deprecated " + "in the future." + "Please use data_format instead", DeprecationWarning) + self.format = format self.creator = creator self.contributor = contributor self.collection_date = collection_date @@ -538,8 +546,6 @@ def _to_xml(self): xml_dataset : str XML description of the data. """ - xml_dataset = ('\n') props = ['id', 'name', 'version', 'description', 'format', 'creator', 'contributor', 'collection_date', 'upload_date', 'language', 'licence', 'url', 'default_target_attribute', diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index ef0df953f..49d867965 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -1,4 +1,3 @@ -from collections import OrderedDict import hashlib import io import os @@ -6,15 +5,21 @@ import numpy as np import six import arff +import xmltodict from scipy.sparse import coo_matrix from oslo_concurrency import lockutils -import xmltodict +from collections import OrderedDict +from warnings import warn import openml.utils import openml._api_calls from .dataset import OpenMLDataset -from ..exceptions import OpenMLCacheException, OpenMLServerException, \ - OpenMLHashException, PrivateDatasetError +from ..exceptions import ( + OpenMLCacheException, + OpenMLHashException, + OpenMLServerException, + PrivateDatasetError, +) from ..utils import ( _create_cache_directory, _remove_cache_dir_for_id, @@ -355,7 +360,7 @@ def get_dataset(dataset_id): def create_dataset(name, description, creator, contributor, collection_date, language, licence, attributes, data, default_target_attribute, - row_id_attribute, ignore_attribute, citation, + row_id_attribute, ignore_attribute, citation, format=None, original_data_url=None, paper_url=None, update_comment=None, version_label=None): """Create a dataset. @@ -370,6 +375,8 @@ def create_dataset(name, description, creator, contributor, collection_date, Name of the dataset. description : str Description of the dataset. + format : str, optional + Format of the dataset. Only 'arff' for now. creator : str The person who created the dataset. contributor : str @@ -409,55 +416,61 @@ def create_dataset(name, description, creator, contributor, collection_date, ------- class:`openml.OpenMLDataset` Dataset description.""" - arff_object = { - 'relation': name, - 'description': description, - 'attributes': attributes, - 'data': data - } - # Determine arff format from the dataset + if format is not None: + warn("The format parameter will be deprecated in the future," + " the method will determine the format of the ARFF " + "based on the given data.", DeprecationWarning) + + # Determine ARFF format from the dataset if isinstance(data, list): if isinstance(data[0], list): d_format = 'arff' elif isinstance(data[0], dict): d_format = 'sparse_arff' else: - raise ValueError('Illegal dataset value, ' - 'only list of lists/dicts is supported') + raise ValueError( + 'When giving a list, the list should contain a list for dense ' + 'data or a dictionary for sparse data. Got {!r} instead.' + .format(data[0]) + ) elif isinstance(data, np.ndarray): d_format = 'arff' elif isinstance(data, coo_matrix): d_format = 'sparse_arff' else: - raise ValueError('Illegal dataset value, ' - 'please check the function documentation') + raise ValueError( + 'Invalid data type. The data type can be a list of ' + 'lists or a numpy ndarray for dense data. Otherwise, ' + 'it can be a list of dicts or scipy.sparse.coo_matrix' + 'for sparse data.' + ) + arff_object = { + 'relation': name, + 'description': description, + 'attributes': attributes, + 'data': data + } - # serializes the arff dataset object and returns a string + # serializes the ARFF dataset object and returns a string arff_dataset = arff.dumps(arff_object) try: - # check if arff is valid + # check if ARFF is valid decoder = arff.ArffDecoder() - if d_format == 'arff': - decoder.decode( - arff_dataset, - encode_nominal=True, - ) - if d_format == 'sparse_arff': - decoder.decode( - arff_dataset, - encode_nominal=True, - return_type=arff.COO, - ) + decoder.decode( + arff_dataset, + encode_nominal=True, + return_type=arff.COO if d_format == 'sparse_arff' else arff.DENSE + ) except arff.ArffException: raise ValueError("The arguments you have provided \ - do not construct a valid arff file") + do not construct a valid ARFF file") return OpenMLDataset( name, description, - creator=creator, data_format=d_format, + creator=creator, contributor=contributor, collection_date=collection_date, language=language, @@ -514,7 +527,7 @@ def _get_dataset_description(did_cache_dir, dataset_id): def _get_dataset_arff(did_cache_dir, description): - """Get the filepath to the dataset arff + """Get the filepath to the dataset ARFF Checks if the file is in the cache, if yes, return the path to the file. If not, downloads the file and caches it, then returns the file path. @@ -532,7 +545,7 @@ def _get_dataset_arff(did_cache_dir, description): Returns ------- output_filename : string - Location of arff file. + Location of ARFF file. """ output_file_path = os.path.join(did_cache_dir, "dataset.arff") md5_checksum_fixture = description.get("oml:md5_checksum") @@ -649,12 +662,12 @@ def _create_dataset_from_description(description, features, qualities, arff_file description : dict Description of a dataset in xml dict. arff_file : string - Path of dataset arff file. + Path of dataset ARFF file. Returns ------- dataset : dataset object - Dataset object from dict and arff. + Dataset object from dict and ARFF. """ dataset = OpenMLDataset( description["oml:name"], @@ -689,35 +702,35 @@ def _create_dataset_from_description(description, features, qualities, arff_file return dataset -def _get_online_dataset_arff(did): - """Download the arff file for a given dataset id +def _get_online_dataset_arff(dataset_id): + """Download the ARFF file for a given dataset id from the OpenML website. Parameters ---------- - did : int + dataset_id : int A dataset id. Returns ------- str - A string representation of an arff file. + A string representation of an ARFF file. """ - dataset_xml = openml._api_calls._perform_api_call("data/%d" % did) + dataset_xml = openml._api_calls._perform_api_call("data/%d" % dataset_id) # build a dict from the xml. - # use the url from the dataset description and return the arff string + # use the url from the dataset description and return the ARFF string return openml._api_calls._read_url( xmltodict.parse(dataset_xml)['oml:data_set_description']['oml:url'] ) -def _get_online_dataset_format(did): +def _get_online_dataset_format(dataset_id): """Get the dataset format for a given dataset id from the OpenML website. Parameters ---------- - did : int + dataset_id : int A dataset id. Returns @@ -725,7 +738,7 @@ def _get_online_dataset_format(did): str Dataset format. """ - dataset_xml = openml._api_calls._perform_api_call("data/%d" % did) + dataset_xml = openml._api_calls._perform_api_call("data/%d" % dataset_id) # build a dict from the xml and get the format from the dataset description return xmltodict\ .parse(dataset_xml)['oml:data_set_description']['oml:format']\ diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 894090a89..5ed34a15d 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -27,13 +27,13 @@ _get_cached_dataset_features, _get_cached_dataset_qualities, _get_cached_datasets, - _get_dataset_description, _get_dataset_arff, + _get_dataset_description, _get_dataset_features, _get_dataset_qualities, - DATASETS_CACHE_DIR_NAME, _get_online_dataset_arff, - _get_online_dataset_format) + _get_online_dataset_format, + DATASETS_CACHE_DIR_NAME) class TestOpenMLDataset(TestBase): @@ -326,7 +326,7 @@ def test_publish_dataset(self): dataset = OpenMLDataset( "anneal", "test", - data_format="ARFF", + data_format="arff", version=1, licence="public", default_target_attribute="class", @@ -348,7 +348,7 @@ def test_upload_dataset_with_url(self): dataset = OpenMLDataset( "UploadTestWithURL", "test", - data_format="ARFF", + data_format="arff", version=1, url="https://www.openml.org/data/download/61/dataset_61_iris.arff", ) @@ -454,7 +454,7 @@ def test_create_dataset_list(self): self.assertEqual( _get_online_dataset_arff(upload_did), dataset._dataset, - "Uploaded arff does not match original one" + "Uploaded ARFF does not match original one" ) self.assertEqual( _get_online_dataset_format(upload_did), @@ -497,7 +497,7 @@ def test_create_dataset_sparse(self): self.assertEqual( _get_online_dataset_arff(upload_did), xor_dataset._dataset, - "Uploaded arff does not match original one" + "Uploaded ARFF does not match original one" ) self.assertEqual( _get_online_dataset_format(upload_did), @@ -534,7 +534,7 @@ def test_create_dataset_sparse(self): self.assertEqual( _get_online_dataset_arff(upload_did), xor_dataset._dataset, - "Uploaded arff does not match original one" + "Uploaded ARFF does not match original one" ) self.assertEqual( _get_online_dataset_format(upload_did), From e85868911cb9001f612b1e01344483bd1737b222 Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Mon, 8 Oct 2018 19:07:33 +0100 Subject: [PATCH 33/43] Adding unit tests, small refactoring --- openml/datasets/dataset.py | 16 ++--- openml/datasets/functions.py | 42 +++++++------ tests/test_datasets/test_dataset.py | 17 +++++- tests/test_datasets/test_dataset_functions.py | 60 ++++++++++++------- 4 files changed, 85 insertions(+), 50 deletions(-) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index 4f8c7d73c..03036191d 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -31,7 +31,7 @@ class OpenMLDataset(object): description : str Description of the dataset. format : str - Format of the dataset. Only 'arff' for now. + Format of the dataset which can be either 'arff' or 'sparse-arff'. dataset_id : int, optional Id autogenerated by the server. version : int, optional @@ -86,13 +86,15 @@ class OpenMLDataset(object): dataset: string, optional Serialized arff dataset string. """ - def __init__(self, name, description, format=None, data_format='arff', dataset_id=None, - version=None, creator=None, contributor=None, - collection_date=None, upload_date=None, language=None, - licence=None, url=None, default_target_attribute=None, + def __init__(self, name, description, format=None, + data_format='arff', dataset_id=None, version=None, + creator=None, contributor=None, collection_date=None, + upload_date=None, language=None, licence=None, + url=None, default_target_attribute=None, row_id_attribute=None, ignore_attribute=None, - version_label=None, citation=None, tag=None, visibility=None, - original_data_url=None, paper_url=None, update_comment=None, + version_label=None, citation=None, tag=None, + visibility=None, original_data_url=None, + paper_url=None, update_comment=None, md5_checksum=None, data_file=None, features=None, qualities=None, dataset=None): diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 49d867965..e8b399672 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -376,7 +376,10 @@ def create_dataset(name, description, creator, contributor, collection_date, description : str Description of the dataset. format : str, optional - Format of the dataset. Only 'arff' for now. + Format of the dataset which can be either 'arff' or 'sparse-arff'. + By default, the format is automatically inferred. + .. deprecated: 0.8 + ``format`` is deprecated in 0.8 and will be removed in 0.10. creator : str The person who created the dataset. contributor : str @@ -421,30 +424,33 @@ def create_dataset(name, description, creator, contributor, collection_date, warn("The format parameter will be deprecated in the future," " the method will determine the format of the ARFF " "based on the given data.", DeprecationWarning) + d_format = format # Determine ARFF format from the dataset - if isinstance(data, list): - if isinstance(data[0], list): + else: + if isinstance(data, list): + if isinstance(data[0], list): + d_format = 'arff' + elif isinstance(data[0], dict): + d_format = 'sparse_arff' + else: + raise ValueError( + 'When giving a list, the list should contain a list for dense ' + 'data or a dictionary for sparse data. Got {!r} instead.' + .format(data[0]) + ) + elif isinstance(data, np.ndarray): d_format = 'arff' - elif isinstance(data[0], dict): + elif isinstance(data, coo_matrix): d_format = 'sparse_arff' else: raise ValueError( - 'When giving a list, the list should contain a list for dense ' - 'data or a dictionary for sparse data. Got {!r} instead.' - .format(data[0]) + 'Invalid data type. The data type can be a list of ' + 'lists or a numpy ndarray for dense data. Otherwise, ' + 'it can be a list of dicts or scipy.sparse.coo_matrix' + 'for sparse data.' ) - elif isinstance(data, np.ndarray): - d_format = 'arff' - elif isinstance(data, coo_matrix): - d_format = 'sparse_arff' - else: - raise ValueError( - 'Invalid data type. The data type can be a list of ' - 'lists or a numpy ndarray for dense data. Otherwise, ' - 'it can be a list of dicts or scipy.sparse.coo_matrix' - 'for sparse data.' - ) + arff_object = { 'relation': name, 'description': description, diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 5ec6c816b..580d16069 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -1,10 +1,11 @@ import numpy as np -from scipy import sparse import six +import openml from time import time +from scipy import sparse +from warnings import filterwarnings, catch_warnings from openml.testing import TestBase -import openml class OpenMLDatasetTest(TestBase): @@ -97,6 +98,18 @@ def test_get_data_with_ignore_attributes(self): self.assertEqual(len(categorical), 38) # TODO test multiple ignore attributes! + def test_dataset_format_constructor(self): + + with catch_warnings(): + filterwarnings('error') + self.assertRaises( + DeprecationWarning, + openml.OpenMLDataset, + 'Test', + 'Test', + format='arff' + ) + class OpenMLDatasetTestOnTestServer(TestBase): def setUp(self): diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 5ed34a15d..50f1d0533 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -1,19 +1,16 @@ import unittest import os import sys - +import numpy as np +import scipy.sparse +import random +import six if sys.version_info[0] >= 3: from unittest import mock else: import mock - -import random -import six - from oslo_concurrency import lockutils - -import numpy as np -import scipy.sparse +from warnings import filterwarnings, catch_warnings import openml from openml import OpenMLDataset @@ -60,6 +57,24 @@ def _remove_pickle_files(self): except: pass + def _get_empty_param_for_dataset(self): + + return { + 'name': None, + 'description': None, + 'creator': None, + 'contributor': None, + 'collection_date': None, + 'language': None, + 'licence': None, + 'default_target_attribute': None, + 'row_id_attribute': None, + 'ignore_attribute': None, + 'citation': None, + 'attributes': None, + 'data': None + } + def test__list_cached_datasets(self): openml.config.cache_directory = self.static_cache_dir cached_datasets = openml.datasets.functions._list_cached_datasets() @@ -561,21 +576,8 @@ def test_create_invalid_dataset(self): 'rainy', ] - param = { - 'name': None, - 'description': None, - 'creator': None, - 'contributor': None, - 'collection_date': None, - 'language': None, - 'licence': None, - 'default_target_attribute': None, - 'row_id_attribute': None, - 'ignore_attribute': None, - 'citation': None, - 'attributes': None, - 'data': data - } + param = self._get_empty_param_for_dataset() + param['data'] = data self.assertRaises( ValueError, @@ -589,3 +591,15 @@ def test_create_invalid_dataset(self): create_dataset, **param ) + + def test_create_dataset_warning(self): + + parameters = self._get_empty_param_for_dataset() + parameters['format'] = 'arff' + with catch_warnings(): + filterwarnings('error') + self.assertRaises( + DeprecationWarning, + create_dataset, + **parameters + ) From e711267cc4acdfa82610e880fb7307ec9959a2bd Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Mon, 8 Oct 2018 20:01:16 +0100 Subject: [PATCH 34/43] Enforcing pep8 style --- openml/datasets/functions.py | 5 ++-- tests/test_datasets/test_dataset_functions.py | 27 +++++++++---------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index e8b399672..7898a2dfc 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -435,8 +435,9 @@ def create_dataset(name, description, creator, contributor, collection_date, d_format = 'sparse_arff' else: raise ValueError( - 'When giving a list, the list should contain a list for dense ' - 'data or a dictionary for sparse data. Got {!r} instead.' + 'When giving a list, the list should contain a ' + 'list for dense data or a dictionary for sparse ' + 'data. Got {!r} instead.' .format(data[0]) ) elif isinstance(data, np.ndarray): diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 50f1d0533..39e2d069f 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -1,24 +1,11 @@ -import unittest -import os -import sys -import numpy as np -import scipy.sparse -import random -import six -if sys.version_info[0] >= 3: - from unittest import mock -else: - import mock +import openml from oslo_concurrency import lockutils from warnings import filterwarnings, catch_warnings - -import openml from openml import OpenMLDataset from openml.exceptions import OpenMLCacheException, PyOpenMLError, \ OpenMLHashException, PrivateDatasetError from openml.testing import TestBase from openml.utils import _tag_entity, _create_cache_directory_for_id - from openml.datasets.functions import (create_dataset, _get_cached_dataset, _get_cached_dataset_features, @@ -32,6 +19,18 @@ _get_online_dataset_format, DATASETS_CACHE_DIR_NAME) +import unittest +import os +import sys +import numpy as np +import scipy.sparse +import random +import six +if sys.version_info[0] >= 3: + from unittest import mock +else: + import mock + class TestOpenMLDataset(TestBase): _multiprocess_can_split_ = True From a3dbb9a2efe5298824380f7b1f39fd8257e561cc Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Mon, 15 Oct 2018 13:43:56 +0200 Subject: [PATCH 35/43] Following Matthias's suggestions --- .travis.yml | 3 +++ examples/create_upload_tutorial.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 4bde22b5e..f9f8415d3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -23,8 +23,11 @@ env: - DISTRIB="conda" PYTHON_VERSION="3.6" SKLEARN_VERSION="0.18.2" RUN_FLAKE8="true" SKIP_TESTS="true" - DISTRIB="conda" PYTHON_VERSION="3.7" SKLEARN_VERSION="0.19.2" +# Travis issue +# https://github.com/travis-ci/travis-ci/issues/8920 before_install: - python -c "import fcntl; fcntl.fcntl(1, fcntl.F_SETFL, 0)" + install: source ci_scripts/install.sh script: bash ci_scripts/test.sh after_success: source ci_scripts/success.sh && source ci_scripts/create_doc.sh $TRAVIS_BRANCH "doc_result" diff --git a/examples/create_upload_tutorial.py b/examples/create_upload_tutorial.py index 202f48828..24b04d624 100644 --- a/examples/create_upload_tutorial.py +++ b/examples/create_upload_tutorial.py @@ -32,7 +32,7 @@ breast_cancer = sklearn.datasets.load_breast_cancer() name = 'BreastCancer(scikit-learn)' -x = breast_cancer.data +X = breast_cancer.data y = breast_cancer.target target_names = breast_cancer.target_names y = np.array([target_names[i] for i in y]) From 4ae71bec82a57a792cbd08db0e171767239fa913 Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Mon, 15 Oct 2018 13:46:46 +0200 Subject: [PATCH 36/43] Fixing bug introduced by variable name change --- examples/create_upload_tutorial.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/create_upload_tutorial.py b/examples/create_upload_tutorial.py index 24b04d624..309474faa 100644 --- a/examples/create_upload_tutorial.py +++ b/examples/create_upload_tutorial.py @@ -45,7 +45,7 @@ # The target feature is indicated as meta-data of the # dataset (and tasks on that data). -data = np.concatenate((x, y.reshape((-1, 1))), axis=1) +data = np.concatenate((X, y.reshape((-1, 1))), axis=1) attribute_names = list(attribute_names) attributes = [ (attribute_name, 'REAL') for attribute_name in attribute_names From f922654326922eedf72e89c1aad92c887fed1e0f Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Mon, 15 Oct 2018 15:34:04 +0200 Subject: [PATCH 37/43] Changing the breast_cancer dataset to diabetes, fixing typo with weather dataset, adding creator of weather dataset --- examples/create_upload_tutorial.py | 51 ++++++++++++++---------------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/examples/create_upload_tutorial.py b/examples/create_upload_tutorial.py index 309474faa..daa31e1dd 100644 --- a/examples/create_upload_tutorial.py +++ b/examples/create_upload_tutorial.py @@ -5,9 +5,10 @@ A tutorial on how to create and upload a dataset to OpenML. """ import numpy as np -import openml import sklearn.datasets from scipy.sparse import coo_matrix + +import openml from openml.datasets.functions import create_dataset ############################################################################ @@ -30,14 +31,13 @@ # ^^^^^^^^^^^^^^^ # Load an example dataset from scikit-learn which we will upload to OpenML.org via the API. -breast_cancer = sklearn.datasets.load_breast_cancer() -name = 'BreastCancer(scikit-learn)' -X = breast_cancer.data -y = breast_cancer.target -target_names = breast_cancer.target_names -y = np.array([target_names[i] for i in y]) -attribute_names = breast_cancer.feature_names -description = breast_cancer.DESCR +diabetes = sklearn.datasets.load_diabetes() +name = 'Diabetes(scikit-learn)' +X = diabetes.data +y = diabetes.target +attribute_names = diabetes.feature_names +description = diabetes.DESCR + ############################################################################ # OpenML does not distinguish between the attributes and # targets on the data level and stores all data in a single matrix. @@ -49,18 +49,15 @@ attribute_names = list(attribute_names) attributes = [ (attribute_name, 'REAL') for attribute_name in attribute_names - ] + [('class', list(breast_cancer.target_names))] + ] + [('class', 'INTEGER')] citation = ( - "W.N. Street, W.H. Wolberg and O.L. Mangasarian. " - "Nuclear feature extraction for breast tumor diagnosis. " - "IS&T/SPIE 1993 International Symposium on Electronic " - "Imaging: Science and Technology, " - "volume 1905, pages 861-870, San Jose, CA, 1993." + "Bradley Efron, Trevor Hastie, Iain Johnstone and " + "Robert Tibshirani (2004) “Least Angle Regression,” " + "Annals of Statistics (with discussion), 407-499" ) paper_url = ( - 'https://www.spiedigitallibrary.org/conference-proceedings-of-spie/' - '1905/0000/Nuclear-feature-extraction-for-breast-tumor-diagnosis/' - '10.1117/12.148698.short?SSO=1' + 'http://web.stanford.edu/~hastie/Papers/' + 'LARS/LeastAngle_2002.pdf' ) ############################################################################ @@ -70,7 +67,7 @@ # # https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.data.upload.xsd -bc_dataset = create_dataset( +diabetes_dataset = create_dataset( # The name of the dataset (needs to be unique). # Must not be longer than 128 characters and only contain # a-z, A-Z, 0-9 and the following special characters: _\-\.(), @@ -78,11 +75,12 @@ # Textual description of the dataset. description=description, # The person who created the dataset. - creator='Dr. William H. Wolberg, W. Nick Street, Olvi L. Mangasarian', + creator="Bradley Efron, Trevor Hastie, " + "Iain Johnstone and Robert Tibshirani", # People who contributed to the current version of the dataset. contributor=None, # The date the data was originally collected, given by the uploader. - collection_date='01-11-1995', + collection_date='09-01-2012', # Language in which the data is represented. # Starts with 1 upper case letter, rest lower case, e.g. 'English'. language='English', @@ -102,15 +100,14 @@ # A version label which is provided by the user. version_label='test', original_data_url=( - 'https://archive.ics.uci.edu/ml/datasets/' - 'Breast+Cancer+Wisconsin+(Diagnostic)' + 'http://www4.stat.ncsu.edu/~boos/var.select/diabetes.html' ), paper_url=paper_url, ) ############################################################################ -upload_did = bc_dataset.publish() +upload_did = diabetes_dataset.publish() print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did)) ############################################################################ @@ -162,10 +159,10 @@ 'third edition. Burlington, Mass.: Morgan Kaufmann Publishers, 2011' ) -wind_dataset = create_dataset( +weather_dataset = create_dataset( name="Wind", description=description, - creator=None, + creator='I. H. Witten, E. Frank, M. A. Hall, and ITPro', contributor=None, collection_date='01-01-2011', language='English', @@ -181,7 +178,7 @@ ############################################################################ -upload_did = wind_dataset.publish() +upload_did = weather_dataset.publish() print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did)) ############################################################################ From e84c42debd2b9b7392c4ae4b710ff2cdef2b9467 Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Mon, 15 Oct 2018 16:57:30 +0200 Subject: [PATCH 38/43] Further changes --- examples/create_upload_tutorial.py | 14 +++++++--- openml/datasets/__init__.py | 9 ++++-- openml/datasets/dataset.py | 44 ++++++++++++++++-------------- openml/datasets/functions.py | 15 ++++++---- 4 files changed, 49 insertions(+), 33 deletions(-) diff --git a/examples/create_upload_tutorial.py b/examples/create_upload_tutorial.py index daa31e1dd..b209b1215 100644 --- a/examples/create_upload_tutorial.py +++ b/examples/create_upload_tutorial.py @@ -17,7 +17,11 @@ openml.config.server = 'https://test.openml.org/api/v1/xml' ############################################################################ -# The dataset that you upload to OpenML can be: +# The dataset that you upload to OpenML can be an +# iterable object that returns iterables: +# +# Below we will cover the following cases of the +# dataset object: # # * A numpy array. # * A list of lists. @@ -29,7 +33,8 @@ # # Prepare dataset # ^^^^^^^^^^^^^^^ -# Load an example dataset from scikit-learn which we will upload to OpenML.org via the API. +# Load an example dataset from scikit-learn which we +# will upload to OpenML.org via the API. diabetes = sklearn.datasets.load_diabetes() name = 'Diabetes(scikit-learn)' @@ -63,7 +68,8 @@ ############################################################################ # Create the dataset object # ^^^^^^^^^^^^^^^^^^^^^^^^^ -# The definition of all fields can be found in the XSD files describing the expected format: +# The definition of all fields can be found in the +# XSD files describing the expected format: # # https://github.com/openml/OpenML/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.data.upload.xsd @@ -185,7 +191,7 @@ # Dataset is a sparse matrix # ========================== # -# Sparse data can be represented in the arff object as a +# Sparse data can be passed as a # `scipy.sparse.coo `_, # or a list of dictionaries. diff --git a/openml/datasets/__init__.py b/openml/datasets/__init__.py index 5baa9245a..cbf010a85 100644 --- a/openml/datasets/__init__.py +++ b/openml/datasets/__init__.py @@ -1,5 +1,10 @@ -from .functions import (check_datasets_active, create_dataset, - get_datasets, get_dataset, list_datasets) +from .functions import ( + check_datasets_active, + create_dataset, + get_dataset, + get_datasets, + list_datasets, +) from .dataset import OpenMLDataset from .data_feature import OpenMLDataFeature diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index 03036191d..45059c417 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -1,17 +1,18 @@ -import arff import gzip import io import logging import os -import six +from collections import OrderedDict + +import arff import numpy as np import scipy.sparse import xmltodict -from collections import OrderedDict +import six from six.moves import cPickle as pickle +from warnings import warn import openml._api_calls -from warnings import warn from .data_feature import OpenMLDataFeature from ..exceptions import PyOpenMLError @@ -31,7 +32,7 @@ class OpenMLDataset(object): description : str Description of the dataset. format : str - Format of the dataset which can be either 'arff' or 'sparse-arff'. + Format of the dataset which can be either 'arff' or 'sparse_arff'. dataset_id : int, optional Id autogenerated by the server. version : int, optional @@ -217,26 +218,27 @@ def remove_tag(self, tag): def __eq__(self, other): + if type(other) != OpenMLDataset: + return False + server_fields = { - 'dataset_id': True, - 'version': True, - 'upload_date': True, - 'url': True, - 'dataset': True, - 'data_file': True, + 'dataset_id', + 'version', + 'upload_date', + 'url', + 'dataset', + 'data_file', } - if type(other) != OpenMLDataset: + # check that the keys are identical + self_keys = set(self.__dict__.keys()) - server_fields + other_keys = set(other.__dict__.keys()) - server_fields + if self_keys != other_keys: return False - else: - for field in self.__dict__: - if field not in server_fields: - if field in other.__dict__: - if self.__dict__[field] != other.__dict__[field]: - return False - else: - return False - return True + + # check that values of the common keys are identical + return all(self.__dict__[key] == other.__dict__[key] + for key in self_keys) def __ne__(self, other): """Only needed for python 2, unnecessary in Python 3""" diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 7898a2dfc..744661599 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -2,6 +2,7 @@ import io import os import re + import numpy as np import six import arff @@ -358,11 +359,13 @@ def get_dataset(dataset_id): return dataset -def create_dataset(name, description, creator, contributor, collection_date, - language, licence, attributes, data, default_target_attribute, - row_id_attribute, ignore_attribute, citation, format=None, - original_data_url=None, paper_url=None, update_comment=None, - version_label=None): +def create_dataset(name, description, creator, contributor, + collection_date, language, + licence, attributes, data, + default_target_attribute, row_id_attribute, + ignore_attribute, citation, format=None, + original_data_url=None, paper_url=None, + update_comment=None, version_label=None): """Create a dataset. This function creates an OpenMLDataset object. @@ -376,7 +379,7 @@ def create_dataset(name, description, creator, contributor, collection_date, description : str Description of the dataset. format : str, optional - Format of the dataset which can be either 'arff' or 'sparse-arff'. + Format of the dataset which can be either 'arff' or 'sparse_arff'. By default, the format is automatically inferred. .. deprecated: 0.8 ``format`` is deprecated in 0.8 and will be removed in 0.10. From 1d7f8eb300dbb30f75319ca9d7155ec489d372eb Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Tue, 16 Oct 2018 01:39:58 +0100 Subject: [PATCH 39/43] Adding more changes --- examples/create_upload_tutorial.py | 21 +++---- openml/datasets/dataset.py | 4 +- openml/datasets/functions.py | 19 +++--- tests/test_datasets/test_dataset.py | 5 +- tests/test_datasets/test_dataset_functions.py | 61 +++++++++++++++---- 5 files changed, 70 insertions(+), 40 deletions(-) diff --git a/examples/create_upload_tutorial.py b/examples/create_upload_tutorial.py index b209b1215..4fac26b6b 100644 --- a/examples/create_upload_tutorial.py +++ b/examples/create_upload_tutorial.py @@ -17,19 +17,18 @@ openml.config.server = 'https://test.openml.org/api/v1/xml' ############################################################################ -# The dataset that you upload to OpenML can be an -# iterable object that returns iterables: -# # Below we will cover the following cases of the # dataset object: # -# * A numpy array. -# * A list of lists. +# * A numpy array +# * A list # * A sparse matrix ############################################################################ # Dataset is a numpy array # ======================== +# A numpy array can contain lists in the case of dense data +# or it can contain OrderedDicts in the case of sparse data. # # Prepare dataset # ^^^^^^^^^^^^^^^ @@ -57,7 +56,7 @@ ] + [('class', 'INTEGER')] citation = ( "Bradley Efron, Trevor Hastie, Iain Johnstone and " - "Robert Tibshirani (2004) “Least Angle Regression,” " + "Robert Tibshirani (2004) (Least Angle Regression) " "Annals of Statistics (with discussion), 407-499" ) paper_url = ( @@ -117,8 +116,10 @@ print('URL for dataset: %s/data/%d' % (openml.config.server, upload_did)) ############################################################################ -# Dataset is a list of lists -# ========================== +# Dataset is a list +# ================= +# A list can contain lists in the case of dense data +# or it can contain OrderedDicts in the case of sparse data. # # Weather dataset: # http://storm.cis.fordham.edu/~gweiss/data-mining/datasets.html @@ -190,10 +191,6 @@ ############################################################################ # Dataset is a sparse matrix # ========================== -# -# Sparse data can be passed as a -# `scipy.sparse.coo `_, -# or a list of dictionaries. sparse_data = coo_matrix(( [0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index 45059c417..b4213e91a 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -271,9 +271,9 @@ def _get_arff(self, format): if bits != 64 and os.path.getsize(filename) > 120000000: return NotImplementedError("File too big") - if self.format.lower() == 'arff': + if format.lower() == 'arff': return_type = arff.DENSE - elif self.format.lower() == 'sparse_arff': + elif format.lower() == 'sparse_arff': return_type = arff.COO else: raise ValueError('Unknown data format %s' % format) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 744661599..82b56548c 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -431,28 +431,25 @@ def create_dataset(name, description, creator, contributor, # Determine ARFF format from the dataset else: - if isinstance(data, list): - if isinstance(data[0], list): + if isinstance(data, list) or isinstance(data, np.ndarray): + if isinstance(data[0], list) or isinstance(data[0], np.ndarray): d_format = 'arff' - elif isinstance(data[0], dict): + elif isinstance(data[0], OrderedDict): d_format = 'sparse_arff' else: raise ValueError( - 'When giving a list, the list should contain a ' - 'list for dense data or a dictionary for sparse ' + 'When giving a list or a numpy.ndarray, ' + 'they should contain a list/ numpy.ndarray ' + 'for dense data or a dictionary for sparse ' 'data. Got {!r} instead.' .format(data[0]) ) - elif isinstance(data, np.ndarray): - d_format = 'arff' elif isinstance(data, coo_matrix): d_format = 'sparse_arff' else: raise ValueError( - 'Invalid data type. The data type can be a list of ' - 'lists or a numpy ndarray for dense data. Otherwise, ' - 'it can be a list of dicts or scipy.sparse.coo_matrix' - 'for sparse data.' + 'Invalid data type. The data type can be a list, ' + 'a numpy ndarray or a scipy.sparse.coo_matrix' ) arff_object = { diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 580d16069..c2e507350 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -1,10 +1,11 @@ +from time import time + import numpy as np import six -import openml -from time import time from scipy import sparse from warnings import filterwarnings, catch_warnings +import openml from openml.testing import TestBase diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 39e2d069f..95253066b 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -1,6 +1,20 @@ -import openml +import unittest +import os +import sys +import random +if sys.version_info[0] >= 3: + from unittest import mock +else: + import mock + +import arff +import six +import numpy as np +import scipy.sparse from oslo_concurrency import lockutils from warnings import filterwarnings, catch_warnings + +import openml from openml import OpenMLDataset from openml.exceptions import OpenMLCacheException, PyOpenMLError, \ OpenMLHashException, PrivateDatasetError @@ -19,18 +33,6 @@ _get_online_dataset_format, DATASETS_CACHE_DIR_NAME) -import unittest -import os -import sys -import numpy as np -import scipy.sparse -import random -import six -if sys.version_info[0] >= 3: - from unittest import mock -else: - import mock - class TestOpenMLDataset(TestBase): _multiprocess_can_split_ = True @@ -602,3 +604,36 @@ def test_create_dataset_warning(self): create_dataset, **parameters ) + + def test_get_online_dataset_arff(self): + + # Australian dataset + dataset_id = 100 + dataset = openml.datasets.get_dataset(dataset_id) + decoder = arff.ArffDecoder() + # check if the arff from the dataset is + # the same as the arff from _get_arff function + d_format = (dataset.format).lower() + + self.assertEqual( + dataset._get_arff(d_format), + decoder.decode( + _get_online_dataset_arff(dataset_id), + encode_nominal=True, + return_type=arff.DENSE + if d_format == 'arff' else arff.COO + ), + "ARFF files are not equal" + ) + + def test_get_online_dataset_format(self): + + # Phoneme dataset + dataset_id = 77 + dataset = openml.datasets.get_dataset(dataset_id) + + self.assertEqual( + (dataset.format).lower(), + _get_online_dataset_format(dataset_id), + "The format of the ARFF files is different" + ) From 82b47589ed7ba864e89d3a6dd9c573f8f403b8bb Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Tue, 16 Oct 2018 02:10:29 +0100 Subject: [PATCH 40/43] Fixing bug --- ci_scripts/flake8_diff.sh | 2 +- openml/datasets/functions.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci_scripts/flake8_diff.sh b/ci_scripts/flake8_diff.sh index 90d7923ad..9207163bb 100644 --- a/ci_scripts/flake8_diff.sh +++ b/ci_scripts/flake8_diff.sh @@ -125,7 +125,7 @@ check_files() { if [ -n "$files" ]; then # Conservative approach: diff without context (--unified=0) so that code # that was not changed does not create failures - git diff --unified=0 $COMMIT_RANGE -- $files | flake8 --diff --show-source $options + git diff --unified=0 $COMMIT_RANGE -- $files | flake8 --ignore E402 --diff --show-source $options fi } diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 82b56548c..dc3159b3f 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -434,7 +434,7 @@ def create_dataset(name, description, creator, contributor, if isinstance(data, list) or isinstance(data, np.ndarray): if isinstance(data[0], list) or isinstance(data[0], np.ndarray): d_format = 'arff' - elif isinstance(data[0], OrderedDict): + elif isinstance(data[0], dict): d_format = 'sparse_arff' else: raise ValueError( From 0edea314349d8d801af79bf1c31ee20804957491 Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Tue, 16 Oct 2018 02:44:02 +0100 Subject: [PATCH 41/43] Pep8 enforce --- openml/datasets/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openml/datasets/__init__.py b/openml/datasets/__init__.py index ff57b6417..c0ce3676e 100644 --- a/openml/datasets/__init__.py +++ b/openml/datasets/__init__.py @@ -18,4 +18,4 @@ 'OpenMLDataset', 'OpenMLDataFeature', 'status_update', -] \ No newline at end of file +] From 751f8c9e17b0813eb03653402d1bb91dfdc99ee3 Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Tue, 16 Oct 2018 10:15:21 +0100 Subject: [PATCH 42/43] few changes --- examples/create_upload_tutorial.py | 4 ++-- examples/tasks_tutorial.py | 4 +--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/create_upload_tutorial.py b/examples/create_upload_tutorial.py index 4fac26b6b..e3e8bae9f 100644 --- a/examples/create_upload_tutorial.py +++ b/examples/create_upload_tutorial.py @@ -52,8 +52,8 @@ data = np.concatenate((X, y.reshape((-1, 1))), axis=1) attribute_names = list(attribute_names) attributes = [ - (attribute_name, 'REAL') for attribute_name in attribute_names - ] + [('class', 'INTEGER')] + (attribute_name, 'REAL') for attribute_name in attribute_names +] + [('class', 'INTEGER')] citation = ( "Bradley Efron, Trevor Hastie, Iain Johnstone and " "Robert Tibshirani (2004) (Least Angle Regression) " diff --git a/examples/tasks_tutorial.py b/examples/tasks_tutorial.py index b9c63ef66..ee4b17d69 100644 --- a/examples/tasks_tutorial.py +++ b/examples/tasks_tutorial.py @@ -58,9 +58,7 @@ print(len(filtered_tasks)) ############################################################################ -# Resampling strategies can be found on -# the `OpenML Website `_. +# Resampling strategies can be found on the `OpenML Website `_. # # Similar to listing tasks by task type, we can list tasks by tags: From 0c66cfc5843368c673022722ba369d4e96bcefd8 Mon Sep 17 00:00:00 2001 From: ArlindKadra Date: Wed, 17 Oct 2018 13:39:36 +0200 Subject: [PATCH 43/43] Fixing typo in dataset name attributes --- examples/create_upload_tutorial.py | 2 +- tests/test_datasets/test_dataset_functions.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/create_upload_tutorial.py b/examples/create_upload_tutorial.py index e3e8bae9f..d68100648 100644 --- a/examples/create_upload_tutorial.py +++ b/examples/create_upload_tutorial.py @@ -167,7 +167,7 @@ ) weather_dataset = create_dataset( - name="Wind", + name="Weather", description=description, creator='I. H. Witten, E. Frank, M. A. Hall, and ITPro', contributor=None, diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 5a2cad4ea..bea0b8317 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -480,7 +480,7 @@ def test_create_dataset_list(self): ] dataset = create_dataset( - name="Wind_dataset", + name="ModifiedWeather", description=( 'Testing dataset upload when the data is a list of lists' ),