From d4d764e32eaf0bcfb9f22bbc16b3f471f6a80803 Mon Sep 17 00:00:00 2001 From: neeratyoy Date: Mon, 14 Oct 2019 18:02:03 +0200 Subject: [PATCH 1/3] Replacing numpy conversion with pandas categorical encoding --- openml/datasets/dataset.py | 7 +++++-- tests/test_datasets/test_dataset.py | 8 ++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index 92cf63f0a..aa87fd3fc 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -245,6 +245,7 @@ def _get_arff(self, format: str) -> Dict: when converted to lower case. + Returns ------- dict @@ -319,13 +320,15 @@ def _parse_data_from_arff( attribute_names = [] categories_names = {} categorical = [] - for name, type_ in data['attributes']: + for i, (name, type_) in enumerate(data['attributes']): # if the feature is nominal and the a sparse matrix is # requested, the categories need to be numeric if (isinstance(type_, list) and self.format.lower() == 'sparse_arff'): try: - np.array(type_, dtype=np.float32) + # checks if the strings which should be the class labels + # can be encoded into integers + pd.factorize(type_)[0] except ValueError: raise ValueError( "Categorical data needs to be numeric when " diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 132cf4584..a3c0acb42 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -320,6 +320,14 @@ def test_get_sparse_dataset_rowid_and_ignore_and_target(self): self.assertListEqual(categorical, [False] * 19998) self.assertEqual(y.shape, (600, )) + def test_get_sparse_categorical_data_id_395(self): + dataset = openml.datasets.get_dataset(395, download_data=False) + feature = dataset.features[3758] + self.assertEqual(dataset.name, 're1.wc') + self.assertEqual(feature.name, 'CLASS_LABEL') + self.assertEqual(feature.data_type, 'nominal') + self.assertEqual(len(feature.nominal_values), 25) + class OpenMLDatasetQualityTest(TestBase): def test__check_qualities(self): From b06d348509b58b86b595f42de0a11ee3a1f4f3eb Mon Sep 17 00:00:00 2001 From: neeratyoy Date: Tue, 15 Oct 2019 00:22:37 +0200 Subject: [PATCH 2/3] Adding more unit tests check --- tests/test_datasets/test_dataset.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index a3c0acb42..6fa0a04e5 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -9,6 +9,7 @@ import openml from openml.testing import TestBase from openml.exceptions import PyOpenMLError +from openml.datasets import OpenMLDataset, OpenMLDataFeature class OpenMLDatasetTest(TestBase): @@ -323,6 +324,8 @@ def test_get_sparse_dataset_rowid_and_ignore_and_target(self): def test_get_sparse_categorical_data_id_395(self): dataset = openml.datasets.get_dataset(395, download_data=False) feature = dataset.features[3758] + self.assertTrue(isinstance(dataset, OpenMLDataset)) + self.assertTrue(isinstance(feature, OpenMLDataFeature)) self.assertEqual(dataset.name, 're1.wc') self.assertEqual(feature.name, 'CLASS_LABEL') self.assertEqual(feature.data_type, 'nominal') From 6b0b036a679d39a8f451b399fa8b057eceafb8bd Mon Sep 17 00:00:00 2001 From: Neeratyoy Mallik Date: Tue, 15 Oct 2019 09:36:16 +0200 Subject: [PATCH 3/3] Changing unit test data fetch parameter --- tests/test_datasets/test_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 6fa0a04e5..5f8e9b328 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -322,7 +322,7 @@ def test_get_sparse_dataset_rowid_and_ignore_and_target(self): self.assertEqual(y.shape, (600, )) def test_get_sparse_categorical_data_id_395(self): - dataset = openml.datasets.get_dataset(395, download_data=False) + dataset = openml.datasets.get_dataset(395, download_data=True) feature = dataset.features[3758] self.assertTrue(isinstance(dataset, OpenMLDataset)) self.assertTrue(isinstance(feature, OpenMLDataFeature))