diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index 92cf63f0a..aa87fd3fc 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -245,6 +245,7 @@ def _get_arff(self, format: str) -> Dict: when converted to lower case. + Returns ------- dict @@ -319,13 +320,15 @@ def _parse_data_from_arff( attribute_names = [] categories_names = {} categorical = [] - for name, type_ in data['attributes']: + for i, (name, type_) in enumerate(data['attributes']): # if the feature is nominal and the a sparse matrix is # requested, the categories need to be numeric if (isinstance(type_, list) and self.format.lower() == 'sparse_arff'): try: - np.array(type_, dtype=np.float32) + # checks if the strings which should be the class labels + # can be encoded into integers + pd.factorize(type_)[0] except ValueError: raise ValueError( "Categorical data needs to be numeric when " diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index 132cf4584..5f8e9b328 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -9,6 +9,7 @@ import openml from openml.testing import TestBase from openml.exceptions import PyOpenMLError +from openml.datasets import OpenMLDataset, OpenMLDataFeature class OpenMLDatasetTest(TestBase): @@ -320,6 +321,16 @@ def test_get_sparse_dataset_rowid_and_ignore_and_target(self): self.assertListEqual(categorical, [False] * 19998) self.assertEqual(y.shape, (600, )) + def test_get_sparse_categorical_data_id_395(self): + dataset = openml.datasets.get_dataset(395, download_data=True) + feature = dataset.features[3758] + self.assertTrue(isinstance(dataset, OpenMLDataset)) + self.assertTrue(isinstance(feature, OpenMLDataFeature)) + self.assertEqual(dataset.name, 're1.wc') + self.assertEqual(feature.name, 'CLASS_LABEL') + self.assertEqual(feature.data_type, 'nominal') + self.assertEqual(len(feature.nominal_values), 25) + class OpenMLDatasetQualityTest(TestBase): def test__check_qualities(self):