From 4c21ad88a8e48d120dd41569d0b4eb144c4e9284 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Sun, 17 Mar 2019 15:49:48 +0200 Subject: [PATCH 01/19] First iteration of lazy loading. Does not yet take into account all places that might use the arff file internally. --- openml/datasets/functions.py | 56 +++++++++++++------ tests/test_datasets/test_dataset_functions.py | 43 ++++++++++++++ 2 files changed, 82 insertions(+), 17 deletions(-) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 8b43625c6..5c9bf4333 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -3,6 +3,7 @@ import os import re import warnings +from typing import List, Dict, Union import numpy as np import arff @@ -298,7 +299,10 @@ def check_datasets_active(dataset_ids): return active -def get_datasets(dataset_ids): +def get_datasets( + dataset_ids: List[Union[str, int]], + download_data: bool=True, +) -> List[OpenMLDataset]: """Download datasets. This function iterates :meth:`openml.datasets.get_dataset`. @@ -307,6 +311,11 @@ def get_datasets(dataset_ids): ---------- dataset_ids : iterable Integers representing dataset ids. + download_data : bool, optional + If True, also download the data file. Beware that some datasets are large and it might + make the operation noticeably slower. Metadata is also still retrieved. + If False, create the OpenMLDataset and only populate it with the metadata. + The data may later be retrieved through the `OpenMLDataset.get_data` method. Returns ------- @@ -315,21 +324,26 @@ def get_datasets(dataset_ids): """ datasets = [] for dataset_id in dataset_ids: - datasets.append(get_dataset(dataset_id)) + datasets.append(get_dataset(dataset_id, download_data)) return datasets -def get_dataset(dataset_id): - """Download a dataset. - - TODO: explain caching! +def get_dataset(dataset_id: int, download_data: bool = True) -> OpenMLDataset: + """ Download the OpenML dataset representation, optionally also download actual data file. This function is thread/multiprocessing safe. + This function uses caching. A check will be performed to determine if the information has + previously been downloaded, and if so be loaded from disk instead of retrieved from the server. Parameters ---------- dataset_id : int Dataset ID of the dataset to download + download_data : bool, optional (default=True) + If True, also download the data file. Beware that some datasets are large and it might + make the operation noticeably slower. Metadata is also still retrieved. + If False, create the OpenMLDataset and only populate it with the metadata. + The data may later be retrieved through the `OpenMLDataset.get_data` method. Returns ------- @@ -352,9 +366,14 @@ def get_dataset(dataset_id): try: remove_dataset_cache = True description = _get_dataset_description(did_cache_dir, dataset_id) - arff_file = _get_dataset_arff(did_cache_dir, description) features = _get_dataset_features(did_cache_dir, dataset_id) qualities = _get_dataset_qualities(did_cache_dir, dataset_id) + + if download_data: + arff_file = _get_dataset_arff(did_cache_dir, description) + else: + arff_file = None + remove_dataset_cache = False except OpenMLServerException as e: # if there was an exception, @@ -814,17 +833,23 @@ def _get_dataset_qualities(did_cache_dir, dataset_id): return qualities -def _create_dataset_from_description(description, - features, - qualities, - arff_file): +def _create_dataset_from_description( + description: Dict[str, str], + features: Dict, + qualities: List, + arff_file: str=None, +) -> OpenMLDataset: """Create a dataset object from a description dict. Parameters ---------- description : dict Description of a dataset in xml dict. - arff_file : string + features : dict + Description of a dataset features. + qualities : list + Description of a dataset qualities. + arff_file : string, optional Path of dataset ARFF file. Returns @@ -832,7 +857,7 @@ def _create_dataset_from_description(description, dataset : dataset object Dataset object from dict and ARFF. """ - dataset = OpenMLDataset( + return OpenMLDataset( description["oml:name"], description.get("oml:description"), data_format=description["oml:format"], @@ -845,9 +870,7 @@ def _create_dataset_from_description(description, language=description.get("oml:language"), licence=description.get("oml:licence"), url=description["oml:url"], - default_target_attribute=description.get( - "oml:default_target_attribute" - ), + default_target_attribute=description.get("oml:default_target_attribute"), row_id_attribute=description.get("oml:row_id_attribute"), ignore_attribute=description.get("oml:ignore_attribute"), version_label=description.get("oml:version_label"), @@ -862,7 +885,6 @@ def _create_dataset_from_description(description, features=features, qualities=qualities, ) - return dataset def _get_online_dataset_arff(dataset_id): diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 06ebe4f6e..386fbcc82 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -239,6 +239,28 @@ def test_get_datasets(self): self.assertTrue(os.path.exists(os.path.join( openml.config.get_cache_directory(), "datasets", "2", "qualities.xml"))) + def test_get_datasets_lazy(self): + dids = [1, 2] + datasets = openml.datasets.get_datasets(dids, download_data=False) + self.assertEqual(len(datasets), 2) + self.assertTrue(os.path.exists(os.path.join( + openml.config.get_cache_directory(), "datasets", "1", "description.xml"))) + self.assertTrue(os.path.exists(os.path.join( + openml.config.get_cache_directory(), "datasets", "2", "description.xml"))) + self.assertTrue(os.path.exists(os.path.join( + openml.config.get_cache_directory(), "datasets", "1", "features.xml"))) + self.assertTrue(os.path.exists(os.path.join( + openml.config.get_cache_directory(), "datasets", "2", "features.xml"))) + self.assertTrue(os.path.exists(os.path.join( + openml.config.get_cache_directory(), "datasets", "1", "qualities.xml"))) + self.assertTrue(os.path.exists(os.path.join( + openml.config.get_cache_directory(), "datasets", "2", "qualities.xml"))) + + self.assertFalse(os.path.exists(os.path.join( + openml.config.get_cache_directory(), "datasets", "1", "dataset.arff"))) + self.assertFalse(os.path.exists(os.path.join( + openml.config.get_cache_directory(), "datasets", "2", "dataset.arff"))) + def test_get_dataset(self): dataset = openml.datasets.get_dataset(1) self.assertEqual(type(dataset), OpenMLDataset) @@ -259,6 +281,27 @@ def test_get_dataset(self): openml.config.server = self.production_server self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, 45) + def test_get_dataset_lazy(self): + dataset = openml.datasets.get_dataset(1, download_data=False) + self.assertEqual(type(dataset), OpenMLDataset) + self.assertEqual(dataset.name, 'anneal') + self.assertTrue(os.path.exists(os.path.join( + openml.config.get_cache_directory(), "datasets", "1", "description.xml"))) + self.assertTrue(os.path.exists(os.path.join( + openml.config.get_cache_directory(), "datasets", "1", "features.xml"))) + self.assertTrue(os.path.exists(os.path.join( + openml.config.get_cache_directory(), "datasets", "1", "qualities.xml"))) + + self.assertFalse(os.path.exists(os.path.join( + openml.config.get_cache_directory(), "datasets", "1", "dataset.arff"))) + + self.assertGreater(len(dataset.features), 1) + self.assertGreater(len(dataset.qualities), 4) + + # Issue324 Properly handle private datasets when trying to access them + openml.config.server = self.production_server + self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, 45) + def test_get_dataset_sparse(self): dataset = openml.datasets.get_dataset(102) X = dataset.get_data(dataset_format='array') From 312650fe22a3ac2060ae236d2519b0537c85e018 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Sun, 17 Mar 2019 16:03:02 +0200 Subject: [PATCH 02/19] Factor functionality of loading ARFF to correct data format and pickling it out of __init__. --- openml/datasets/dataset.py | 206 +++++++++++++++++++------------------ 1 file changed, 106 insertions(+), 100 deletions(-) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index 0e7d0b5b7..047d3ab94 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -167,96 +167,103 @@ def __init__(self, name, description, format=None, self.qualities = _check_qualities(qualities) if data_file is not None: - self.data_pickle_file = data_file.replace('.arff', '.pkl.py3') + self.data_pickle_file = self.data_arff_to_pickle(data_file) + else: + self.data_pickle_file = None - if os.path.exists(self.data_pickle_file): - logger.debug("Data pickle file already exists.") - else: - try: - data = self._get_arff(self.format) - except OSError as e: - logger.critical("Please check that the data file %s is " - "there and can be read.", self.data_file) - raise e - - ARFF_DTYPES_TO_PD_DTYPE = { - 'INTEGER': 'integer', - 'REAL': 'floating', - 'NUMERIC': 'floating', - 'STRING': 'string' - } - attribute_dtype = {} - attribute_names = [] - categories_names = {} - categorical = [] - for name, type_ in data['attributes']: - # if the feature is nominal and the a sparse matrix is - # requested, the categories need to be numeric - if (isinstance(type_, list) - and self.format.lower() == 'sparse_arff'): - try: - np.array(type_, dtype=np.float32) - except ValueError: - raise ValueError( - "Categorical data needs to be numeric when " - "using sparse ARFF." - ) - # string can only be supported with pandas DataFrame - elif (type_ == 'STRING' - and self.format.lower() == 'sparse_arff'): + def data_arff_to_pickle(self, data_file): + data_pickle_file = data_file.replace('.arff', '.pkl.py3') + if os.path.exists(data_pickle_file): + logger.debug("Data pickle file already exists.") + else: + try: + data = self._get_arff(self.format) + except OSError as e: + logger.critical("Please check that the data file %s is " + "there and can be read.", data_file) + raise e + + ARFF_DTYPES_TO_PD_DTYPE = { + 'INTEGER': 'integer', + 'REAL': 'floating', + 'NUMERIC': 'floating', + 'STRING': 'string' + } + attribute_dtype = {} + attribute_names = [] + categories_names = {} + categorical = [] + for name, type_ in data['attributes']: + # if the feature is nominal and the a sparse matrix is + # requested, the categories need to be numeric + if (isinstance(type_, list) + and self.format.lower() == 'sparse_arff'): + try: + np.array(type_, dtype=np.float32) + except ValueError: raise ValueError( - "Dataset containing strings is not supported " - "with sparse ARFF." + "Categorical data needs to be numeric when " + "using sparse ARFF." ) - - # infer the dtype from the ARFF header - if isinstance(type_, list): - categorical.append(True) - categories_names[name] = type_ - if len(type_) == 2: - type_norm = [cat.lower().capitalize() - for cat in type_] - if set(['True', 'False']) == set(type_norm): - categories_names[name] = [ - True if cat == 'True' else False - for cat in type_norm - ] - attribute_dtype[name] = 'boolean' - else: - attribute_dtype[name] = 'categorical' + # string can only be supported with pandas DataFrame + elif (type_ == 'STRING' + and self.format.lower() == 'sparse_arff'): + raise ValueError( + "Dataset containing strings is not supported " + "with sparse ARFF." + ) + + # infer the dtype from the ARFF header + if isinstance(type_, list): + categorical.append(True) + categories_names[name] = type_ + if len(type_) == 2: + type_norm = [cat.lower().capitalize() + for cat in type_] + if set(['True', 'False']) == set(type_norm): + categories_names[name] = [ + True if cat == 'True' else False + for cat in type_norm + ] + attribute_dtype[name] = 'boolean' else: attribute_dtype[name] = 'categorical' else: - categorical.append(False) - attribute_dtype[name] = ARFF_DTYPES_TO_PD_DTYPE[type_] - attribute_names.append(name) - - if self.format.lower() == 'sparse_arff': - X = data['data'] - X_shape = (max(X[1]) + 1, max(X[2]) + 1) - X = scipy.sparse.coo_matrix( - (X[0], (X[1], X[2])), shape=X_shape, dtype=np.float32) - X = X.tocsr() - - elif self.format.lower() == 'arff': - X = pd.DataFrame(data['data'], columns=attribute_names) - - col = [] - for column_name in X.columns: - if attribute_dtype[column_name] in ('categorical', - 'boolean'): - col.append(self._unpack_categories( - X[column_name], categories_names[column_name])) - else: - col.append(X[column_name]) - X = pd.concat(col, axis=1) - - # Pickle the dataframe or the sparse matrix. - with open(self.data_pickle_file, "wb") as fh: - pickle.dump((X, categorical, attribute_names), fh, -1) - logger.debug("Saved dataset %d: %s to file %s" % - (int(self.dataset_id or -1), self.name, - self.data_pickle_file)) + attribute_dtype[name] = 'categorical' + else: + categorical.append(False) + attribute_dtype[name] = ARFF_DTYPES_TO_PD_DTYPE[type_] + attribute_names.append(name) + + if self.format.lower() == 'sparse_arff': + X = data['data'] + X_shape = (max(X[1]) + 1, max(X[2]) + 1) + X = scipy.sparse.coo_matrix( + (X[0], (X[1], X[2])), shape=X_shape, dtype=np.float32) + X = X.tocsr() + + elif self.format.lower() == 'arff': + X = pd.DataFrame(data['data'], columns=attribute_names) + + col = [] + for column_name in X.columns: + if attribute_dtype[column_name] in ('categorical', + 'boolean'): + col.append(self._unpack_categories( + X[column_name], categories_names[column_name])) + else: + col.append(X[column_name]) + X = pd.concat(col, axis=1) + + # Pickle the dataframe or the sparse matrix. + with open(data_pickle_file, "wb") as fh: + pickle.dump((X, categorical, attribute_names), fh, -1) + logger.debug("Saved dataset {did}: {name} to file {path}" + .format(did=int(self.dataset_id or -1), + name=self.name, + path=data_pickle_file) + ) + return data_pickle_file def push_tag(self, tag): """Annotates this data set with a tag on the server. @@ -394,13 +401,13 @@ def _unpack_categories(series, categories): return pd.Series(col, index=series.index, dtype='category', name=series.name) - def get_data(self, target=None, - include_row_id=False, - include_ignore_attributes=False, - return_categorical_indicator=False, - return_attribute_names=False, - dataset_format=None): - """Returns dataset content as dataframes or sparse matrices. + def get_data(self, target: str=None, + include_row_id: bool=False, + include_ignore_attributes: bool=False, + return_categorical_indicator: bool=False, + return_attribute_names: bool=False, + dataset_format: str=None): + """ Returns dataset content as dataframes or sparse matrices. Parameters ---------- @@ -416,10 +423,10 @@ def get_data(self, target=None, categorical. return_attribute_names : boolean (default=False) Whether to return attribute names. - dataset_format : string - The format of returned dataset. If ``array``, the returned dataset - will be a NumPy array or a SciPy sparse matrix. If ``dataframe``, - the returned dataset will be a Pandas DataFrame or SparseDataFrame. + dataset_format : string, optional + The format of returned dataset. + If ``array``, the returned dataset will be a NumPy array or a SciPy sparse matrix. + If ``dataframe``, the returned dataset will be a Pandas DataFrame or SparseDataFrame. Returns ------- @@ -428,12 +435,11 @@ def get_data(self, target=None, y : ndarray or series, shape (n_samples,) Target column(s). Only returned if target is not None. categorical_indicator : boolean ndarray - Mask that indicate categorical features. Only returned if - return_categorical_indicator is True. + Mask that indicate categorical features. + Only returned if return_categorical_indicator is True. return_attribute_names : list of strings - List of attribute names. Returned only if return_attribute_names is - True. - + List of attribute names. + Only returned if return_attribute_names is True. """ if dataset_format is None: warn('The default of "dataset_format" will change from "array" to' From a1b8c93059a2f18a7508a07bbe933094a0a01642 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Sun, 17 Mar 2019 19:42:42 +0200 Subject: [PATCH 03/19] Extracted a more general 'download_text_file' function that is now used when downloading the arff file. --- openml/datasets/functions.py | 38 +++++--------- openml/utils.py | 52 +++++++++++++++++++ tests/test_datasets/test_dataset_functions.py | 5 +- 3 files changed, 69 insertions(+), 26 deletions(-) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 5c9bf4333..882ce4a8f 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -168,6 +168,11 @@ def _get_cached_dataset_arff(dataset_id): "cached" % dataset_id) +def _get_cache_directory(dataset: OpenMLDataset) -> str: + """ Return the cache directory of the OpenMLDataset """ + return _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, dataset.dataset_id) + + def list_datasets(offset=None, size=None, status=None, tag=None, **kwargs): """ @@ -724,33 +729,18 @@ def _get_dataset_arff(did_cache_dir, description): """ output_file_path = os.path.join(did_cache_dir, "dataset.arff") md5_checksum_fixture = description.get("oml:md5_checksum") - did = description.get("oml:id") + url = description['oml:url'] - # This means the file is still there; whether it is useful is up to - # the user and not checked by the program. try: - with io.open(output_file_path, encoding='utf8'): - pass - return output_file_path - except (OSError, IOError): - pass - - url = description['oml:url'] - arff_string = openml._api_calls._read_url(url, request_method='get') - md5 = hashlib.md5() - md5.update(arff_string.encode('utf-8')) - md5_checksum = md5.hexdigest() - if md5_checksum != md5_checksum_fixture: - raise OpenMLHashException( - 'Checksum %s of downloaded dataset %d is unequal to the checksum ' - '%s sent by the server.' % ( - md5_checksum, int(did), md5_checksum_fixture - ) + openml.utils._download_text_file( + source=url, + output_path=output_file_path, + md5_checksum=md5_checksum_fixture ) - - with io.open(output_file_path, "w", encoding='utf8') as fh: - fh.write(arff_string) - del arff_string + except OpenMLHashException as e: + additional_info = " Raised when downloading dataset {}.".format(description.get('oml:id')) + e.args = (e.args[0] + additional_info,) + raise return output_file_path diff --git a/openml/utils.py b/openml/utils.py index a95e1c96b..53c48fde9 100644 --- a/openml/utils.py +++ b/openml/utils.py @@ -1,8 +1,10 @@ import os +import hashlib import xmltodict import shutil import openml._api_calls +import openml.exceptions from . import config @@ -284,3 +286,53 @@ def _create_lockfiles_dir(): except OSError: pass return dir + + +def _download_text_file(source: str, + output_path: str, + md5_checksum: str=None, + exists_ok: bool=True, + encoding: str='utf8', + ) -> None: + """ Download the text file at `source` and store it in `output_path`. + + By default, do nothing if a file already exists in `output_path`. + The downloaded file can be checked against an expected md5 checksum. + + Parameters + ---------- + source : str + url of the file to be downloaded + output_path : str + full path, including filename, of where the file should be stored. + md5_checksum : str, optional (default=None) + If not None, should be a string of hexidecimal digits of the expected digest value. + exists_ok : bool, optional (default=True) + If False, raise an FileExistsError if there already exists a file at `output_path`. + encoding : str, optional (default='utf8') + The encoding with which the file should be stored. + """ + try: + with open(output_path, encoding=encoding): + if exists_ok: + return + else: + raise FileExistsError + except FileNotFoundError: + pass + + downloaded_file = openml._api_calls._read_url(source, request_method='get') + + if md5_checksum is not None: + md5 = hashlib.md5() + md5.update(downloaded_file.encode('utf-8')) + md5_checksum_download = md5.hexdigest() + if md5_checksum != md5_checksum_download: + raise openml.exceptions.OpenMLHashException( + 'Checksum {} of downloaded file is unequal to the expected checksum {}.' + .format(md5_checksum_download, md5_checksum)) + + with open(output_path, "w", encoding=encoding) as fh: + fh.write(downloaded_file) + + del downloaded_file diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 386fbcc82..eebe40bae 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -335,8 +335,9 @@ def test__getarff_md5_issue(self): } self.assertRaisesRegex( OpenMLHashException, - 'Checksum ad484452702105cbf3d30f8deaba39a9 of downloaded dataset 5 ' - 'is unequal to the checksum abc sent by the server.', + 'Checksum ad484452702105cbf3d30f8deaba39a9 of downloaded file ' + 'is unequal to the expected checksum abc. ' + 'Raised when downloading dataset 5.', _get_dataset_arff, self.workdir, description, ) From 1b1407831c80d774535a34532ef8be931ecf397d Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Sun, 17 Mar 2019 19:59:57 +0200 Subject: [PATCH 04/19] Download data when get_data is called and it had not yet been downloaded. --- openml/datasets/dataset.py | 11 ++++-- openml/datasets/functions.py | 35 +++++++++++++------ tests/test_datasets/test_dataset_functions.py | 4 +-- 3 files changed, 35 insertions(+), 15 deletions(-) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index 047d3ab94..0122f86ae 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -167,11 +167,11 @@ def __init__(self, name, description, format=None, self.qualities = _check_qualities(qualities) if data_file is not None: - self.data_pickle_file = self.data_arff_to_pickle(data_file) + self.data_pickle_file = self._data_arff_to_pickle(data_file) else: self.data_pickle_file = None - def data_arff_to_pickle(self, data_file): + def _data_arff_to_pickle(self, data_file): data_pickle_file = data_file.replace('.arff', '.pkl.py3') if os.path.exists(data_pickle_file): logger.debug("Data pickle file already exists.") @@ -448,6 +448,13 @@ def get_data(self, target: str=None, rval = [] + if self.data_pickle_file is None: + if self.data_file is None: + # import required hero to avoid circular import. + from .functions import _get_dataset_arff + self.data_file = _get_dataset_arff(self) + self.data_pickle_file = self._data_arff_to_pickle(self.data_file) + path = self.data_pickle_file if not os.path.exists(path): raise ValueError("Cannot find a pickle file for dataset %s at " diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 882ce4a8f..5ff739724 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -375,7 +375,7 @@ def get_dataset(dataset_id: int, download_data: bool = True) -> OpenMLDataset: qualities = _get_dataset_qualities(did_cache_dir, dataset_id) if download_data: - arff_file = _get_dataset_arff(did_cache_dir, description) + arff_file = _get_dataset_arff(description) else: arff_file = None @@ -706,30 +706,43 @@ def _get_dataset_description(did_cache_dir, dataset_id): return description -def _get_dataset_arff(did_cache_dir, description): - """Get the filepath to the dataset ARFF +def _get_dataset_arff(description: Union[Dict, OpenMLDataset], cache_directory: str=None) -> str: + """ Return the path to the local arff file of the dataset. If is not cached, it is downloaded. Checks if the file is in the cache, if yes, return the path to the file. If not, downloads the file and caches it, then returns the file path. + The cache directory is generated based on dataset information, but can also be specified. This function is NOT thread/multiprocessing safe. Parameters ---------- - did_cache_dir : str - Cache subdirectory for this dataset. + description : dictionary or OpenMLDataset + Either a dataset description as dict or OpenMLDataset. - description : dictionary - Dataset description dict. + cache_directory: str, optional (default=None) + Folder to store the arff file in. + If None, use the default cache directory for the dataset. Returns ------- output_filename : string Location of ARFF file. """ - output_file_path = os.path.join(did_cache_dir, "dataset.arff") - md5_checksum_fixture = description.get("oml:md5_checksum") - url = description['oml:url'] + if isinstance(description, dict): + md5_checksum_fixture = description.get("oml:md5_checksum") + url = description['oml:url'] + did = description.get('oml:id') + elif isinstance(description, OpenMLDataset): + md5_checksum_fixture = description.md5_checksum + url = description.url + did = description.dataset_id + else: + raise TypeError("`description` should be either OpenMLDataset or Dict.") + + if cache_directory is None: + cache_directory = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, did) + output_file_path = os.path.join(cache_directory, "dataset.arff") try: openml.utils._download_text_file( @@ -738,7 +751,7 @@ def _get_dataset_arff(did_cache_dir, description): md5_checksum=md5_checksum_fixture ) except OpenMLHashException as e: - additional_info = " Raised when downloading dataset {}.".format(description.get('oml:id')) + additional_info = " Raised when downloading dataset {}.".format(did) e.args = (e.args[0] + additional_info,) raise diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index eebe40bae..ee5548d5b 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -323,7 +323,7 @@ def test__get_dataset_description(self): def test__getarff_path_dataset_arff(self): openml.config.cache_directory = self.static_cache_dir description = openml.datasets.functions._get_cached_dataset_description(2) - arff_path = _get_dataset_arff(self.workdir, description) + arff_path = _get_dataset_arff(description, cache_directory=self.workdir) self.assertIsInstance(arff_path, str) self.assertTrue(os.path.exists(arff_path)) @@ -339,7 +339,7 @@ def test__getarff_md5_issue(self): 'is unequal to the expected checksum abc. ' 'Raised when downloading dataset 5.', _get_dataset_arff, - self.workdir, description, + description, ) def test__get_dataset_features(self): From 4090c05645b65f9bb4532bd5dae9946397791a0f Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Sun, 17 Mar 2019 20:03:46 +0200 Subject: [PATCH 05/19] Update unit tests. --- tests/test_datasets/test_dataset_functions.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index ee5548d5b..04ffba4db 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -261,6 +261,14 @@ def test_get_datasets_lazy(self): self.assertFalse(os.path.exists(os.path.join( openml.config.get_cache_directory(), "datasets", "2", "dataset.arff"))) + datasets[0].get_data() + self.assertTrue(os.path.exists(os.path.join( + openml.config.get_cache_directory(), "datasets", "1", "dataset.arff"))) + + datasets[1].get_data() + self.assertTrue(os.path.exists(os.path.join( + openml.config.get_cache_directory(), "datasets", "2", "dataset.arff"))) + def test_get_dataset(self): dataset = openml.datasets.get_dataset(1) self.assertEqual(type(dataset), OpenMLDataset) @@ -298,6 +306,10 @@ def test_get_dataset_lazy(self): self.assertGreater(len(dataset.features), 1) self.assertGreater(len(dataset.qualities), 4) + dataset.get_data() + self.assertTrue(os.path.exists(os.path.join( + openml.config.get_cache_directory(), "datasets", "1", "dataset.arff"))) + # Issue324 Properly handle private datasets when trying to access them openml.config.server = self.production_server self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, 45) From a01a029c4da18c4addb01c14e9970ea94ee5b21d Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Sun, 17 Mar 2019 20:17:47 +0200 Subject: [PATCH 06/19] Also check if download is required for retrieve class labels. --- openml/datasets/dataset.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index 0122f86ae..3d48716f4 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -401,6 +401,12 @@ def _unpack_categories(series, categories): return pd.Series(col, index=series.index, dtype='category', name=series.name) + def _download_data(self) -> None: + """ Download ARFF data file to standard cache directory. Set `self.data_file`. """ + # import required here to avoid circular import. + from .functions import _get_dataset_arff + self.data_file = _get_dataset_arff(self) + def get_data(self, target: str=None, include_row_id: bool=False, include_ignore_attributes: bool=False, @@ -450,9 +456,7 @@ def get_data(self, target: str=None, if self.data_pickle_file is None: if self.data_file is None: - # import required hero to avoid circular import. - from .functions import _get_dataset_arff - self.data_file = _get_dataset_arff(self) + self._download_data() self.data_pickle_file = self._data_arff_to_pickle(self.data_file) path = self.data_pickle_file @@ -570,6 +574,9 @@ def retrieve_class_labels(self, target_name='class'): # TODO improve performance, currently reads the whole file # Should make a method that only reads the attributes + if self.data_file is None: + self._download_data() + arffFileName = self.data_file if self.format.lower() == 'arff': From 50a9c3f015b1f39d4a85d561ba8d99560398687a Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Sun, 17 Mar 2019 20:32:08 +0200 Subject: [PATCH 07/19] add test to ensure all functionality works without retrieving data. --- tests/test_datasets/test_dataset_functions.py | 28 ++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 04ffba4db..01351da0b 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -314,6 +314,32 @@ def test_get_dataset_lazy(self): openml.config.server = self.production_server self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, 45) + def test_get_dataset_lazy_all_functions(self): + """ Test that all expected functionality is available without downloading the dataset. """ + dataset = openml.datasets.get_dataset(1, download_data=False) + # We only tests functions as general integrity is tested by test_get_dataset_lazy + + dataset.push_tag('lazy_tag') + self.assertFalse(os.path.exists(os.path.join( + openml.config.get_cache_directory(), "datasets", "1", "dataset.arff"))) + + dataset.remove_tag('lazy_tag') + self.assertFalse(os.path.exists(os.path.join( + openml.config.get_cache_directory(), "datasets", "1", "dataset.arff"))) + + nominal_indices = dataset.get_features_by_type('nominal') + self.assertFalse(os.path.exists(os.path.join( + openml.config.get_cache_directory(), "datasets", "1", "dataset.arff"))) + correct = [0, 1, 2, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 35, 36, 37, 38] + self.assertEqual(nominal_indices, correct) + + # Due to the current implementation, retrieve_class_labels must download the file + classes = dataset.retrieve_class_labels() + self.assertTrue(os.path.exists(os.path.join( + openml.config.get_cache_directory(), "datasets", "1", "dataset.arff"))) + self.assertEqual(classes, ['1', '2', '3', '4', '5', 'U']) + def test_get_dataset_sparse(self): dataset = openml.datasets.get_dataset(102) X = dataset.get_data(dataset_format='array') @@ -493,7 +519,7 @@ def test_attributes_arff_from_df_mixed_dtype_categories(self): attributes_arff_from_df(df) def test_attributes_arff_from_df_unknown_dtype(self): - # check that an error is raised when the dtype is not supported by + # check that an error is raised when the dtype is not supptagorted by # liac-arff data = [ [[1], ['2'], [3.]], From d13f0c4817f4fe2c277edb4b8caea0ca91cbd493 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Sun, 17 Mar 2019 20:34:17 +0200 Subject: [PATCH 08/19] update doc/hint. --- openml/datasets/functions.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 5ff739724..e60b1fded 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -315,7 +315,7 @@ def get_datasets( Parameters ---------- dataset_ids : iterable - Integers representing dataset ids. + Integers or strings representing dataset ids. download_data : bool, optional If True, also download the data file. Beware that some datasets are large and it might make the operation noticeably slower. Metadata is also still retrieved. @@ -333,7 +333,7 @@ def get_datasets( return datasets -def get_dataset(dataset_id: int, download_data: bool = True) -> OpenMLDataset: +def get_dataset(dataset_id: Union[int, str], download_data: bool = True) -> OpenMLDataset: """ Download the OpenML dataset representation, optionally also download actual data file. This function is thread/multiprocessing safe. @@ -342,7 +342,7 @@ def get_dataset(dataset_id: int, download_data: bool = True) -> OpenMLDataset: Parameters ---------- - dataset_id : int + dataset_id : int or str Dataset ID of the dataset to download download_data : bool, optional (default=True) If True, also download the data file. Beware that some datasets are large and it might From dd6a064df756eb973cdd5579d946fe8e8a9d0155 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Sun, 17 Mar 2019 20:44:40 +0200 Subject: [PATCH 09/19] Flake8, unused imports, spacing around = --- openml/datasets/dataset.py | 12 ++++++------ openml/datasets/functions.py | 8 ++++---- openml/utils.py | 6 +++--- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index 3d48716f4..8b3455981 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -407,12 +407,12 @@ def _download_data(self) -> None: from .functions import _get_dataset_arff self.data_file = _get_dataset_arff(self) - def get_data(self, target: str=None, - include_row_id: bool=False, - include_ignore_attributes: bool=False, - return_categorical_indicator: bool=False, - return_attribute_names: bool=False, - dataset_format: str=None): + def get_data(self, target: str = None, + include_row_id: bool = False, + include_ignore_attributes: bool = False, + return_categorical_indicator: bool = False, + return_attribute_names: bool = False, + dataset_format: str = None): """ Returns dataset content as dataframes or sparse matrices. Parameters diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index e60b1fded..cf9da7506 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -1,4 +1,3 @@ -import hashlib import io import os import re @@ -306,7 +305,7 @@ def check_datasets_active(dataset_ids): def get_datasets( dataset_ids: List[Union[str, int]], - download_data: bool=True, + download_data: bool = True, ) -> List[OpenMLDataset]: """Download datasets. @@ -706,7 +705,8 @@ def _get_dataset_description(did_cache_dir, dataset_id): return description -def _get_dataset_arff(description: Union[Dict, OpenMLDataset], cache_directory: str=None) -> str: +def _get_dataset_arff(description: Union[Dict, OpenMLDataset], + cache_directory: str = None) -> str: """ Return the path to the local arff file of the dataset. If is not cached, it is downloaded. Checks if the file is in the cache, if yes, return the path to the file. @@ -840,7 +840,7 @@ def _create_dataset_from_description( description: Dict[str, str], features: Dict, qualities: List, - arff_file: str=None, + arff_file: str = None, ) -> OpenMLDataset: """Create a dataset object from a description dict. diff --git a/openml/utils.py b/openml/utils.py index 53c48fde9..25e0582ab 100644 --- a/openml/utils.py +++ b/openml/utils.py @@ -290,9 +290,9 @@ def _create_lockfiles_dir(): def _download_text_file(source: str, output_path: str, - md5_checksum: str=None, - exists_ok: bool=True, - encoding: str='utf8', + md5_checksum: str = None, + exists_ok: bool = True, + encoding: str = 'utf8', ) -> None: """ Download the text file at `source` and store it in `output_path`. From 9cd81761eeabbf687503fa170a2cf43c9f3911c7 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Sun, 17 Mar 2019 22:09:59 +0200 Subject: [PATCH 10/19] Always return path to pickle file. --- openml/datasets/dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index 8b3455981..b7205cdbb 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -175,6 +175,7 @@ def _data_arff_to_pickle(self, data_file): data_pickle_file = data_file.replace('.arff', '.pkl.py3') if os.path.exists(data_pickle_file): logger.debug("Data pickle file already exists.") + return data_pickle_file else: try: data = self._get_arff(self.format) From 18eda4d1d4cbf6f4577e31a12b340ee52c93fbb2 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Mon, 18 Mar 2019 10:21:17 +0200 Subject: [PATCH 11/19] Add notice of lazy loading to dataset tutorial. --- examples/datasets_tutorial.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/examples/datasets_tutorial.py b/examples/datasets_tutorial.py index 95d19db65..4d5b7ad84 100644 --- a/examples/datasets_tutorial.py +++ b/examples/datasets_tutorial.py @@ -77,6 +77,15 @@ print(X.head()) print(X.info()) +############################################################################ +# Sometimes you only need access to a dataset's metadata. +# In those cases, you can download the dataset without downloading the +# data file. The dataset object can be used as normal. +# Whenever you use any functionality that requires the data, +# such as `get_data`, the data will be downloaded. +dataset = openml.datasets.get_dataset(68, download_data=False) + + ############################################################################ # Exercise 2 # ********** From 3d8dedab869e154a0f1be39028c299408a1d989e Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Mon, 18 Mar 2019 10:38:25 +0200 Subject: [PATCH 12/19] Simplified `retrieve_class_labels` using the already downloaded feature metadata. --- openml/datasets/dataset.py | 29 +++++------------------------ 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py index b7205cdbb..21260d370 100644 --- a/openml/datasets/dataset.py +++ b/openml/datasets/dataset.py @@ -157,7 +157,7 @@ def __init__(self, name, description, format=None, feature = OpenMLDataFeature(int(xmlfeature['oml:index']), xmlfeature['oml:name'], xmlfeature['oml:data_type'], - None, + xmlfeature.get('oml:nominal_value'), int(nr_missing)) if idx != feature.index: raise ValueError('Data features not provided ' @@ -572,29 +572,10 @@ def retrieve_class_labels(self, target_name='class'): ------- list """ - - # TODO improve performance, currently reads the whole file - # Should make a method that only reads the attributes - if self.data_file is None: - self._download_data() - - arffFileName = self.data_file - - if self.format.lower() == 'arff': - return_type = arff.DENSE - elif self.format.lower() == 'sparse_arff': - return_type = arff.COO - else: - raise ValueError('Unknown data format %s' % self.format) - - with io.open(arffFileName, encoding='utf8') as fh: - arffData = arff.ArffDecoder().decode(fh, return_type=return_type) - - dataAttributes = dict(arffData['attributes']) - if target_name in dataAttributes: - return dataAttributes[target_name] - else: - return None + for feature in self.features.values(): + if (feature.name == target_name) and (feature.data_type == 'nominal'): + return feature.nominal_values + return None def get_features_by_type(self, data_type, exclude=None, exclude_ignore_attributes=True, From 6ca05beaaa63bf49c7cbe122b347eae29102cf43 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Mon, 18 Mar 2019 11:42:51 +0200 Subject: [PATCH 13/19] Fix a bug where nominal feature with a single unique value is treated differently from one with multiple (e.g. feat 5 of d/2). --- openml/datasets/functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index cf9da7506..11f76ba9d 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -792,7 +792,7 @@ def _get_dataset_features(did_cache_dir, dataset_id): with io.open(features_file, "w", encoding='utf8') as fh: fh.write(features_xml) - xml_as_dict = xmltodict.parse(features_xml, force_list=('oml:feature',)) + xml_as_dict = xmltodict.parse(features_xml, force_list=('oml:feature','oml:nominal_value')) features = xml_as_dict["oml:data_features"] return features From 5f2919ffebaee7e96f684f2dc86175338979b52f Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Mon, 18 Mar 2019 12:02:41 +0200 Subject: [PATCH 14/19] Apply AppVeyor fix. --- appveyor.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 6f8b75917..a4aecd8b7 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -29,9 +29,8 @@ install: - rmdir C:\\cygwin /s /q # Update previous packages and install the build and runtime dependencies of the project. - # XXX: setuptools>23 is currently broken on Win+py3 with numpy - # (https://github.com/pypa/setuptools/issues/728) - - conda update --all --yes setuptools=23 + - conda update conda --yes + - conda update --all --yes # Install the build and runtime dependencies of the project. - "cd C:\\projects\\openml-python" From 062e2e971777ddf12c7525af35487b9f074db827 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Mon, 18 Mar 2019 12:49:23 +0200 Subject: [PATCH 15/19] Update feature xml to most recent. --- .../org/openml/test/datasets/2/features.xml | 159 +++++++++++++----- 1 file changed, 119 insertions(+), 40 deletions(-) diff --git a/tests/files/org/openml/test/datasets/2/features.xml b/tests/files/org/openml/test/datasets/2/features.xml index 5d3f034cd..8b994ccaa 100644 --- a/tests/files/org/openml/test/datasets/2/features.xml +++ b/tests/files/org/openml/test/datasets/2/features.xml @@ -3,7 +3,16 @@ 0 family nominal - false + GB + GK + GS + TN + ZA + ZF + ZH + ZM + ZS + false false false 772 @@ -12,7 +21,10 @@ 1 product-type nominal - false + C + H + G + false false false 0 @@ -21,7 +33,15 @@ 2 steel nominal - false + R + A + U + K + M + S + W + V + false false false 86 @@ -30,7 +50,7 @@ 3 carbon numeric - false + false false false 0 @@ -39,7 +59,7 @@ 4 hardness numeric - false + false false false 0 @@ -48,7 +68,8 @@ 5 temper_rolling nominal - false + T + false false false 761 @@ -57,7 +78,10 @@ 6 condition nominal - false + S + A + X + false false false 303 @@ -66,7 +90,12 @@ 7 formability nominal - false + 1 + 2 + 3 + 4 + 5 + false false false 318 @@ -75,7 +104,7 @@ 8 strength numeric - false + false false false 0 @@ -84,7 +113,8 @@ 9 non-ageing nominal - false + N + false false false 793 @@ -93,7 +123,9 @@ 10 surface-finish nominal - false + P + M + false false false 889 @@ -102,7 +134,11 @@ 11 surface-quality nominal - false + D + E + F + G + false false false 244 @@ -111,7 +147,12 @@ 12 enamelability nominal - false + 1 + 2 + 3 + 4 + 5 + false false false 882 @@ -120,7 +161,8 @@ 13 bc nominal - false + Y + false false false 897 @@ -129,7 +171,8 @@ 14 bf nominal - false + Y + false false false 769 @@ -138,7 +181,8 @@ 15 bt nominal - false + Y + false false false 824 @@ -147,7 +191,9 @@ 16 bw%2Fme nominal - false + B + M + false false false 687 @@ -156,7 +202,8 @@ 17 bl nominal - false + Y + false false false 749 @@ -165,7 +212,8 @@ 18 m nominal - false + Y + false false false 898 @@ -174,7 +222,8 @@ 19 chrom nominal - false + C + false false false 872 @@ -183,7 +232,8 @@ 20 phos nominal - false + P + false false false 891 @@ -192,7 +242,8 @@ 21 cbond nominal - false + Y + false false false 824 @@ -201,7 +252,8 @@ 22 marvi nominal - false + Y + false false false 898 @@ -210,7 +262,8 @@ 23 exptl nominal - false + Y + false false false 896 @@ -219,7 +272,8 @@ 24 ferro nominal - false + Y + false false false 868 @@ -228,7 +282,8 @@ 25 corr nominal - false + Y + false false false 898 @@ -237,7 +292,11 @@ 26 blue%2Fbright%2Fvarn%2Fclean nominal - false + B + R + V + C + false false false 892 @@ -246,7 +305,8 @@ 27 lustre nominal - false + Y + false false false 847 @@ -255,7 +315,8 @@ 28 jurofm nominal - false + Y + false false false 898 @@ -264,7 +325,8 @@ 29 s nominal - false + Y + false false false 898 @@ -273,7 +335,8 @@ 30 p nominal - false + Y + false false false 898 @@ -282,7 +345,9 @@ 31 shape nominal - false + COIL + SHEET + false false false 0 @@ -291,7 +356,7 @@ 32 thick numeric - false + false false false 0 @@ -300,7 +365,7 @@ 33 width numeric - false + false false false 0 @@ -309,7 +374,7 @@ 34 len numeric - false + false false false 0 @@ -318,7 +383,9 @@ 35 oil nominal - false + Y + N + false false false 834 @@ -327,7 +394,11 @@ 36 bore nominal - false + 0 + 500 + 600 + 760 + false false false 0 @@ -336,7 +407,10 @@ 37 packing nominal - false + 1 + 2 + 3 + false false false 889 @@ -345,10 +419,15 @@ 38 class nominal - true + 1 + 2 + 3 + 4 + 5 + U + true false false 0 - From 391f30abc79672dd5cf8813bf5b9ba906581ff89 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Mon, 18 Mar 2019 12:51:20 +0200 Subject: [PATCH 16/19] Update test to reflect retrieve_class_labels is now available with lazy loading. --- tests/test_datasets/test_dataset_functions.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 01351da0b..55e91d121 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -334,11 +334,11 @@ def test_get_dataset_lazy_all_functions(self): 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 35, 36, 37, 38] self.assertEqual(nominal_indices, correct) - # Due to the current implementation, retrieve_class_labels must download the file classes = dataset.retrieve_class_labels() - self.assertTrue(os.path.exists(os.path.join( - openml.config.get_cache_directory(), "datasets", "1", "dataset.arff"))) self.assertEqual(classes, ['1', '2', '3', '4', '5', 'U']) + + self.assertFalse(os.path.exists(os.path.join( + openml.config.get_cache_directory(), "datasets", "1", "dataset.arff"))) def test_get_dataset_sparse(self): dataset = openml.datasets.get_dataset(102) From 860322463f45f76f4038c0a125d9c76349662470 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Mon, 18 Mar 2019 13:46:27 +0200 Subject: [PATCH 17/19] Unify loading of features between cached and downloaded. --- openml/datasets/functions.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py index 11f76ba9d..7e3fd8421 100644 --- a/openml/datasets/functions.py +++ b/openml/datasets/functions.py @@ -129,9 +129,7 @@ def _get_cached_dataset_features(dataset_id): ) features_file = os.path.join(did_cache_dir, "features.xml") try: - with io.open(features_file, encoding='utf8') as fh: - features_xml = fh.read() - return xmltodict.parse(features_xml)["oml:data_features"] + return _load_features_from_file(features_file) except (IOError, OSError): raise OpenMLCacheException("Dataset features for dataset id %d not " "cached" % dataset_id) @@ -273,6 +271,14 @@ def __list_datasets(api_call): return datasets +def _load_features_from_file(features_file: str) -> Dict: + with io.open(features_file, encoding='utf8') as fh: + features_xml = fh.read() + xml_dict = xmltodict.parse(features_xml, + force_list=('oml:feature', 'oml:nominal_value')) + return xml_dict["oml:data_features"] + + def check_datasets_active(dataset_ids): """Check if the dataset ids provided are active. @@ -782,20 +788,13 @@ def _get_dataset_features(did_cache_dir, dataset_id): features_file = os.path.join(did_cache_dir, "features.xml") # Dataset features aren't subject to change... - try: - with io.open(features_file, encoding='utf8') as fh: - features_xml = fh.read() - except (OSError, IOError): + if not os.path.isfile(features_file): url_extension = "data/features/{}".format(dataset_id) features_xml = openml._api_calls._perform_api_call(url_extension, 'get') - with io.open(features_file, "w", encoding='utf8') as fh: fh.write(features_xml) - xml_as_dict = xmltodict.parse(features_xml, force_list=('oml:feature','oml:nominal_value')) - features = xml_as_dict["oml:data_features"] - - return features + return _load_features_from_file(features_file) def _get_dataset_qualities(did_cache_dir, dataset_id): From 76e5bb9d6dd6ce5187aed36c1b1d8ff1ffe974c3 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Mon, 18 Mar 2019 14:06:53 +0200 Subject: [PATCH 18/19] Flake8. --- tests/test_datasets/test_dataset_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index 55e91d121..f58d8f38f 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -336,7 +336,7 @@ def test_get_dataset_lazy_all_functions(self): classes = dataset.retrieve_class_labels() self.assertEqual(classes, ['1', '2', '3', '4', '5', 'U']) - + self.assertFalse(os.path.exists(os.path.join( openml.config.get_cache_directory(), "datasets", "1", "dataset.arff"))) From b812904c41e4f80d2cdc4da5308390ab671686b9 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Mon, 18 Mar 2019 14:33:41 +0200 Subject: [PATCH 19/19] Add random element to tag to avoid race conditions in parallel tests. --- tests/test_datasets/test_dataset_functions.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py index f58d8f38f..ff6d1c6c4 100644 --- a/tests/test_datasets/test_dataset_functions.py +++ b/tests/test_datasets/test_dataset_functions.py @@ -319,11 +319,12 @@ def test_get_dataset_lazy_all_functions(self): dataset = openml.datasets.get_dataset(1, download_data=False) # We only tests functions as general integrity is tested by test_get_dataset_lazy - dataset.push_tag('lazy_tag') + tag = 'test_lazy_tag_%d' % random.randint(1, 1000000) + dataset.push_tag(tag) self.assertFalse(os.path.exists(os.path.join( openml.config.get_cache_directory(), "datasets", "1", "dataset.arff"))) - dataset.remove_tag('lazy_tag') + dataset.remove_tag(tag) self.assertFalse(os.path.exists(os.path.join( openml.config.get_cache_directory(), "datasets", "1", "dataset.arff")))