Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
4c21ad8
First iteration of lazy loading. Does not yet take into account all p…
PGijsbers Mar 17, 2019
312650f
Factor functionality of loading ARFF to correct data format and pickl…
PGijsbers Mar 17, 2019
a1b8c93
Extracted a more general 'download_text_file' function that is now us…
PGijsbers Mar 17, 2019
1b14078
Download data when get_data is called and it had not yet been downloa…
PGijsbers Mar 17, 2019
4090c05
Update unit tests.
PGijsbers Mar 17, 2019
a01a029
Also check if download is required for retrieve class labels.
PGijsbers Mar 17, 2019
50a9c3f
add test to ensure all functionality works without retrieving data.
PGijsbers Mar 17, 2019
d13f0c4
update doc/hint.
PGijsbers Mar 17, 2019
dd6a064
Flake8, unused imports, spacing around =
PGijsbers Mar 17, 2019
9cd8176
Always return path to pickle file.
PGijsbers Mar 17, 2019
18eda4d
Add notice of lazy loading to dataset tutorial.
PGijsbers Mar 18, 2019
3d8deda
Simplified `retrieve_class_labels` using the already downloaded featu…
PGijsbers Mar 18, 2019
6ca05be
Fix a bug where nominal feature with a single unique value is treated…
PGijsbers Mar 18, 2019
5f2919f
Apply AppVeyor fix.
PGijsbers Mar 18, 2019
062e2e9
Update feature xml to most recent.
PGijsbers Mar 18, 2019
391f30a
Update test to reflect retrieve_class_labels is now available with la…
PGijsbers Mar 18, 2019
8603224
Unify loading of features between cached and downloaded.
PGijsbers Mar 18, 2019
76e5bb9
Flake8.
PGijsbers Mar 18, 2019
b812904
Add random element to tag to avoid race conditions in parallel tests.
PGijsbers Mar 18, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,8 @@ install:
- rmdir C:\\cygwin /s /q

# Update previous packages and install the build and runtime dependencies of the project.
# XXX: setuptools>23 is currently broken on Win+py3 with numpy
# (https://github.com/pypa/setuptools/issues/728)
- conda update --all --yes setuptools=23
- conda update conda --yes
- conda update --all --yes

# Install the build and runtime dependencies of the project.
- "cd C:\\projects\\openml-python"
Expand Down
9 changes: 9 additions & 0 deletions examples/datasets_tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,15 @@
print(X.head())
print(X.info())

############################################################################
# Sometimes you only need access to a dataset's metadata.
# In those cases, you can download the dataset without downloading the
# data file. The dataset object can be used as normal.
# Whenever you use any functionality that requires the data,
# such as `get_data`, the data will be downloaded.
dataset = openml.datasets.get_dataset(68, download_data=False)


############################################################################
# Exercise 2
# **********
Expand Down
244 changes: 123 additions & 121 deletions openml/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def __init__(self, name, description, format=None,
feature = OpenMLDataFeature(int(xmlfeature['oml:index']),
xmlfeature['oml:name'],
xmlfeature['oml:data_type'],
None,
xmlfeature.get('oml:nominal_value'),
int(nr_missing))
if idx != feature.index:
raise ValueError('Data features not provided '
Expand All @@ -167,96 +167,104 @@ def __init__(self, name, description, format=None,
self.qualities = _check_qualities(qualities)

if data_file is not None:
self.data_pickle_file = data_file.replace('.arff', '.pkl.py3')
self.data_pickle_file = self._data_arff_to_pickle(data_file)
else:
self.data_pickle_file = None

if os.path.exists(self.data_pickle_file):
logger.debug("Data pickle file already exists.")
else:
try:
data = self._get_arff(self.format)
except OSError as e:
logger.critical("Please check that the data file %s is "
"there and can be read.", self.data_file)
raise e

ARFF_DTYPES_TO_PD_DTYPE = {
'INTEGER': 'integer',
'REAL': 'floating',
'NUMERIC': 'floating',
'STRING': 'string'
}
attribute_dtype = {}
attribute_names = []
categories_names = {}
categorical = []
for name, type_ in data['attributes']:
# if the feature is nominal and the a sparse matrix is
# requested, the categories need to be numeric
if (isinstance(type_, list)
and self.format.lower() == 'sparse_arff'):
try:
np.array(type_, dtype=np.float32)
except ValueError:
raise ValueError(
"Categorical data needs to be numeric when "
"using sparse ARFF."
)
# string can only be supported with pandas DataFrame
elif (type_ == 'STRING'
and self.format.lower() == 'sparse_arff'):
def _data_arff_to_pickle(self, data_file):
data_pickle_file = data_file.replace('.arff', '.pkl.py3')
if os.path.exists(data_pickle_file):
logger.debug("Data pickle file already exists.")
return data_pickle_file
else:
try:
data = self._get_arff(self.format)
except OSError as e:
logger.critical("Please check that the data file %s is "
"there and can be read.", data_file)
raise e

ARFF_DTYPES_TO_PD_DTYPE = {
'INTEGER': 'integer',
'REAL': 'floating',
'NUMERIC': 'floating',
'STRING': 'string'
}
attribute_dtype = {}
attribute_names = []
categories_names = {}
categorical = []
for name, type_ in data['attributes']:
# if the feature is nominal and the a sparse matrix is
# requested, the categories need to be numeric
if (isinstance(type_, list)
and self.format.lower() == 'sparse_arff'):
try:
np.array(type_, dtype=np.float32)
except ValueError:
raise ValueError(
"Dataset containing strings is not supported "
"with sparse ARFF."
"Categorical data needs to be numeric when "
"using sparse ARFF."
)

# infer the dtype from the ARFF header
if isinstance(type_, list):
categorical.append(True)
categories_names[name] = type_
if len(type_) == 2:
type_norm = [cat.lower().capitalize()
for cat in type_]
if set(['True', 'False']) == set(type_norm):
categories_names[name] = [
True if cat == 'True' else False
for cat in type_norm
]
attribute_dtype[name] = 'boolean'
else:
attribute_dtype[name] = 'categorical'
# string can only be supported with pandas DataFrame
elif (type_ == 'STRING'
and self.format.lower() == 'sparse_arff'):
raise ValueError(
"Dataset containing strings is not supported "
"with sparse ARFF."
)

# infer the dtype from the ARFF header
if isinstance(type_, list):
categorical.append(True)
categories_names[name] = type_
if len(type_) == 2:
type_norm = [cat.lower().capitalize()
for cat in type_]
if set(['True', 'False']) == set(type_norm):
categories_names[name] = [
True if cat == 'True' else False
for cat in type_norm
]
attribute_dtype[name] = 'boolean'
else:
attribute_dtype[name] = 'categorical'
else:
categorical.append(False)
attribute_dtype[name] = ARFF_DTYPES_TO_PD_DTYPE[type_]
attribute_names.append(name)

if self.format.lower() == 'sparse_arff':
X = data['data']
X_shape = (max(X[1]) + 1, max(X[2]) + 1)
X = scipy.sparse.coo_matrix(
(X[0], (X[1], X[2])), shape=X_shape, dtype=np.float32)
X = X.tocsr()

elif self.format.lower() == 'arff':
X = pd.DataFrame(data['data'], columns=attribute_names)

col = []
for column_name in X.columns:
if attribute_dtype[column_name] in ('categorical',
'boolean'):
col.append(self._unpack_categories(
X[column_name], categories_names[column_name]))
else:
col.append(X[column_name])
X = pd.concat(col, axis=1)

# Pickle the dataframe or the sparse matrix.
with open(self.data_pickle_file, "wb") as fh:
pickle.dump((X, categorical, attribute_names), fh, -1)
logger.debug("Saved dataset %d: %s to file %s" %
(int(self.dataset_id or -1), self.name,
self.data_pickle_file))
attribute_dtype[name] = 'categorical'
else:
categorical.append(False)
attribute_dtype[name] = ARFF_DTYPES_TO_PD_DTYPE[type_]
attribute_names.append(name)

if self.format.lower() == 'sparse_arff':
X = data['data']
X_shape = (max(X[1]) + 1, max(X[2]) + 1)
X = scipy.sparse.coo_matrix(
(X[0], (X[1], X[2])), shape=X_shape, dtype=np.float32)
X = X.tocsr()

elif self.format.lower() == 'arff':
X = pd.DataFrame(data['data'], columns=attribute_names)

col = []
for column_name in X.columns:
if attribute_dtype[column_name] in ('categorical',
'boolean'):
col.append(self._unpack_categories(
X[column_name], categories_names[column_name]))
else:
col.append(X[column_name])
X = pd.concat(col, axis=1)

# Pickle the dataframe or the sparse matrix.
with open(data_pickle_file, "wb") as fh:
pickle.dump((X, categorical, attribute_names), fh, -1)
logger.debug("Saved dataset {did}: {name} to file {path}"
.format(did=int(self.dataset_id or -1),
name=self.name,
path=data_pickle_file)
)
return data_pickle_file

def push_tag(self, tag):
"""Annotates this data set with a tag on the server.
Expand Down Expand Up @@ -394,13 +402,19 @@ def _unpack_categories(series, categories):
return pd.Series(col, index=series.index, dtype='category',
name=series.name)

def get_data(self, target=None,
include_row_id=False,
include_ignore_attributes=False,
return_categorical_indicator=False,
return_attribute_names=False,
dataset_format=None):
"""Returns dataset content as dataframes or sparse matrices.
def _download_data(self) -> None:
""" Download ARFF data file to standard cache directory. Set `self.data_file`. """
# import required here to avoid circular import.
from .functions import _get_dataset_arff
self.data_file = _get_dataset_arff(self)

def get_data(self, target: str = None,
include_row_id: bool = False,
include_ignore_attributes: bool = False,
return_categorical_indicator: bool = False,
return_attribute_names: bool = False,
dataset_format: str = None):
""" Returns dataset content as dataframes or sparse matrices.

Parameters
----------
Expand All @@ -416,10 +430,10 @@ def get_data(self, target=None,
categorical.
return_attribute_names : boolean (default=False)
Whether to return attribute names.
dataset_format : string
The format of returned dataset. If ``array``, the returned dataset
will be a NumPy array or a SciPy sparse matrix. If ``dataframe``,
the returned dataset will be a Pandas DataFrame or SparseDataFrame.
dataset_format : string, optional
The format of returned dataset.
If ``array``, the returned dataset will be a NumPy array or a SciPy sparse matrix.
If ``dataframe``, the returned dataset will be a Pandas DataFrame or SparseDataFrame.

Returns
-------
Expand All @@ -428,12 +442,11 @@ def get_data(self, target=None,
y : ndarray or series, shape (n_samples,)
Target column(s). Only returned if target is not None.
categorical_indicator : boolean ndarray
Mask that indicate categorical features. Only returned if
return_categorical_indicator is True.
Mask that indicate categorical features.
Only returned if return_categorical_indicator is True.
return_attribute_names : list of strings
List of attribute names. Returned only if return_attribute_names is
True.

List of attribute names.
Only returned if return_attribute_names is True.
"""
if dataset_format is None:
warn('The default of "dataset_format" will change from "array" to'
Expand All @@ -442,6 +455,11 @@ def get_data(self, target=None,

rval = []

if self.data_pickle_file is None:
if self.data_file is None:
self._download_data()
self.data_pickle_file = self._data_arff_to_pickle(self.data_file)

path = self.data_pickle_file
if not os.path.exists(path):
raise ValueError("Cannot find a pickle file for dataset %s at "
Expand Down Expand Up @@ -554,26 +572,10 @@ def retrieve_class_labels(self, target_name='class'):
-------
list
"""

# TODO improve performance, currently reads the whole file
# Should make a method that only reads the attributes
arffFileName = self.data_file

if self.format.lower() == 'arff':
return_type = arff.DENSE
elif self.format.lower() == 'sparse_arff':
return_type = arff.COO
else:
raise ValueError('Unknown data format %s' % self.format)

with io.open(arffFileName, encoding='utf8') as fh:
arffData = arff.ArffDecoder().decode(fh, return_type=return_type)

dataAttributes = dict(arffData['attributes'])
if target_name in dataAttributes:
return dataAttributes[target_name]
else:
return None
for feature in self.features.values():
if (feature.name == target_name) and (feature.data_type == 'nominal'):
return feature.nominal_values
return None

def get_features_by_type(self, data_type, exclude=None,
exclude_ignore_attributes=True,
Expand Down
Loading