Skip to content

Commit

Permalink
Merge branch 'develop' into feature/upload-flow
Browse files Browse the repository at this point in the history
Conflicts:
	tests/tasks/test_task_functions.py
  • Loading branch information
mfeurer committed Oct 4, 2016
2 parents 0e2fd77 + 30a53f9 commit af83de7
Show file tree
Hide file tree
Showing 10 changed files with 160,199 additions and 256 deletions.
79 changes: 49 additions & 30 deletions openml/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import scipy.sparse
import xmltodict

from ..exceptions import PyOpenMLError

if sys.version_info[0] >= 3:
import pickle
else:
Expand Down Expand Up @@ -45,7 +47,7 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
row_id_attribute=None, ignore_attribute=None,
version_label=None, citation=None, tag=None, visibility=None,
original_data_url=None, paper_url=None, update_comment=None,
md5_checksum=None, data_file=None):
md5_checksum=None, data_file=None, features=None):
# Attributes received by querying the RESTful API
self.dataset_id = int(dataset_id) if dataset_id is not None else None
self.name = name
Expand All @@ -71,38 +73,41 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
self.update_comment = update_comment
self.md5_cheksum = md5_checksum
self.data_file = data_file
self.features = features

if data_file is not None:
self.data_pickle_file = data_file.replace('.arff', '.pkl')
if self._data_features_supported():
self.data_pickle_file = data_file.replace('.arff', '.pkl')

if os.path.exists(self.data_pickle_file):
logger.debug("Data pickle file already exists.")
else:
try:
data = self._get_arff(self.format)
except OSError as e:
logger.critical("Please check that the data file %s is there "
"and can be read.", self.data_file)
raise e

categorical = [False if type(type_) != list else True
for name, type_ in data['attributes']]
attribute_names = [name for name, type_ in data['attributes']]

if isinstance(data['data'], tuple):
X = data['data']
X_shape = (max(X[1]) + 1, max(X[2]) + 1)
X = scipy.sparse.coo_matrix(
(X[0], (X[1], X[2])), shape=X_shape, dtype=np.float32)
X = X.tocsr()
elif isinstance(data['data'], list):
X = np.array(data['data'], dtype=np.float32)
if os.path.exists(self.data_pickle_file):
logger.debug("Data pickle file already exists.")
else:
raise Exception()

with open(self.data_pickle_file, "wb") as fh:
pickle.dump((X, categorical, attribute_names), fh, -1)
logger.debug("Saved dataset %d: %s to file %s" %
(self.dataset_id, self.name, self.data_pickle_file))
try:
data = self._get_arff(self.format)
except OSError as e:
logger.critical("Please check that the data file %s is there "
"and can be read.", self.data_file)
raise e

categorical = [False if type(type_) != list else True
for name, type_ in data['attributes']]
attribute_names = [name for name, type_ in data['attributes']]

if isinstance(data['data'], tuple):
X = data['data']
X_shape = (max(X[1]) + 1, max(X[2]) + 1)
X = scipy.sparse.coo_matrix(
(X[0], (X[1], X[2])), shape=X_shape, dtype=np.float32)
X = X.tocsr()
elif isinstance(data['data'], list):
X = np.array(data['data'], dtype=np.float32)
else:
raise Exception()

with open(self.data_pickle_file, "wb") as fh:
pickle.dump((X, categorical, attribute_names), fh, -1)
logger.debug("Saved dataset %d: %s to file %s" %
(self.dataset_id, self.name, self.data_pickle_file))

def __eq__(self, other):
if type(other) != OpenMLDataset:
Expand Down Expand Up @@ -132,6 +137,9 @@ def _get_arff(self, format):
# 32 bit system...currently 120mb (just a little bit more than covtype)
import struct

if not self._data_features_supported():
raise PyOpenMLError('Dataset not compatible, PyOpenML cannot handle string features')

filename = self.data_file
bits = (8 * struct.calcsize("P"))
if bits != 64 and os.path.getsize(filename) > 120000000:
Expand Down Expand Up @@ -172,6 +180,9 @@ def get_data(self, target=None, target_dtype=int, include_row_id=False,
"""
rval = []

if not self._data_features_supported():
raise PyOpenMLError('Dataset not compatible, PyOpenML cannot handle string features')

path = self.data_pickle_file
if not os.path.exists(path):
raise ValueError("Cannot find a ndarray file for dataset %s at"
Expand Down Expand Up @@ -336,3 +347,11 @@ def _to_xml(self):
xml_dataset += "<oml:{0}>{1}</oml:{0}>\n".format(prop, content)
xml_dataset += "</oml:data_set_description>"
return xml_dataset

def _data_features_supported(self):
if self.features is not None:
for feature in self.features['oml:feature']:
if feature['oml:data_type'] not in ['numeric', 'nominal']:
return False
return True
return True
28 changes: 23 additions & 5 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ def _get_cached_dataset(dataset_id):
"""
description = _get_cached_dataset_description(dataset_id)
arff_file = _get_cached_dataset_arff(dataset_id)
dataset = _create_dataset_from_description(description, arff_file)
features = _get_cached_dataset_features(dataset_id)
dataset = _create_dataset_from_description(description, features, arff_file)

return dataset

Expand All @@ -93,6 +94,22 @@ def _get_cached_dataset_description(dataset_id):
raise OpenMLCacheException("Dataset description for dataset id %d not "
"cached" % dataset_id)

def _get_cached_dataset_features(dataset_id):
for cache_dir in [config.get_cache_directory(),
config.get_private_directory()]:
did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id))
features_file = os.path.join(did_cache_dir, "features.xml")
try:
with io.open(features_file, encoding='utf8') as fh:
features_xml = fh.read()
except (IOError, OSError):
continue

return xmltodict.parse(features_xml)["oml:data_features"]

raise OpenMLCacheException("Dataset features for dataset id %d not "
"cached" % dataset_id)


def _get_cached_dataset_arff(dataset_id):
for cache_dir in [config.get_cache_directory(),
Expand Down Expand Up @@ -255,14 +272,14 @@ def get_dataset(dataset_id):
try:
description = _get_dataset_description(did_cache_dir, dataset_id)
arff_file = _get_dataset_arff(did_cache_dir, description)
# TODO not used yet, figure out what to do with them...
features = _get_dataset_features(did_cache_dir, dataset_id)
# TODO not used yet, figure out what to do with this...
qualities = _get_dataset_qualities(did_cache_dir, dataset_id)
except Exception as e:
_remove_dataset_cache_dir(did_cache_dir)
raise e

dataset = _create_dataset_from_description(description, arff_file)
dataset = _create_dataset_from_description(description, features, arff_file)
return dataset


Expand Down Expand Up @@ -463,7 +480,7 @@ def _remove_dataset_cache_dir(did_cache_dir):
'Please do this manually!' % did_cache_dir)


def _create_dataset_from_description(description, arff_file):
def _create_dataset_from_description(description, features, arff_file):
"""Create a dataset object from a description dict.
Parameters
Expand Down Expand Up @@ -502,5 +519,6 @@ def _create_dataset_from_description(description, arff_file):
description.get("oml:paper_url"),
description.get("oml:update_comment"),
description.get("oml:md5_checksum"),
data_file=arff_file)
data_file=arff_file,
features=features)
return dataset
8 changes: 2 additions & 6 deletions openml/runs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
from .run import OpenMLRun
from .functions import (run_task, get_run, list_runs, list_runs_by_flow,
list_runs_by_tag, list_runs_by_task,
list_runs_by_uploader, list_runs_by_filters)
from .functions import (run_task, get_run, list_runs)

__all__ = ['OpenMLRun', 'run_task', 'get_run', 'list_runs', 'list_runs_by_flow',
'list_runs_by_tag', 'list_runs_by_task', 'list_runs_by_uploader',
'list_runs_by_filters']
__all__ = ['OpenMLRun', 'run_task', 'get_run', 'list_runs']

0 comments on commit af83de7

Please sign in to comment.