Merge branch 'develop' into feature/upload-flow

Conflicts: tests/tasks/test_task_functions.py
openml · Oct 4, 2016 · af83de7 · af83de7
2 parents 0e2fd77 + 30a53f9
commit af83de7
Show file tree

Hide file tree

Showing 10 changed files with 160,199 additions and 256 deletions.
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -10,6 +10,8 @@
 import scipy.sparse
 import xmltodict
 
+from ..exceptions import PyOpenMLError
+
 if sys.version_info[0] >= 3:
     import pickle
 else:
@@ -45,7 +47,7 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
                  row_id_attribute=None, ignore_attribute=None,
                  version_label=None, citation=None, tag=None, visibility=None,
                  original_data_url=None, paper_url=None, update_comment=None,
-                 md5_checksum=None, data_file=None):
+                 md5_checksum=None, data_file=None, features=None):
         # Attributes received by querying the RESTful API
         self.dataset_id = int(dataset_id) if dataset_id is not None else None
         self.name = name
@@ -71,38 +73,41 @@ def __init__(self, dataset_id=None, name=None, version=None, description=None,
         self.update_comment = update_comment
         self.md5_cheksum = md5_checksum
         self.data_file = data_file
+        self.features = features
+
         if data_file is not None:
-            self.data_pickle_file = data_file.replace('.arff', '.pkl')
+            if self._data_features_supported():
+                self.data_pickle_file = data_file.replace('.arff', '.pkl')
 
-            if os.path.exists(self.data_pickle_file):
-                logger.debug("Data pickle file already exists.")
-            else:
-                try:
-                    data = self._get_arff(self.format)
-                except OSError as e:
-                    logger.critical("Please check that the data file %s is there "
-                                    "and can be read.", self.data_file)
-                    raise e
-
-                categorical = [False if type(type_) != list else True
-                               for name, type_ in data['attributes']]
-                attribute_names = [name for name, type_ in data['attributes']]
-
-                if isinstance(data['data'], tuple):
-                    X = data['data']
-                    X_shape = (max(X[1]) + 1, max(X[2]) + 1)
-                    X = scipy.sparse.coo_matrix(
-                        (X[0], (X[1], X[2])), shape=X_shape, dtype=np.float32)
-                    X = X.tocsr()
-                elif isinstance(data['data'], list):
-                    X = np.array(data['data'], dtype=np.float32)
+                if os.path.exists(self.data_pickle_file):
+                    logger.debug("Data pickle file already exists.")
                 else:
-                    raise Exception()
-
-                with open(self.data_pickle_file, "wb") as fh:
-                    pickle.dump((X, categorical, attribute_names), fh, -1)
-                logger.debug("Saved dataset %d: %s to file %s" %
-                             (self.dataset_id, self.name, self.data_pickle_file))
+                    try:
+                        data = self._get_arff(self.format)
+                    except OSError as e:
+                        logger.critical("Please check that the data file %s is there "
+                                        "and can be read.", self.data_file)
+                        raise e
+
+                    categorical = [False if type(type_) != list else True
+                                   for name, type_ in data['attributes']]
+                    attribute_names = [name for name, type_ in data['attributes']]
+
+                    if isinstance(data['data'], tuple):
+                        X = data['data']
+                        X_shape = (max(X[1]) + 1, max(X[2]) + 1)
+                        X = scipy.sparse.coo_matrix(
+                            (X[0], (X[1], X[2])), shape=X_shape, dtype=np.float32)
+                        X = X.tocsr()
+                    elif isinstance(data['data'], list):
+                        X = np.array(data['data'], dtype=np.float32)
+                    else:
+                        raise Exception()
+
+                    with open(self.data_pickle_file, "wb") as fh:
+                        pickle.dump((X, categorical, attribute_names), fh, -1)
+                    logger.debug("Saved dataset %d: %s to file %s" %
+                                 (self.dataset_id, self.name, self.data_pickle_file))
 
     def __eq__(self, other):
         if type(other) != OpenMLDataset:
@@ -132,6 +137,9 @@ def _get_arff(self, format):
         # 32 bit system...currently 120mb (just a little bit more than covtype)
         import struct
 
+        if not self._data_features_supported():
+            raise PyOpenMLError('Dataset not compatible, PyOpenML cannot handle string features')
+
         filename = self.data_file
         bits = (8 * struct.calcsize("P"))
         if bits != 64 and os.path.getsize(filename) > 120000000:
@@ -172,6 +180,9 @@ def get_data(self, target=None, target_dtype=int, include_row_id=False,
         """
         rval = []
 
+        if not self._data_features_supported():
+            raise PyOpenMLError('Dataset not compatible, PyOpenML cannot handle string features')
+
         path = self.data_pickle_file
         if not os.path.exists(path):
             raise ValueError("Cannot find a ndarray file for dataset %s at"
@@ -336,3 +347,11 @@ def _to_xml(self):
                 xml_dataset += "<oml:{0}>{1}</oml:{0}>\n".format(prop, content)
         xml_dataset += "</oml:data_set_description>"
         return xml_dataset
+
+    def _data_features_supported(self):
+        if self.features is not None:
+            for feature in self.features['oml:feature']:
+                if feature['oml:data_type'] not in ['numeric', 'nominal']:
+                    return False
+            return True
+        return True
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -72,7 +72,8 @@ def _get_cached_dataset(dataset_id):
     """
     description = _get_cached_dataset_description(dataset_id)
     arff_file = _get_cached_dataset_arff(dataset_id)
-    dataset = _create_dataset_from_description(description, arff_file)
+    features = _get_cached_dataset_features(dataset_id)
+    dataset = _create_dataset_from_description(description, features, arff_file)
 
     return dataset
 
@@ -93,6 +94,22 @@ def _get_cached_dataset_description(dataset_id):
     raise OpenMLCacheException("Dataset description for dataset id %d not "
                                "cached" % dataset_id)
 
+def _get_cached_dataset_features(dataset_id):
+    for cache_dir in [config.get_cache_directory(),
+                      config.get_private_directory()]:
+        did_cache_dir = os.path.join(cache_dir, "datasets", str(dataset_id))
+        features_file = os.path.join(did_cache_dir, "features.xml")
+        try:
+            with io.open(features_file, encoding='utf8') as fh:
+                features_xml = fh.read()
+        except (IOError, OSError):
+            continue
+
+        return xmltodict.parse(features_xml)["oml:data_features"]
+
+    raise OpenMLCacheException("Dataset features for dataset id %d not "
+                               "cached" % dataset_id)
+
 
 def _get_cached_dataset_arff(dataset_id):
     for cache_dir in [config.get_cache_directory(),
@@ -255,14 +272,14 @@ def get_dataset(dataset_id):
     try:
         description = _get_dataset_description(did_cache_dir, dataset_id)
         arff_file = _get_dataset_arff(did_cache_dir, description)
-        # TODO not used yet, figure out what to do with them...
         features = _get_dataset_features(did_cache_dir, dataset_id)
+        # TODO not used yet, figure out what to do with this...
         qualities = _get_dataset_qualities(did_cache_dir, dataset_id)
     except Exception as e:
         _remove_dataset_cache_dir(did_cache_dir)
         raise e
 
-    dataset = _create_dataset_from_description(description, arff_file)
+    dataset = _create_dataset_from_description(description, features, arff_file)
     return dataset
 
 
@@ -463,7 +480,7 @@ def _remove_dataset_cache_dir(did_cache_dir):
                              'Please do this manually!' % did_cache_dir)
 
 
-def _create_dataset_from_description(description, arff_file):
+def _create_dataset_from_description(description, features, arff_file):
     """Create a dataset object from a description dict.
 
     Parameters
@@ -502,5 +519,6 @@ def _create_dataset_from_description(description, arff_file):
         description.get("oml:paper_url"),
         description.get("oml:update_comment"),
         description.get("oml:md5_checksum"),
-        data_file=arff_file)
+        data_file=arff_file,
+        features=features)
     return dataset
diff --git a/openml/runs/__init__.py b/openml/runs/__init__.py
@@ -1,8 +1,4 @@
 from .run import OpenMLRun
-from .functions import (run_task, get_run, list_runs, list_runs_by_flow,
-                        list_runs_by_tag, list_runs_by_task,
-                        list_runs_by_uploader, list_runs_by_filters)
+from .functions import (run_task, get_run, list_runs)
 
-__all__ = ['OpenMLRun', 'run_task', 'get_run', 'list_runs', 'list_runs_by_flow',
-           'list_runs_by_tag', 'list_runs_by_task', 'list_runs_by_uploader',
-           'list_runs_by_filters']
+__all__ = ['OpenMLRun', 'run_task', 'get_run', 'list_runs']