openml · mfeurer · Sep 2, 2015 · Mar 2, 2015 · Mar 2, 2015 · Mar 2, 2015
diff --git a/openml/apiconnector.py b/openml/apiconnector.py
@@ -27,7 +27,7 @@
 
 logger = logging.getLogger(__name__)
 
-OPENML_URL = "http://openml.org"
+OPENML_URL = "http://www.openml.org"
 
 
 class OpenMLStatusChange(Warning):
@@ -54,6 +54,10 @@ class AuthentificationError(PyOpenMLError):
     def __init__(self, message):
         super(AuthentificationError, self).__init__(message)
 
+class OpenMLCacheException(PyOpenMLError):
+    def __init__(self, message):
+        super(OpenMLCacheException, self).__init__(message)
+
 
 class APIConnector(object):
     """
@@ -99,6 +103,10 @@ class APIConnector(object):
         `here <http://pieces.openpolitics.com
         /2012/04/python-logging-best-practices/>`_.
 
+    private_directory : str, optional (default=None)
+        A local directory which can be accessed through the OpenML package.
+        Useful to access private datasets through the same interface.
+
     Raises
     ------
     ValueError
@@ -119,7 +127,7 @@ class APIConnector(object):
     """
     def __init__(self, cache_directory=None, username=None, password=None,
                  server=None, verbosity=None, configure_logger=True,
-                 authenticate=True):
+                 authenticate=True, private_directory=None):
         # The .openml directory is necessary, just try to create it (EAFP)
         try:
             os.mkdir(os.path.expanduser('~/.openml'))
@@ -139,6 +147,8 @@ def __init__(self, cache_directory=None, username=None, password=None,
             self.config.set('FAKE_SECTION', 'server', server)
         if verbosity is not None:
             self.config.set('FAKE_SECTION', 'verbosity', verbosity)
+        if private_directory is not None:
+            self.config.set('FAKE_SECTION', 'private_directory', private_directory)
 
         if configure_logger:
             verbosity = self.config.getint('FAKE_SECTION', 'verbosity')
@@ -162,8 +172,18 @@ def __init__(self, cache_directory=None, username=None, password=None,
         self.dataset_cache_dir = os.path.join(self.cache_dir, "datasets")
         self.task_cache_dir = os.path.join(self.cache_dir, "tasks")
 
+        # Set up the private directory
+        self.private_directory = self.config.get('FAKE_SECTION',
+                                                 'private_directory')
+        self._private_directory_datasets = os.path.join(
+            self.private_directory, "datasets")
+        self._private_directory_tasks = os.path.join(
+            self.private_directory, "tasks")
+
         for dir_ in [self.cache_dir, self.dataset_cache_dir,
-                     self.task_cache_dir]:
+                     self.task_cache_dir, self.private_directory,
+                     self._private_directory_datasets,
+                     self._private_directory_tasks]:
             if not os.path.exists(dir_) and not os.path.isdir(dir_):
                 os.mkdir(dir_)
 
@@ -213,7 +233,8 @@ def _parse_config(self):
                     'password': '',
                     'server': OPENML_URL,
                     'verbosity': 0,
-                    'cachedir': os.path.expanduser('~/.openml/cache')}
+                    'cachedir': os.path.expanduser('~/.openml/cache'),
+                    'private_directory': os.path.expanduser('~/.openml/private')}
 
         config_file = os.path.expanduser('~/.openml/config')
         config = configparser.RawConfigParser(defaults=defaults)
@@ -243,26 +264,30 @@ def _parse_config(self):
     # Local getters/accessors to the cache directory
     def get_list_of_cached_datasets(self):
         """Return list with ids of all cached datasets"""
-        directory_content = os.listdir(self.dataset_cache_dir)
-        directory_content.sort()
-
-        # Find all dataset ids for which we have downloaded the dataset
-        # description
         datasets = []
-        for directory_name in directory_content:
-            # First check if the directory name could be an OpenML dataset id
-            if not re.match(r"[0-9]*", directory_name):
-                continue
 
-            did = int(directory_name)
+        for dataset_cache_dir in [self.dataset_cache_dir,
+                                  self._private_directory_datasets]:
+            directory_content = os.listdir(dataset_cache_dir)
+            directory_content.sort()
 
-            directory_name = os.path.join(self.dataset_cache_dir,
-                                          directory_name)
-            dataset_directory_content = os.listdir(directory_name)
+            # Find all dataset ids for which we have downloaded the dataset
+            # description
 
-            if "dataset.arff" in dataset_directory_content and \
-                    "description.xml" in dataset_directory_content:
-                datasets.append(did)
+            for directory_name in directory_content:
+                # First check if the directory name could be an OpenML dataset id
+                if not re.match(r"[0-9]*", directory_name):
+                    continue
+
+                did = int(directory_name)
+
+                directory_name = os.path.join(dataset_cache_dir,
+                                              directory_name)
+                dataset_directory_content = os.listdir(directory_name)
+
+                if "dataset.arff" in dataset_directory_content and \
+                        "description.xml" in dataset_directory_content:
+                    datasets.append(did)
 
         datasets.sort()
         return datasets
@@ -281,57 +306,117 @@ def get_cached_datasets(self):
 
     def get_cached_dataset(self, did):
         # This code is slow...replace it with new API calls
-        description = self.download_dataset_description(did)
-        arff_file = self.download_dataset_arff(did, description=description)
+        description = self._get_cached_dataset_description(did)
+        arff_file = self._get_cached_dataset_arff(did)
         dataset = self._create_dataset_from_description(description, arff_file)
 
         return dataset
 
-    def get_cached_tasks(self):
-        directory_content = os.listdir(self.task_cache_dir)
-        directory_content.sort()
+    def _get_cached_dataset_description(self, did):
+        for dataset_cache_dir in [self.dataset_cache_dir,
+                                  self._private_directory_datasets]:
+            did_cache_dir = os.path.join(dataset_cache_dir, str(did))
+            description_file = os.path.join(did_cache_dir, "description.xml")
+
+            try:
+                with open(description_file) as fh:
+                    dataset_xml = fh.read()
+            except (IOError, OSError) as e:
+                continue
+
+            return xmltodict.parse(dataset_xml)["oml:data_set_description"]
+
+        raise OpenMLCacheException("Dataset description for did %d not "
+                                   "cached" % did)
+
+
+    def _get_cached_dataset_arff(self, did):
+        for dataset_cache_dir in [self.dataset_cache_dir,
+                                  self._private_directory_datasets]:
+            did_cache_dir = os.path.join(dataset_cache_dir, str(did))
+            output_file = os.path.join(did_cache_dir, "dataset.arff")
+
+            try:
+                with open(output_file):
+                    pass
+                return output_file
+            except (OSError, IOError) as e:
+                # TODO create NOTCACHEDEXCEPTION
+                continue
+
+        print("Dataset ID", did)
+        raise Exception()
+
 
-        # Find all dataset ids for which we have downloaded the dataset
-        # description
+    def get_cached_tasks(self):
         tasks = OrderedDict()
-        for filename in directory_content:
-            match = re.match(r"(tid)_([0-9]*)\.xml", filename)
-            if match:
-                tid = match.group(2)
-                tid = int(tid)
+        for task_cache_dir in [self.task_cache_dir,
+                                  self._private_directory_tasks]:
+
+            directory_content = os.listdir(task_cache_dir)
+            directory_content.sort()
+
+            # Find all dataset ids for which we have downloaded the dataset
+            # description
 
-                tasks[tid] = self.get_cached_task(tid)
+            for filename in directory_content:
+                match = re.match(r"(tid)_([0-9]*)\.xml", filename)
+                if match:
+                    tid = match.group(2)
+                    tid = int(tid)
+
+                    tasks[tid] = self.get_cached_task(tid)
 
         return tasks
 
     def get_cached_task(self, tid):
-        task_file = os.path.join(self.task_cache_dir,
-                                 "tid_%d.xml" % int(tid))
+        for task_cache_dir in [self.task_cache_dir,
+                               self._private_directory_tasks]:
+            task_file = os.path.join(task_cache_dir,
+                                     "tid_%d.xml" % int(tid))
 
-        with open(task_file) as fh:
-            task = self._create_task_from_xml(xml=fh.read())
-        return task
+            try:
+                with open(task_file) as fh:
+                    task = self._create_task_from_xml(xml=fh.read())
+                return task
+            except (OSError, IOError) as e:
+                continue
 
-    def get_cached_splits(self):
-        directory_content = os.listdir(self.task_cache_dir)
-        directory_content.sort()
+        print("Task ID", tid)
+        raise Exception()
 
+    def get_cached_splits(self):
         splits = OrderedDict()
-        for filename in directory_content:
-            match = re.match(r"(tid)_([0-9]*)\.arff", filename)
-            if match:
-                tid = match.group(2)
-                tid = int(tid)
+        for task_cache_dir in [self.task_cache_dir,
+                               self._private_directory_tasks]:
+            directory_content = os.listdir(task_cache_dir)
+            directory_content.sort()
 
-                splits[tid] = self.get_cached_task(tid)
+
+            for filename in directory_content:
+                match = re.match(r"(tid)_([0-9]*)\.arff", filename)
+                if match:
+                    tid = match.group(2)
+                    tid = int(tid)
+
+                    splits[tid] = self.get_cached_task(tid)
 
         return splits
 
     def get_cached_split(self, tid):
-        split_file = os.path.join(self.task_cache_dir,
-                                  "tid_%d.arff" % int(tid))
-        split = OpenMLSplit.from_arff_file(split_file)
-        return split
+        for task_cache_dir in [self.task_cache_dir,
+                               self._private_directory_tasks]:
+            try:
+                split_file = os.path.join(task_cache_dir,
+                                          "tid_%d.arff" % int(tid))
+                split = OpenMLSplit.from_arff_file(split_file)
+                return split
+
+            except (OSError, IOError) as e:
+                continue
+
+        print("Task ID", tid)
+        raise Exception()
 
     ############################################################################
     # Remote getters/API calls to OpenML
@@ -462,14 +547,14 @@ def download_dataset_description(self, did):
         description_file = os.path.join(did_cache_dir, "description.xml")
 
         try:
-            with open(description_file) as fh:
-                dataset_xml = fh.read()
-        except (IOError, OSError):
+            return self._get_cached_dataset_description(did)
+        except (OpenMLCacheException):
             try:
                 return_code, dataset_xml = self._perform_api_call(
                     "openml.data.description", data_id=did)
-            except URLError as e:
+            except (URLError, UnicodeEncodeError) as e:
                 # TODO logger.debug
+                self._remove_dataset_chache_dir(did)
                 print(e)
                 raise e
 
@@ -481,6 +566,7 @@ def download_dataset_description(self, did):
                 "oml:data_set_description"]
         except Exception as e:
             # TODO logger.debug
+            self._remove_dataset_chache_dir()
             print("Dataset ID", did)
             raise e
 
@@ -526,7 +612,7 @@ def download_dataset_features(self, did):
             try:
                 return_code, features_xml = self._perform_api_call(
                     "openml.data.features", data_id=did)
-            except URLError as e:
+            except (URLError, UnicodeEncodeError) as e:
                 # TODO logger.debug
                 print(e)
                 raise e
@@ -550,7 +636,7 @@ def download_dataset_qualities(self, did):
         try:
             return_code, qualities_xml = self._perform_api_call(
                 "openml.data.qualities", data_id=did)
-        except URLError as e:
+        except (URLError, UnicodeEncodeError) as e:
             # TODO logger.debug
             print(e)
             raise e
@@ -575,6 +661,14 @@ def _create_dataset_cache_dir(self, did):
             pass
         return dataset_cache_dir
 
+    def _remove_dataset_chache_dir(self, did):
+        dataset_cache_dir = os.path.join(self.dataset_cache_dir, str(did))
+        try:
+            os.rmdir(dataset_cache_dir)
+        except (OSError, IOError):
+            # TODO add debug information
+            pass
+
     def _create_dataset_from_description(self, description, arff_file):
         dataset = OpenMLDataset(
             description["oml:id"],
@@ -680,7 +774,7 @@ def download_task(self, task_id):
             try:
                 return_code, task_xml = self._perform_api_call(
                     "openml.task.search", task_id=task_id)
-            except URLError as e:
+            except (URLError, UnicodeEncodeError) as e:
                 print(e)
                 raise e
 
@@ -764,7 +858,7 @@ def _download_split(self, task, cache_file):
             split_url = task.estimation_procedure["data_splits_url"]
             try:
                 return_code, split_arff = self._read_url(split_url)
-            except URLError as e:
+            except (URLError, UnicodeEncodeError) as e:
                 print(e, split_url)
                 raise e