Merge eff74f6 into b3262b6

openml · Apr 5, 2017 · 8a6f4a8 · 8a6f4a8
2 parents b3262b6 + eff74f6
commit 8a6f4a8
Show file tree

Hide file tree

Showing 14 changed files with 352 additions and 156 deletions.
diff --git a/openml/_api_calls.py b/openml/_api_calls.py
@@ -3,9 +3,10 @@
 import requests
 import arff
 import warnings
+import xmltodict
 
 from . import config
-from .exceptions import OpenMLServerError
+from .exceptions import OpenMLServerError, OpenMLServerException
 
 
 def _perform_api_call(call, data=None, file_dictionary=None,
@@ -80,7 +81,7 @@ def _read_url_files(url, data=None, file_dictionary=None, file_elements=None):
     # 'gzip,deflate'
     response = requests.post(url, data=data, files=file_elements)
     if response.status_code != 200:
-        raise OpenMLServerError(('Status code: %d\n' % response.status_code) + response.text)
+        raise _parse_server_exception(response)
     if 'Content-Encoding' not in response.headers or \
             response.headers['Content-Encoding'] != 'gzip':
         warnings.warn('Received uncompressed content from OpenML for %s.' % url)
@@ -97,8 +98,23 @@ def _read_url(url, data=None):
     response = requests.post(url, data=data)
 
     if response.status_code != 200:
-        raise OpenMLServerError(('Status code: %d\n' % response.status_code) + response.text)
+        raise _parse_server_exception(response)
     if 'Content-Encoding' not in response.headers or \
             response.headers['Content-Encoding'] != 'gzip':
         warnings.warn('Received uncompressed content from OpenML for %s.' % url)
     return response.text
+
+def _parse_server_exception(response):
+    # OpenML has a sopisticated error system
+    # where information about failures is provided. try to parse this
+    try:
+        server_exception = xmltodict.parse(response.text)
+    except:
+        raise OpenMLServerError(('Status code: %d\n' % response.status_code) + response.text)
+
+    code = int(server_exception['oml:error']['oml:code'])
+    message = server_exception['oml:error']['oml:message']
+    additional = None
+    if 'oml:additional_information' in server_exception['oml:error']:
+        additional = server_exception['oml:error']['oml:additional_information']
+    return OpenMLServerException(code, message, additional)
diff --git a/openml/config.py b/openml/config.py
@@ -36,6 +36,7 @@ def _setup():
     """
     global apikey
     global server
+    global avoid_duplicate_runs
     # read config file, create cache directory
     try:
         os.mkdir(os.path.expanduser('~/.openml'))
@@ -46,6 +47,7 @@ def _setup():
     apikey = config.get('FAKE_SECTION', 'apikey')
     server = config.get('FAKE_SECTION', 'server')
     cache_dir = config.get('FAKE_SECTION', 'cachedir')
+    avoid_duplicate_runs = config.getboolean('FAKE_SECTION', 'avoid_duplicate_runs')
     set_cache_directory(cache_dir)
 
 
@@ -84,7 +86,8 @@ def _parse_config():
     defaults = {'apikey': apikey,
                 'server': server,
                 'verbosity': 0,
-                'cachedir': os.path.expanduser('~/.openml/cache')}
+                'cachedir': os.path.expanduser('~/.openml/cache'),
+                'avoid_duplicate_runs': 'True'}
 
     config_file = os.path.expanduser('~/.openml/config')
     config = configparser.RawConfigParser(defaults=defaults)

diff --git a/openml/exceptions.py b/openml/exceptions.py
@@ -2,13 +2,25 @@ class PyOpenMLError(Exception):
     def __init__(self, message):
         super(PyOpenMLError, self).__init__(message)
 
-
 class OpenMLServerError(PyOpenMLError):
-    """Server didn't respond 200."""
+    """class for when something is really wrong on the server
+       (result did not parse to dict), contains unparsed error."""
+
     def __init__(self, message):
         message = "OpenML Server error: " + message
         super(OpenMLServerError, self).__init__(message)
 
+#
+class OpenMLServerException(OpenMLServerError):
+    """exception for when the result of the server was
+       not 200 (e.g., listing call w/o results). """
+
+    def __init__(self, code, message, additional=None):
+        self.code = code
+        self.additional = additional
+        message = "OpenML Server exception: " + message
+        super(OpenMLServerException, self).__init__(message)
+
 
 class OpenMLCacheException(PyOpenMLError):
     """Dataset / task etc not found in cache"""

diff --git a/openml/flows/__init__.py b/openml/flows/__init__.py
@@ -1,6 +1,6 @@
 from .flow import OpenMLFlow
 from .sklearn_converter import sklearn_to_flow, flow_to_sklearn
-from .functions import get_flow, list_flows
+from .functions import get_flow, list_flows, flow_exists
 
 __all__ = ['OpenMLFlow', 'create_flow_from_model', 'get_flow', 'list_flows',
-           'sklearn_to_flow', 'flow_to_sklearn']
+           'sklearn_to_flow', 'flow_to_sklearn', 'flow_exists']
diff --git a/openml/flows/flow.py b/openml/flows/flow.py
@@ -340,58 +340,6 @@ def publish(self):
         self.flow_id = int(xmltodict.parse(return_value)['oml:upload_flow']['oml:id'])
         return self
 
-    def _ensure_flow_exists(self):
-        """ Checks if a flow exists for the given model and possibly creates it.
-
-        If the given flow exists on the server, the flow-id will simply
-        be returned. Otherwise it will be uploaded to the server.
-
-        Returns
-        -------
-        flow_id : int
-            Flow id on the server.
-        """
-        _, flow_id = _check_flow_exists(self.name, self.external_version)
-        # TODO add numpy and scipy version!
-
-        if int(flow_id) == -1:
-            flow = self.publish()
-            return int(flow.flow_id)
-
-        return int(flow_id)
-
-
-def _check_flow_exists(name, version):
-    """Retrieves the flow id of the flow uniquely identified by name+version.
-
-    Parameter
-    ---------
-    name : string
-        Name of the flow
-    version : string
-        Version information associated with flow.
-
-    Returns
-    -------
-    flow_exist : int
-        Flow id or -1 if the flow doesn't exist.
-
-    Notes
-    -----
-    see http://www.openml.org/api_docs/#!/flow/get_flow_exists_name_version
-    """
-    if not (type(name) is str and len(name) > 0):
-        raise ValueError('Argument \'name\' should be a non-empty string')
-    if not (type(version) is str and len(version) > 0):
-        raise ValueError('Argument \'version\' should be a non-empty string')
-
-    xml_response = _perform_api_call("flow/exists",
-                                     data={'name': name, 'external_version': version})
-
-    xml_dict = xmltodict.parse(xml_response)
-    flow_id = xml_dict['oml:flow_exists']['oml:id']
-    return xml_response, flow_id
-
 
 def _add_if_nonempty(dic, key, value):
     if value is not None:

diff --git a/openml/flows/functions.py b/openml/flows/functions.py
@@ -1,4 +1,5 @@
 import xmltodict
+import six
 
 from openml._api_calls import _perform_api_call
 from . import OpenMLFlow, flow_to_sklearn
@@ -69,6 +70,41 @@ def list_flows(offset=None, size=None, tag=None):
     return _list_flows(api_call)
 
 
+def flow_exists(name, external_version):
+    """Retrieves the flow id of the flow uniquely identified by name + external_version.
+
+    Parameter
+    ---------
+    name : string
+        Name of the flow
+    version : string
+        Version information associated with flow.
+
+    Returns
+    -------
+    flow_exist : int
+        flow id iff exists, False otherwise
+
+    Notes
+    -----
+    see http://www.openml.org/api_docs/#!/flow/get_flow_exists_name_version
+    """
+    if not (isinstance(name, six.string_types) and len(name) > 0):
+        raise ValueError('Argument \'name\' should be a non-empty string')
+    if not (isinstance(name, six.string_types) and len(external_version) > 0):
+        raise ValueError('Argument \'version\' should be a non-empty string')
+
+    xml_response = _perform_api_call("flow/exists",
+                                     data={'name': name, 'external_version': external_version})
+
+    result_dict = xmltodict.parse(xml_response)
+    flow_id = int(result_dict['oml:flow_exists']['oml:id'])
+    if flow_id > 0:
+        return flow_id
+    else:
+        return False
+
+
 def _list_flows(api_call):
     # TODO add proper error handling here!
     xml_string = _perform_api_call(api_call)

diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -4,12 +4,14 @@
 import xmltodict
 import numpy as np
 import warnings
+import openml
 from sklearn.model_selection._search import BaseSearchCV
 
 from ..exceptions import PyOpenMLError
 from .. import config
-from ..flows import sklearn_to_flow
-from ..exceptions import OpenMLCacheException
+from ..flows import sklearn_to_flow, get_flow, flow_exists
+from ..setups import setup_exists
+from ..exceptions import OpenMLCacheException, OpenMLServerException
 from ..util import URLError
 from ..tasks.functions import _create_task_from_xml
 from .._api_calls import _perform_api_call
@@ -21,7 +23,7 @@
 
 
 
-def run_task(task, model):
+def run_task(task, model, avoid_duplicate_runs=True):
     """Performs a CV run on the dataset of the given task, using the split.
 
     Parameters
@@ -42,6 +44,19 @@ def run_task(task, model):
     # TODO move this into its onwn module. While it somehow belongs here, it
     # adds quite a lot of functionality which is better suited in other places!
     # TODO why doesn't this accept a flow as input? - this would make this more flexible!
+    flow = sklearn_to_flow(model)
+
+    # returns flow id if the flow exists on the server, False otherwise
+    flow_id = flow_exists(flow.name, flow.external_version)
+
+    # skips the run if it already exists and the user opts for this in the config file.
+    # also, if the flow is not present on the server, the check is not needed.
+    if avoid_duplicate_runs and flow_id:
+        flow = get_flow(flow_id)
+        setup_id = setup_exists(flow, model)
+        ids = _run_exists(task.task_id, setup_id)
+        if ids:
+            raise PyOpenMLError("Run already exists in server. Run id(s): %s" %str(ids))
 
     dataset = task.get_dataset()
     X, Y = dataset.get_data(target=task.target_name)
@@ -55,19 +70,44 @@ def run_task(task, model):
     run = OpenMLRun(task_id=task.task_id, flow_id=None, dataset_id=dataset.dataset_id, model=model)
     run.data_content, run.trace_content = _run_task_get_arffcontent(model, task, class_labels)
 
-    # now generate the flow
-    flow = sklearn_to_flow(model)
-    flow_id = flow._ensure_flow_exists()
-    if flow_id < 0:
-        print("No flow")
-        return 0, 2
-    config.logger.info(flow_id)
+    if flow_id == False:
+        # means the flow did not exists.
+        # As we could run it, publish it now
+        flow = flow.publish()
+    else:
+        # flow already existed, download it from server
+        # TODO (neccessary? is this a post condition of this function)
+        flow = get_flow(flow_id)
 
-    # attach the flow to the run
-    run.flow_id = flow_id
+    run.flow_id = flow.flow_id
+    config.logger.info('Executed Task %d with Flow id: %d' %(task.task_id, run.flow_id))
 
     return run
 
+def _run_exists(task_id, setup_id):
+    '''
+    Checks whether a task/setup combination is already present on the server.
+
+    :param task_id: int
+    :param setup_id: int
+    :return: List of run ids iff these already exists on the server, False otherwise
+    '''
+    if setup_id <= 0:
+        # openml setups are in range 1-inf
+        return False
+
+    try:
+        result = list_runs(task=[task_id], setup=[setup_id])
+        if len(result) > 0:
+            return set(result.keys())
+        else:
+            return False
+    except OpenMLServerException as exception:
+        # error code 512 implies no results. This means the run does not exist yet
+        assert(exception.code == 512)
+        return False
+
+
 
 def _prediction_to_row(rep_no, fold_no, row_id, correct_label, predicted_label,
                        predicted_probabilities, class_labels, model_classes_mapping):
@@ -275,27 +315,28 @@ def _create_run_from_xml(xml):
     evaluations = dict()
     detailed_evaluations = defaultdict(lambda: defaultdict(dict))
     evaluation_flows = dict()
-    for evaluation_dict in run['oml:output_data']['oml:evaluation']:
-        key = evaluation_dict['oml:name']
-        if 'oml:value' in evaluation_dict:
-            value = float(evaluation_dict['oml:value'])
-        elif 'oml:array_data' in evaluation_dict:
-            value = evaluation_dict['oml:array_data']
-        else:
-            raise ValueError('Could not find keys "value" or "array_data" '
-                             'in %s' % str(evaluation_dict.keys()))
-
-        if '@repeat' in evaluation_dict and '@fold' in evaluation_dict:
-            repeat = int(evaluation_dict['@repeat'])
-            fold = int(evaluation_dict['@fold'])
-            repeat_dict = detailed_evaluations[key]
-            fold_dict = repeat_dict[repeat]
-            fold_dict[fold] = value
-        else:
-            evaluations[key] = value
-            evaluation_flows[key] = flow_id
+    if 'oml:output_data' in run and 'oml:evaluation' in run['oml:output_data']:
+        for evaluation_dict in run['oml:output_data']['oml:evaluation']:
+            key = evaluation_dict['oml:name']
+            if 'oml:value' in evaluation_dict:
+                value = float(evaluation_dict['oml:value'])
+            elif 'oml:array_data' in evaluation_dict:
+                value = evaluation_dict['oml:array_data']
+            else:
+                raise ValueError('Could not find keys "value" or "array_data" '
+                                 'in %s' % str(evaluation_dict.keys()))
+
+            if '@repeat' in evaluation_dict and '@fold' in evaluation_dict:
+                repeat = int(evaluation_dict['@repeat'])
+                fold = int(evaluation_dict['@fold'])
+                repeat_dict = detailed_evaluations[key]
+                fold_dict = repeat_dict[repeat]
+                fold_dict[fold] = value
+            else:
+                evaluations[key] = value
+                evaluation_flows[key] = flow_id
 
-        evaluation_flows[key] = flow_id
+            evaluation_flows[key] = flow_id
 
     return OpenMLRun(run_id=run_id, uploader=uploader,
                      uploader_name=uploader_name, task_id=task_id,
@@ -325,7 +366,7 @@ def _get_cached_run(run_id):
                                    "cached" % run_id)
 
 
-def list_runs(offset=None, size=None, id=None, task=None,
+def list_runs(offset=None, size=None, id=None, task=None, setup=None,
               flow=None, uploader=None, tag=None):
     """List all runs matching all of the given filters.
 
@@ -342,6 +383,8 @@ def list_runs(offset=None, size=None, id=None, task=None,
 
     task : list, optional
 
+    setup: list, optional
+
     flow : list, optional
 
     uploader : list, optional
@@ -363,6 +406,8 @@ def list_runs(offset=None, size=None, id=None, task=None,
         api_call += "/run/%s" % ','.join([str(int(i)) for i in id])
     if task is not None:
         api_call += "/task/%s" % ','.join([str(int(i)) for i in task])
+    if setup is not None:
+        api_call += "/setup/%s" % ','.join([str(int(i)) for i in setup])
     if flow is not None:
         api_call += "/flow/%s" % ','.join([str(int(i)) for i in flow])
     if uploader is not None: