Skip to content

Commit

Permalink
Merge eff74f6 into b3262b6
Browse files Browse the repository at this point in the history
  • Loading branch information
janvanrijn committed Apr 5, 2017
2 parents b3262b6 + eff74f6 commit 8a6f4a8
Show file tree
Hide file tree
Showing 14 changed files with 352 additions and 156 deletions.
22 changes: 19 additions & 3 deletions openml/_api_calls.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
import requests
import arff
import warnings
import xmltodict

from . import config
from .exceptions import OpenMLServerError
from .exceptions import OpenMLServerError, OpenMLServerException


def _perform_api_call(call, data=None, file_dictionary=None,
Expand Down Expand Up @@ -80,7 +81,7 @@ def _read_url_files(url, data=None, file_dictionary=None, file_elements=None):
# 'gzip,deflate'
response = requests.post(url, data=data, files=file_elements)
if response.status_code != 200:
raise OpenMLServerError(('Status code: %d\n' % response.status_code) + response.text)
raise _parse_server_exception(response)
if 'Content-Encoding' not in response.headers or \
response.headers['Content-Encoding'] != 'gzip':
warnings.warn('Received uncompressed content from OpenML for %s.' % url)
Expand All @@ -97,8 +98,23 @@ def _read_url(url, data=None):
response = requests.post(url, data=data)

if response.status_code != 200:
raise OpenMLServerError(('Status code: %d\n' % response.status_code) + response.text)
raise _parse_server_exception(response)
if 'Content-Encoding' not in response.headers or \
response.headers['Content-Encoding'] != 'gzip':
warnings.warn('Received uncompressed content from OpenML for %s.' % url)
return response.text

def _parse_server_exception(response):
# OpenML has a sopisticated error system
# where information about failures is provided. try to parse this
try:
server_exception = xmltodict.parse(response.text)
except:
raise OpenMLServerError(('Status code: %d\n' % response.status_code) + response.text)

code = int(server_exception['oml:error']['oml:code'])
message = server_exception['oml:error']['oml:message']
additional = None
if 'oml:additional_information' in server_exception['oml:error']:
additional = server_exception['oml:error']['oml:additional_information']
return OpenMLServerException(code, message, additional)
5 changes: 4 additions & 1 deletion openml/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def _setup():
"""
global apikey
global server
global avoid_duplicate_runs
# read config file, create cache directory
try:
os.mkdir(os.path.expanduser('~/.openml'))
Expand All @@ -46,6 +47,7 @@ def _setup():
apikey = config.get('FAKE_SECTION', 'apikey')
server = config.get('FAKE_SECTION', 'server')
cache_dir = config.get('FAKE_SECTION', 'cachedir')
avoid_duplicate_runs = config.getboolean('FAKE_SECTION', 'avoid_duplicate_runs')
set_cache_directory(cache_dir)


Expand Down Expand Up @@ -84,7 +86,8 @@ def _parse_config():
defaults = {'apikey': apikey,
'server': server,
'verbosity': 0,
'cachedir': os.path.expanduser('~/.openml/cache')}
'cachedir': os.path.expanduser('~/.openml/cache'),
'avoid_duplicate_runs': 'True'}

config_file = os.path.expanduser('~/.openml/config')
config = configparser.RawConfigParser(defaults=defaults)
Expand Down
16 changes: 14 additions & 2 deletions openml/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,25 @@ class PyOpenMLError(Exception):
def __init__(self, message):
super(PyOpenMLError, self).__init__(message)


class OpenMLServerError(PyOpenMLError):
"""Server didn't respond 200."""
"""class for when something is really wrong on the server
(result did not parse to dict), contains unparsed error."""

def __init__(self, message):
message = "OpenML Server error: " + message
super(OpenMLServerError, self).__init__(message)

#
class OpenMLServerException(OpenMLServerError):
"""exception for when the result of the server was
not 200 (e.g., listing call w/o results). """

def __init__(self, code, message, additional=None):
self.code = code
self.additional = additional
message = "OpenML Server exception: " + message
super(OpenMLServerException, self).__init__(message)


class OpenMLCacheException(PyOpenMLError):
"""Dataset / task etc not found in cache"""
Expand Down
4 changes: 2 additions & 2 deletions openml/flows/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .flow import OpenMLFlow
from .sklearn_converter import sklearn_to_flow, flow_to_sklearn
from .functions import get_flow, list_flows
from .functions import get_flow, list_flows, flow_exists

__all__ = ['OpenMLFlow', 'create_flow_from_model', 'get_flow', 'list_flows',
'sklearn_to_flow', 'flow_to_sklearn']
'sklearn_to_flow', 'flow_to_sklearn', 'flow_exists']
52 changes: 0 additions & 52 deletions openml/flows/flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,58 +340,6 @@ def publish(self):
self.flow_id = int(xmltodict.parse(return_value)['oml:upload_flow']['oml:id'])
return self

def _ensure_flow_exists(self):
""" Checks if a flow exists for the given model and possibly creates it.
If the given flow exists on the server, the flow-id will simply
be returned. Otherwise it will be uploaded to the server.
Returns
-------
flow_id : int
Flow id on the server.
"""
_, flow_id = _check_flow_exists(self.name, self.external_version)
# TODO add numpy and scipy version!

if int(flow_id) == -1:
flow = self.publish()
return int(flow.flow_id)

return int(flow_id)


def _check_flow_exists(name, version):
"""Retrieves the flow id of the flow uniquely identified by name+version.
Parameter
---------
name : string
Name of the flow
version : string
Version information associated with flow.
Returns
-------
flow_exist : int
Flow id or -1 if the flow doesn't exist.
Notes
-----
see http://www.openml.org/api_docs/#!/flow/get_flow_exists_name_version
"""
if not (type(name) is str and len(name) > 0):
raise ValueError('Argument \'name\' should be a non-empty string')
if not (type(version) is str and len(version) > 0):
raise ValueError('Argument \'version\' should be a non-empty string')

xml_response = _perform_api_call("flow/exists",
data={'name': name, 'external_version': version})

xml_dict = xmltodict.parse(xml_response)
flow_id = xml_dict['oml:flow_exists']['oml:id']
return xml_response, flow_id


def _add_if_nonempty(dic, key, value):
if value is not None:
Expand Down
36 changes: 36 additions & 0 deletions openml/flows/functions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import xmltodict
import six

from openml._api_calls import _perform_api_call
from . import OpenMLFlow, flow_to_sklearn
Expand Down Expand Up @@ -69,6 +70,41 @@ def list_flows(offset=None, size=None, tag=None):
return _list_flows(api_call)


def flow_exists(name, external_version):
"""Retrieves the flow id of the flow uniquely identified by name + external_version.
Parameter
---------
name : string
Name of the flow
version : string
Version information associated with flow.
Returns
-------
flow_exist : int
flow id iff exists, False otherwise
Notes
-----
see http://www.openml.org/api_docs/#!/flow/get_flow_exists_name_version
"""
if not (isinstance(name, six.string_types) and len(name) > 0):
raise ValueError('Argument \'name\' should be a non-empty string')
if not (isinstance(name, six.string_types) and len(external_version) > 0):
raise ValueError('Argument \'version\' should be a non-empty string')

xml_response = _perform_api_call("flow/exists",
data={'name': name, 'external_version': external_version})

result_dict = xmltodict.parse(xml_response)
flow_id = int(result_dict['oml:flow_exists']['oml:id'])
if flow_id > 0:
return flow_id
else:
return False


def _list_flows(api_call):
# TODO add proper error handling here!
xml_string = _perform_api_call(api_call)
Expand Down
111 changes: 78 additions & 33 deletions openml/runs/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@
import xmltodict
import numpy as np
import warnings
import openml
from sklearn.model_selection._search import BaseSearchCV

from ..exceptions import PyOpenMLError
from .. import config
from ..flows import sklearn_to_flow
from ..exceptions import OpenMLCacheException
from ..flows import sklearn_to_flow, get_flow, flow_exists
from ..setups import setup_exists
from ..exceptions import OpenMLCacheException, OpenMLServerException
from ..util import URLError
from ..tasks.functions import _create_task_from_xml
from .._api_calls import _perform_api_call
Expand All @@ -21,7 +23,7 @@



def run_task(task, model):
def run_task(task, model, avoid_duplicate_runs=True):
"""Performs a CV run on the dataset of the given task, using the split.
Parameters
Expand All @@ -42,6 +44,19 @@ def run_task(task, model):
# TODO move this into its onwn module. While it somehow belongs here, it
# adds quite a lot of functionality which is better suited in other places!
# TODO why doesn't this accept a flow as input? - this would make this more flexible!
flow = sklearn_to_flow(model)

# returns flow id if the flow exists on the server, False otherwise
flow_id = flow_exists(flow.name, flow.external_version)

# skips the run if it already exists and the user opts for this in the config file.
# also, if the flow is not present on the server, the check is not needed.
if avoid_duplicate_runs and flow_id:
flow = get_flow(flow_id)
setup_id = setup_exists(flow, model)
ids = _run_exists(task.task_id, setup_id)
if ids:
raise PyOpenMLError("Run already exists in server. Run id(s): %s" %str(ids))

dataset = task.get_dataset()
X, Y = dataset.get_data(target=task.target_name)
Expand All @@ -55,19 +70,44 @@ def run_task(task, model):
run = OpenMLRun(task_id=task.task_id, flow_id=None, dataset_id=dataset.dataset_id, model=model)
run.data_content, run.trace_content = _run_task_get_arffcontent(model, task, class_labels)

# now generate the flow
flow = sklearn_to_flow(model)
flow_id = flow._ensure_flow_exists()
if flow_id < 0:
print("No flow")
return 0, 2
config.logger.info(flow_id)
if flow_id == False:
# means the flow did not exists.
# As we could run it, publish it now
flow = flow.publish()
else:
# flow already existed, download it from server
# TODO (neccessary? is this a post condition of this function)
flow = get_flow(flow_id)

# attach the flow to the run
run.flow_id = flow_id
run.flow_id = flow.flow_id
config.logger.info('Executed Task %d with Flow id: %d' %(task.task_id, run.flow_id))

return run

def _run_exists(task_id, setup_id):
'''
Checks whether a task/setup combination is already present on the server.
:param task_id: int
:param setup_id: int
:return: List of run ids iff these already exists on the server, False otherwise
'''
if setup_id <= 0:
# openml setups are in range 1-inf
return False

try:
result = list_runs(task=[task_id], setup=[setup_id])
if len(result) > 0:
return set(result.keys())
else:
return False
except OpenMLServerException as exception:
# error code 512 implies no results. This means the run does not exist yet
assert(exception.code == 512)
return False



def _prediction_to_row(rep_no, fold_no, row_id, correct_label, predicted_label,
predicted_probabilities, class_labels, model_classes_mapping):
Expand Down Expand Up @@ -275,27 +315,28 @@ def _create_run_from_xml(xml):
evaluations = dict()
detailed_evaluations = defaultdict(lambda: defaultdict(dict))
evaluation_flows = dict()
for evaluation_dict in run['oml:output_data']['oml:evaluation']:
key = evaluation_dict['oml:name']
if 'oml:value' in evaluation_dict:
value = float(evaluation_dict['oml:value'])
elif 'oml:array_data' in evaluation_dict:
value = evaluation_dict['oml:array_data']
else:
raise ValueError('Could not find keys "value" or "array_data" '
'in %s' % str(evaluation_dict.keys()))

if '@repeat' in evaluation_dict and '@fold' in evaluation_dict:
repeat = int(evaluation_dict['@repeat'])
fold = int(evaluation_dict['@fold'])
repeat_dict = detailed_evaluations[key]
fold_dict = repeat_dict[repeat]
fold_dict[fold] = value
else:
evaluations[key] = value
evaluation_flows[key] = flow_id
if 'oml:output_data' in run and 'oml:evaluation' in run['oml:output_data']:
for evaluation_dict in run['oml:output_data']['oml:evaluation']:
key = evaluation_dict['oml:name']
if 'oml:value' in evaluation_dict:
value = float(evaluation_dict['oml:value'])
elif 'oml:array_data' in evaluation_dict:
value = evaluation_dict['oml:array_data']
else:
raise ValueError('Could not find keys "value" or "array_data" '
'in %s' % str(evaluation_dict.keys()))

if '@repeat' in evaluation_dict and '@fold' in evaluation_dict:
repeat = int(evaluation_dict['@repeat'])
fold = int(evaluation_dict['@fold'])
repeat_dict = detailed_evaluations[key]
fold_dict = repeat_dict[repeat]
fold_dict[fold] = value
else:
evaluations[key] = value
evaluation_flows[key] = flow_id

evaluation_flows[key] = flow_id
evaluation_flows[key] = flow_id

return OpenMLRun(run_id=run_id, uploader=uploader,
uploader_name=uploader_name, task_id=task_id,
Expand Down Expand Up @@ -325,7 +366,7 @@ def _get_cached_run(run_id):
"cached" % run_id)


def list_runs(offset=None, size=None, id=None, task=None,
def list_runs(offset=None, size=None, id=None, task=None, setup=None,
flow=None, uploader=None, tag=None):
"""List all runs matching all of the given filters.
Expand All @@ -342,6 +383,8 @@ def list_runs(offset=None, size=None, id=None, task=None,
task : list, optional
setup: list, optional
flow : list, optional
uploader : list, optional
Expand All @@ -363,6 +406,8 @@ def list_runs(offset=None, size=None, id=None, task=None,
api_call += "/run/%s" % ','.join([str(int(i)) for i in id])
if task is not None:
api_call += "/task/%s" % ','.join([str(int(i)) for i in task])
if setup is not None:
api_call += "/setup/%s" % ','.join([str(int(i)) for i in setup])
if flow is not None:
api_call += "/flow/%s" % ','.join([str(int(i)) for i in flow])
if uploader is not None:
Expand Down
Loading

0 comments on commit 8a6f4a8

Please sign in to comment.