From 10940a2a2e69ca8ffef690ed1f94a3f2cd50a218 Mon Sep 17 00:00:00 2001 From: Jan van Rijn Date: Tue, 25 Apr 2017 15:52:50 +0200 Subject: [PATCH 01/16] seed functionality --- openml/runs/functions.py | 46 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/openml/runs/functions.py b/openml/runs/functions.py index 9cb08d7e5..970108342 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -24,7 +24,7 @@ -def run_task(task, model, avoid_duplicate_runs=True, flow_tags=None): +def run_task(task, model, avoid_duplicate_runs=True, flow_tags=None, seed=None): """Performs a CV run on the dataset of the given task, using the split. Parameters @@ -35,8 +35,13 @@ def run_task(task, model, avoid_duplicate_runs=True, flow_tags=None): a model which has a function fit(X,Y) and predict(X), all supervised estimators of scikit learn follow this definition of a model [1] [1](http://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html) + avoid_duplicate_runs : bool + if this flag is set to True, the run will throw an error if the + setup/task combination is already present on the server. flow_tags : list(str) a list of tags that the flow should have at creation + seed: int + the models that are not seeded will get this seed Returns ------- @@ -48,6 +53,7 @@ def run_task(task, model, avoid_duplicate_runs=True, flow_tags=None): # TODO move this into its onwn module. While it somehow belongs here, it # adds quite a lot of functionality which is better suited in other places! # TODO why doesn't this accept a flow as input? - this would make this more flexible! + model = _get_seeded_model(model, seed) flow = sklearn_to_flow(model) # returns flow id if the flow exists on the server, False otherwise @@ -111,6 +117,44 @@ def _run_exists(task_id, setup_id): assert(exception.code == 512) return False +def _get_seeded_model(model, seed=None): + '''Sets all the non-seeded components of a model with a seed. + + Parameters + ---------- + model : sklearn model + The model to be seeded + seed : int + The seed to initialize the RandomState with. Unseeded subcomponents + will be seeded with a random number from the RandomState. + + Returns + ------- + model : sklearn model + a version of the model where all (sub)components have + a seed + ''' + + rs = np.random.RandomState(seed) + model_params = model.get_params() + random_states = {} + for param_name in sorted(model_params): + if 'random_state' in param_name: + currentValue = model_params[param_name] + # important to draw the value at this point (and not in the if statement) + newValue = rs.randint(0, 2**16) + if currentValue is None: + random_states[param_name] = newValue + elif isinstance(currentValue, int): + # acceptable behaviour + pass + elif isinstance(currentValue, np.random.RandomState): + raise ValueError('Models initialized with a RandomState object are not supported. Please seed with an integer. ') + else: + raise ValueError('Models should be seeded with int or None (this should never happen). ') + model.set_params(**random_states) + return model + def _prediction_to_row(rep_no, fold_no, row_id, correct_label, predicted_label, From 982898e0fd6c5ce11eb39e50195cf343fa3abcbb Mon Sep 17 00:00:00 2001 From: Jan van Rijn Date: Tue, 25 Apr 2017 21:57:26 +0200 Subject: [PATCH 02/16] implemented setup data structure --- openml/runs/functions.py | 22 +++++- openml/setups/__init__.py | 4 +- openml/setups/functions.py | 82 ++++++++++++++++++++++- tests/test_setups/test_setup_functions.py | 16 +++++ 4 files changed, 117 insertions(+), 7 deletions(-) diff --git a/openml/runs/functions.py b/openml/runs/functions.py index 970108342..0959e980f 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -6,15 +6,13 @@ import warnings import sklearn import time -from sklearn.model_selection._search import BaseSearchCV from ..exceptions import PyOpenMLError from .. import config from ..flows import sklearn_to_flow, get_flow, flow_exists -from ..setups import setup_exists +from ..setups import setup_exists, initialize_model from ..exceptions import OpenMLCacheException, OpenMLServerException from ..util import URLError, version_complies -from ..tasks.functions import _create_task_from_xml from .._api_calls import _perform_api_call from .run import OpenMLRun, _get_version_information @@ -94,6 +92,24 @@ def run_task(task, model, avoid_duplicate_runs=True, flow_tags=None, seed=None): return run +def initialize_model_from_run(run_id): + ''' + Initialized a model based on a run_id (i.e., using the exact + same parameter settings) + + Parameters + ---------- + run_id : int + The Openml run_id + + Returns + ------- + model : sklearn model + the scikitlearn model with all parameters initailized + ''' + run = get_run(run_id) + return initialize_model(run.setup_id) + def _run_exists(task_id, setup_id): ''' Checks whether a task/setup combination is already present on the server. diff --git a/openml/setups/__init__.py b/openml/setups/__init__.py index 27c884446..c29271ec1 100644 --- a/openml/setups/__init__.py +++ b/openml/setups/__init__.py @@ -1,3 +1,3 @@ -from .functions import setup_exists +from .functions import get_setup, setup_exists, initialize_model -__all__ = ['setup_exists'] \ No newline at end of file +__all__ = ['get_setup', 'setup_exists', 'initialize_model'] \ No newline at end of file diff --git a/openml/setups/functions.py b/openml/setups/functions.py index e9167d4cc..c26b83dbb 100644 --- a/openml/setups/functions.py +++ b/openml/setups/functions.py @@ -2,6 +2,7 @@ import xmltodict from collections import OrderedDict +from .setup import OpenMLSetup, OpenMLParameter def setup_exists(downloaded_flow, sklearn_model): ''' @@ -34,14 +35,91 @@ def setup_exists(downloaded_flow, sklearn_model): if setup_id > 0: return setup_id else: - return False; + return False + + +def get_setup(setup_id): + ''' + Downloads the setup (configuration) description from OpenML + and returns a structured object + + Parameters + ---------- + setup_id : int + The Openml setup_id + + Returns + ------- + OpenMLSetup + an initialized openml setup object + ''' + result = openml._api_calls._perform_api_call('/setup/%d' %setup_id) + result_dict = xmltodict.parse(result) + return _create_setup_from_xml(result_dict) + + +def initialize_model(setup_id): + ''' + Initialized a model based on a setup_id (i.e., using the exact + same parameter settings) + + Parameters + ---------- + setup_id : int + The Openml setup_id + + Returns + ------- + model : sklearn model + the scikitlearn model with all parameters initailized + ''' + + setup = get_setup(setup_id) + flow = openml.flows.get_flow(setup.flow_id) + sklearn_model = openml.flows.flow_to_sklearn(flow) + # print(sklearn_model.get_params()) + + raise ValueError('not implemented yet') def _to_dict(flow_id, openml_parameter_settings): + # for convenience, this function (ab)uses the run object. xml = OrderedDict() xml['oml:run'] = OrderedDict() xml['oml:run']['@xmlns:oml'] = 'http://openml.org/openml' xml['oml:run']['oml:flow_id'] = flow_id xml['oml:run']['oml:parameter_setting'] = openml_parameter_settings - return xml \ No newline at end of file + return xml + +def _create_setup_from_xml(result_dict): + ''' + Turns an API xml result into a OpenMLSetup object + ''' + flow_id = int(result_dict['oml:setup_parameters']['oml:flow_id']) + parameters = {} + if 'oml:parameter' not in result_dict['oml:setup_parameters']: + parameters = None + else: + # basically all others + xml_parameters = result_dict['oml:setup_parameters']['oml:parameter'] + if isinstance(xml_parameters, dict): + id = int(xml_parameters['oml:id']) + parameters[id] = _create_setup_parameter_from_xml(xml_parameters) + elif isinstance(xml_parameters, list): + for xml_parameter in xml_parameters: + id = int(xml_parameter['oml:id']) + parameters[id] = _create_setup_parameter_from_xml(xml_parameter) + else: + raise ValueError('Expected None, list or dict, received someting else: %s' %str(type(xml_parameters))) + + return OpenMLSetup(flow_id, parameters) + +def _create_setup_parameter_from_xml(result_dict): + return OpenMLParameter(int(result_dict['oml:id']), + int(result_dict['oml:flow_id']), + result_dict['oml:full_name'], + result_dict['oml:parameter_name'], + result_dict['oml:data_type'], + result_dict['oml:default_value'], + result_dict['oml:value']) \ No newline at end of file diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py index 013f25168..be5a23c8f 100644 --- a/tests/test_setups/test_setup_functions.py +++ b/tests/test_setups/test_setup_functions.py @@ -72,3 +72,19 @@ def test_existing_setup_exists(self): setup_id = openml.setups.setup_exists(flow, bagging) self.assertEquals(setup_id, run.setup_id) + def test_setup_get(self): + # no setups in default test server + openml.config.server = 'https://www.openml.org/api/v1/xml/' + + # contains all special cases, 0 params, 1 param, n params. + # Non scikitlearn flows. + setups = [18, 19, 20, 118] + num_params = [8, 0, 3, 1] + + for idx in range(len(setups)): + current = openml.setups.get_setup(setups[idx]) + assert current.flow_id > 0 + if num_params[idx] == 0: + assert current.parameters is None + else: + assert len(current.parameters) == num_params[idx] From 8ceb2905ef6237b4fcca991ed6955c3669426610 Mon Sep 17 00:00:00 2001 From: Jan van Rijn Date: Tue, 25 Apr 2017 23:00:42 +0200 Subject: [PATCH 03/16] parsing parameter values into setup --- openml/setups/functions.py | 27 ++++++++++++++++++++++++--- openml/setups/setup.py | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 3 deletions(-) create mode 100644 openml/setups/setup.py diff --git a/openml/setups/functions.py b/openml/setups/functions.py index c26b83dbb..adf8556a8 100644 --- a/openml/setups/functions.py +++ b/openml/setups/functions.py @@ -1,5 +1,6 @@ import openml import xmltodict +import copy from collections import OrderedDict from .setup import OpenMLSetup, OpenMLParameter @@ -73,13 +74,33 @@ def initialize_model(setup_id): model : sklearn model the scikitlearn model with all parameters initailized ''' + def get_flow_dict(_flow, identifier_trace): + flow_map = {_flow.flow_id: identifier_trace} + for identifier in _flow.components: + duplicate_trace = copy.deepcopy(identifier_trace) + duplicate_trace.append(identifier) + flow_map.update(get_flow_dict(_flow.components[identifier], duplicate_trace)) + return flow_map setup = get_setup(setup_id) flow = openml.flows.get_flow(setup.flow_id) sklearn_model = openml.flows.flow_to_sklearn(flow) - # print(sklearn_model.get_params()) - - raise ValueError('not implemented yet') + identifier_trace = get_flow_dict(flow, []) + print(sklearn_model.get_params()) + print(identifier_trace) + parameter_dict = {} + for param_id in setup.parameters: + parameter = setup.parameters[param_id] + if parameter.flow_id == flow.flow_id: + # TODO: parse value. If serialized object (e.g., steps, estimator), skip it (?) + parameter_dict[parameter.parameter_name] = parameter.value + else: + # TODO: parse value. If serialized object (e.g., steps, estimator), skip it (?) + # find my estimator path + parameter_name = '__'.join(identifier_trace[parameter.flow_id]) + "__" + parameter.parameter_name + parameter_dict[parameter_name] = parameter.value + print(parameter_dict) + sklearn_model.set_params(**parameter_dict) def _to_dict(flow_id, openml_parameter_settings): diff --git a/openml/setups/setup.py b/openml/setups/setup.py new file mode 100644 index 000000000..e3187ad8c --- /dev/null +++ b/openml/setups/setup.py @@ -0,0 +1,35 @@ + +class OpenMLSetup(object): + """Setup object (a.k.a. Configuration). + + Parameters + ---------- + flow_id : int + The flow that it is build upon + parameters : dict + The setting of the parameters + """ + + def __init__(self, flow_id, parameters): + self.flow_id = flow_id + self.parameters = parameters + + +class OpenMLParameter(object): + """Parameter object (used in setup). + + Parameters + ---------- + flow_id : int + The flow that it is build upon + parameters : dict + The setting of the parameters + """ + def __init__(self, id, flow_id, full_name, parameter_name, data_type, default_value, value): + self.id = id + self.flow_id = flow_id + self.full_name = full_name + self.parameter_name = parameter_name + self.data_type = data_type + self.default_value = default_value + self.value = value From ae85e5bf6733c96540747d2e50f06beecb9fb8f7 Mon Sep 17 00:00:00 2001 From: Jan van Rijn Date: Wed, 26 Apr 2017 13:30:10 +0200 Subject: [PATCH 04/16] update run with new parameter extraction procedure --- openml/runs/run.py | 86 ++++++++++++++++++++++++++-------------------- 1 file changed, 49 insertions(+), 37 deletions(-) diff --git a/openml/runs/run.py b/openml/runs/run.py index 9a0ed855e..0c845ef16 100644 --- a/openml/runs/run.py +++ b/openml/runs/run.py @@ -165,7 +165,7 @@ def _create_description_xml(self): return description_xml @staticmethod - def _parse_parameters(model, flow): + def _parse_parameters(model, server_flow): """Extracts all parameter settings from a model in OpenML format. Parameters @@ -176,50 +176,62 @@ def _parse_parameters(model, flow): openml flow object (containing flow ids, i.e., it has to be downloaded from the server) """ - if flow.flow_id is None: + if server_flow.flow_id is None: raise ValueError("The flow parameter needs to be downloaded from server") - python_param_settings = model.get_params() - openml_param_settings = [] - def get_flow_dict(_flow): flow_map = {_flow.name: _flow.flow_id} for subflow in _flow.components: flow_map.update(get_flow_dict(_flow.components[subflow])) return flow_map - flow_dict = get_flow_dict(flow) - - for param in python_param_settings: - if "__" in param: - # parameter of subflow. will be handled later - continue - if isinstance(python_param_settings[param], BaseEstimator): - # extract parameters of the subflow individually - subflow = flow.components[param] - openml_param_settings += OpenMLRun._parse_parameters(python_param_settings[param], subflow) - - # add parameter setting (in some cases also the subflow. Just because we can) - if param in flow.parameters.keys(): - param_dict = OrderedDict() - param_dict['oml:name'] = param - param_dict['oml:value'] = str(python_param_settings[param]) - param_dict['oml:component'] = flow_dict[flow.name] - openml_param_settings.append(param_dict) - else: - if flow.name.startswith("sklearn.pipeline.Pipeline"): - # tolerate - pass - elif flow.name.startswith("sklearn.pipeline.FeatureUnion"): - # tolerate - pass - elif flow.name.startswith("sklearn.ensemble.voting_classifier.VotingClassifier"): - # tolerate - pass - else: - raise ValueError("parameter %s not in flow description of flow %s" %(param,flow.name)) - - return openml_param_settings + def extract_parameters(_flow, _param_dict): + _params = {} + for _param_name in _flow.parameters: + _current = OrderedDict() + _current['oml:name'] = _param_name + _current['oml:value'] = _flow.parameters[_param_name] + _current['oml:component'] = _param_dict[_flow.flow_id] + _params.append(_current) + for _identifier in _flow.components: + _params.update(extract_parameters(_flow.components[_identifier], _param_dict)) + + flow_dict = get_flow_dict(server_flow) + print(flow_dict) + local_flow = openml.flows.sklearn_to_flow(model) + + parameters = extract_parameters(local_flow, flow_dict) + # + # for param in python_param_settings: + # if "__" in param: + # # parameter of subflow. will be handled later + # continue + # if isinstance(python_param_settings[param], BaseEstimator): + # # extract parameters of the subflow individually + # subflow = flow.components[param] + # openml_param_settings += OpenMLRun._parse_parameters(python_param_settings[param], subflow) + # + # # add parameter setting (in some cases also the subflow. Just because we can) + # if param in flow.parameters.keys(): + # param_dict = OrderedDict() + # param_dict['oml:name'] = param + # param_dict['oml:value'] = str(python_param_settings[param]) + # param_dict['oml:component'] = flow_dict[flow.name] + # openml_param_settings.append(param_dict) + # else: + # if flow.name.startswith("sklearn.pipeline.Pipeline"): + # # tolerate + # pass + # elif flow.name.startswith("sklearn.pipeline.FeatureUnion"): + # # tolerate + # pass + # elif flow.name.startswith("sklearn.ensemble.voting_classifier.VotingClassifier"): + # # tolerate + # pass + # else: + # raise ValueError("parameter %s not in flow description of flow %s" %(param,flow.name)) + + return parameters ################################################################################ # Functions which cannot be in runs/functions due to circular imports From 83662d78c075e17530c98f99d765ade1fbb89ef9 Mon Sep 17 00:00:00 2001 From: Jan van Rijn Date: Wed, 26 Apr 2017 13:53:04 +0200 Subject: [PATCH 05/16] fixed merge conflict bug, reimplemented extract parameters from run (based on sklearn converter) --- openml/runs/functions.py | 2 +- openml/runs/run.py | 38 ++++---------------------------------- 2 files changed, 5 insertions(+), 35 deletions(-) diff --git a/openml/runs/functions.py b/openml/runs/functions.py index 021283818..f73af2cf4 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -10,7 +10,7 @@ from ..exceptions import PyOpenMLError from .. import config -from ..flows import sklearn_to_flow, get_flow, flow_exists +from ..flows import sklearn_to_flow, get_flow, flow_exists, _check_n_jobs from ..setups import setup_exists, initialize_model from ..exceptions import OpenMLCacheException, OpenMLServerException diff --git a/openml/runs/run.py b/openml/runs/run.py index 0c845ef16..e69717487 100644 --- a/openml/runs/run.py +++ b/openml/runs/run.py @@ -186,51 +186,21 @@ def get_flow_dict(_flow): return flow_map def extract_parameters(_flow, _param_dict): - _params = {} + _params = [] for _param_name in _flow.parameters: _current = OrderedDict() _current['oml:name'] = _param_name _current['oml:value'] = _flow.parameters[_param_name] - _current['oml:component'] = _param_dict[_flow.flow_id] + _current['oml:component'] = _param_dict[_flow.name] _params.append(_current) for _identifier in _flow.components: - _params.update(extract_parameters(_flow.components[_identifier], _param_dict)) + _params.extend(extract_parameters(_flow.components[_identifier], _param_dict)) + return _params flow_dict = get_flow_dict(server_flow) - print(flow_dict) local_flow = openml.flows.sklearn_to_flow(model) parameters = extract_parameters(local_flow, flow_dict) - # - # for param in python_param_settings: - # if "__" in param: - # # parameter of subflow. will be handled later - # continue - # if isinstance(python_param_settings[param], BaseEstimator): - # # extract parameters of the subflow individually - # subflow = flow.components[param] - # openml_param_settings += OpenMLRun._parse_parameters(python_param_settings[param], subflow) - # - # # add parameter setting (in some cases also the subflow. Just because we can) - # if param in flow.parameters.keys(): - # param_dict = OrderedDict() - # param_dict['oml:name'] = param - # param_dict['oml:value'] = str(python_param_settings[param]) - # param_dict['oml:component'] = flow_dict[flow.name] - # openml_param_settings.append(param_dict) - # else: - # if flow.name.startswith("sklearn.pipeline.Pipeline"): - # # tolerate - # pass - # elif flow.name.startswith("sklearn.pipeline.FeatureUnion"): - # # tolerate - # pass - # elif flow.name.startswith("sklearn.ensemble.voting_classifier.VotingClassifier"): - # # tolerate - # pass - # else: - # raise ValueError("parameter %s not in flow description of flow %s" %(param,flow.name)) - return parameters ################################################################################ From 71b5efdcc1188029299fc1ed890442ee9b60c0d2 Mon Sep 17 00:00:00 2001 From: Jan van Rijn Date: Wed, 26 Apr 2017 14:13:44 +0200 Subject: [PATCH 06/16] functionality to reconstruct a flow using a given set of parameters --- openml/setups/functions.py | 52 ++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/openml/setups/functions.py b/openml/setups/functions.py index adf8556a8..c28650cfe 100644 --- a/openml/setups/functions.py +++ b/openml/setups/functions.py @@ -74,33 +74,37 @@ def initialize_model(setup_id): model : sklearn model the scikitlearn model with all parameters initailized ''' - def get_flow_dict(_flow, identifier_trace): - flow_map = {_flow.flow_id: identifier_trace} - for identifier in _flow.components: - duplicate_trace = copy.deepcopy(identifier_trace) - duplicate_trace.append(identifier) - flow_map.update(get_flow_dict(_flow.components[identifier], duplicate_trace)) - return flow_map + def _to_dict_of_dicts(_params): + # this subfunction transforms an openml setup object into + # a dict of dicts, structured: flow_id maps to dict of + # parameter_names mapping to parameter_value + _res = {} + for _param in _params: + if _param.flow_id not in _res: + _res[_param.flow_id] = {} + _res[_param.flow_id][_param.parameter_name] = _param.value + return _res + + def _reconstruct_flow(_flow, _params): + # sets the values of flow parameters (and subflows) to + # the specific values from a setup. _params is a dict of + # dicts, mapping from flow id to param name to param value + # (obtained by using the subfunction _to_dict_of_dicts) + for _param in _flow.parameters: + _flow.parameters[_param] = _params[_flow.flow_id][_param] + for _identifier in flow.components: + _flow.components[_identifier] = _reconstruct_flow(_flow.components[_identifier], _params) + return _flow setup = get_setup(setup_id) + parameters = _to_dict_of_dicts(setup.parameters) flow = openml.flows.get_flow(setup.flow_id) - sklearn_model = openml.flows.flow_to_sklearn(flow) - identifier_trace = get_flow_dict(flow, []) - print(sklearn_model.get_params()) - print(identifier_trace) - parameter_dict = {} - for param_id in setup.parameters: - parameter = setup.parameters[param_id] - if parameter.flow_id == flow.flow_id: - # TODO: parse value. If serialized object (e.g., steps, estimator), skip it (?) - parameter_dict[parameter.parameter_name] = parameter.value - else: - # TODO: parse value. If serialized object (e.g., steps, estimator), skip it (?) - # find my estimator path - parameter_name = '__'.join(identifier_trace[parameter.flow_id]) + "__" + parameter.parameter_name - parameter_dict[parameter_name] = parameter.value - print(parameter_dict) - sklearn_model.set_params(**parameter_dict) + + # now we 'abuse' the parameter object by passing in the + # parameters obtained from the setup + flow = _reconstruct_flow(flow, parameters) + + return openml.flows.flow_to_sklearn(flow) def _to_dict(flow_id, openml_parameter_settings): From af784362bb5ce0f251bb946fad30cef510d6507a Mon Sep 17 00:00:00 2001 From: Jan van Rijn Date: Wed, 26 Apr 2017 14:16:53 +0200 Subject: [PATCH 07/16] clarifications to _to_dict_of_dicts function --- openml/setups/functions.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/openml/setups/functions.py b/openml/setups/functions.py index c28650cfe..58e09490a 100644 --- a/openml/setups/functions.py +++ b/openml/setups/functions.py @@ -80,9 +80,12 @@ def _to_dict_of_dicts(_params): # parameter_names mapping to parameter_value _res = {} for _param in _params: - if _param.flow_id not in _res: - _res[_param.flow_id] = {} - _res[_param.flow_id][_param.parameter_name] = _param.value + _flow_id = _params[_param].flow_id + _param_name = _params[_param].parameter_name + _param_value = _params[_param].value + if _flow_id not in _res: + _res[_flow_id] = {} + _res[_flow_id][_param_name] = _param_value return _res def _reconstruct_flow(_flow, _params): From e37bb93b766bd2f18cac4a6baca5f034182a0dc4 Mon Sep 17 00:00:00 2001 From: Jan van Rijn Date: Wed, 26 Apr 2017 14:33:19 +0200 Subject: [PATCH 08/16] almost finished reinstatiating setups --- openml/setups/functions.py | 2 +- openml/testing.py | 1 + tests/test_runs/test_run_functions.py | 10 +++++++++- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/openml/setups/functions.py b/openml/setups/functions.py index 58e09490a..83486840b 100644 --- a/openml/setups/functions.py +++ b/openml/setups/functions.py @@ -95,7 +95,7 @@ def _reconstruct_flow(_flow, _params): # (obtained by using the subfunction _to_dict_of_dicts) for _param in _flow.parameters: _flow.parameters[_param] = _params[_flow.flow_id][_param] - for _identifier in flow.components: + for _identifier in _flow.components: _flow.components[_identifier] = _reconstruct_flow(_flow.components[_identifier], _params) return _flow diff --git a/openml/testing.py b/openml/testing.py index c5c11091a..e24adc92a 100644 --- a/openml/testing.py +++ b/openml/testing.py @@ -17,6 +17,7 @@ class TestBase(unittest.TestCase): def setUp(self): # This cache directory is checked in to git to simulate a populated # cache + self.maxDiff = None self.static_cache_dir = None static_cache_dir = os.path.dirname(os.path.abspath(inspect.getfile(self.__class__))) diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index e4926491a..680f26539 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -25,7 +25,7 @@ class TestRun(TestBase): - def _perform_run(self, task_id, num_instances, clf): + def _perform_run(self, task_id, num_instances, clf, check_setup=True): task = openml.tasks.get_task(task_id) run = openml.runs.run_task(task, clf, openml.config.avoid_duplicate_runs) run_ = run.publish() @@ -34,6 +34,14 @@ def _perform_run(self, task_id, num_instances, clf): # check arff output self.assertEqual(len(run.data_content), num_instances) + + if check_setup: + run_id = run_.run_id + run_prime = openml.runs.get_run(run_id) + clf_prime = openml.setups.initialize_model(run_prime.setup_id) + self.assertEquals(clf.get_params(), clf_prime.get_params()) + # self.assertEquals(clf, clf_prime) + return run def test_run_regression_on_classif_task(self): From 08e2ae8bdc42977b0d8b441d7d891d2ea7f7200f Mon Sep 17 00:00:00 2001 From: Jan van Rijn Date: Wed, 26 Apr 2017 15:15:46 +0200 Subject: [PATCH 09/16] implemented some sort of unit test. should be improved --- tests/test_runs/test_run_functions.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index 680f26539..3087e10ac 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -2,6 +2,8 @@ import openml import openml.exceptions +import json + from openml.testing import TestBase from openml.runs.functions import _run_task_get_arffcontent @@ -39,7 +41,22 @@ def _perform_run(self, task_id, num_instances, clf, check_setup=True): run_id = run_.run_id run_prime = openml.runs.get_run(run_id) clf_prime = openml.setups.initialize_model(run_prime.setup_id) - self.assertEquals(clf.get_params(), clf_prime.get_params()) + + params_orig = clf.get_params() + params_serv = clf_prime.get_params() + self.assertEqual(params_orig.keys(), params_serv.keys()) + + for param in params_orig: + print("%s : %s" %(param, str(params_serv[param]))) + try: + value_orig = json.dumps(params_orig[param]) + value_serv = json.dumps(params_serv[param]) + self.assertEqual(value_orig, value_serv) + except TypeError: + # TODO: think of a different check + print('Object not json serializable') + + #self.assertEquals(clf.get_params(), clf_prime.get_params()) # self.assertEquals(clf, clf_prime) return run From b8ced46da474daaa3eec9550f2677dd3c12ef42a Mon Sep 17 00:00:00 2001 From: Jan van Rijn Date: Thu, 27 Apr 2017 16:49:47 +0200 Subject: [PATCH 10/16] finalized instantiate setup check --- tests/test_runs/test_run_functions.py | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index 3087e10ac..d8b1bfee3 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -2,7 +2,6 @@ import openml import openml.exceptions -import json from openml.testing import TestBase from openml.runs.functions import _run_task_get_arffcontent @@ -39,22 +38,13 @@ def _perform_run(self, task_id, num_instances, clf, check_setup=True): if check_setup: run_id = run_.run_id - run_prime = openml.runs.get_run(run_id) - clf_prime = openml.setups.initialize_model(run_prime.setup_id) - - params_orig = clf.get_params() - params_serv = clf_prime.get_params() - self.assertEqual(params_orig.keys(), params_serv.keys()) - - for param in params_orig: - print("%s : %s" %(param, str(params_serv[param]))) - try: - value_orig = json.dumps(params_orig[param]) - value_serv = json.dumps(params_serv[param]) - self.assertEqual(value_orig, value_serv) - except TypeError: - # TODO: think of a different check - print('Object not json serializable') + run_server = openml.runs.get_run(run_id) + clf_server = openml.setups.initialize_model(run_server.setup_id) + + flow_local = openml.flows.sklearn_to_flow(clf) + flow_server = openml.flows.sklearn_to_flow(clf_server) + + openml.flows.assert_flows_equal(flow_local, flow_server) #self.assertEquals(clf.get_params(), clf_prime.get_params()) # self.assertEquals(clf, clf_prime) From 363b381f726aa89daf2cefd76ac4e2a99f384375 Mon Sep 17 00:00:00 2001 From: "janvanrijn@gmail.com" Date: Tue, 2 May 2017 15:25:30 +0200 Subject: [PATCH 11/16] fix unit tests for setup --- openml/runs/run.py | 12 +++++++++--- tests/test_setups/test_setup_functions.py | 19 +++++++++---------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/openml/runs/run.py b/openml/runs/run.py index e69717487..727119ff1 100644 --- a/openml/runs/run.py +++ b/openml/runs/run.py @@ -185,13 +185,19 @@ def get_flow_dict(_flow): flow_map.update(get_flow_dict(_flow.components[subflow])) return flow_map - def extract_parameters(_flow, _param_dict): + def extract_parameters(_flow, _param_dict, _main_call=False, main_id=None): + # _flow is openml flow object, _param dict maps from flow name to flow id + # for the main call, the param dict can be overridden (useful for unit tests / sentinels) + # this way, for flows without subflows we do not have to rely on _param_dict _params = [] for _param_name in _flow.parameters: _current = OrderedDict() _current['oml:name'] = _param_name _current['oml:value'] = _flow.parameters[_param_name] - _current['oml:component'] = _param_dict[_flow.name] + if _main_call: + _current['oml:component'] = main_id + else: + _current['oml:component'] = _param_dict[_flow.name] _params.append(_current) for _identifier in _flow.components: _params.extend(extract_parameters(_flow.components[_identifier], _param_dict)) @@ -200,7 +206,7 @@ def extract_parameters(_flow, _param_dict): flow_dict = get_flow_dict(server_flow) local_flow = openml.flows.sklearn_to_flow(model) - parameters = extract_parameters(local_flow, flow_dict) + parameters = extract_parameters(local_flow, flow_dict, True, server_flow.flow_id) return parameters ################################################################################ diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py index 14144bd9a..17bea0de1 100644 --- a/tests/test_setups/test_setup_functions.py +++ b/tests/test_setups/test_setup_functions.py @@ -32,6 +32,7 @@ class TestRun(TestBase): def test_nonexisting_setup_exists(self): # first publish a non-existing flow sentinel = get_sentinel() + # because of the sentinel, we can not use flows that contain subflows dectree = DecisionTreeClassifier() flow = openml.flows.sklearn_to_flow(dectree) flow.name = 'TEST%s%s' % (sentinel, flow.name) @@ -45,26 +46,24 @@ def test_nonexisting_setup_exists(self): def test_existing_setup_exists(self): # first publish a nonexiting flow - bagging = BaggingClassifier(DecisionTreeClassifier(max_depth=5, - min_samples_split=3), - n_estimators=3, - max_samples=0.5) - flow = openml.flows.sklearn_to_flow(bagging) + + # because of the sentinel, we can not use flows that contain subflows + classif = DecisionTreeClassifier(max_depth=5, + min_samples_split=3) + flow = openml.flows.sklearn_to_flow(classif) flow.name = 'TEST%s%s' % (get_sentinel(), flow.name) - flow.components['base_estimator'].name = 'TEST%s%s' % ( - get_sentinel(), flow.components['base_estimator'].name) flow = flow.publish() flow = openml.flows.get_flow(flow.flow_id) # although the flow exists, we can be sure there are no # setups (yet) as it hasn't been ran - setup_id = openml.setups.setup_exists(flow, bagging) + setup_id = openml.setups.setup_exists(flow, classif) self.assertFalse(setup_id) # now run the flow on an easy task: task = openml.tasks.get_task(115) #diabetes - run = openml.runs.run_task(task, bagging) + run = openml.runs.run_task(task, classif) # spoof flow id, otherwise the sentinel is ignored run.flow_id = flow.flow_id run = run.publish() @@ -72,7 +71,7 @@ def test_existing_setup_exists(self): run = openml.runs.get_run(run.run_id) # execute the function we are interested in - setup_id = openml.setups.setup_exists(flow, bagging) + setup_id = openml.setups.setup_exists(flow, classif) self.assertEquals(setup_id, run.setup_id) def test_setup_get(self): From 3ae5f8e0b520bc8684536b9df883db99e6441db9 Mon Sep 17 00:00:00 2001 From: "janvanrijn@gmail.com" Date: Tue, 2 May 2017 18:26:40 +0200 Subject: [PATCH 12/16] requests from @mfeurer --- openml/runs/__init__.py | 2 +- openml/runs/functions.py | 2 ++ tests/test_runs/test_run_functions.py | 6 ++++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/openml/runs/__init__.py b/openml/runs/__init__.py index 562327d34..a0735b4f1 100644 --- a/openml/runs/__init__.py +++ b/openml/runs/__init__.py @@ -1,4 +1,4 @@ from .run import OpenMLRun -from .functions import (run_task, get_run, list_runs, get_runs) +from .functions import (run_task, get_run, list_runs, get_runs, initialize_model_from_run) __all__ = ['OpenMLRun', 'run_task', 'get_run', 'list_runs', 'get_runs'] diff --git a/openml/runs/functions.py b/openml/runs/functions.py index f73af2cf4..8338519b5 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -160,6 +160,8 @@ def _get_seeded_model(model, seed=None): if 'random_state' in param_name: currentValue = model_params[param_name] # important to draw the value at this point (and not in the if statement) + # this way we guarantee that if a different set of subflows is seeded, + # the same number of the random generator is used newValue = rs.randint(0, 2**16) if currentValue is None: random_states[param_name] = newValue diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py index d8b1bfee3..6c711f70a 100644 --- a/tests/test_runs/test_run_functions.py +++ b/tests/test_runs/test_run_functions.py @@ -37,6 +37,7 @@ def _perform_run(self, task_id, num_instances, clf, check_setup=True): self.assertEqual(len(run.data_content), num_instances) if check_setup: + # test the initialize setup function run_id = run_.run_id run_server = openml.runs.get_run(run_id) clf_server = openml.setups.initialize_model(run_server.setup_id) @@ -46,6 +47,11 @@ def _perform_run(self, task_id, num_instances, clf, check_setup=True): openml.flows.assert_flows_equal(flow_local, flow_server) + # and test the initialize setup from run function + clf_server2 = openml.runs.initialize_model_from_run(run_server.run_id) + flow_server2 = openml.flows.sklearn_to_flow(clf_server2) + openml.flows.assert_flows_equal(flow_local, flow_server2) + #self.assertEquals(clf.get_params(), clf_prime.get_params()) # self.assertEquals(clf, clf_prime) From 70475feea5a2225e479dcd01a1176cedab852a29 Mon Sep 17 00:00:00 2001 From: "janvanrijn@gmail.com" Date: Wed, 3 May 2017 13:38:56 +0200 Subject: [PATCH 13/16] added comment --- openml/runs/functions.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/openml/runs/functions.py b/openml/runs/functions.py index 8338519b5..0a211d7fc 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -137,6 +137,9 @@ def _run_exists(task_id, setup_id): def _get_seeded_model(model, seed=None): '''Sets all the non-seeded components of a model with a seed. + Models that are already seeded will maintain the seed. In + this case, only integer seeds are allowed (An exception + is thrown when a RandomState was used as seed) Parameters ---------- From 23aaf813e3dffd2389d24240f5e53dbc8094d1f1 Mon Sep 17 00:00:00 2001 From: "janvanrijn@gmail.com" Date: Wed, 3 May 2017 16:34:19 +0200 Subject: [PATCH 14/16] changed comments of setup, changed assertions --- openml/setups/setup.py | 19 +++++++++++++++---- tests/test_setups/test_setup_functions.py | 4 ++-- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/openml/setups/setup.py b/openml/setups/setup.py index e3187ad8c..d23893828 100644 --- a/openml/setups/setup.py +++ b/openml/setups/setup.py @@ -20,10 +20,21 @@ class OpenMLParameter(object): Parameters ---------- - flow_id : int - The flow that it is build upon - parameters : dict - The setting of the parameters + id : int + The input id from the openml database + flow id : int + The flow to which this parameter is associated + full_name : str + The name of the flow and parameter combined + parameter_name : str + The name of the parameter + data_type : str + The datatype of the parameter. generally unused for sklearn flows + default_value : str + The default value. For sklearn parameters, this is unknown and a + default value is selected arbitrarily + value : str + If the parameter was set, the value that it was set to. """ def __init__(self, id, flow_id, full_name, parameter_name, data_type, default_value, value): self.id = id diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py index 17bea0de1..10f18e321 100644 --- a/tests/test_setups/test_setup_functions.py +++ b/tests/test_setups/test_setup_functions.py @@ -87,6 +87,6 @@ def test_setup_get(self): current = openml.setups.get_setup(setups[idx]) assert current.flow_id > 0 if num_params[idx] == 0: - assert current.parameters is None + self.asserts(current.parameters is None) else: - assert len(current.parameters) == num_params[idx] + self.asserts(len(current.parameters) == num_params[idx]) From bd8ee242fab4a5cadb005eb5adb3cdac141a1481 Mon Sep 17 00:00:00 2001 From: "janvanrijn@gmail.com" Date: Wed, 3 May 2017 16:46:26 +0200 Subject: [PATCH 15/16] typo --- tests/test_setups/test_setup_functions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py index 10f18e321..f033a19b7 100644 --- a/tests/test_setups/test_setup_functions.py +++ b/tests/test_setups/test_setup_functions.py @@ -87,6 +87,6 @@ def test_setup_get(self): current = openml.setups.get_setup(setups[idx]) assert current.flow_id > 0 if num_params[idx] == 0: - self.asserts(current.parameters is None) + self.assertTrue(current.parameters is None) else: - self.asserts(len(current.parameters) == num_params[idx]) + self.assertTrue(len(current.parameters) == num_params[idx]) From 85472cc644a60bd4db9c40e23bba7f4fc709b486 Mon Sep 17 00:00:00 2001 From: Jan van Rijn Date: Thu, 4 May 2017 16:22:20 +0200 Subject: [PATCH 16/16] changes requested --- tests/test_setups/test_setup_functions.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py index f033a19b7..99cff2ef7 100644 --- a/tests/test_setups/test_setup_functions.py +++ b/tests/test_setups/test_setup_functions.py @@ -38,8 +38,9 @@ def test_nonexisting_setup_exists(self): flow.name = 'TEST%s%s' % (sentinel, flow.name) flow.publish() - # although the flow exists, we can be sure there are no - # setups (yet) as it hasn't been ran + # although the flow exists (created as of previous statement), + # we can be sure there are no setups (yet) as it was just created + # and hasn't been ran setup_id = openml.setups.setup_exists(flow, dectree) self.assertFalse(setup_id) @@ -87,6 +88,6 @@ def test_setup_get(self): current = openml.setups.get_setup(setups[idx]) assert current.flow_id > 0 if num_params[idx] == 0: - self.assertTrue(current.parameters is None) + self.assertIsNone(current.parameters) else: - self.assertTrue(len(current.parameters) == num_params[idx]) + self.assertEquals(len(current.parameters), num_params[idx])