openml · mfeurer · May 4, 2017 · Apr 25, 2017 · Apr 25, 2017 · Apr 25, 2017
diff --git a/openml/runs/__init__.py b/openml/runs/__init__.py
@@ -1,4 +1,4 @@
 from .run import OpenMLRun
-from .functions import (run_task, get_run, list_runs, get_runs)
+from .functions import (run_task, get_run, list_runs, get_runs, initialize_model_from_run)
 
 __all__ = ['OpenMLRun', 'run_task', 'get_run', 'list_runs', 'get_runs']
diff --git a/openml/runs/functions.py b/openml/runs/functions.py
@@ -6,15 +6,15 @@
 import warnings
 import sklearn
 import time
-from sklearn.model_selection._search import BaseSearchCV
 
 from ..exceptions import PyOpenMLError
 from .. import config
+
 from ..flows import sklearn_to_flow, get_flow, flow_exists, _check_n_jobs
-from ..setups import setup_exists
+from ..setups import setup_exists, initialize_model
+
 from ..exceptions import OpenMLCacheException, OpenMLServerException
 from ..util import URLError, version_complies
-from ..tasks.functions import _create_task_from_xml
 from .._api_calls import _perform_api_call
 from .run import OpenMLRun, _get_version_information
 
@@ -24,7 +24,7 @@
 
 
 
-def run_task(task, model, avoid_duplicate_runs=True, flow_tags=None):
+def run_task(task, model, avoid_duplicate_runs=True, flow_tags=None, seed=None):
     """Performs a CV run on the dataset of the given task, using the split.
 
     Parameters
@@ -35,8 +35,13 @@ def run_task(task, model, avoid_duplicate_runs=True, flow_tags=None):
         a model which has a function fit(X,Y) and predict(X),
         all supervised estimators of scikit learn follow this definition of a model [1]
         [1](http://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html)
+    avoid_duplicate_runs : bool
+        if this flag is set to True, the run will throw an error if the
+        setup/task combination is already present on the server.
     flow_tags : list(str)
         a list of tags that the flow should have at creation
+    seed: int
+        the models that are not seeded will get this seed
 
     Returns
     -------
@@ -48,6 +53,7 @@ def run_task(task, model, avoid_duplicate_runs=True, flow_tags=None):
     # TODO move this into its onwn module. While it somehow belongs here, it
     # adds quite a lot of functionality which is better suited in other places!
     # TODO why doesn't this accept a flow as input? - this would make this more flexible!
+    model = _get_seeded_model(model, seed)
     flow = sklearn_to_flow(model)
 
     # returns flow id if the flow exists on the server, False otherwise
@@ -88,6 +94,24 @@ def run_task(task, model, avoid_duplicate_runs=True, flow_tags=None):
 
     return run
 
+def initialize_model_from_run(run_id):
+    '''
+    Initialized a model based on a run_id (i.e., using the exact
+    same parameter settings)
+
+    Parameters
+        ----------
+        run_id : int
+            The Openml run_id
+
+        Returns
+        -------
+        model : sklearn model
+            the scikitlearn model with all parameters initailized
+    '''
+    run = get_run(run_id)
+    return initialize_model(run.setup_id)
+
 def _run_exists(task_id, setup_id):
     '''
     Checks whether a task/setup combination is already present on the server.
@@ -111,6 +135,49 @@ def _run_exists(task_id, setup_id):
         assert(exception.code == 512)
         return False
 
+def _get_seeded_model(model, seed=None):
+    '''Sets all the non-seeded components of a model with a seed.
+       Models that are already seeded will maintain the seed. In
+       this case, only integer seeds are allowed (An exception
+       is thrown when a RandomState was used as seed)
+
+        Parameters
+        ----------
+        model : sklearn model
+            The model to be seeded
+        seed : int
+            The seed to initialize the RandomState with. Unseeded subcomponents
+            will be seeded with a random number from the RandomState.
+
+        Returns
+        -------
+        model : sklearn model
+            a version of the model where all (sub)components have
+            a seed
+    '''
+
+    rs = np.random.RandomState(seed)
+    model_params = model.get_params()
+    random_states = {}
+    for param_name in sorted(model_params):
+        if 'random_state' in param_name:
+            currentValue = model_params[param_name]
+            # important to draw the value at this point (and not in the if statement)
+            # this way we guarantee that if a different set of subflows is seeded,
+            # the same number of the random generator is used
+            newValue = rs.randint(0, 2**16)
+            if currentValue is None:
+                random_states[param_name] = newValue
+            elif isinstance(currentValue, int):
+                # acceptable behaviour
+                pass
+            elif isinstance(currentValue, np.random.RandomState):
+                raise ValueError('Models initialized with a RandomState object are not supported. Please seed with an integer. ')
+            else:
+                raise ValueError('Models should be seeded with int or None (this should never happen). ')
+            model.set_params(**random_states)
+    return model
+
 
 
 def _prediction_to_row(rep_no, fold_no, row_id, correct_label, predicted_label,

diff --git a/openml/runs/run.py b/openml/runs/run.py
@@ -165,7 +165,7 @@ def _create_description_xml(self):
         return description_xml
 
     @staticmethod
-    def _parse_parameters(model, flow):
+    def _parse_parameters(model, server_flow):
         """Extracts all parameter settings from a model in OpenML format.
 
         Parameters
@@ -176,50 +176,38 @@ def _parse_parameters(model, flow):
             openml flow object (containing flow ids, i.e., it has to be downloaded from the server)
 
         """
-        if flow.flow_id is None:
+        if server_flow.flow_id is None:
             raise ValueError("The flow parameter needs to be downloaded from server")
 
-        python_param_settings = model.get_params()
-        openml_param_settings = []
-
         def get_flow_dict(_flow):
             flow_map = {_flow.name: _flow.flow_id}
             for subflow in _flow.components:
                 flow_map.update(get_flow_dict(_flow.components[subflow]))
             return flow_map
 
-        flow_dict = get_flow_dict(flow)
-
-        for param in python_param_settings:
-            if "__" in param:
-                # parameter of subflow. will be handled later
-                continue
-            if isinstance(python_param_settings[param], BaseEstimator):
-                # extract parameters of the subflow individually
-                subflow = flow.components[param]
-                openml_param_settings += OpenMLRun._parse_parameters(python_param_settings[param], subflow)
-
-            # add parameter setting (in some cases also the subflow. Just because we can)
-            if param in flow.parameters.keys():
-                param_dict = OrderedDict()
-                param_dict['oml:name'] = param
-                param_dict['oml:value'] = str(python_param_settings[param])
-                param_dict['oml:component'] = flow_dict[flow.name]
-                openml_param_settings.append(param_dict)
-            else:
-                if flow.name.startswith("sklearn.pipeline.Pipeline"):
-                    # tolerate
-                    pass
-                elif flow.name.startswith("sklearn.pipeline.FeatureUnion"):
-                    # tolerate
-                    pass
-                elif flow.name.startswith("sklearn.ensemble.voting_classifier.VotingClassifier"):
-                    # tolerate
-                    pass
+        def extract_parameters(_flow, _param_dict, _main_call=False, main_id=None):
+            # _flow is openml flow object, _param dict maps from flow name to flow id
+            # for the main call, the param dict can be overridden (useful for unit tests / sentinels)
+            # this way, for flows without subflows we do not have to rely on _param_dict
+            _params = []
+            for _param_name in _flow.parameters:
+                _current = OrderedDict()
+                _current['oml:name'] = _param_name
+                _current['oml:value'] = _flow.parameters[_param_name]
+                if _main_call:
+                    _current['oml:component'] = main_id
                 else:
-                    raise ValueError("parameter %s not in flow description of flow %s" %(param,flow.name))
+                    _current['oml:component'] = _param_dict[_flow.name]
+                _params.append(_current)
+            for _identifier in _flow.components:
+                _params.extend(extract_parameters(_flow.components[_identifier], _param_dict))
+            return _params
+
+        flow_dict = get_flow_dict(server_flow)
+        local_flow = openml.flows.sklearn_to_flow(model)
 
-        return openml_param_settings
+        parameters = extract_parameters(local_flow, flow_dict, True, server_flow.flow_id)
+        return parameters
 
 ################################################################################
 # Functions which cannot be in runs/functions due to circular imports

diff --git a/openml/setups/__init__.py b/openml/setups/__init__.py
@@ -1,3 +1,3 @@
-from .functions import setup_exists
+from .functions import get_setup, setup_exists, initialize_model
 
-__all__ = ['setup_exists']
+__all__ = ['get_setup', 'setup_exists', 'initialize_model']
diff --git a/openml/setups/functions.py b/openml/setups/functions.py
@@ -1,7 +1,9 @@
 import openml
 import xmltodict
+import copy
 
 from collections import OrderedDict
+from .setup import OpenMLSetup, OpenMLParameter
 
 def setup_exists(downloaded_flow, sklearn_model):
     '''
@@ -34,14 +36,118 @@ def setup_exists(downloaded_flow, sklearn_model):
     if setup_id > 0:
         return setup_id
     else:
-        return False;
+        return False
+
+
+def get_setup(setup_id):
+    '''
+     Downloads the setup (configuration) description from OpenML
+     and returns a structured object
+
+    Parameters
+        ----------
+        setup_id : int
+            The Openml setup_id
+
+        Returns
+        -------
+        OpenMLSetup
+            an initialized openml setup object
+    '''
+    result = openml._api_calls._perform_api_call('/setup/%d' %setup_id)
+    result_dict = xmltodict.parse(result)
+    return _create_setup_from_xml(result_dict)
+
+
+def initialize_model(setup_id):
+    '''
+    Initialized a model based on a setup_id (i.e., using the exact
+    same parameter settings)
+
+    Parameters
+        ----------
+        setup_id : int
+            The Openml setup_id
+
+        Returns
+        -------
+        model : sklearn model
+            the scikitlearn model with all parameters initailized
+    '''
+    def _to_dict_of_dicts(_params):
+        # this subfunction transforms an openml setup object into
+        # a dict of dicts, structured: flow_id maps to dict of
+        # parameter_names mapping to parameter_value
+        _res = {}
+        for _param in _params:
+            _flow_id = _params[_param].flow_id
+            _param_name = _params[_param].parameter_name
+            _param_value = _params[_param].value
+            if _flow_id not in _res:
+                _res[_flow_id] = {}
+            _res[_flow_id][_param_name] = _param_value
+        return _res
+
+    def _reconstruct_flow(_flow, _params):
+        # sets the values of flow parameters (and subflows) to
+        # the specific values from a setup. _params is a dict of
+        # dicts, mapping from flow id to param name to param value
+        # (obtained by using the subfunction _to_dict_of_dicts)
+        for _param in _flow.parameters:
+            _flow.parameters[_param] = _params[_flow.flow_id][_param]
+        for _identifier in _flow.components:
+            _flow.components[_identifier] = _reconstruct_flow(_flow.components[_identifier], _params)
+        return _flow
+
+    setup = get_setup(setup_id)
+    parameters = _to_dict_of_dicts(setup.parameters)
+    flow = openml.flows.get_flow(setup.flow_id)
+
+    # now we 'abuse' the parameter object by passing in the
+    # parameters obtained from the setup
+    flow = _reconstruct_flow(flow, parameters)
+
+    return openml.flows.flow_to_sklearn(flow)
 
 
 def _to_dict(flow_id, openml_parameter_settings):
+    # for convenience, this function (ab)uses the run object.
     xml = OrderedDict()
     xml['oml:run'] = OrderedDict()
     xml['oml:run']['@xmlns:oml'] = 'http://openml.org/openml'
     xml['oml:run']['oml:flow_id'] = flow_id
     xml['oml:run']['oml:parameter_setting'] = openml_parameter_settings
 
-    return xml
+    return xml
+
+def _create_setup_from_xml(result_dict):
+    '''
+     Turns an API xml result into a OpenMLSetup object
+    '''
+    flow_id = int(result_dict['oml:setup_parameters']['oml:flow_id'])
+    parameters = {}
+    if 'oml:parameter' not in result_dict['oml:setup_parameters']:
+        parameters = None
+    else:
+        # basically all others
+        xml_parameters = result_dict['oml:setup_parameters']['oml:parameter']
+        if isinstance(xml_parameters, dict):
+            id = int(xml_parameters['oml:id'])
+            parameters[id] = _create_setup_parameter_from_xml(xml_parameters)
+        elif isinstance(xml_parameters, list):
+            for xml_parameter in xml_parameters:
+                id = int(xml_parameter['oml:id'])
+                parameters[id] = _create_setup_parameter_from_xml(xml_parameter)
+        else:
+            raise ValueError('Expected None, list or dict, received someting else: %s' %str(type(xml_parameters)))
+
+    return OpenMLSetup(flow_id, parameters)
+
+def _create_setup_parameter_from_xml(result_dict):
+    return OpenMLParameter(int(result_dict['oml:id']),
+                           int(result_dict['oml:flow_id']),
+                           result_dict['oml:full_name'],
+                           result_dict['oml:parameter_name'],
+                           result_dict['oml:data_type'],
+                           result_dict['oml:default_value'],
+                           result_dict['oml:value'])