Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion openml/runs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .run import OpenMLRun
from .functions import (run_task, get_run, list_runs, get_runs)
from .functions import (run_task, get_run, list_runs, get_runs, initialize_model_from_run)

__all__ = ['OpenMLRun', 'run_task', 'get_run', 'list_runs', 'get_runs']
75 changes: 71 additions & 4 deletions openml/runs/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@
import warnings
import sklearn
import time
from sklearn.model_selection._search import BaseSearchCV

from ..exceptions import PyOpenMLError
from .. import config

from ..flows import sklearn_to_flow, get_flow, flow_exists, _check_n_jobs
from ..setups import setup_exists
from ..setups import setup_exists, initialize_model

from ..exceptions import OpenMLCacheException, OpenMLServerException
from ..util import URLError, version_complies
from ..tasks.functions import _create_task_from_xml
from .._api_calls import _perform_api_call
from .run import OpenMLRun, _get_version_information

Expand All @@ -24,7 +24,7 @@



def run_task(task, model, avoid_duplicate_runs=True, flow_tags=None):
def run_task(task, model, avoid_duplicate_runs=True, flow_tags=None, seed=None):
"""Performs a CV run on the dataset of the given task, using the split.

Parameters
Expand All @@ -35,8 +35,13 @@ def run_task(task, model, avoid_duplicate_runs=True, flow_tags=None):
a model which has a function fit(X,Y) and predict(X),
all supervised estimators of scikit learn follow this definition of a model [1]
[1](http://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html)
avoid_duplicate_runs : bool
if this flag is set to True, the run will throw an error if the
setup/task combination is already present on the server.
flow_tags : list(str)
a list of tags that the flow should have at creation
seed: int
the models that are not seeded will get this seed

Returns
-------
Expand All @@ -48,6 +53,7 @@ def run_task(task, model, avoid_duplicate_runs=True, flow_tags=None):
# TODO move this into its onwn module. While it somehow belongs here, it
# adds quite a lot of functionality which is better suited in other places!
# TODO why doesn't this accept a flow as input? - this would make this more flexible!
model = _get_seeded_model(model, seed)
flow = sklearn_to_flow(model)

# returns flow id if the flow exists on the server, False otherwise
Expand Down Expand Up @@ -88,6 +94,24 @@ def run_task(task, model, avoid_duplicate_runs=True, flow_tags=None):

return run

def initialize_model_from_run(run_id):
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is neither used nor tested.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, I added tests.

'''
Initialized a model based on a run_id (i.e., using the exact
same parameter settings)

Parameters
----------
run_id : int
The Openml run_id

Returns
-------
model : sklearn model
the scikitlearn model with all parameters initailized
'''
run = get_run(run_id)
return initialize_model(run.setup_id)

def _run_exists(task_id, setup_id):
'''
Checks whether a task/setup combination is already present on the server.
Expand All @@ -111,6 +135,49 @@ def _run_exists(task_id, setup_id):
assert(exception.code == 512)
return False

def _get_seeded_model(model, seed=None):
'''Sets all the non-seeded components of a model with a seed.
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should mention the restriction that one cannot use a random state in the pipelines.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure. One could argue that that is a restriction of the run_tasks function, as that is the function the user interacts with. Furthermore, that function is responsible for the check.

This function only adds seeds to unseeded models

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But it can raise an exception:

import numpy as np
import sklearn.ensemble

import openml


rf = sklearn.ensemble.RandomForestClassifier(
    random_state=np.random.RandomState(1))
openml.runs.functions._get_seeded_model(rf, 5)

But you're right, it should be documented in the run_tasks() function.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mea culpa, i will add it

Models that are already seeded will maintain the seed. In
this case, only integer seeds are allowed (An exception
is thrown when a RandomState was used as seed)

Parameters
----------
model : sklearn model
The model to be seeded
seed : int
The seed to initialize the RandomState with. Unseeded subcomponents
will be seeded with a random number from the RandomState.

Returns
-------
model : sklearn model
a version of the model where all (sub)components have
a seed
'''

rs = np.random.RandomState(seed)
model_params = model.get_params()
random_states = {}
for param_name in sorted(model_params):
if 'random_state' in param_name:
currentValue = model_params[param_name]
# important to draw the value at this point (and not in the if statement)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, could you explain why? It's not clear to me from this.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added description.

I'm not sure if we really need this, but seems nice property to respect.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

k

# this way we guarantee that if a different set of subflows is seeded,
# the same number of the random generator is used
newValue = rs.randint(0, 2**16)
if currentValue is None:
random_states[param_name] = newValue
elif isinstance(currentValue, int):
# acceptable behaviour
pass
elif isinstance(currentValue, np.random.RandomState):
raise ValueError('Models initialized with a RandomState object are not supported. Please seed with an integer. ')
else:
raise ValueError('Models should be seeded with int or None (this should never happen). ')
model.set_params(**random_states)
return model



def _prediction_to_row(rep_no, fold_no, row_id, correct_label, predicted_label,
Expand Down
58 changes: 23 additions & 35 deletions openml/runs/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def _create_description_xml(self):
return description_xml

@staticmethod
def _parse_parameters(model, flow):
def _parse_parameters(model, server_flow):
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at this again I'm actually surprised that this is not called run_task, but only when publishing. But maybe this should be its own issue/PR.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you elaborate a bit? I don really understand which / why

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I meant "that this is not called IN run_task"

"""Extracts all parameter settings from a model in OpenML format.

Parameters
Expand All @@ -176,50 +176,38 @@ def _parse_parameters(model, flow):
openml flow object (containing flow ids, i.e., it has to be downloaded from the server)

"""
if flow.flow_id is None:
if server_flow.flow_id is None:
raise ValueError("The flow parameter needs to be downloaded from server")

python_param_settings = model.get_params()
openml_param_settings = []

def get_flow_dict(_flow):
flow_map = {_flow.name: _flow.flow_id}
for subflow in _flow.components:
flow_map.update(get_flow_dict(_flow.components[subflow]))
return flow_map

flow_dict = get_flow_dict(flow)

for param in python_param_settings:
if "__" in param:
# parameter of subflow. will be handled later
continue
if isinstance(python_param_settings[param], BaseEstimator):
# extract parameters of the subflow individually
subflow = flow.components[param]
openml_param_settings += OpenMLRun._parse_parameters(python_param_settings[param], subflow)

# add parameter setting (in some cases also the subflow. Just because we can)
if param in flow.parameters.keys():
param_dict = OrderedDict()
param_dict['oml:name'] = param
param_dict['oml:value'] = str(python_param_settings[param])
param_dict['oml:component'] = flow_dict[flow.name]
openml_param_settings.append(param_dict)
else:
if flow.name.startswith("sklearn.pipeline.Pipeline"):
# tolerate
pass
elif flow.name.startswith("sklearn.pipeline.FeatureUnion"):
# tolerate
pass
elif flow.name.startswith("sklearn.ensemble.voting_classifier.VotingClassifier"):
# tolerate
pass
def extract_parameters(_flow, _param_dict, _main_call=False, main_id=None):
# _flow is openml flow object, _param dict maps from flow name to flow id
# for the main call, the param dict can be overridden (useful for unit tests / sentinels)
# this way, for flows without subflows we do not have to rely on _param_dict
_params = []
for _param_name in _flow.parameters:
_current = OrderedDict()
_current['oml:name'] = _param_name
_current['oml:value'] = _flow.parameters[_param_name]
if _main_call:
_current['oml:component'] = main_id
else:
raise ValueError("parameter %s not in flow description of flow %s" %(param,flow.name))
_current['oml:component'] = _param_dict[_flow.name]
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is it once an ID, and once a name?

_params.append(_current)
for _identifier in _flow.components:
_params.extend(extract_parameters(_flow.components[_identifier], _param_dict))
return _params

flow_dict = get_flow_dict(server_flow)
local_flow = openml.flows.sklearn_to_flow(model)

return openml_param_settings
parameters = extract_parameters(local_flow, flow_dict, True, server_flow.flow_id)
return parameters

################################################################################
# Functions which cannot be in runs/functions due to circular imports
Expand Down
4 changes: 2 additions & 2 deletions openml/setups/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .functions import setup_exists
from .functions import get_setup, setup_exists, initialize_model

__all__ = ['setup_exists']
__all__ = ['get_setup', 'setup_exists', 'initialize_model']
110 changes: 108 additions & 2 deletions openml/setups/functions.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import openml
import xmltodict
import copy

from collections import OrderedDict
from .setup import OpenMLSetup, OpenMLParameter

def setup_exists(downloaded_flow, sklearn_model):
'''
Expand Down Expand Up @@ -34,14 +36,118 @@ def setup_exists(downloaded_flow, sklearn_model):
if setup_id > 0:
return setup_id
else:
return False;
return False


def get_setup(setup_id):
'''
Downloads the setup (configuration) description from OpenML
and returns a structured object

Parameters
----------
setup_id : int
The Openml setup_id

Returns
-------
OpenMLSetup
an initialized openml setup object
'''
result = openml._api_calls._perform_api_call('/setup/%d' %setup_id)
result_dict = xmltodict.parse(result)
return _create_setup_from_xml(result_dict)


def initialize_model(setup_id):
'''
Initialized a model based on a setup_id (i.e., using the exact
same parameter settings)

Parameters
----------
setup_id : int
The Openml setup_id

Returns
-------
model : sklearn model
the scikitlearn model with all parameters initailized
'''
def _to_dict_of_dicts(_params):
# this subfunction transforms an openml setup object into
# a dict of dicts, structured: flow_id maps to dict of
# parameter_names mapping to parameter_value
_res = {}
for _param in _params:
_flow_id = _params[_param].flow_id
_param_name = _params[_param].parameter_name
_param_value = _params[_param].value
if _flow_id not in _res:
_res[_flow_id] = {}
_res[_flow_id][_param_name] = _param_value
return _res

def _reconstruct_flow(_flow, _params):
# sets the values of flow parameters (and subflows) to
# the specific values from a setup. _params is a dict of
# dicts, mapping from flow id to param name to param value
# (obtained by using the subfunction _to_dict_of_dicts)
for _param in _flow.parameters:
_flow.parameters[_param] = _params[_flow.flow_id][_param]
for _identifier in _flow.components:
_flow.components[_identifier] = _reconstruct_flow(_flow.components[_identifier], _params)
return _flow

setup = get_setup(setup_id)
parameters = _to_dict_of_dicts(setup.parameters)
flow = openml.flows.get_flow(setup.flow_id)

# now we 'abuse' the parameter object by passing in the
# parameters obtained from the setup
flow = _reconstruct_flow(flow, parameters)

return openml.flows.flow_to_sklearn(flow)


def _to_dict(flow_id, openml_parameter_settings):
# for convenience, this function (ab)uses the run object.
xml = OrderedDict()
xml['oml:run'] = OrderedDict()
xml['oml:run']['@xmlns:oml'] = 'http://openml.org/openml'
xml['oml:run']['oml:flow_id'] = flow_id
xml['oml:run']['oml:parameter_setting'] = openml_parameter_settings

return xml
return xml

def _create_setup_from_xml(result_dict):
'''
Turns an API xml result into a OpenMLSetup object
'''
flow_id = int(result_dict['oml:setup_parameters']['oml:flow_id'])
parameters = {}
if 'oml:parameter' not in result_dict['oml:setup_parameters']:
parameters = None
else:
# basically all others
xml_parameters = result_dict['oml:setup_parameters']['oml:parameter']
if isinstance(xml_parameters, dict):
id = int(xml_parameters['oml:id'])
parameters[id] = _create_setup_parameter_from_xml(xml_parameters)
elif isinstance(xml_parameters, list):
for xml_parameter in xml_parameters:
id = int(xml_parameter['oml:id'])
parameters[id] = _create_setup_parameter_from_xml(xml_parameter)
else:
raise ValueError('Expected None, list or dict, received someting else: %s' %str(type(xml_parameters)))

return OpenMLSetup(flow_id, parameters)

def _create_setup_parameter_from_xml(result_dict):
return OpenMLParameter(int(result_dict['oml:id']),
int(result_dict['oml:flow_id']),
result_dict['oml:full_name'],
result_dict['oml:parameter_name'],
result_dict['oml:data_type'],
result_dict['oml:default_value'],
result_dict['oml:value'])
Loading