-
-
Notifications
You must be signed in to change notification settings - Fork 270
Fix169 #241
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix169 #241
Changes from all commits
10940a2
982898e
8ceb290
ae85e5b
7f59df2
83662d7
71b5efd
af78436
e37bb93
08e2ae8
280a515
6fdba65
b8ced46
363b381
3ae5f8e
70475fe
23aaf81
bd8ee24
85472cc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,4 @@ | ||
| from .run import OpenMLRun | ||
| from .functions import (run_task, get_run, list_runs, get_runs) | ||
| from .functions import (run_task, get_run, list_runs, get_runs, initialize_model_from_run) | ||
|
|
||
| __all__ = ['OpenMLRun', 'run_task', 'get_run', 'list_runs', 'get_runs'] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,15 +6,15 @@ | |
| import warnings | ||
| import sklearn | ||
| import time | ||
| from sklearn.model_selection._search import BaseSearchCV | ||
|
|
||
| from ..exceptions import PyOpenMLError | ||
| from .. import config | ||
|
|
||
| from ..flows import sklearn_to_flow, get_flow, flow_exists, _check_n_jobs | ||
| from ..setups import setup_exists | ||
| from ..setups import setup_exists, initialize_model | ||
|
|
||
| from ..exceptions import OpenMLCacheException, OpenMLServerException | ||
| from ..util import URLError, version_complies | ||
| from ..tasks.functions import _create_task_from_xml | ||
| from .._api_calls import _perform_api_call | ||
| from .run import OpenMLRun, _get_version_information | ||
|
|
||
|
|
@@ -24,7 +24,7 @@ | |
|
|
||
|
|
||
|
|
||
| def run_task(task, model, avoid_duplicate_runs=True, flow_tags=None): | ||
| def run_task(task, model, avoid_duplicate_runs=True, flow_tags=None, seed=None): | ||
| """Performs a CV run on the dataset of the given task, using the split. | ||
|
|
||
| Parameters | ||
|
|
@@ -35,8 +35,13 @@ def run_task(task, model, avoid_duplicate_runs=True, flow_tags=None): | |
| a model which has a function fit(X,Y) and predict(X), | ||
| all supervised estimators of scikit learn follow this definition of a model [1] | ||
| [1](http://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html) | ||
| avoid_duplicate_runs : bool | ||
| if this flag is set to True, the run will throw an error if the | ||
| setup/task combination is already present on the server. | ||
| flow_tags : list(str) | ||
| a list of tags that the flow should have at creation | ||
| seed: int | ||
| the models that are not seeded will get this seed | ||
|
|
||
| Returns | ||
| ------- | ||
|
|
@@ -48,6 +53,7 @@ def run_task(task, model, avoid_duplicate_runs=True, flow_tags=None): | |
| # TODO move this into its onwn module. While it somehow belongs here, it | ||
| # adds quite a lot of functionality which is better suited in other places! | ||
| # TODO why doesn't this accept a flow as input? - this would make this more flexible! | ||
| model = _get_seeded_model(model, seed) | ||
| flow = sklearn_to_flow(model) | ||
|
|
||
| # returns flow id if the flow exists on the server, False otherwise | ||
|
|
@@ -88,6 +94,24 @@ def run_task(task, model, avoid_duplicate_runs=True, flow_tags=None): | |
|
|
||
| return run | ||
|
|
||
| def initialize_model_from_run(run_id): | ||
| ''' | ||
| Initialized a model based on a run_id (i.e., using the exact | ||
| same parameter settings) | ||
|
|
||
| Parameters | ||
| ---------- | ||
| run_id : int | ||
| The Openml run_id | ||
|
|
||
| Returns | ||
| ------- | ||
| model : sklearn model | ||
| the scikitlearn model with all parameters initailized | ||
| ''' | ||
| run = get_run(run_id) | ||
| return initialize_model(run.setup_id) | ||
|
|
||
| def _run_exists(task_id, setup_id): | ||
| ''' | ||
| Checks whether a task/setup combination is already present on the server. | ||
|
|
@@ -111,6 +135,49 @@ def _run_exists(task_id, setup_id): | |
| assert(exception.code == 512) | ||
| return False | ||
|
|
||
| def _get_seeded_model(model, seed=None): | ||
| '''Sets all the non-seeded components of a model with a seed. | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You should mention the restriction that one cannot use a random state in the pipelines.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure. One could argue that that is a restriction of the run_tasks function, as that is the function the user interacts with. Furthermore, that function is responsible for the check. This function only adds seeds to unseeded models
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But it can raise an exception: But you're right, it should be documented in the
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Mea culpa, i will add it |
||
| Models that are already seeded will maintain the seed. In | ||
| this case, only integer seeds are allowed (An exception | ||
| is thrown when a RandomState was used as seed) | ||
|
|
||
| Parameters | ||
| ---------- | ||
| model : sklearn model | ||
| The model to be seeded | ||
| seed : int | ||
| The seed to initialize the RandomState with. Unseeded subcomponents | ||
| will be seeded with a random number from the RandomState. | ||
|
|
||
| Returns | ||
| ------- | ||
| model : sklearn model | ||
| a version of the model where all (sub)components have | ||
| a seed | ||
| ''' | ||
|
|
||
| rs = np.random.RandomState(seed) | ||
| model_params = model.get_params() | ||
| random_states = {} | ||
| for param_name in sorted(model_params): | ||
| if 'random_state' in param_name: | ||
| currentValue = model_params[param_name] | ||
| # important to draw the value at this point (and not in the if statement) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hm, could you explain why? It's not clear to me from this.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added description. I'm not sure if we really need this, but seems nice property to respect.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. k |
||
| # this way we guarantee that if a different set of subflows is seeded, | ||
| # the same number of the random generator is used | ||
| newValue = rs.randint(0, 2**16) | ||
| if currentValue is None: | ||
| random_states[param_name] = newValue | ||
| elif isinstance(currentValue, int): | ||
| # acceptable behaviour | ||
| pass | ||
| elif isinstance(currentValue, np.random.RandomState): | ||
| raise ValueError('Models initialized with a RandomState object are not supported. Please seed with an integer. ') | ||
| else: | ||
| raise ValueError('Models should be seeded with int or None (this should never happen). ') | ||
| model.set_params(**random_states) | ||
| return model | ||
|
|
||
|
|
||
|
|
||
| def _prediction_to_row(rep_no, fold_no, row_id, correct_label, predicted_label, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -165,7 +165,7 @@ def _create_description_xml(self): | |
| return description_xml | ||
|
|
||
| @staticmethod | ||
| def _parse_parameters(model, flow): | ||
| def _parse_parameters(model, server_flow): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looking at this again I'm actually surprised that this is not called
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you elaborate a bit? I don really understand which / why
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, I meant "that this is not called IN |
||
| """Extracts all parameter settings from a model in OpenML format. | ||
|
|
||
| Parameters | ||
|
|
@@ -176,50 +176,38 @@ def _parse_parameters(model, flow): | |
| openml flow object (containing flow ids, i.e., it has to be downloaded from the server) | ||
|
|
||
| """ | ||
| if flow.flow_id is None: | ||
| if server_flow.flow_id is None: | ||
| raise ValueError("The flow parameter needs to be downloaded from server") | ||
|
|
||
| python_param_settings = model.get_params() | ||
| openml_param_settings = [] | ||
|
|
||
| def get_flow_dict(_flow): | ||
| flow_map = {_flow.name: _flow.flow_id} | ||
| for subflow in _flow.components: | ||
| flow_map.update(get_flow_dict(_flow.components[subflow])) | ||
| return flow_map | ||
|
|
||
| flow_dict = get_flow_dict(flow) | ||
|
|
||
| for param in python_param_settings: | ||
| if "__" in param: | ||
| # parameter of subflow. will be handled later | ||
| continue | ||
| if isinstance(python_param_settings[param], BaseEstimator): | ||
| # extract parameters of the subflow individually | ||
| subflow = flow.components[param] | ||
| openml_param_settings += OpenMLRun._parse_parameters(python_param_settings[param], subflow) | ||
|
|
||
| # add parameter setting (in some cases also the subflow. Just because we can) | ||
| if param in flow.parameters.keys(): | ||
| param_dict = OrderedDict() | ||
| param_dict['oml:name'] = param | ||
| param_dict['oml:value'] = str(python_param_settings[param]) | ||
| param_dict['oml:component'] = flow_dict[flow.name] | ||
| openml_param_settings.append(param_dict) | ||
| else: | ||
| if flow.name.startswith("sklearn.pipeline.Pipeline"): | ||
| # tolerate | ||
| pass | ||
| elif flow.name.startswith("sklearn.pipeline.FeatureUnion"): | ||
| # tolerate | ||
| pass | ||
| elif flow.name.startswith("sklearn.ensemble.voting_classifier.VotingClassifier"): | ||
| # tolerate | ||
| pass | ||
| def extract_parameters(_flow, _param_dict, _main_call=False, main_id=None): | ||
| # _flow is openml flow object, _param dict maps from flow name to flow id | ||
| # for the main call, the param dict can be overridden (useful for unit tests / sentinels) | ||
| # this way, for flows without subflows we do not have to rely on _param_dict | ||
| _params = [] | ||
| for _param_name in _flow.parameters: | ||
| _current = OrderedDict() | ||
| _current['oml:name'] = _param_name | ||
| _current['oml:value'] = _flow.parameters[_param_name] | ||
| if _main_call: | ||
| _current['oml:component'] = main_id | ||
| else: | ||
| raise ValueError("parameter %s not in flow description of flow %s" %(param,flow.name)) | ||
| _current['oml:component'] = _param_dict[_flow.name] | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is it once an ID, and once a name? |
||
| _params.append(_current) | ||
| for _identifier in _flow.components: | ||
| _params.extend(extract_parameters(_flow.components[_identifier], _param_dict)) | ||
| return _params | ||
|
|
||
| flow_dict = get_flow_dict(server_flow) | ||
| local_flow = openml.flows.sklearn_to_flow(model) | ||
|
|
||
| return openml_param_settings | ||
| parameters = extract_parameters(local_flow, flow_dict, True, server_flow.flow_id) | ||
| return parameters | ||
|
|
||
| ################################################################################ | ||
| # Functions which cannot be in runs/functions due to circular imports | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,3 @@ | ||
| from .functions import setup_exists | ||
| from .functions import get_setup, setup_exists, initialize_model | ||
|
|
||
| __all__ = ['setup_exists'] | ||
| __all__ = ['get_setup', 'setup_exists', 'initialize_model'] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is neither used nor tested.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed, I added tests.