Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 39 additions & 11 deletions openml/runs/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,15 @@
from ..util import URLError, version_complies
from ..tasks.functions import _create_task_from_xml
from .._api_calls import _perform_api_call
from .run import OpenMLRun
from .run import OpenMLRun, _get_version_information


# _get_version_info, _get_dict and _create_setup_string are in run.py to avoid
# circular imports



def run_task(task, model, avoid_duplicate_runs=True):
def run_task(task, model, avoid_duplicate_runs=True, flow_tags=None):
"""Performs a CV run on the dataset of the given task, using the split.

Parameters
Expand All @@ -35,13 +35,16 @@ def run_task(task, model, avoid_duplicate_runs=True):
a model which has a function fit(X,Y) and predict(X),
all supervised estimators of scikit learn follow this definition of a model [1]
[1](http://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html)

flow_tags : list(str)
a list of tags that the flow should have at creation

Returns
-------
run : OpenMLRun
Result of the run.
"""
if flow_tags is not None and not isinstance(flow_tags, list):
raise ValueError("flow_tags should be list")
# TODO move this into its onwn module. While it somehow belongs here, it
# adds quite a lot of functionality which is better suited in other places!
# TODO why doesn't this accept a flow as input? - this would make this more flexible!
Expand All @@ -66,8 +69,10 @@ def run_task(task, model, avoid_duplicate_runs=True):
raise ValueError('The task has no class labels. This method currently '
'only works for tasks with class labels.')

run_environment = _get_version_information()
tags = ['openml-python', run_environment[1]]
# execute the run
run = OpenMLRun(task_id=task.task_id, flow_id=None, dataset_id=dataset.dataset_id, model=model)
run = OpenMLRun(task_id=task.task_id, flow_id=None, dataset_id=dataset.dataset_id, model=model, tags=tags)
run.data_content, run.trace_content, run.trace_attributes = _run_task_get_arffcontent(model, task, class_labels)

if flow_id == False:
Expand Down Expand Up @@ -176,18 +181,20 @@ def _run_task_get_arffcontent(model, task, class_labels):
if version_complies(3, 3):
modelfit_duration = (time.process_time() - modelfit_starttime) * 1000
user_defined_measures['usercpu_time_millis_training'][rep_no][fold_no] = modelfit_duration

if isinstance(model_fold, sklearn.model_selection._search.BaseSearchCV):
arff_tracecontent.extend(_extract_arfftrace(model_fold, rep_no, fold_no))
model_classes = model_fold.best_estimator_.classes_
else:
model_classes = model_fold.classes_
except AttributeError as e:
# typically happens when training a regressor on classification task
raise PyOpenMLError(str(e))

# extract trace
if isinstance(model_fold, sklearn.model_selection._search.BaseSearchCV):
arff_tracecontent.extend(_extract_arfftrace(model_fold, rep_no, fold_no))
model_classes = model_fold.best_estimator_.classes_
else:
model_classes = model_fold.classes_

if version_complies(3, 3):
modelpredict_starttime = time.process_time()

ProbaY = model_fold.predict_proba(testX)
PredY = model_fold.predict(testX)
if version_complies(3, 3):
Expand Down Expand Up @@ -215,6 +222,12 @@ def _run_task_get_arffcontent(model, task, class_labels):


def _extract_arfftrace(model, rep_no, fold_no):
if not isinstance(model, sklearn.model_selection._search.BaseSearchCV):
raise ValueError('model should be instance of'\
' sklearn.model_selection._search.BaseSearchCV')
if not hasattr(model, 'cv_results_'):
raise ValueError('model should contain `cv_results_`')

arff_tracecontent = []
for itt_no in range(0, len(model.cv_results_['mean_test_score'])):
# we use the string values for True and False, as it is defined in this way by the OpenML server
Expand All @@ -230,6 +243,12 @@ def _extract_arfftrace(model, rep_no, fold_no):
return arff_tracecontent

def _extract_arfftrace_attributes(model):
if not isinstance(model, sklearn.model_selection._search.BaseSearchCV):
raise ValueError('model should be instance of'\
' sklearn.model_selection._search.BaseSearchCV')
if not hasattr(model, 'cv_results_'):
raise ValueError('model should contain `cv_results_`')

# attributes that will be in trace arff, regardless of the model
trace_attributes = [('repeat', 'NUMERIC'),
('fold', 'NUMERIC'),
Expand Down Expand Up @@ -391,6 +410,15 @@ def _create_run_from_xml(xml):
evaluation_flows[key] = flow_id

evaluation_flows[key] = flow_id
tags = None
if 'oml:tag' in run:
if isinstance(run['oml:tag'], str):
tags = [run['oml:tag']]
elif isinstance(run['oml:tag'], list):
tags = run['oml:tag']
else:
raise ValueError('Received not string and non list as tag item')


return OpenMLRun(run_id=run_id, uploader=uploader,
uploader_name=uploader_name, task_id=task_id,
Expand All @@ -401,7 +429,7 @@ def _create_run_from_xml(xml):
parameter_settings=parameters,
dataset_id=dataset_id, predictions_url=predictions_url,
evaluations=evaluations,
detailed_evaluations=detailed_evaluations)
detailed_evaluations=detailed_evaluations, tags=tags)


def _get_cached_run(run_id):
Expand Down
5 changes: 2 additions & 3 deletions openml/runs/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def __init__(self, task_id, flow_id, dataset_id, setup_string=None,
self.flow = flow
self.run_id = run_id
self.model = model
self.tags = tags

def _generate_arff_dict(self):
"""Generates the arff dictionary for uploading predictions to the server.
Expand Down Expand Up @@ -142,7 +143,6 @@ def _create_description_xml(self):
xml_string : string
XML description of run.
"""
run_environment = _get_version_information()

# TODO: don't we have flow object in data structure? Use this one
downloaded_flow = openml.flows.get_flow(self.flow_id)
Expand All @@ -155,13 +155,12 @@ def _create_description_xml(self):
# ' ', '_').replace('/', '-').replace(':', '.')
# tags = run_environment + [well_formatted_time] + ['run_task'] + \
# [self.model.__module__ + "." + self.model.__class__.__name__]
tags = ['openml-python', run_environment[1]]
description = _to_dict(taskid=self.task_id, flow_id=self.flow_id,
setup_string=_create_setup_string(self.model),
parameter_settings=openml_param_settings,
error_message=self.error_message,
detailed_evaluations=self.detailed_evaluations,
tags=tags)
tags=self.tags)
description_xml = xmltodict.unparse(description, pretty=True)
return description_xml

Expand Down
4 changes: 3 additions & 1 deletion tests/test_flows/test_flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,7 @@ def test_sklearn_to_upload_to_flow(self):
estimator=model, param_distributions=parameter_grid, cv=cv)
rs.fit(X, y)
flow = openml.flows.sklearn_to_flow(rs)
flow.tags.extend(['openml-python', 'unittest'])

# Add the sentinel to all name strings in all subflows. Adds it to
# name to make it easier in the web gui to see that the flow is only
Expand Down Expand Up @@ -281,5 +282,6 @@ def test_sklearn_to_upload_to_flow(self):
% sentinel

self.assertEqual(new_flow.name, fixture_name)

self.assertTrue('openml-python' in new_flow.tags)
self.assertTrue('unittest' in new_flow.tags)
new_flow.model.fit(X, y)
10 changes: 8 additions & 2 deletions tests/test_runs/test_run_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def test_run_regression_on_classif_task(self):

clf = LinearRegression()
task = openml.tasks.get_task(task_id)
self.assertRaises(openml.exceptions.PyOpenMLError, openml.runs.run_task,
self.assertRaises(AttributeError, openml.runs.run_task,
task=task, model=clf, avoid_duplicate_runs=False)

@mock.patch('openml.flows.sklearn_to_flow')
Expand All @@ -60,7 +60,10 @@ def test_run_diabetes(self):
num_instances = 768

clf = LogisticRegression()
self._perform_run(task_id,num_instances, clf)
res = self._perform_run(task_id,num_instances, clf)

downloaded = openml.runs.get_run(res.run_id)
assert('openml-python' in downloaded.tags)

def test_run_optimize_randomforest_iris(self):
task_id = 115
Expand All @@ -80,6 +83,7 @@ def test_run_optimize_randomforest_iris(self):
n_iter=num_iterations)

run = self._perform_run(task_id, num_instances, random_search)
print(run.trace_content)
self.assertEqual(len(run.trace_content), num_iterations * num_folds)

def test_run_optimize_bagging_iris(self):
Expand Down Expand Up @@ -166,6 +170,8 @@ def test_get_run(self):
(8, 0.56759),
(9, 0.64621)]:
self.assertEqual(run.detailed_evaluations['f_measure'][0][i], value)
assert('weka' in run.tags)
assert('stacking' in run.tags)

def _check_run(self, run):
self.assertIsInstance(run, dict)
Expand Down