diff --git a/openml/runs/functions.py b/openml/runs/functions.py index 64cfbe7b1..43bbf05be 100644 --- a/openml/runs/functions.py +++ b/openml/runs/functions.py @@ -5,13 +5,15 @@ import numpy as np import warnings import sklearn +import time +from sklearn.model_selection._search import BaseSearchCV from ..exceptions import PyOpenMLError from .. import config from ..flows import sklearn_to_flow, get_flow, flow_exists from ..setups import setup_exists from ..exceptions import OpenMLCacheException, OpenMLServerException -from ..util import URLError +from ..util import URLError, version_complies from ..tasks.functions import _create_task_from_xml from .._api_calls import _perform_api_call from .run import OpenMLRun @@ -68,7 +70,6 @@ def run_task(task, model, avoid_duplicate_runs=True): run = OpenMLRun(task_id=task.task_id, flow_id=None, dataset_id=dataset.dataset_id, model=model) run.data_content, run.trace_content, run.trace_attributes = _run_task_get_arffcontent(model, task, class_labels) - if flow_id == False: # means the flow did not exists. As we could run it, publish it now flow = flow.publish() @@ -151,6 +152,7 @@ def _run_task_get_arffcontent(model, task, class_labels): X, Y = task.get_X_and_y() arff_datacontent = [] arff_tracecontent = [] + user_defined_measures = defaultdict(lambda: defaultdict(dict)) rep_no = 0 # TODO use different iterator to only provide a single iterator (less @@ -166,8 +168,15 @@ def _run_task_get_arffcontent(model, task, class_labels): testY = Y[test_indices] try: + # for measuring runtime. Only available since Python 3.3 + if version_complies(3, 3): + modelfit_starttime = time.process_time() model_fold.fit(trainX, trainY) + if version_complies(3, 3): + modelfit_duration = (time.process_time() - modelfit_starttime) * 1000 + user_defined_measures['usercpu_time_millis_training'][rep_no][fold_no] = modelfit_duration + if isinstance(model_fold, sklearn.model_selection._search.BaseSearchCV): arff_tracecontent.extend(_extract_arfftrace(model_fold, rep_no, fold_no)) model_classes = model_fold.best_estimator_.classes_ @@ -177,8 +186,15 @@ def _run_task_get_arffcontent(model, task, class_labels): # typically happens when training a regressor on classification task raise PyOpenMLError(str(e)) + if version_complies(3, 3): + modelpredict_starttime = time.process_time() ProbaY = model_fold.predict_proba(testX) PredY = model_fold.predict(testX) + if version_complies(3, 3): + modelpredict_duration = (time.process_time() - modelpredict_starttime) * 1000 + user_defined_measures['usercpu_time_millis_testing'][rep_no][fold_no] = modelpredict_duration + user_defined_measures['usercpu_time_millis'][rep_no][fold_no] = modelfit_duration + modelpredict_duration + if ProbaY.shape[1] != len(class_labels): warnings.warn("Repeat %d Fold %d: estimator only predicted for %d/%d classes!" %(rep_no, fold_no, ProbaY.shape[1], len(class_labels))) @@ -195,7 +211,6 @@ def _run_task_get_arffcontent(model, task, class_labels): else: arff_tracecontent = None arff_trace_attributes = None - return arff_datacontent, arff_tracecontent, arff_trace_attributes @@ -397,7 +412,7 @@ def _get_cached_run(run_id): run_file = os.path.join(run_cache_dir, "run_%d.xml" % int(run_id)) with io.open(run_file, encoding='utf8') as fh: - run = _create_task_from_xml(xml=fh.read()) + run = _create_run_from_xml(xml=fh.read()) return run except (OSError, IOError): diff --git a/openml/runs/run.py b/openml/runs/run.py index 8a21a5ff3..d667f468e 100644 --- a/openml/runs/run.py +++ b/openml/runs/run.py @@ -151,14 +151,16 @@ def _create_description_xml(self): # as a tag, it must be of the form ([a-zA-Z0-9_\-\.])+ # so we format time from 'mm/dd/yy hh:mm:ss' to 'mm-dd-yy_hh.mm.ss' - well_formatted_time = time.strftime("%c").replace( - ' ', '_').replace('/', '-').replace(':', '.') - tags = run_environment + [well_formatted_time] + ['run_task'] + \ - [self.model.__module__ + "." + self.model.__class__.__name__] + # well_formatted_time = time.strftime("%c").replace( + # ' ', '_').replace('/', '-').replace(':', '.') + # tags = run_environment + [well_formatted_time] + ['run_task'] + \ + # [self.model.__module__ + "." + self.model.__class__.__name__] + tags = ['openml-python', run_environment[1]] description = _to_dict(taskid=self.task_id, flow_id=self.flow_id, setup_string=_create_setup_string(self.model), parameter_settings=openml_param_settings, error_message=self.error_message, + detailed_evaluations=self.detailed_evaluations, tags=tags) description_xml = xmltodict.unparse(description, pretty=True) return description_xml @@ -247,7 +249,7 @@ def _get_version_information(): return [python_version, sklearn_version, numpy_version, scipy_version] -def _to_dict(taskid, flow_id, setup_string, error_message, parameter_settings, tags): +def _to_dict(taskid, flow_id, setup_string, error_message, parameter_settings, tags=None, detailed_evaluations=None): """ Creates a dictionary corresponding to the desired xml desired by openML Parameters @@ -274,11 +276,17 @@ def _to_dict(taskid, flow_id, setup_string, error_message, parameter_settings, t if error_message is not None: description['oml:run']['oml:error_message'] = error_message description['oml:run']['oml:parameter_setting'] = parameter_settings - description['oml:run']['oml:tag'] = tags # Tags describing the run - # description['oml:run']['oml:output_data'] = 0; - # all data that was output of this run, which can be evaluation scores - # (though those are also calculated serverside) - # must be of special data type + if tags is not None: + description['oml:run']['oml:tag'] = tags # Tags describing the run + if detailed_evaluations is not None: + description['oml:run']['oml:output_data'] = dict() + description['oml:run']['oml:output_data']['oml:evaluation'] = list() + for measure in detailed_evaluations: + for repeat in detailed_evaluations[measure]: + for fold, value in detailed_evaluations[measure][repeat].items(): + current = OrderedDict([('@repeat', str(repeat)), ('@fold', str(fold)), + ('oml:name', measure), ('oml:value', str(value))]) + description['oml:run']['oml:output_data']['oml:evaluation'].append(current) return description diff --git a/openml/util.py b/openml/util.py index cbf191f00..f50c645df 100644 --- a/openml/util.py +++ b/openml/util.py @@ -12,5 +12,15 @@ def is_string(obj): except NameError: return isinstance(obj, str) +def version_complies(major, minor=None): + version = sys.version_info + if version[0] > major: + return True + if version[0] < major: + return False + # version == major + if minor is None or version[1] >= minor: + return True + return False __all__ = ['URLError', 'is_string']