Skip to content

Commit

Permalink
Merge branch 'feature/upload-run' of ssh://github.com/openml/openml-p…
Browse files Browse the repository at this point in the history
…ython into feature/upload-run
  • Loading branch information
mfeurer committed Jan 31, 2017
2 parents 31bf79e + b250547 commit 90a88cc
Show file tree
Hide file tree
Showing 5 changed files with 197 additions and 28 deletions.
2 changes: 1 addition & 1 deletion openml/flows/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .flow import OpenMLFlow
from .sklearn_converter import sklearn_to_flow, flow_to_sklearn
from .functions import get_flow
from .functions import get_flow, get_flow_dict

__all__ = ['OpenMLFlow', 'create_flow_from_model', 'get_flow',
'sklearn_to_flow', 'flow_to_sklearn']
16 changes: 16 additions & 0 deletions openml/flows/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,19 @@ def get_flow(flow_id):
flow.model = flow_to_sklearn(flow)

return flow


def get_flow_dict(flow):
"""Returns a dictionary with keys flow name and values flow id.
Parameters
----------
flow : OpenMLFlow
"""
if flow.flow_id is None:
raise PyOpenMLError(
"Can only invoke function 'get_flow_map' on a server downloaded flow. ")
flow_map = {flow.name: flow.flow_id}
for subflow in flow.components:
flow_map.update(get_flow_dict(flow.components[subflow]))

return flow_map
28 changes: 21 additions & 7 deletions openml/runs/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import io
import os
import xmltodict
from sklearn.model_selection._search import BaseSearchCV

from .. import config
from ..flows import sklearn_to_flow
Expand Down Expand Up @@ -56,19 +57,16 @@ def run_task(task, model):
'only works for tasks with class labels.')

run = OpenMLRun(task_id=task.task_id, flow_id=flow_id,
dataset_id=dataset.dataset_id)
run.data_content = _run_task_get_arffcontent(model, task, class_labels)
dataset_id=dataset.dataset_id, model=model)
run.data_content, run.trace_content = _run_task_get_arffcontent(model, task, class_labels)

# The model will not be uploaded at the moment, but used to get the
# hyperparameter values when uploading the run
X, Y = task.get_X_and_y()
run.model = model.fit(X, Y)
return run


def _run_task_get_arffcontent(model, task, class_labels):
X, Y = task.get_X_and_y()
arff_datacontent = []
arff_tracecontent = []

rep_no = 0
# TODO use different iterator to only provide a single iterator (less
Expand All @@ -83,6 +81,19 @@ def _run_task_get_arffcontent(model, task, class_labels):
testY = Y[test_indices]

model.fit(trainX, trainY)
if isinstance(model, BaseSearchCV):
for itt_no in range(0, len(model.cv_results_['mean_test_score'])):
# we use the string values for True and False, as it is defined in this way by the OpenML server
selected = 'false'
if itt_no == model.best_index_:
selected = 'true'
test_score = model.cv_results_['mean_test_score'][itt_no]
arff_line = [rep_no, fold_no, itt_no, test_score, selected]
for key in model.cv_results_:
if key.startswith("param_"):
arff_line.append(str(model.cv_results_[key][itt_no]))
arff_tracecontent.append(arff_line)

ProbaY = model.predict_proba(testX)
PredY = model.predict(testX)

Expand All @@ -96,7 +107,10 @@ def _run_task_get_arffcontent(model, task, class_labels):
fold_no = fold_no + 1
rep_no = rep_no + 1

return arff_datacontent
if not isinstance(model, BaseSearchCV):
arff_tracecontent = None

return arff_datacontent, arff_tracecontent


def get_runs(run_ids):
Expand Down
117 changes: 101 additions & 16 deletions openml/runs/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@

import arff
import xmltodict
from sklearn.base import BaseEstimator
from sklearn.model_selection._search import BaseSearchCV

import openml
from ..tasks import get_task
from .._api_calls import _perform_api_call

from ..exceptions import PyOpenMLError

class OpenMLRun(object):
"""OpenML Run: result of running a model on an openml dataset.
Expand All @@ -17,10 +20,10 @@ class OpenMLRun(object):
FIXME
"""
def __init__(self, task_id, flow_id, dataset_id, setup_string=None,
def __init__(self, task_id, flow_id, dataset_id, setup_string=None,
files=None, setup_id=None, tags=None, uploader=None, uploader_name=None,
evaluations=None, detailed_evaluations=None,
data_content=None, model=None, task_type=None,
data_content=None, trace_content=None, model=None, task_type=None,
task_evaluation_measure=None, flow_name=None,
parameter_settings=None, predictions_url=None, task=None,
flow=None, run_id=None):
Expand All @@ -39,12 +42,14 @@ def __init__(self, task_id, flow_id, dataset_id, setup_string=None,
self.evaluations = evaluations
self.detailed_evaluations = detailed_evaluations
self.data_content = data_content
self.trace_content = trace_content
self.task = task
self.flow = flow
self.run_id = run_id
self.model = model

def _generate_arff_dict(self):
"""Generates the arff dictionary for upload to the server.
"""Generates the arff dictionary for uploading predictions to the server.
Assumes that the run has been executed.
Expand Down Expand Up @@ -74,6 +79,49 @@ def _generate_arff_dict(self):
arff_dict['relation'] = 'openml_task_' + str(task.task_id) + '_predictions'
return arff_dict

def _generate_trace_arff_dict(self, model):
"""Generates the arff dictionary for uploading predictions to the server.
Assumes that the run has been executed.
Returns
-------
arf_dict : dict
Dictionary representation of the ARFF file that will be uploaded.
Contains information about the optimization trace.
"""
if self.trace_content is None:
raise ValueError('No trace content avaiable.')
if not isinstance(model, BaseSearchCV):
raise PyOpenMLError('Cannot generate trace on provided classifier. (This should never happen.)')

arff_dict = {}
arff_dict['attributes'] = [('repeat', 'NUMERIC'),
('fold', 'NUMERIC'),
('iteration', 'NUMERIC'),
('evaluation', 'NUMERIC'),
('selected', ['true', 'false'])]
for key in model.cv_results_:
if key.startswith("param_"):
type = 'STRING'
if all(isinstance(i, (bool)) for i in model.cv_results_[key]):
type = ['True', 'False']
elif all(isinstance(i, (int, float)) for i in model.cv_results_[key]):
type = 'NUMERIC'
else:
values = list(set(model.cv_results_[key])) # unique values
if len(values) < 100: # arbitrary number. make it an option?
type = [str(i) for i in values]
print(key + ": " + str(type))

attribute = ("parameter_" + key[6:], type)
arff_dict['attributes'].append(attribute)

arff_dict['data'] = self.trace_content
arff_dict['relation'] = 'openml_task_' + str(self.task_id) + '_predictions'

return arff_dict

def publish(self):
"""Publish a run to the OpenML server.
Expand All @@ -84,10 +132,18 @@ def publish(self):
-------
self : OpenMLRun
"""
if self.model is None:
raise PyOpenMLError("OpenMLRun obj does not contain a model. (This should never happen.) ");

predictions = arff.dumps(self._generate_arff_dict())
description_xml = self._create_description_xml()
file_elements = {'predictions': ("predictions.csv", predictions),

file_elements = {'predictions': ("predictions.arff", predictions),
'description': ("description.xml", description_xml)}
if self.trace_content is not None:
trace_arff = arff.dumps(self._generate_trace_arff_dict(self.model))
file_elements['trace'] = ("trace.arff", trace_arff)

return_code, return_value = _perform_api_call(
"/run/", file_elements=file_elements)
run_id = int(xmltodict.parse(return_value)['oml:upload_run']['oml:run_id'])
Expand All @@ -104,7 +160,11 @@ def _create_description_xml(self):
"""
run_environment = _get_version_information()

parameter_settings = self.model.get_params()
# TODO: don't we have flow object in data structure? Use this one
downloaded_flow = openml.flows.get_flow(self.flow_id)

openml_param_settings = _parse_parameters(self.model, downloaded_flow)

# as a tag, it must be of the form ([a-zA-Z0-9_\-\.])+
# so we format time from 'mm/dd/yy hh:mm:ss' to 'mm-dd-yy_hh.mm.ss'
well_formatted_time = time.strftime("%c").replace(
Expand All @@ -113,11 +173,44 @@ def _create_description_xml(self):
[self.model.__module__ + "." + self.model.__class__.__name__]
description = _to_dict(taskid=self.task_id, flow_id=self.flow_id,
setup_string=_create_setup_string(self.model),
parameter_settings=parameter_settings,
parameter_settings=openml_param_settings,
tags=tags)
description_xml = xmltodict.unparse(description, pretty=True)
return description_xml

def _parse_parameters(model, flow):
"""Extracts all parameter settings from an model in OpenML format.
Parameters
----------
model
the sci-kit learn model (fitted)
flow
openml flow object (containing flow ids, i.e., it has to be downloaded from the server)
"""
python_param_settings = model.get_params()
openml_param_settings = []
flow_dict = openml.flows.get_flow_dict(flow)

for param in python_param_settings:
if "__" in param:
# parameter of subflow. will be handled later
continue
if isinstance(python_param_settings[param], BaseEstimator):
# extract parameters of the subflow individually
subflow = flow.components[param]
openml_param_settings += _parse_parameters(python_param_settings[param], subflow)

# add parameter setting (also the subflow. Just because we can)
param_dict = OrderedDict()
param_dict['oml:name'] = param
param_dict['oml:value'] = str(python_param_settings[param])
param_dict['oml:component'] = flow_dict[flow.name]
openml_param_settings.append(param_dict)

return openml_param_settings

################################################################################
# Functions which cannot be in runs/functions due to circular imports

Expand Down Expand Up @@ -169,15 +262,7 @@ def _to_dict(taskid, flow_id, setup_string, parameter_settings, tags):
description['oml:run']['@xmlns:oml'] = 'http://openml.org/openml'
description['oml:run']['oml:task_id'] = taskid
description['oml:run']['oml:flow_id'] = flow_id

params = []
for k, v in parameter_settings.items():
param_dict = OrderedDict()
param_dict['oml:name'] = k
param_dict['oml:value'] = ('None' if v is None else v)
params.append(param_dict)

description['oml:run']['oml:parameter_setting'] = params
description['oml:run']['oml:parameter_setting'] = parameter_settings
description['oml:run']['oml:tag'] = tags # Tags describing the run
# description['oml:run']['oml:output_data'] = 0;
# all data that was output of this run, which can be evaluation scores
Expand Down
62 changes: 58 additions & 4 deletions tests/runs/test_run_functions.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,68 @@
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import openml
import openml.exceptions
from openml.testing import TestBase


class TestRun(TestBase):
def test_run_iris(self):
task = openml.tasks.get_task(10107)
clf = LogisticRegression()

def _perform_run(self, task_id, num_instances, clf):
task = openml.tasks.get_task(task_id)
run = openml.runs.run_task(task, clf)
run_ = run.publish()
self.assertEqual(run_, run)
self.assertIsInstance(run.dataset_id, int)

# check arff output
self.assertEqual(len(run.data_content), num_instances)
return run


def test_run_iris(self):
task_id = 10107
num_instances = 150

clf = LogisticRegression()
self._perform_run(task_id,num_instances, clf)


def test_run_optimize_randomforest_iris(self):
task_id = 10107
num_instances = 150
num_folds = 10
num_iterations = 5

clf = RandomForestClassifier(n_estimators=10)
param_dist = {"max_depth": [3, None],
"max_features": [1,2,3,4],
"min_samples_split": [2,3,4,5,6,7,8,9,10],
"min_samples_leaf": [1,2,3,4,5,6,7,8,9,10],
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]}
random_search = RandomizedSearchCV(clf, param_dist,n_iter=num_iterations)

run = self._perform_run(task_id,num_instances, random_search)
self.assertEqual(len(run.trace_content), num_iterations * num_folds)

def test_run_optimize_bagging_iris(self):
task_id = 10107
num_instances = 150
num_folds = 10
num_iterations = 36 # (num values for C times gamma)

task = openml.tasks.get_task(task_id)
bag = BaggingClassifier(base_estimator=SVC())
param_dist = {"base_estimator__C": [0.001, 0.01, 0.1, 1, 10, 100],
"base_estimator__gamma": [0.001, 0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(bag, param_dist)

run = self._perform_run(task_id, num_instances, grid_search)
self.assertEqual(len(run.trace_content), num_iterations * num_folds)


def test__run_task_get_arffcontent(self):
task = openml.tasks.get_task(1939)
class_labels = task.class_labels
Expand All @@ -24,9 +74,13 @@ def test__run_task_get_arffcontent(self):
clf, task, class_labels)

clf = SGDClassifier(loss='log', random_state=1)
arff_datacontent = openml.runs.functions._run_task_get_arffcontent(
arff_datacontent, arff_tracecontent = openml.runs.functions._run_task_get_arffcontent(
clf, task, class_labels)
# predictions
self.assertIsInstance(arff_datacontent, list)
# trace. SGD does not produce any
self.assertIsInstance(arff_tracecontent, type(None))

# 10 times 10 fold CV of 150 samples
self.assertEqual(len(arff_datacontent), 1500)
for arff_line in arff_datacontent:
Expand Down

0 comments on commit 90a88cc

Please sign in to comment.