Skip to content

Commit

Permalink
Merge 8a6deb5 into 06394cd
Browse files Browse the repository at this point in the history
  • Loading branch information
amueller committed Nov 9, 2016
2 parents 06394cd + 8a6deb5 commit cbbcaa2
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 132 deletions.
161 changes: 63 additions & 98 deletions openml/flows/flow.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from collections import OrderedDict
import re

import six
import xmltodict
Expand All @@ -14,13 +13,14 @@ class OpenMLFlow(object):
:meth:`openml.flows.create_flow_from_model`. Using this helper function
ensures that all relevant fields are filled in.
Implements https://github.com/openml/website/blob/master/openml_OS/views/pages/api_new/v1/xsd/openml.implementation.upload.xsd.
Implements https://github.com/openml/website/blob/master/openml_OS/ \
views/pages/api_new/v1/xsd/openml.implementation.upload.xsd.
Parameters
----------
name : str
Name of the flow. Is used together with the attribute `external_version`
as a unique identifier of the flow.
Name of the flow. Is used together with the attribute
`external_version` as a unique identifier of the flow.
description : str
Human-readable description of the flow (free text).
model : object
Expand All @@ -38,8 +38,8 @@ class OpenMLFlow(object):
toolbox plugin can take care of casting the parameter default value to
the correct type.
parameters_meta_info : OrderedDict
Mapping from parameter name to `dict`. Stores additional information for
each parameter. Required keys are `data_type` and `description`.
Mapping from parameter name to `dict`. Stores additional information
for each parameter. Required keys are `data_type` and `description`.
external_version : str
Version number of the software the flow is implemented in. Is used
together with the attribute `name` as a uniquer identifier of the flow.
Expand Down Expand Up @@ -76,15 +76,16 @@ class OpenMLFlow(object):
def __init__(self, name, description, model, components, parameters,
parameters_meta_info, external_version, tags, language,
dependencies, binary_url=None, binary_format=None,
binary_md5=None, uploader=None, upload_date=None, flow_id=None,
version=None):
binary_md5=None, uploader=None, upload_date=None,
flow_id=None, version=None):
self.name = name
self.description = description
self.model = model

for variable, variable_name in [[components, 'components'],
[parameters, 'parameters'],
[parameters_meta_info, 'parameters_meta_info']]:
for variable, variable_name in [
[components, 'components'],
[parameters, 'parameters'],
[parameters_meta_info, 'parameters_meta_info']]:
if not isinstance(variable, OrderedDict):
raise TypeError('%s must be of type OrderedDict, '
'but is %s.' % (variable_name, type(variable)))
Expand Down Expand Up @@ -154,38 +155,32 @@ def _to_dict(self):
Flow represented as OrderedDict.
"""
flow_dict = OrderedDict()
flow_dict['oml:flow'] = OrderedDict()
flow_dict['oml:flow']['@xmlns:oml'] = 'http://openml.org/openml'
if self.flow_id is not None:
flow_dict['oml:flow']['oml:id'] = self.flow_id
if self.uploader is not None:
flow_dict['oml:flow']['oml:uploader'] = self.uploader
flow_dict['oml:flow']['oml:name'] = self._get_name()
if self.version is not None:
flow_dict['oml:flow']['oml:version'] = self.version
flow_dict['oml:flow']['oml:external_version'] = self.external_version
flow_dict['oml:flow']['oml:description'] = self.description
if self.upload_date is not None:
flow_dict['oml:flow']['oml:upload_date'] = self.upload_date
if self.language is not None:
flow_dict['oml:flow']['oml:language'] = self.language
if self.dependencies is not None:
flow_dict['oml:flow']['oml:dependencies'] = self.dependencies
flow_container = OrderedDict()
flow_dict = OrderedDict([('@xmlns:oml', 'http://openml.org/openml')])
flow_container['oml:flow'] = flow_dict
_add_if_nonempty(flow_dict, 'oml:id', self.flow_id)

for required in ["name", "external_version"]:
if getattr(self, required) is None:
raise ValueError("self.{} is required but None".format(
required))
for attribute in ["uploader", "name", "version", "external_version",
"description", "upload_date", "language",
"dependencies"]:
_add_if_nonempty(flow_dict, 'oml:{}'.format(attribute),
getattr(self, attribute))

flow_parameters = []
for key in self.parameters:
param_dict = OrderedDict()
param_dict['oml:name'] = key
meta_info = self.parameters_meta_info[key]

if self.parameters_meta_info[key]['data_type'] is not None:
param_dict['oml:data_type'] = self.parameters_meta_info[key].\
get('data_type')

_add_if_nonempty(param_dict, 'oml:data_type',
meta_info['data_type'])
param_dict['oml:default_value'] = self.parameters[key]
if self.parameters_meta_info[key]['description'] is not None:
param_dict['oml:description'] = self.parameters_meta_info[key].\
get('description')
_add_if_nonempty(param_dict, 'oml:description',
meta_info['description'])

for key_, value in param_dict.items():
if key_ is not None and not isinstance(key_, six.string_types):
Expand All @@ -199,7 +194,7 @@ def _to_dict(self):

flow_parameters.append(param_dict)

flow_dict['oml:flow']['oml:parameter'] = flow_parameters
flow_dict['oml:parameter'] = flow_parameters

components = []
for key in self.components:
Expand All @@ -218,17 +213,13 @@ def _to_dict(self):

components.append(component_dict)

flow_dict['oml:flow']['oml:component'] = components
flow_dict['oml:flow']['oml:tag'] = self.tags

if self.binary_url is not None:
flow_dict['oml:flow']['oml:binary_url'] = self.binary_url
if self.binary_format is not None:
flow_dict['oml:flow']['oml:binary_format'] = self.binary_format
if self.binary_md5 is not None:
flow_dict['oml:flow']['oml:binary_md5'] = self.binary_md5
flow_dict['oml:component'] = components
flow_dict['oml:tag'] = self.tags
for attribute in ["binary_url", "binary_format", "binary_md5"]:
_add_if_nonempty(flow_dict, 'oml:{}'.format(attribute),
getattr(self, attribute))

return flow_dict
return flow_container

@classmethod
def _from_xml(cls, xml_dict):
Expand Down Expand Up @@ -262,7 +253,8 @@ def _from_xml(cls, xml_dict):

# has to be converted to an int if present and cannot parsed in the
# two loops above
arguments['flow_id'] = int(dic['oml:id']) if 'oml:id' in dic else None
arguments['flow_id'] = (int(dic['oml:id']) if dic.get("oml:id")
is not None else None)

# Now parse parts of a flow which can occur multiple times like
# parameters, components (subflows) and tags. These can't be tackled
Expand Down Expand Up @@ -330,51 +322,23 @@ def __eq__(self, other):
Two flows are equal if their all keys which are not set by the server
are equal, as well as all their parameters and components.
"""
if isinstance(other, self.__class__):
this_dict = self.__dict__.copy()
this_parameters = this_dict['parameters']
del this_dict['parameters']
this_components = this_dict['components']
del this_dict['components']
del this_dict['model']

other_dict = other.__dict__.copy()
other_parameters = other_dict['parameters']
del other_dict['parameters']
other_components = other_dict['components']
del other_dict['components']
del other_dict['model']

# Name is actually not generated by the server, but it will be
# tested further down with a getter (allows mocking in the tests)
generated_by_the_server = ['name', 'flow_id', 'uploader', 'version',
'upload_date', 'source_url',
'binary_url', 'source_format',
'binary_format', 'source_md5',
'binary_md5']
for field in generated_by_the_server:
if field in this_dict:
del this_dict[field]
if field in other_dict:
del other_dict[field]
equal = this_dict == other_dict
equal_name = self._get_name() == other._get_name()

parameters_equal = \
this_parameters.keys() == other_parameters.keys() and \
all([this_parameter == other_parameter
for this_parameter, other_parameter in
zip(this_parameters.values(),
other_parameters.values())])
components_equal = \
this_components.keys() == other_components.keys() and \
all([this_component == other_component
for this_component, other_component in
zip(this_components.values(),
other_components.values())])

return parameters_equal and components_equal and equal and equal_name
return NotImplemented
if not isinstance(other, self.__class__):
return NotImplemented

# Name is actually not generated by the server, but it will be
# tested further down with a getter (allows mocking in the tests)
generated_by_the_server = ['name', 'flow_id', 'uploader', 'version',
'upload_date', 'source_url',
'binary_url', 'source_format',
'binary_format', 'source_md5',
'binary_md5', 'model']

for key in set(self.__dict__.keys()).union(other.__dict__.keys()):
if key in generated_by_the_server:
continue
if getattr(self, key, None) != getattr(other, key, None):
return False
return True

def publish(self):
"""Publish flow to OpenML server.
Expand Down Expand Up @@ -405,7 +369,7 @@ def _ensure_flow_exists(self):
"""
import sklearn
flow_version = 'sklearn_' + sklearn.__version__
_, _, flow_id = _check_flow_exists(self._get_name(), flow_version)
_, _, flow_id = _check_flow_exists(self.name, flow_version)
# TODO add numpy and scipy version!

if int(flow_id) == -1:
Expand All @@ -417,10 +381,6 @@ def _ensure_flow_exists(self):

return int(flow_id)

def _get_name(self):
"""Helper function. Can be mocked for testing."""
return self.name


def _check_flow_exists(name, version):
"""Retrieves the flow id of the flow uniquely identified by name+version.
Expand Down Expand Up @@ -455,3 +415,8 @@ def _check_flow_exists(name, version):
xml_dict = xmltodict.parse(xml_response)
flow_id = xml_dict['oml:flow_exists']['oml:id']
return return_code, xml_response, flow_id


def _add_if_nonempty(dic, key, value):
if value is not None:
dic[key] = value
10 changes: 5 additions & 5 deletions openml/flows/sklearn_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,10 +196,10 @@ def _serialize_model(model):
# when deserealizing the parameter
sub_component_identifier = k + '__' + identifier
sub_components[sub_component_identifier] = sub_component
component_reference = \
{'oml:serialized_object': 'component_reference',
'value': {'key': sub_component_identifier,
'step_name': identifier}}
component_reference = OrderedDict()
component_reference['oml:serialized_object'] = 'component_reference'
component_reference['value'] = OrderedDict(
key=sub_component_identifier, step_name=identifier)
parameter_value.append(component_reference)

if isinstance(rval, tuple):
Expand Down Expand Up @@ -262,7 +262,7 @@ def _serialize_model(model):

def _deserialize_model(flow, **kwargs):

model_name = flow._get_name()
model_name = flow.name
# Remove everything after the first bracket, it is not necessary for
# creating the current flow
pos = model_name.find('(')
Expand Down
36 changes: 7 additions & 29 deletions tests/flows/test_flow.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import collections
import hashlib
import re
import sys
import time
import unittest

import xmltodict

import scipy.stats
import sklearn
import sklearn.datasets
import sklearn.decomposition
import sklearn.dummy
Expand All @@ -22,11 +22,6 @@
from openml._api_calls import _perform_api_call
import openml

if sys.version_info[0] >= 3:
from unittest import mock
else:
import mock


class TestFlow(TestBase):

Expand Down Expand Up @@ -106,8 +101,7 @@ def test_to_xml_from_xml(self):
self.assertEqual(new_flow, flow)
self.assertIsNot(new_flow, flow)

@mock.patch.object(openml.OpenMLFlow, '_get_name', autospec=True)
def test_publish_flow(self, name_mock):
def test_publish_flow(self):
# Create a unique prefix for the flow. Necessary because the flow is
# identified by its name and external version online. Having a unique
# name allows us to publish the same flow in each test run
Expand All @@ -126,13 +120,12 @@ def test_publish_flow(self, name_mock):
language='English',
dependencies=''
)
name_mock.return_value = 'TEST%s%s' % (sentinel, flow.name)
flow.name = 'TEST%s%s' % (sentinel, flow.name)

flow.publish()
self.assertIsInstance(flow.flow_id, int)

@mock.patch.object(openml.OpenMLFlow, '_get_name', autospec=True)
def test_sklearn_to_upload_to_flow(self, name_mock):
def test_sklearn_to_upload_to_flow(self):
iris = sklearn.datasets.load_iris()
X = iris.data
y = iris.target
Expand All @@ -144,12 +137,6 @@ def test_sklearn_to_upload_to_flow(self, name_mock):
md5.update(str(time.time()).encode('utf-8'))
sentinel = md5.hexdigest()[:10]
sentinel = 'TEST%s' % sentinel
def side_effect(self):
if sentinel in self.name:
return self.name
else:
return '%s%s' % (sentinel, self.name)
name_mock.side_effect = side_effect

# Test a more complicated flow
ohe = sklearn.preprocessing.OneHotEncoder(categorical_features=[1])
Expand All @@ -171,22 +158,14 @@ def side_effect(self):
estimator=model, param_distributions=parameter_grid, cv=cv)
rs.fit(X, y)
flow = openml.flows.sklearn_to_flow(rs)
flow.external_version = sentinel + flow.external_version

flow.publish()
self.assertIsInstance(flow.flow_id, int)

# Check whether we can load the flow again
# Remove the sentinel from the name again so that we can reinstantiate
# the object again
def side_effect(self):
if sentinel in self.name:
name = self.name.replace(sentinel, '')
return name
else:
return self.name
name_mock.side_effect = side_effect

name_mock.side_effect = side_effect
new_flow = openml.flows.get_flow(flow_id=flow.flow_id)

local_xml = flow._to_xml()
Expand All @@ -205,12 +184,10 @@ def side_effect(self):
server_xml = server_xml.replace(' ', '').replace('\t', '').strip().replace('\n\n', '\n').replace('"', '"')
server_xml = re.sub(r'^$', '', server_xml)


self.assertEqual(server_xml, local_xml)

self.assertEqual(new_flow, flow)
self.assertIsNot(new_flow, flow)
new_flow.model.fit(X, y)

fixture_name = 'sklearn.model_selection._search.RandomizedSearchCV(' \
'sklearn.model_selection._split.StratifiedKFold,' \
Expand All @@ -223,5 +200,6 @@ def side_effect(self):
'sklearn.ensemble.weight_boosting.AdaBoostClassifier(' \
'sklearn.tree.tree.DecisionTreeClassifier)))'

self.assertEqual(new_flow._get_name(), fixture_name)
self.assertEqual(new_flow.name, fixture_name)

new_flow.model.fit(X, y)

0 comments on commit cbbcaa2

Please sign in to comment.