Skip to content

Commit

Permalink
ADD propagate external version of subflows to parent flow
Browse files Browse the repository at this point in the history
  • Loading branch information
mfeurer committed Jan 26, 2017
1 parent f2c7a42 commit dfff969
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 8 deletions.
24 changes: 18 additions & 6 deletions openml/flows/sklearn_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,6 @@ def _serialize_model(model):

# Create a flow name, which contains all components in brackets, for
# example RandomizedSearchCV(Pipeline(StandardScaler,AdaBoostClassifier(DecisionTreeClassifier)),StandardScaler,AdaBoostClassifier(DecisionTreeClassifier))
# TODO the name above is apparently wrong, I need to test and check this
class_name = model.__module__ + "." + model.__class__.__name__

# will be part of the name (in brackets)
Expand All @@ -274,7 +273,24 @@ def _serialize_model(model):
else:
name = class_name

external_version = _get_external_version_info()
# Get the external versions of all sub-components
model_package_name = model.__module__.split('.')[0]
module = importlib.import_module(model_package_name)
model_package_version_number = module.__version__
external_version = '%s==%s' % (model_package_name, model_package_version_number)

external_versions = set()
external_versions.add(external_version)
to_visit_stack = []
to_visit_stack.extend(sub_components.values())
while len(to_visit_stack) > 0:
visitee = to_visit_stack.pop()
for external_version in visitee.external_version.split(','):
external_versions.add(external_version)
to_visit_stack.extend(visitee.components.values())
external_versions = list(sorted(external_versions))
external_version = ','.join(external_versions)

flow = OpenMLFlow(name=name,
class_name=class_name,
description='Automatically created sub-component.',
Expand Down Expand Up @@ -470,7 +486,3 @@ def _deserialize_cross_validator(value, **kwargs):
for parameter in parameters:
parameters[parameter] = flow_to_sklearn(parameters[parameter])
return model_class(**parameters)


def _get_external_version_info():
return 'sklearn_' + sklearn.__version__
3 changes: 3 additions & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Dummy to allow mock classes in the test files to have a version number for
# their parent module
__version__ = '0.1'
1 change: 1 addition & 0 deletions tests/flows/dummy_learn/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__version__ = 1.0
12 changes: 12 additions & 0 deletions tests/flows/dummy_learn/dummy_forest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
class DummyRegressor(object):
def fit(self, X, y):
return self

def predict(self, X):
return X[:, 0]

def get_params(self, deep=False):
return {}

def set_params(self, params):
return None
26 changes: 24 additions & 2 deletions tests/flows/test_sklearn.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from collections import OrderedDict
import json
import os
import sys
import unittest

import numpy as np
Expand Down Expand Up @@ -61,6 +63,7 @@ def test_serialize_model(self):
serialization = sklearn_to_flow(model)

self.assertEqual(serialization.name, fixture_name)
self.assertEqual(serialization.class_name, fixture_name)
self.assertEqual(serialization.description, fixture_description)
self.assertEqual(serialization.parameters, fixture_parameters)

Expand All @@ -78,16 +81,21 @@ def test_serialize_model_with_subcomponent(self):

fixture_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier' \
'(base_estimator=sklearn.tree.tree.DecisionTreeClassifier)'
fixture_class_name = 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'
fixture_description = 'Automatically created sub-component.'
fixture_subcomponent_class_name = 'sklearn.tree.tree.DecisionTreeClassifier'

serialization = sklearn_to_flow(model)

self.assertEqual(serialization.name, fixture_name)
self.assertEqual(serialization.class_name, fixture_class_name)
self.assertEqual(serialization.description, fixture_description)
self.assertEqual(serialization.parameters['algorithm'], '"SAMME.R"')
self.assertIsInstance(serialization.parameters['base_estimator'], str)
self.assertEqual(serialization.parameters['learning_rate'], '1.0')
self.assertEqual(serialization.parameters['n_estimators'], '100')
self.assertEqual(serialization.components['base_estimator'].class_name,
fixture_subcomponent_class_name)

new_model = flow_to_sklearn(serialization)

Expand Down Expand Up @@ -403,7 +411,7 @@ def test_error_on_adding_component_multiple_times_to_flow(self):
"\('pca2', PCA\(copy=True, iterated_power='auto', " \
"n_components=None, random_state=None,\n" \
" svd_solver='auto', tol=0.0, whiten=False\)\)\)\)."
#self.assertRaisesRegexp(ValueError, fixture, sklearn_to_flow, pipeline)
self.assertRaisesRegexp(ValueError, fixture, sklearn_to_flow, pipeline)

fu = sklearn.pipeline.FeatureUnion((('pca1', pca), ('pca2', pca2)))
fixture = "Found a second occurence of component sklearn.decomposition.pca.PCA when trying to serialize " \
Expand All @@ -416,7 +424,7 @@ def test_error_on_adding_component_multiple_times_to_flow(self):
" n_components=None, random_state=None,\n" \
" svd_solver='auto', tol=0.0, whiten=False\)\)\),\n" \
" transformer_weights=None\)."
#self.assertRaisesRegexp(ValueError, fixture, sklearn_to_flow, fu)
self.assertRaisesRegexp(ValueError, fixture, sklearn_to_flow, fu)

fs = sklearn.feature_selection.SelectKBest()
fu2 = sklearn.pipeline.FeatureUnion((('pca1', pca), ('fs', fs)))
Expand All @@ -435,3 +443,17 @@ def test_error_on_adding_component_multiple_times_to_flow(self):
" n_components=None, random_state=None,\n" \
" svd_solver='auto', tol=0.0, whiten=False\)\)\)\)."
self.assertRaisesRegexp(ValueError, fixture, sklearn_to_flow, pipeline2)

def test_subflow_version_change(self):
this_directory = os.path.dirname(os.path.abspath(__file__))
sys.path.append(this_directory)
import dummy_learn
import dummy_learn.dummy_forest
pca = sklearn.decomposition.PCA()
dummy = dummy_learn.dummy_forest.DummyRegressor()
pipeline = sklearn.pipeline.Pipeline((('pca', pca), ('dummy', dummy)))
flow = sklearn_to_flow(pipeline)
self.assertEqual(flow.external_version, 'dummy_learn==1.0,sklearn==0.18.1')
dummy_learn.__version__ = '1.1.0'
flow = sklearn_to_flow(pipeline)
self.assertEqual(flow.external_version, 'dummy_learn==1.1.0,sklearn==0.18.1')

0 comments on commit dfff969

Please sign in to comment.