Fixes #18, add converters for xgboost (#192)

* remove unnecessary print, add quote around filenames in some places * replaces as_matrix by values (pandas warnings) * changes variable name to avoid getting warnings about invalid names * better consistency for converted, allows targetted onnx version to be None * Revert "better consistency for converted, allows targetted onnx version to be None" This reverts commit e257ca1. * handle the comparison of ONNX versions in only one place * fix bug with OneHotEncoder and scikit-learn 0.20 * release the constraint on scikit-learn (0.20.0 allowed) * fix one type issue for Python 2.7 * add documentation to compare_strict_version * Fixes #151, BernouilliNB converter * Removes unused nodes in graph * Adresses issue #143, enables build with keras 2.1.2 * Revert modifications due to a wrong merge * update keras version * Disable test on keras/mobilenet as it does not work * add unit test for xception (failing) * remove duplicate install * skip unit test if not installed (tensorflow still not available on python 3.7) * Fix when keras is not available * Fix missing import * Update test_single_operator_with_cntk_backend.py * Set up CI with Azure Pipelines * Update azure pipeline * Skip a unit test if tensorflow is not installed * merge * missing import * Revert "Merge branch 'master' of https://github.com/onnx/onnxmltools" This reverts commit 178e763, reversing changes made to 1a617ef. * revert changes * Revert changes * \r * \r * first step in the migration of xgboost code * XGBoost regression works * Finalize xgboost converter * Update README.md * Add function has_tensorflow * Update test_single_operator_with_cntk_backend.py * better desgin for a unit test * update xgboost classifier * Delete test_keras_xception.py * Delete requirements-deep.txt * Delete test_keras_modebilenetv2.py * less spaces * lower precision for xgboost comparison tests * disable xgboost testing on python 2
onnx · Feb 14, 2019 · eef63ee · eef63ee
1 parent 30d5fcf
commit eef63ee
Show file tree

Hide file tree

Showing 24 changed files with 610 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -32,6 +32,8 @@ This package relies on ONNX, NumPy, and ProtoBuf. If you are converting a model
 2. CoreMLTools
 3. Keras (version 2.0.8 or higher) with the corresponding Tensorflow version
 4. LightGBM (scikit-learn interface)
+5. XGBoost (scikit-learn interface)
+6. libsvm
 
 # Examples
 If you want the converted ONNX model to be compatible with a certain ONNX version, please specify the target_opset parameter upon invoking the convert function. The following Keras model conversion example demonstrates this below. You can identify the mapping from ONNX Operator Sets (referred to as opsets) to ONNX releases in the [versioning documentation](https://github.com/onnx/onnx/blob/master/docs/Versioning.md#released-versions). 

diff --git a/onnxmltools/convert/__init__.py b/onnxmltools/convert/__init__.py
@@ -9,3 +9,5 @@
 from .main import convert_libsvm
 from .main import convert_lightgbm
 from .main import convert_sklearn
+from .main import convert_xgboost
+
diff --git a/onnxmltools/convert/common/_container.py b/onnxmltools/convert/common/_container.py
@@ -92,6 +92,10 @@ class LightGbmModelContainer(CommonSklearnModelContainer):
     pass
 
 
+class XGBoostModelContainer(CommonSklearnModelContainer):
+    pass
+
+
 class KerasModelContainer(RawModelContainer):
 
     def __init__(self, keras_model):

diff --git a/onnxmltools/convert/common/data_types.py b/onnxmltools/convert/common/data_types.py
@@ -15,6 +15,10 @@ def __init__(self, shape=None, doc_string=''):
 
     def to_onnx_type(self):
         raise NotImplementedError()
+
+    def __repr__(self):
+        name = self.__class__.__name__
+        return "{}({}, '{}')".format(name, self.shape, self.doc_string)
 
 
 class Int64Type(DataType):

diff --git a/onnxmltools/convert/common/interface.py b/onnxmltools/convert/common/interface.py
@@ -10,6 +10,7 @@
 import abc
 import six
 
+
 @six.add_metaclass(abc.ABCMeta)
 class ModelContainer:
     __metaclass = abc.ABCMeta
@@ -41,6 +42,7 @@ def add_node(self, op_type, inputs, outputs, op_domain='', op_version=1, **attrs
         """
         return
 
+
 @six.add_metaclass(abc.ABCMeta)
 class OperatorBase:
     __metaclass__ = abc.ABCMeta
@@ -77,6 +79,7 @@ def original_operator(self):
         """
         pass
 
+
 @six.add_metaclass(abc.ABCMeta)
 class ScopeBase:
     __metaclass__ = abc.ABCMeta

diff --git a/onnxmltools/convert/common/optimizer.py b/onnxmltools/convert/common/optimizer.py
@@ -117,7 +117,8 @@ def build_from_onnx(onnx_nodes, nchw_inputs, inputs, outputs):
             ln = LinkedNode(o_)
             view.append(ln)
             for var_ in o_.output:
-                assert var_map.get(var_) is None
+                if var_map.get(var_) is not None:
+                    raise RuntimeError("Duplicated output name (accross all nodes) '{0}'".format(var_))
                 var_map[var_] = ln
 
         additional_nodes = []

diff --git a/onnxmltools/convert/common/utils.py b/onnxmltools/convert/common/utils.py
@@ -120,7 +120,7 @@ def xgboost_installed():
     try:
         _LIB.XGBoosterDumpModelEx
     except AttributeError:
-        # The version is now recent enough even though it is version 0.6.
+        # The version is not recent enough even though it is version 0.6.
         # You need to install xgboost from github and not from pypi.
         return False
     from xgboost import __version__
@@ -290,7 +290,7 @@ def check_input_and_output_numbers(operator, input_count_range=None, output_coun
     if max_output_count is not None and len(operator.outputs) > max_output_count:
         raise RuntimeError(
             'For operator %s (type: %s), at most %s outputs(s) is(are) supported but we got %s output(s) which are %s' \
-            % (operator.full_name, operator.type, max_output_count, len(operator.outputs), operator.outputs_full_names))
+            % (operator.full_name, operator.type, max_output_count, len(operator.outputs), operator.output_full_names))
 
 
 def check_input_and_output_types(operator, good_input_types=None, good_output_types=None):

diff --git a/onnxmltools/convert/lightgbm/_parse.py b/onnxmltools/convert/lightgbm/_parse.py
@@ -21,7 +21,7 @@ def _get_lightgbm_operator_name(model_type):
     '''
     Get operator name of the input argument
 
-    :param model_type:  A scikit-learn object (e.g., SGDClassifier and Binarizer)
+    :param model_type:  A lightgbm object.
     :return: A string which stands for the type of the input model in our conversion framework
     '''
     if model_type not in lightgbm_operator_name_map:
@@ -60,7 +60,7 @@ def _parse_lightgbm(scope, model, inputs):
     This is a delegate function. It doesn't nothing but invoke the correct parsing function according to the input
     model's type.
     :param scope: Scope object
-    :param model: A scikit-learn object (e.g., OneHotEncoder and LogisticRegression)
+    :param model: A lightgbm object
     :param inputs: A list of variables
     :return: The output variables produced by the input model
     '''

diff --git a/onnxmltools/convert/main.py b/onnxmltools/convert/main.py
@@ -63,3 +63,13 @@ def convert_sklearn(model, name=None, initial_types=None, doc_string='', target_
     from skl2onnx.convert import convert_sklearn as convert_skl2onnx
     return convert_skl2onnx(model, name, initial_types, doc_string, target_opset,
                    custom_conversion_functions, custom_shape_calculators)
+
+
+def convert_xgboost(*args, **kwargs):
+    if not utils.xgboost_installed():
+        raise RuntimeError('xgboost is not installed. Please install xgboost to use this feature.')
+
+    from .xgboost.convert import convert
+    return convert(*args, **kwargs)
+
+
diff --git a/onnxmltools/convert/xgboost/__init__.py b/onnxmltools/convert/xgboost/__init__.py
@@ -0,0 +1,7 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+from .convert import convert
diff --git a/onnxmltools/convert/xgboost/_parse.py b/onnxmltools/convert/xgboost/_parse.py
@@ -0,0 +1,91 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+from ..common._container import XGBoostModelContainer
+from ..common._topology import *
+
+from xgboost import XGBRegressor, XGBClassifier
+
+xgboost_classifier_list = [XGBClassifier]
+
+# Associate types with our operator names.
+xgboost_operator_name_map = {XGBClassifier: 'XGBClassifier',
+                              XGBRegressor: 'XGBRegressor'}
+
+
+def _get_xgboost_operator_name(model_type):
+    '''
+    Get operator name of the input argument
+
+    :param model_type:  A xgboost object.
+    :return: A string which stands for the type of the input model in our conversion framework
+    '''
+    if model_type not in xgboost_operator_name_map:
+        raise ValueError("No proper operator name found for '%s'" % model_type)
+    return xgboost_operator_name_map[model_type]
+
+
+def _parse_xgboost_simple_model(scope, model, inputs):
+    '''
+    This function handles all non-pipeline models.
+
+    :param scope: Scope object
+    :param model: A xgboost object
+    :param inputs: A list of variables
+    :return: A list of output variables which will be passed to next stage
+    '''
+    this_operator = scope.declare_local_operator(_get_xgboost_operator_name(type(model)), model)
+    this_operator.inputs = inputs
+
+    if type(model) in xgboost_classifier_list:
+        # For classifiers, we may have two outputs, one for label and the other one for probabilities of all classes.
+        # Notice that their types here are not necessarily correct and they will be fixed in shape inference phase
+        label_variable = scope.declare_local_variable('label', FloatTensorType())
+        probability_map_variable = scope.declare_local_variable('probabilities', FloatTensorType())
+        this_operator.outputs.append(label_variable)
+        this_operator.outputs.append(probability_map_variable)
+    else:
+        # We assume that all scikit-learn operator can only produce a single float tensor.
+        variable = scope.declare_local_variable('variable', FloatTensorType())
+        this_operator.outputs.append(variable)
+    return this_operator.outputs
+
+
+def _parse_xgboost(scope, model, inputs):
+    '''
+    This is a delegate function. It doesn't nothing but invoke the correct parsing function according to the input
+    model's type.
+    :param scope: Scope object
+    :param model: A xgboost object
+    :param inputs: A list of variables
+    :return: The output variables produced by the input model
+    '''
+    return _parse_xgboost_simple_model(scope, model, inputs)
+
+
+def parse_xgboost(model, initial_types=None, target_opset=None,
+                   custom_conversion_functions=None, custom_shape_calculators=None):
+
+    raw_model_container = XGBoostModelContainer(model)
+    topology = Topology(raw_model_container,
+                        initial_types=initial_types, target_opset=target_opset,
+                        custom_conversion_functions=custom_conversion_functions,
+                        custom_shape_calculators=custom_shape_calculators)
+    scope = topology.declare_scope('__root__')
+
+    inputs = []
+    for var_name, initial_type in initial_types:
+        inputs.append(scope.declare_local_variable(var_name, initial_type))
+
+    for variable in inputs:
+        raw_model_container.add_input(variable)
+
+    outputs = _parse_xgboost(scope, model, inputs)
+
+    for variable in outputs:
+        raw_model_container.add_output(variable)
+
+    return topology
diff --git a/onnxmltools/convert/xgboost/common.py b/onnxmltools/convert/xgboost/common.py
@@ -0,0 +1,16 @@
+"""
+Common function to converters and shape calculators.
+"""
+
+def get_xgb_params(xgb_node):
+    """
+    Retrieves parameters of a model.
+    """
+    if hasattr(xgb_node, 'kwargs'):
+        # XGBoost >= 0.7
+        params = xgb_node.get_xgb_params()
+    else:
+        # XGBoost < 0.7
+        params = xgb_node.__dict__
+
+    return params        
diff --git a/onnxmltools/convert/xgboost/convert.py b/onnxmltools/convert/xgboost/convert.py
@@ -0,0 +1,44 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+from uuid import uuid4
+from ...proto import onnx, get_opset_number_from_onnx
+from ..common._topology import convert_topology
+from ._parse import parse_xgboost
+
+# Invoke the registration of all our converters and shape calculators
+# from . import shape_calculators
+from . import operator_converters, shape_calculators
+
+
+def convert(model, name=None, initial_types=None, doc_string='', target_opset=None,
+            targeted_onnx=onnx.__version__, custom_conversion_functions=None,
+            custom_shape_calculators=None):
+    '''
+    This function produces an equivalent ONNX model of the given xgboost model.
+
+    :param model: A xgboost model
+    :param initial_types: a python list. Each element is a tuple of a variable name and a type defined in data_types.py
+    :param name: The name of the graph (type: GraphProto) in the produced ONNX model (type: ModelProto)
+    :param doc_string: A string attached onto the produced ONNX model
+    :param target_opset: number, for example, 7 for ONNX 1.2, and 8 for ONNX 1.3.
+    :param targeted_onnx: A string (for example, '1.1.2' and '1.2') used to specify the targeted ONNX version of the
+        produced model. If ONNXMLTools cannot find a compatible ONNX python package, an error may be thrown.
+    :param custom_conversion_functions: a dictionary for specifying the user customized conversion function
+    :param custom_shape_calculators: a dictionary for specifying the user customized shape calculator
+    :return: An ONNX model (type: ModelProto) which is equivalent to the input xgboost model
+    '''
+    if initial_types is None:
+        raise ValueError('Initial types are required. See usage of convert(...) in \
+                           onnxmltools.convert.xgboost.convert for details')
+    if name is None:
+        name = str(uuid4().hex)
+
+    target_opset = target_opset if target_opset else get_opset_number_from_onnx()
+    topology = parse_xgboost(model, initial_types, target_opset, custom_conversion_functions, custom_shape_calculators)
+    topology.compile()
+    onnx_model = convert_topology(topology, name, doc_string, target_opset, targeted_onnx)
+    return onnx_model