Add an example to create a custom converter for a NMF transformer (#167)

add an example on how to create an ONNX function for a NMF decomposition
onnx · Jun 12, 2019 · bf5311a · bf5311a
1 parent f427759
commit bf5311a
Show file tree

Hide file tree

Showing 11 changed files with 348 additions and 43 deletions.
diff --git a/docs/api_summary.rst b/docs/api_summary.rst
@@ -12,8 +12,35 @@ in *scikit-onnx*.
 Converters
 ==========
 
+Both functions convert a *scikit-learn* model into ONNX.
+The first one lets the user manually 
+define the input's name and types. The second one
+infers this information from the training data.
+These two functions are the main entry points to converter.
+The rest of the API is needed if a model has no converter
+implemented in this package. A new converter has then to be
+registered, whether it is imported from another package
+or created from scratch.
+
 .. autofunction:: skl2onnx.convert_sklearn
 
+.. autofunction:: skl2onnx.to_onnx
+
+Register a new converter
+========================
+
+If a model has no converter
+implemented in this package, a new converter has then to be
+registered, whether it is imported from another package
+or created from scratch. Section :ref:`l-converter-list`
+lists all available converters.
+
+.. autofunction:: skl2onnx.supported_converters
+
+.. autofunction:: skl2onnx.update_registered_converter
+
+.. autofunction:: skl2onnx.update_registered_parser
+
 Manipulate ONNX graphs
 ======================
 
@@ -25,15 +52,6 @@ Manipulate ONNX graphs
 
 .. autofunction:: skl2onnx.helpers.onnx_helper.save_onnx_model
 
-Register a new converter
-========================
-
-.. autofunction:: skl2onnx.supported_converters
-
-.. autofunction:: skl2onnx.update_registered_converter
-
-.. autofunction:: skl2onnx.update_registered_parser
-
 Parsers
 =======
 

diff --git a/docs/examples/plot_nmf.py b/docs/examples/plot_nmf.py
@@ -0,0 +1,155 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+"""
+Custom Operator for NMF Decomposition
+=====================================
+
+`NMF <https://scikit-learn.org/stable/modules/generated/
+sklearn.decomposition.NMF.html>`_ factorizes an input matrix
+into two matrices *W, H* of rank *k* so that :math:`WH \\sim M``.
+:math:`M=(m_{ij})` may be a binary matrix where *i* is a user
+and *j* a product he bought. The prediction
+function depends on whether or not the user needs a
+recommandation for an existing user or a new user.
+This example addresses the first case.
+
+The second case is more complex as it theoretically
+requires the estimation of a new matrix *W* with a
+gradient descent.
+
+.. contents::
+    :local:
+
+Building a simple model
++++++++++++++++++++++++
+
+"""
+
+import os
+import skl2onnx
+import onnxruntime
+import sklearn
+from sklearn.decomposition import NMF
+import numpy as np
+import matplotlib.pyplot as plt
+from onnx.tools.net_drawer import GetPydotGraph, GetOpNodeProducer
+import onnx
+from skl2onnx.algebra.onnx_ops import (
+    OnnxArrayFeatureExtractor, OnnxMul, OnnxReduceSum)
+from skl2onnx.common.data_types import FloatTensorType
+from onnxruntime import InferenceSession
+
+
+mat = np.array([[1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0],
+                [1, 0, 0, 0], [1, 0, 0, 0]], dtype=np.float64)
+mat[:mat.shape[1], :] += np.identity(mat.shape[1])
+
+mod = NMF(n_components=2)
+W = mod.fit_transform(mat)
+H = mod.components_
+pred = mod.inverse_transform(W)
+
+print("original predictions")
+exp = []
+for i in range(mat.shape[0]):
+    for j in range(mat.shape[1]):
+        exp.append((i, j, pred[i, j]))
+
+print(exp)
+
+#######################
+# Let's rewrite the prediction in a way it is closer
+# to the function we need to convert into ONNX.
+
+
+def predict(W, H, row_index, col_index):
+    return np.dot(W[row_index, :], H[:, col_index])
+
+
+got = []
+for i in range(mat.shape[0]):
+    for j in range(mat.shape[1]):
+        got.append((i, j, predict(W, H, i, j)))
+
+print(got)
+
+
+#################################
+# Conversion into ONNX
+# ++++++++++++++++++++
+#
+# There is no implemented converter for
+# `NMF <https://scikit-learn.org/stable/modules/generated/
+# sklearn.decomposition.NMF.html>`_ as the function we plan
+# to convert is not transformer or a predictor.
+# The following converter does not need to be registered,
+# it just creates an ONNX graph equivalent to function
+# *predict* implemented above.
+
+
+def nmf_to_onnx(W, H):
+    """
+    The function converts a NMF described by matrices
+    *W*, *H* (*WH* approximate training data *M*).
+    into a function which takes two indices *(i, j)*
+    and returns the predictions for it. It assumes
+    these indices applies on the training data.
+    """
+    col = OnnxArrayFeatureExtractor(H, 'col')
+    row = OnnxArrayFeatureExtractor(W.T, 'row')
+    dot = OnnxMul(col, row)
+    res = OnnxReduceSum(dot, output_names="rec")
+    indices_type = np.array([0], dtype=np.int64)
+    onx = res.to_onnx(inputs={'col': indices_type,
+                              'row': indices_type},
+                      outputs=[('rec', FloatTensorType((1, 1)))])
+    return onx
+
+
+model_onnx = nmf_to_onnx(W, H)
+print(model_onnx)
+
+########################################
+# Let's compute prediction with it.
+
+sess = InferenceSession(model_onnx.SerializeToString())
+
+
+def predict_onnx(sess, row_indices, col_indices):
+    res = sess.run(None,
+                   {'col': col_indices,
+                    'row': row_indices})
+    return res
+
+
+onnx_preds = []
+for i in range(mat.shape[0]):
+    for j in range(mat.shape[1]):
+        row_indices = np.array([i], dtype=np.int64)
+        col_indices = np.array([j], dtype=np.int64)
+        pred = predict_onnx(sess, row_indices, col_indices)[0]
+        onnx_preds.append((i, j, pred[0, 0]))
+
+print(onnx_preds)
+
+
+###################################
+# The ONNX graph looks like the following.
+pydot_graph = GetPydotGraph(
+    model_onnx.graph, name=model_onnx.graph.name,
+    rankdir="TB", node_producer=GetOpNodeProducer("docstring"))
+pydot_graph.write_dot("graph_nmf.dot")
+os.system('dot -O -Tpng graph_nmf.dot')
+image = plt.imread("graph_nmf.dot.png")
+plt.imshow(image)
+plt.axis('off')
+
+#################################
+# **Versions used for this example**
+
+print("numpy:", np.__version__)
+print("scikit-learn:", sklearn.__version__)
+print("onnx: ", onnx.__version__)
+print("onnxruntime: ", onnxruntime.__version__)
+print("skl2onnx: ", skl2onnx.__version__)
diff --git a/docs/index.rst b/docs/index.rst
@@ -26,7 +26,6 @@ toolkits into `ONNX <https://onnx.ai>`_.
     pipeline
     parameterized
     supported
-    onnx_ops
 
 **Issues, questions**
 

diff --git a/docs/onnx_ops.rst b/docs/onnx_ops.rst
diff --git a/docs/supported.rst b/docs/supported.rst
@@ -15,6 +15,8 @@ implements *to_onnx* methods.
 .. contents::
     :local:
 
+.. _l-converter-list:
+
 Covered Converters
 ==================
 
@@ -37,3 +39,20 @@ Pipeline
 .. autoclass:: skl2onnx.algebra.sklearn_ops.OnnxSklearnFeatureUnion
     :members: to_onnx, to_onnx_operator, onnx_parser, onnx_shape_calculator, onnx_converter
 
+Available ONNX operators
+========================
+
+*skl2onnx* maps every ONNX operators into a class
+easy to insert into a graph. These operators get
+dynamically added and the list depends on the installed
+*ONNX* package. The documentation for these operators
+can be found on github: `ONNX Operators.md
+<https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_
+and `ONNX-ML Operators
+<https://github.com/onnx/onnx/blob/master/docs/Operators-ml.md>`_.
+Associated to `onnxruntime <https://github.com/Microsoft/onnxruntime>`_,
+the mapping makes it easier to easily check the output
+of the *ONNX* operators on any data as shown
+in example :ref:`l-onnx-operators`.
+
+.. supported-onnx-ops::
diff --git a/skl2onnx/algebra/type_helper.py b/skl2onnx/algebra/type_helper.py
@@ -6,10 +6,13 @@
 import numpy as np
 from ..proto import TensorProto, ValueInfoProto, onnx_proto
 from ..common._topology import Variable
-from ..common.data_types import FloatTensorType, Int64TensorType
-from ..common.data_types import StringTensorType
-from ..common.data_types import Int32TensorType, DoubleTensorType
-from ..common.data_types import BooleanTensorType
+from ..common.data_types import (
+    BooleanTensorType,
+    DoubleTensorType, FloatTensorType,
+    Int64Type,
+    Int64TensorType, Int32TensorType,
+    StringTensorType
+)
 
 
 def _guess_type_proto(data_type, dims):
@@ -39,16 +42,18 @@ def _guess_type(given_type):
     if isinstance(given_type, np.ndarray):
         if given_type.dtype == np.float32:
             return FloatTensorType(given_type.shape)
+        elif given_type.dtype == np.int32:
+            return Int32TensorType(given_type.shape)
         elif given_type.dtype == np.int64:
             return Int64TensorType(given_type.shape)
-        elif given_type.dtype == np.str:
+        elif given_type.dtype == np.str or str(given_type.dtype) in ('<U1', ):
             return StringTensorType(given_type.shape)
         else:
             raise NotImplementedError(
                 "Unsupported type '{}'. Double should "
                 "be converted into single floats.".format(given_type.dtype))
     elif isinstance(given_type, (FloatTensorType, Int64TensorType,
-                                 StringTensorType)):
+                                 Int32TensorType, StringTensorType)):
         return given_type
     elif isinstance(given_type, Variable):
         return given_type.type
@@ -60,6 +65,8 @@ def _guess_type(given_type):
         dims = [ttype.shape.dim[i].dim_value
                 for i in range(len(ttype.shape.dim))]
         return _guess_type_proto(ttype.elem_type, dims)
+    elif isinstance(given_type, np.int64):
+        return Int64Type()
     else:
         raise NotImplementedError(
             "Unsupported type '{}'. You may raise an issue "

diff --git a/skl2onnx/common/data_types.py b/skl2onnx/common/data_types.py
@@ -7,11 +7,10 @@
 from ..proto import TensorProto, onnx_proto
 from onnxconverter_common.data_types import DataType, Int64Type, FloatType  # noqa
 from onnxconverter_common.data_types import StringType, TensorType  # noqa
-from onnxconverter_common.data_types import Int64TensorType
-from onnxconverter_common.data_types import Int32TensorType, BooleanTensorType
-from onnxconverter_common.data_types import FloatTensorType, StringTensorType
-from onnxconverter_common.data_types import DoubleTensorType
-from onnxconverter_common.data_types import DictionaryType, SequenceType  # noqa
+from onnxconverter_common.data_types import (  # noqa
+    Int64TensorType, Int32TensorType, BooleanTensorType,
+    FloatTensorType, StringTensorType, DoubleTensorType,
+    DictionaryType, SequenceType)
 from onnxconverter_common.data_types import find_type_conversion, onnx_built_with_ml  # noqa
 
 

diff --git a/skl2onnx/operator_converters/text_vectoriser.py b/skl2onnx/operator_converters/text_vectoriser.py
@@ -178,7 +178,7 @@ def convert_sklearn_text_vectorizer(scope, operator, container):
             default_separators = options['sep']
     else:
         if options['sep'] != 'DEFAULT':
-            raise RuntimeError("Option sep has not effect "
+            raise RuntimeError("Option sep has no effect "
                                "if analyser != 'word'.")
         regex = options['regex'] if options['regex'] else '.'
         default_separators = None

diff --git a/tests/test_algebra_test_helper.py b/tests/test_algebra_test_helper.py
@@ -0,0 +1,40 @@
+import unittest
+import numpy as np
+from skl2onnx.algebra.type_helper import _guess_type
+from skl2onnx.common.data_types import (
+    FloatTensorType, Int64TensorType,
+    Int32TensorType, StringTensorType
+)
+
+
+class TestAlgebraTestHelper(unittest.TestCase):
+
+    def test_guess_type(self):
+        dtypes = [
+            (np.int32, Int32TensorType),
+            (np.int64, Int64TensorType),
+            (np.float32, FloatTensorType),
+            (np.str, StringTensorType)
+        ]
+        for dtype, exp in dtypes:
+            if dtype == np.str:
+                mat = np.empty((3, 3), dtype=dtype)
+                mat[:, :] = ""
+            else:
+                mat = np.zeros((3, 3), dtype=dtype)
+            res = _guess_type(mat)
+            assert isinstance(res, exp)
+
+        dtypes = [np.float64]
+        for dtype in dtypes:
+            mat = np.zeros((3, 3), dtype=dtype)
+            try:
+                _guess_type(mat)
+                raise AssertionError("It should fail for type "
+                                     "{}".format(dtype))
+            except NotImplementedError:
+                pass
+
+
+if __name__ == "__main__":
+    unittest.main()