Update TfIdf converter to reflect changes in Tokenizer specifications (…

…#178) Update converter for TfIdf after a change of spec in Tokenizer
onnx · Jun 14, 2019 · 4eb0217 · 4eb0217
1 parent 4200f11
commit 4eb0217
Show file tree

Hide file tree

Showing 6 changed files with 84 additions and 66 deletions.
diff --git a/skl2onnx/convert.py b/skl2onnx/convert.py
@@ -87,7 +87,8 @@ def convert_sklearn(model, name=None, initial_types=None, doc_string='',
 
     ::
 
-        extra = {TfidfVectorizer: {"sep": [' ', '.', '?', ',', ';', ':', '!', '(', ')']}}
+        extra = {TfidfVectorizer: {"separators": [' ', '[.]', '\\\\?',
+                    ',', ';', ':', '\\\\!', '\\\\(', '\\\\)']}}
         model_onnx = convert_sklearn(model, "tfidf",
                                      initial_types=[("input", StringTensorType([1, 1]))],
                                      options=extra)
@@ -97,7 +98,8 @@ def convert_sklearn(model, name=None, initial_types=None, doc_string='',
 
     ::
 
-        extra = {id(model): {"sep": [' ', '.', '?', ',', ';', ':', '!', '(', ')']}}
+        extra = {id(model): {"separators": [' ', '.', '\\\\?', ',', ';',
+                    ':', '\\\\!', '\\\\(', '\\\\)']}}
         model_onnx = convert_sklearn(pipeline, "pipeline-with-2-tfidf",
                                      initial_types=[("input", StringTensorType([1, 1]))],
                                      options=extra)

diff --git a/skl2onnx/operator_converters/text_vectoriser.py b/skl2onnx/operator_converters/text_vectoriser.py
@@ -79,32 +79,35 @@ def _intelligent_split(text, op, tokenizer, existing):
 def convert_sklearn_text_vectorizer(scope, operator, container):
     """
     Converters for class
-    `TfidfVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html>`_.
+    `TfidfVectorizer <https://scikit-learn.org/stable/modules/generated/
+    sklearn.feature_extraction.text.TfidfVectorizer.html>`_.
     The current implementation is a work in progress and the ONNX version
     does not produce the exact same results. The converter lets the user
     change some of its parameters.
 
     Additional options
     ------------------
 
-    regex: string
+    tokenexp: string
         The default will change to true in version 1.6.0.
         The tokenizer splits into words using this regular
         expression or the regular expression specified by
         *scikit-learn* is the value is an empty string.
         See also note below.
         Default value: None
-    sep: list of separators
+    separators: list of separators
         These separators are used to split a string into words.
-        Options *sep* is ignore if options *regex* is not None.
-        Default value: ``[' ', '.', '?', ',', ';', ':', '!']``.
+        Options *separators* is ignore if options *tokenexp* is not None.
+        Default value: ``[' ', '[.]', '\\\\?', ',', ';', ':', '\\\\!']``.
 
     Example (from :ref:`l-example-tfidfvectorizer`):
 
     ::
 
-        seps = {TfidfVectorizer: {"sep": [' ', '.', '?', ',', ';', ':', '!', '(', ')',
-                                           '\\n', '"', "'", "-", "[", "]", "@"]}}
+        seps = {TfidfVectorizer: {"separators": [' ', '[.]', '\\\\?', ',', ';',
+                                                 ':', '!', '\\\\(', '\\\\)',
+                                                 '\\n', '\\\\"', "'", "-",
+                                                 "\\\\[", "\\\\]", "@"]}}
         model_onnx = convert_sklearn(pipeline, "tfidf",
                                      initial_types=[("input", StringTensorType([1, 2]))],
                                      options=seps)
@@ -127,6 +130,10 @@ def convert_sklearn_text_vectorizer(scope, operator, container):
     Regular expression ``[^\\\\\\\\n]`` is used to split
     a sentance into character (and not works) if ``analyser=='char'``.
     The mode ``analyser=='char_wb'`` is not implemented.
+    
+    .. versionchanged:: 1.6
+        Parameters have been renamed: *sep* into *separators*,
+        *regex* into *tokenexp*.
     ````
     
     """ # noqa
@@ -147,40 +154,40 @@ def convert_sklearn_text_vectorizer(scope, operator, container):
             "https://github.com/onnx/sklearn-onnx/issues.")
 
     options = container.get_options(
-            op, dict(sep="DEFAULT",
-                     regex=None))
-    if set(options) != {'sep', 'regex'}:
+            op, dict(separators="DEFAULT",
+                     tokenexp=None))
+    if set(options) != {'separators', 'tokenexp'}:
         raise RuntimeError("Unknown option {} for {}".format(
-                                set(options) - {'sep'}, type(op)))
+                                set(options) - {'separators'}, type(op)))
 
     if op.analyzer == 'word':
         default_pattern = '(?u)\\b\\w\\w+\\b'
-        if options['sep'] == "DEFAULT" and options['regex'] is None:
+        if options['separators'] == "DEFAULT" and options['tokenexp'] is None:
             warnings.warn("Converter for TfidfVectorizer will use "
                           "scikit-learn regular expression by default "
                           "in version 1.6.",
                           DeprecationWarning)
-            default_separators = [' ', '.', '?', ',', ';', ':', '!']
+            default_separators = [' ', '.', '\\?', ',', ';', ':', '\\!']
             regex = op.token_pattern
             if regex == default_pattern:
                 regex = '[a-zA-Z0-9_]+'
             default_separators = None
-        elif options['regex'] is not None:
-            if options['regex']:
-                regex = options['regex']
+        elif options['tokenexp'] is not None:
+            if options['tokenexp']:
+                regex = options['tokenexp']
             else:
                 regex = op.token_pattern
                 if regex == default_pattern:
                     regex = '[a-zA-Z0-9_]+'
             default_separators = None
         else:
             regex = None
-            default_separators = options['sep']
+            default_separators = options['separators']
     else:
-        if options['sep'] != 'DEFAULT':
-            raise RuntimeError("Option sep has no effect "
+        if options['separators'] != 'DEFAULT':
+            raise RuntimeError("Option separators has no effect "
                                "if analyser != 'word'.")
-        regex = options['regex'] if options['regex'] else '.'
+        regex = options['tokenexp'] if options['tokenexp'] else '.'
         default_separators = None
 
     if op.preprocessor is not None:

diff --git a/tests/test_sklearn_count_vectorizer_converter_bug.py b/tests/test_sklearn_count_vectorizer_converter_bug.py
@@ -30,7 +30,7 @@ def test_model_count_vectorizer_custom_tokenizer(self):
 
         extra = {
             CountVectorizer: {
-                "sep": ["ZZZZ"]
+                "separators": ["ZZZZ"]
             }
         }
 
@@ -46,7 +46,7 @@ def test_model_count_vectorizer_custom_tokenizer(self):
             corpus, vect, model_onnx,
             basename="SklearnTfidfVectorizer11CustomTokenizer-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__) <= "
-                          "StrictVersion('0.3.0')")
+                          "StrictVersion('0.4.0')")
 
     @unittest.skipIf(
         StrictVersion(onnx.__version__) < StrictVersion("1.4.1"),

diff --git a/tests/test_sklearn_documentation.py b/tests/test_sklearn_documentation.py
@@ -70,7 +70,9 @@ def test_pipeline_tfidf(self):
         tfi.fit(tdata.ravel())
         extra = {
             TfidfVectorizer: {
-                "sep": [" ", ".", "?", ",", ";", ":", "!", "(", ")"]
+                "separators": [
+                    " ", "[.]", "\\?", ",", ";", ":", "\\!", "\\(", "\\)"
+                ]
             }
         }
         model_onnx = convert_sklearn(
@@ -114,22 +116,22 @@ def test_pipeline_tfidf_pipeline_minmax(self):
         pipeline.fit(train_data[:300])
         extra = {
             TfidfVectorizer: {
-                "sep": [
+                "separators": [
                     " ",
-                    ".",
-                    "?",
+                    "[.]",
+                    "\\?",
                     ",",
                     ";",
                     ":",
-                    "!",
-                    "(",
-                    ")",
+                    "\\!",
+                    "\\(",
+                    "\\)",
                     "\n",
                     '"',
                     "'",
                     "-",
-                    "[",
-                    "]",
+                    "\\[",
+                    "\\]",
                     "@",
                 ]
             }

diff --git a/tests/test_sklearn_tfidf_vectorizer_converter.py b/tests/test_sklearn_tfidf_vectorizer_converter.py
@@ -14,7 +14,7 @@
 class TestSklearnTfidfVectorizer(unittest.TestCase):
 
     def get_options(self):
-        return {TfidfVectorizer: {"regex": None}}
+        return {TfidfVectorizer: {"tokenexp": None}}
 
     @unittest.skipIf(
         StrictVersion(onnx.__version__) < StrictVersion("1.4.1"),
@@ -38,7 +38,7 @@ def test_model_tfidf_vectorizer11(self):
             model_onnx,
             basename="SklearnTfidfVectorizer11-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__)"
-                          " <= StrictVersion('0.3.0')",
+                          " <= StrictVersion('0.4.0')",
         )
 
     @unittest.skipIf(
@@ -64,8 +64,8 @@ def test_model_tfidf_vectorizer11_empty_string_case1(self):
             corpus[2:], vect, model_onnx,
             basename="SklearnTfidfVectorizer11EmptyStringSepCase1-"
                      "OneOff-SklCol",
-            allow_failure="StrictVersion(onnxruntime.__version__) <= "
-                          "StrictVersion('0.3.0')")
+            allow_failure="StrictVersion(onnxruntime.__version__)"
+                          " <= StrictVersion('0.4.0')")
 
     @unittest.skipIf(
         StrictVersion(onnx.__version__) < StrictVersion("1.4.1"),
@@ -90,7 +90,7 @@ def test_model_tfidf_vectorizer11_empty_string_case2(self):
             model_onnx,
             basename="SklearnTfidfVectorizer11EmptyString-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__)"
-                          " <= StrictVersion('0.3.0')",
+                          " <= StrictVersion('0.4.0')",
         )
 
     @unittest.skipIf(
@@ -121,7 +121,7 @@ def test_model_tfidf_vectorizer11_out_vocabulary(self):
             model_onnx,
             basename="SklearnTfidfVectorizer11OutVocab-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__)"
-                          " <= StrictVersion('0.3.0')",
+                          " <= StrictVersion('0.4.0')",
         )
 
     @unittest.skipIf(
@@ -146,7 +146,7 @@ def test_model_tfidf_vectorizer22(self):
             model_onnx,
             basename="SklearnTfidfVectorizer22-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__)"
-                          " <= StrictVersion('0.3.0')",
+                          " <= StrictVersion('0.4.0')",
         )
 
     @unittest.skipIf(
@@ -166,7 +166,7 @@ def test_model_tfidf_vectorizer21(self):
             model_onnx,
             basename="SklearnTfidfVectorizer22S-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__)"
-                          " <= StrictVersion('0.3.0')",
+                          " <= StrictVersion('0.4.0')",
         )
 
     @unittest.skipIf(
@@ -191,7 +191,7 @@ def test_model_tfidf_vectorizer12(self):
             model_onnx,
             basename="SklearnTfidfVectorizer22-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__)"
-                          " <= StrictVersion('0.3.0')",
+                          " <= StrictVersion('0.4.0')",
         )
 
     @unittest.skipIf(
@@ -215,7 +215,7 @@ def test_model_tfidf_vectorizer12_normL1(self):
             model_onnx,
             basename="SklearnTfidfVectorizer22L1-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__)"
-                          " <= StrictVersion('0.3.0')",
+                          " <= StrictVersion('0.4.0')",
         )
 
     @unittest.skipIf(
@@ -240,7 +240,7 @@ def test_model_tfidf_vectorizer12_normL2(self):
             model_onnx,
             basename="SklearnTfidfVectorizer22L2-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__)"
-                          " <= StrictVersion('0.3.0')",
+                          " <= StrictVersion('0.4.0')",
         )
 
     @unittest.skipIf(
@@ -265,7 +265,7 @@ def test_model_tfidf_vectorizer13(self):
             model_onnx,
             basename="SklearnTfidfVectorizer13-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__)"
-                          " <= StrictVersion('0.3.0')",
+                          " <= StrictVersion('0.4.0')",
         )
 
     @unittest.skipIf(
@@ -282,7 +282,9 @@ def test_model_tfidf_vectorizer11parenthesis_class(self):
         vect.fit(corpus.ravel())
         extra = {
             TfidfVectorizer: {
-                "sep": [" ", ".", "?", ",", ";", ":", "!", "(", ")"]
+                "separators": [
+                    " ", "\\.", "\\?", ",", ";", ":", "\\!", "\\(", "\\)"
+                ]
             }
         }
         model_onnx = convert_sklearn(
@@ -333,7 +335,9 @@ def test_model_tfidf_vectorizer11_idparenthesis_id(self):
 
         extra = {
             id(vect): {
-                "sep": [" ", ".", "?", ",", ";", ":", "!", "(", ")"]
+                "separators": [
+                    " ", "[.]", "\\?", ",", ";", ":", "\\!", "\\(", "\\)"
+                ]
             }
         }
         model_onnx = convert_sklearn(
@@ -351,8 +355,7 @@ def test_model_tfidf_vectorizer11_idparenthesis_id(self):
             model_onnx,
             basename="SklearnTfidfVectorizer11ParenthesisId-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__)"
-                          " <= StrictVersion('0.4.0')",
-        )
+                          " <= StrictVersion('0.4.0')")
 
 
 if __name__ == "__main__":