From 4eb0217d4f009fe2b05a2ee4dd9d4b8988910caa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@users.noreply.github.com>
Date: Fri, 14 Jun 2019 15:49:45 +0200
Subject: [PATCH] Update TfIdf converter to reflect changes in Tokenizer
 specifications (#178)

Update converter for TfIdf after a change of spec in Tokenizer
---
 skl2onnx/convert.py                           |  6 ++-
 .../operator_converters/text_vectoriser.py    | 47 +++++++++++--------
 ..._sklearn_count_vectorizer_converter_bug.py |  4 +-
 tests/test_sklearn_documentation.py           | 20 ++++----
 ...test_sklearn_tfidf_vectorizer_converter.py | 35 +++++++-------
 ...klearn_tfidf_vectorizer_converter_regex.py | 38 ++++++++-------
 6 files changed, 84 insertions(+), 66 deletions(-)

diff --git a/skl2onnx/convert.py b/skl2onnx/convert.py
index 558c0c340..9d825ebda 100644
--- a/skl2onnx/convert.py
+++ b/skl2onnx/convert.py
@@ -87,7 +87,8 @@ def convert_sklearn(model, name=None, initial_types=None, doc_string='',
 
     ::
 
-        extra = {TfidfVectorizer: {"sep": [' ', '.', '?', ',', ';', ':', '!', '(', ')']}}
+        extra = {TfidfVectorizer: {"separators": [' ', '[.]', '\\\\?',
+                    ',', ';', ':', '\\\\!', '\\\\(', '\\\\)']}}
         model_onnx = convert_sklearn(model, "tfidf",
                                      initial_types=[("input", StringTensorType([1, 1]))],
                                      options=extra)
@@ -97,7 +98,8 @@ def convert_sklearn(model, name=None, initial_types=None, doc_string='',
 
     ::
 
-        extra = {id(model): {"sep": [' ', '.', '?', ',', ';', ':', '!', '(', ')']}}
+        extra = {id(model): {"separators": [' ', '.', '\\\\?', ',', ';',
+                    ':', '\\\\!', '\\\\(', '\\\\)']}}
         model_onnx = convert_sklearn(pipeline, "pipeline-with-2-tfidf",
                                      initial_types=[("input", StringTensorType([1, 1]))],
                                      options=extra)
diff --git a/skl2onnx/operator_converters/text_vectoriser.py b/skl2onnx/operator_converters/text_vectoriser.py
index b09295158..3362ecf91 100644
--- a/skl2onnx/operator_converters/text_vectoriser.py
+++ b/skl2onnx/operator_converters/text_vectoriser.py
@@ -79,7 +79,8 @@ def _intelligent_split(text, op, tokenizer, existing):
 def convert_sklearn_text_vectorizer(scope, operator, container):
     """
     Converters for class
-    `TfidfVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html>`_.
+    `TfidfVectorizer <https://scikit-learn.org/stable/modules/generated/
+    sklearn.feature_extraction.text.TfidfVectorizer.html>`_.
     The current implementation is a work in progress and the ONNX version
     does not produce the exact same results. The converter lets the user
     change some of its parameters.
@@ -87,24 +88,26 @@ def convert_sklearn_text_vectorizer(scope, operator, container):
     Additional options
     ------------------
 
-    regex: string
+    tokenexp: string
         The default will change to true in version 1.6.0.
         The tokenizer splits into words using this regular
         expression or the regular expression specified by
         *scikit-learn* is the value is an empty string.
         See also note below.
         Default value: None
-    sep: list of separators
+    separators: list of separators
         These separators are used to split a string into words.
-        Options *sep* is ignore if options *regex* is not None.
-        Default value: ``[' ', '.', '?', ',', ';', ':', '!']``.
+        Options *separators* is ignore if options *tokenexp* is not None.
+        Default value: ``[' ', '[.]', '\\\\?', ',', ';', ':', '\\\\!']``.
 
     Example (from :ref:`l-example-tfidfvectorizer`):
 
     ::
 
-        seps = {TfidfVectorizer: {"sep": [' ', '.', '?', ',', ';', ':', '!', '(', ')',
-                                           '\\n', '"', "'", "-", "[", "]", "@"]}}
+        seps = {TfidfVectorizer: {"separators": [' ', '[.]', '\\\\?', ',', ';',
+                                                 ':', '!', '\\\\(', '\\\\)',
+                                                 '\\n', '\\\\"', "'", "-",
+                                                 "\\\\[", "\\\\]", "@"]}}
         model_onnx = convert_sklearn(pipeline, "tfidf",
                                      initial_types=[("input", StringTensorType([1, 2]))],
                                      options=seps)
@@ -127,6 +130,10 @@ def convert_sklearn_text_vectorizer(scope, operator, container):
     Regular expression ``[^\\\\\\\\n]`` is used to split
     a sentance into character (and not works) if ``analyser=='char'``.
     The mode ``analyser=='char_wb'`` is not implemented.
+    
+    .. versionchanged:: 1.6
+        Parameters have been renamed: *sep* into *separators*,
+        *regex* into *tokenexp*.
     ````
     
     """ # noqa
@@ -147,27 +154,27 @@ def convert_sklearn_text_vectorizer(scope, operator, container):
             "https://github.com/onnx/sklearn-onnx/issues.")
 
     options = container.get_options(
-            op, dict(sep="DEFAULT",
-                     regex=None))
-    if set(options) != {'sep', 'regex'}:
+            op, dict(separators="DEFAULT",
+                     tokenexp=None))
+    if set(options) != {'separators', 'tokenexp'}:
         raise RuntimeError("Unknown option {} for {}".format(
-                                set(options) - {'sep'}, type(op)))
+                                set(options) - {'separators'}, type(op)))
 
     if op.analyzer == 'word':
         default_pattern = '(?u)\\b\\w\\w+\\b'
-        if options['sep'] == "DEFAULT" and options['regex'] is None:
+        if options['separators'] == "DEFAULT" and options['tokenexp'] is None:
             warnings.warn("Converter for TfidfVectorizer will use "
                           "scikit-learn regular expression by default "
                           "in version 1.6.",
                           DeprecationWarning)
-            default_separators = [' ', '.', '?', ',', ';', ':', '!']
+            default_separators = [' ', '.', '\\?', ',', ';', ':', '\\!']
             regex = op.token_pattern
             if regex == default_pattern:
                 regex = '[a-zA-Z0-9_]+'
             default_separators = None
-        elif options['regex'] is not None:
-            if options['regex']:
-                regex = options['regex']
+        elif options['tokenexp'] is not None:
+            if options['tokenexp']:
+                regex = options['tokenexp']
             else:
                 regex = op.token_pattern
                 if regex == default_pattern:
@@ -175,12 +182,12 @@ def convert_sklearn_text_vectorizer(scope, operator, container):
             default_separators = None
         else:
             regex = None
-            default_separators = options['sep']
+            default_separators = options['separators']
     else:
-        if options['sep'] != 'DEFAULT':
-            raise RuntimeError("Option sep has no effect "
+        if options['separators'] != 'DEFAULT':
+            raise RuntimeError("Option separators has no effect "
                                "if analyser != 'word'.")
-        regex = options['regex'] if options['regex'] else '.'
+        regex = options['tokenexp'] if options['tokenexp'] else '.'
         default_separators = None
 
     if op.preprocessor is not None:
diff --git a/tests/test_sklearn_count_vectorizer_converter_bug.py b/tests/test_sklearn_count_vectorizer_converter_bug.py
index 1fb608fe1..c4433c7f1 100644
--- a/tests/test_sklearn_count_vectorizer_converter_bug.py
+++ b/tests/test_sklearn_count_vectorizer_converter_bug.py
@@ -30,7 +30,7 @@ def test_model_count_vectorizer_custom_tokenizer(self):
 
         extra = {
             CountVectorizer: {
-                "sep": ["ZZZZ"]
+                "separators": ["ZZZZ"]
             }
         }
 
@@ -46,7 +46,7 @@ def test_model_count_vectorizer_custom_tokenizer(self):
             corpus, vect, model_onnx,
             basename="SklearnTfidfVectorizer11CustomTokenizer-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__) <= "
-                          "StrictVersion('0.3.0')")
+                          "StrictVersion('0.4.0')")
 
     @unittest.skipIf(
         StrictVersion(onnx.__version__) < StrictVersion("1.4.1"),
diff --git a/tests/test_sklearn_documentation.py b/tests/test_sklearn_documentation.py
index 859f20773..3e32cd26d 100644
--- a/tests/test_sklearn_documentation.py
+++ b/tests/test_sklearn_documentation.py
@@ -70,7 +70,9 @@ def test_pipeline_tfidf(self):
         tfi.fit(tdata.ravel())
         extra = {
             TfidfVectorizer: {
-                "sep": [" ", ".", "?", ",", ";", ":", "!", "(", ")"]
+                "separators": [
+                    " ", "[.]", "\\?", ",", ";", ":", "\\!", "\\(", "\\)"
+                ]
             }
         }
         model_onnx = convert_sklearn(
@@ -114,22 +116,22 @@ def test_pipeline_tfidf_pipeline_minmax(self):
         pipeline.fit(train_data[:300])
         extra = {
             TfidfVectorizer: {
-                "sep": [
+                "separators": [
                     " ",
-                    ".",
-                    "?",
+                    "[.]",
+                    "\\?",
                     ",",
                     ";",
                     ":",
-                    "!",
-                    "(",
-                    ")",
+                    "\\!",
+                    "\\(",
+                    "\\)",
                     "\n",
                     '"',
                     "'",
                     "-",
-                    "[",
-                    "]",
+                    "\\[",
+                    "\\]",
                     "@",
                 ]
             }
diff --git a/tests/test_sklearn_tfidf_vectorizer_converter.py b/tests/test_sklearn_tfidf_vectorizer_converter.py
index d5da21178..743080691 100644
--- a/tests/test_sklearn_tfidf_vectorizer_converter.py
+++ b/tests/test_sklearn_tfidf_vectorizer_converter.py
@@ -14,7 +14,7 @@
 class TestSklearnTfidfVectorizer(unittest.TestCase):
 
     def get_options(self):
-        return {TfidfVectorizer: {"regex": None}}
+        return {TfidfVectorizer: {"tokenexp": None}}
 
     @unittest.skipIf(
         StrictVersion(onnx.__version__) < StrictVersion("1.4.1"),
@@ -38,7 +38,7 @@ def test_model_tfidf_vectorizer11(self):
             model_onnx,
             basename="SklearnTfidfVectorizer11-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__)"
-                          " <= StrictVersion('0.3.0')",
+                          " <= StrictVersion('0.4.0')",
         )
 
     @unittest.skipIf(
@@ -64,8 +64,8 @@ def test_model_tfidf_vectorizer11_empty_string_case1(self):
             corpus[2:], vect, model_onnx,
             basename="SklearnTfidfVectorizer11EmptyStringSepCase1-"
                      "OneOff-SklCol",
-            allow_failure="StrictVersion(onnxruntime.__version__) <= "
-                          "StrictVersion('0.3.0')")
+            allow_failure="StrictVersion(onnxruntime.__version__)"
+                          " <= StrictVersion('0.4.0')")
 
     @unittest.skipIf(
         StrictVersion(onnx.__version__) < StrictVersion("1.4.1"),
@@ -90,7 +90,7 @@ def test_model_tfidf_vectorizer11_empty_string_case2(self):
             model_onnx,
             basename="SklearnTfidfVectorizer11EmptyString-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__)"
-                          " <= StrictVersion('0.3.0')",
+                          " <= StrictVersion('0.4.0')",
         )
 
     @unittest.skipIf(
@@ -121,7 +121,7 @@ def test_model_tfidf_vectorizer11_out_vocabulary(self):
             model_onnx,
             basename="SklearnTfidfVectorizer11OutVocab-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__)"
-                          " <= StrictVersion('0.3.0')",
+                          " <= StrictVersion('0.4.0')",
         )
 
     @unittest.skipIf(
@@ -146,7 +146,7 @@ def test_model_tfidf_vectorizer22(self):
             model_onnx,
             basename="SklearnTfidfVectorizer22-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__)"
-                          " <= StrictVersion('0.3.0')",
+                          " <= StrictVersion('0.4.0')",
         )
 
     @unittest.skipIf(
@@ -166,7 +166,7 @@ def test_model_tfidf_vectorizer21(self):
             model_onnx,
             basename="SklearnTfidfVectorizer22S-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__)"
-                          " <= StrictVersion('0.3.0')",
+                          " <= StrictVersion('0.4.0')",
         )
 
     @unittest.skipIf(
@@ -191,7 +191,7 @@ def test_model_tfidf_vectorizer12(self):
             model_onnx,
             basename="SklearnTfidfVectorizer22-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__)"
-                          " <= StrictVersion('0.3.0')",
+                          " <= StrictVersion('0.4.0')",
         )
 
     @unittest.skipIf(
@@ -215,7 +215,7 @@ def test_model_tfidf_vectorizer12_normL1(self):
             model_onnx,
             basename="SklearnTfidfVectorizer22L1-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__)"
-                          " <= StrictVersion('0.3.0')",
+                          " <= StrictVersion('0.4.0')",
         )
 
     @unittest.skipIf(
@@ -240,7 +240,7 @@ def test_model_tfidf_vectorizer12_normL2(self):
             model_onnx,
             basename="SklearnTfidfVectorizer22L2-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__)"
-                          " <= StrictVersion('0.3.0')",
+                          " <= StrictVersion('0.4.0')",
         )
 
     @unittest.skipIf(
@@ -265,7 +265,7 @@ def test_model_tfidf_vectorizer13(self):
             model_onnx,
             basename="SklearnTfidfVectorizer13-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__)"
-                          " <= StrictVersion('0.3.0')",
+                          " <= StrictVersion('0.4.0')",
         )
 
     @unittest.skipIf(
@@ -282,7 +282,9 @@ def test_model_tfidf_vectorizer11parenthesis_class(self):
         vect.fit(corpus.ravel())
         extra = {
             TfidfVectorizer: {
-                "sep": [" ", ".", "?", ",", ";", ":", "!", "(", ")"]
+                "separators": [
+                    " ", "\\.", "\\?", ",", ";", ":", "\\!", "\\(", "\\)"
+                ]
             }
         }
         model_onnx = convert_sklearn(
@@ -333,7 +335,9 @@ def test_model_tfidf_vectorizer11_idparenthesis_id(self):
 
         extra = {
             id(vect): {
-                "sep": [" ", ".", "?", ",", ";", ":", "!", "(", ")"]
+                "separators": [
+                    " ", "[.]", "\\?", ",", ";", ":", "\\!", "\\(", "\\)"
+                ]
             }
         }
         model_onnx = convert_sklearn(
@@ -351,8 +355,7 @@ def test_model_tfidf_vectorizer11_idparenthesis_id(self):
             model_onnx,
             basename="SklearnTfidfVectorizer11ParenthesisId-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__)"
-                          " <= StrictVersion('0.4.0')",
-        )
+                          " <= StrictVersion('0.4.0')")
 
 
 if __name__ == "__main__":
diff --git a/tests/test_sklearn_tfidf_vectorizer_converter_regex.py b/tests/test_sklearn_tfidf_vectorizer_converter_regex.py
index 7a89ab20b..bb40d362b 100644
--- a/tests/test_sklearn_tfidf_vectorizer_converter_regex.py
+++ b/tests/test_sklearn_tfidf_vectorizer_converter_regex.py
@@ -14,7 +14,7 @@
 class TestSklearnTfidfVectorizerRegex(unittest.TestCase):
 
     def get_options(self):
-        return {TfidfVectorizer: {"regex": ""}}
+        return {TfidfVectorizer: {"tokenexp": ""}}
 
     @unittest.skipIf(
         StrictVersion(onnx.__version__) < StrictVersion("1.4.1"),
@@ -36,7 +36,7 @@ def test_model_tfidf_vectorizer11(self):
             corpus, vect, model_onnx,
             basename="SklearnTfidfVectorizer11Regex-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__) <= "
-                          "StrictVersion('0.3.0')")
+                          "StrictVersion('0.4.0')")
 
     @unittest.skipIf(
         StrictVersion(onnx.__version__) < StrictVersion("1.4.1"),
@@ -59,7 +59,7 @@ def test_model_tfidf_vectorizer11_word4(self):
             corpus, vect, model_onnx,
             basename="SklearnTfidfVectorizer11Regex4-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__) <= "
-                          "StrictVersion('0.3.0')")
+                          "StrictVersion('0.4.0')")
 
     @unittest.skipIf(
         StrictVersion(onnx.__version__) < StrictVersion("1.4.1"),
@@ -82,7 +82,7 @@ def test_model_tfidf_vectorizer11_empty_string(self):
             corpus, vect, model_onnx,
             basename="SklearnTfidfVectorizer11EmptyStringRegex-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__) "
-                          "<= StrictVersion('0.3.0')")
+                          "<= StrictVersion('0.4.0')")
 
     @unittest.skipIf(
         StrictVersion(onnx.__version__) < StrictVersion("1.4.1"),
@@ -110,7 +110,7 @@ def test_model_tfidf_vectorizer11_out_vocabulary(self):
             corpus, vect, model_onnx,
             basename="SklearnTfidfVectorizer11OutVocabRegex-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__) <= "
-                          "StrictVersion('0.3.0')")
+                          "StrictVersion('0.4.0')")
 
     @unittest.skipIf(
         StrictVersion(onnx.__version__) < StrictVersion("1.4.1"),
@@ -132,7 +132,7 @@ def test_model_tfidf_vectorizer22(self):
             corpus, vect, model_onnx,
             basename="SklearnTfidfVectorizer22Regex-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__) <= "
-                          "StrictVersion('0.3.0')")
+                          "StrictVersion('0.4.0')")
 
     @unittest.skipIf(
         StrictVersion(onnx.__version__) < StrictVersion("1.4.1"),
@@ -152,7 +152,7 @@ def test_model_tfidf_vectorizer12(self):
             corpus, vect, model_onnx,
             basename="SklearnTfidfVectorizer12SRegex-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__) <= "
-                          "StrictVersion('0.3.0')")
+                          "StrictVersion('0.4.0')")
 
     @unittest.skipIf(
         StrictVersion(onnx.__version__) < StrictVersion("1.4.1"),
@@ -174,7 +174,7 @@ def test_model_tfidf_vectorizer122(self):
             corpus, vect, model_onnx,
             basename="SklearnTfidfVectorizer12Regex-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__) <= "
-                          "StrictVersion('0.3.0')")
+                          "StrictVersion('0.4.0')")
 
     @unittest.skipIf(
         StrictVersion(onnx.__version__) < StrictVersion("1.4.1"),
@@ -195,7 +195,7 @@ def test_model_tfidf_vectorizer12_normL1(self):
             corpus, vect, model_onnx,
             basename="SklearnTfidfVectorizer12L1Regex-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__) <= "
-                          "StrictVersion('0.3.0')")
+                          "StrictVersion('0.4.0')")
 
     @unittest.skipIf(
         StrictVersion(onnx.__version__) < StrictVersion("1.4.1"),
@@ -217,7 +217,7 @@ def test_model_tfidf_vectorizer12_normL2(self):
             corpus, vect, model_onnx,
             basename="SklearnTfidfVectorizer12L2Regex-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__) <= "
-                          "StrictVersion('0.3.0')")
+                          "StrictVersion('0.4.0')")
 
     @unittest.skipIf(
         StrictVersion(onnx.__version__) < StrictVersion("1.4.1"),
@@ -239,7 +239,7 @@ def test_model_tfidf_vectorizer13(self):
             corpus, vect, model_onnx,
             basename="SklearnTfidfVectorizer13Regex-OneOff-SklCol",
             allow_failure="StrictVersion(onnxruntime.__version__) <= "
-                          "StrictVersion('0.3.0')")
+                          "StrictVersion('0.4.0')")
 
     @unittest.skipIf(
         StrictVersion(onnx.__version__) < StrictVersion("1.4.1"),
@@ -253,9 +253,11 @@ def test_model_tfidf_vectorizer11parenthesis_class(self):
         ]).reshape((4, 1))
         vect = TfidfVectorizer(ngram_range=(1, 1), norm=None)
         vect.fit(corpus.ravel())
-        extra = {TfidfVectorizer: {'sep': [' ', '.', '?', ',', ';',
-                                           ':', '!', '(', ')'],
-                                   'regex': None}}
+        extra = {TfidfVectorizer: {'separators': [
+                                        ' ', '[.]', '\\?', ',', ';',
+                                        ':', '\\!', '\\(', '\\)'
+                                    ],
+                                   'tokenexp': None}}
         model_onnx = convert_sklearn(vect, 'TfidfVectorizer',
                                      [('input', StringTensorType([1, 1]))],
                                      options=extra)
@@ -292,9 +294,11 @@ def test_model_tfidf_vectorizer11_idparenthesis_id(self):
         except RuntimeError:
             pass
 
-        extra = {id(vect): {"sep": [' ', '.', '?', ',', ';',
-                                    ':', '!', '(', ')'],
-                            "regex": None}}
+        extra = {id(vect): {"separators": [
+                                ' ', '[.]', '\\?', ',', ';', ':',
+                                '\\!', '\\(', '\\)'
+                            ],
+                            "tokenexp": None}}
         model_onnx = convert_sklearn(vect, 'TfidfVectorizer',
                                      [('input', StringTensorType([1, 1]))],
                                      options=extra)