From 4eb0217d4f009fe2b05a2ee4dd9d4b8988910caa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= Date: Fri, 14 Jun 2019 15:49:45 +0200 Subject: [PATCH] Update TfIdf converter to reflect changes in Tokenizer specifications (#178) Update converter for TfIdf after a change of spec in Tokenizer --- skl2onnx/convert.py | 6 ++- .../operator_converters/text_vectoriser.py | 47 +++++++++++-------- ..._sklearn_count_vectorizer_converter_bug.py | 4 +- tests/test_sklearn_documentation.py | 20 ++++---- ...test_sklearn_tfidf_vectorizer_converter.py | 35 +++++++------- ...klearn_tfidf_vectorizer_converter_regex.py | 38 ++++++++------- 6 files changed, 84 insertions(+), 66 deletions(-) diff --git a/skl2onnx/convert.py b/skl2onnx/convert.py index 558c0c340..9d825ebda 100644 --- a/skl2onnx/convert.py +++ b/skl2onnx/convert.py @@ -87,7 +87,8 @@ def convert_sklearn(model, name=None, initial_types=None, doc_string='', :: - extra = {TfidfVectorizer: {"sep": [' ', '.', '?', ',', ';', ':', '!', '(', ')']}} + extra = {TfidfVectorizer: {"separators": [' ', '[.]', '\\\\?', + ',', ';', ':', '\\\\!', '\\\\(', '\\\\)']}} model_onnx = convert_sklearn(model, "tfidf", initial_types=[("input", StringTensorType([1, 1]))], options=extra) @@ -97,7 +98,8 @@ def convert_sklearn(model, name=None, initial_types=None, doc_string='', :: - extra = {id(model): {"sep": [' ', '.', '?', ',', ';', ':', '!', '(', ')']}} + extra = {id(model): {"separators": [' ', '.', '\\\\?', ',', ';', + ':', '\\\\!', '\\\\(', '\\\\)']}} model_onnx = convert_sklearn(pipeline, "pipeline-with-2-tfidf", initial_types=[("input", StringTensorType([1, 1]))], options=extra) diff --git a/skl2onnx/operator_converters/text_vectoriser.py b/skl2onnx/operator_converters/text_vectoriser.py index b09295158..3362ecf91 100644 --- a/skl2onnx/operator_converters/text_vectoriser.py +++ b/skl2onnx/operator_converters/text_vectoriser.py @@ -79,7 +79,8 @@ def _intelligent_split(text, op, tokenizer, existing): def convert_sklearn_text_vectorizer(scope, operator, container): """ Converters for class - `TfidfVectorizer `_. + `TfidfVectorizer `_. The current implementation is a work in progress and the ONNX version does not produce the exact same results. The converter lets the user change some of its parameters. @@ -87,24 +88,26 @@ def convert_sklearn_text_vectorizer(scope, operator, container): Additional options ------------------ - regex: string + tokenexp: string The default will change to true in version 1.6.0. The tokenizer splits into words using this regular expression or the regular expression specified by *scikit-learn* is the value is an empty string. See also note below. Default value: None - sep: list of separators + separators: list of separators These separators are used to split a string into words. - Options *sep* is ignore if options *regex* is not None. - Default value: ``[' ', '.', '?', ',', ';', ':', '!']``. + Options *separators* is ignore if options *tokenexp* is not None. + Default value: ``[' ', '[.]', '\\\\?', ',', ';', ':', '\\\\!']``. Example (from :ref:`l-example-tfidfvectorizer`): :: - seps = {TfidfVectorizer: {"sep": [' ', '.', '?', ',', ';', ':', '!', '(', ')', - '\\n', '"', "'", "-", "[", "]", "@"]}} + seps = {TfidfVectorizer: {"separators": [' ', '[.]', '\\\\?', ',', ';', + ':', '!', '\\\\(', '\\\\)', + '\\n', '\\\\"', "'", "-", + "\\\\[", "\\\\]", "@"]}} model_onnx = convert_sklearn(pipeline, "tfidf", initial_types=[("input", StringTensorType([1, 2]))], options=seps) @@ -127,6 +130,10 @@ def convert_sklearn_text_vectorizer(scope, operator, container): Regular expression ``[^\\\\\\\\n]`` is used to split a sentance into character (and not works) if ``analyser=='char'``. The mode ``analyser=='char_wb'`` is not implemented. + + .. versionchanged:: 1.6 + Parameters have been renamed: *sep* into *separators*, + *regex* into *tokenexp*. ```` """ # noqa @@ -147,27 +154,27 @@ def convert_sklearn_text_vectorizer(scope, operator, container): "https://github.com/onnx/sklearn-onnx/issues.") options = container.get_options( - op, dict(sep="DEFAULT", - regex=None)) - if set(options) != {'sep', 'regex'}: + op, dict(separators="DEFAULT", + tokenexp=None)) + if set(options) != {'separators', 'tokenexp'}: raise RuntimeError("Unknown option {} for {}".format( - set(options) - {'sep'}, type(op))) + set(options) - {'separators'}, type(op))) if op.analyzer == 'word': default_pattern = '(?u)\\b\\w\\w+\\b' - if options['sep'] == "DEFAULT" and options['regex'] is None: + if options['separators'] == "DEFAULT" and options['tokenexp'] is None: warnings.warn("Converter for TfidfVectorizer will use " "scikit-learn regular expression by default " "in version 1.6.", DeprecationWarning) - default_separators = [' ', '.', '?', ',', ';', ':', '!'] + default_separators = [' ', '.', '\\?', ',', ';', ':', '\\!'] regex = op.token_pattern if regex == default_pattern: regex = '[a-zA-Z0-9_]+' default_separators = None - elif options['regex'] is not None: - if options['regex']: - regex = options['regex'] + elif options['tokenexp'] is not None: + if options['tokenexp']: + regex = options['tokenexp'] else: regex = op.token_pattern if regex == default_pattern: @@ -175,12 +182,12 @@ def convert_sklearn_text_vectorizer(scope, operator, container): default_separators = None else: regex = None - default_separators = options['sep'] + default_separators = options['separators'] else: - if options['sep'] != 'DEFAULT': - raise RuntimeError("Option sep has no effect " + if options['separators'] != 'DEFAULT': + raise RuntimeError("Option separators has no effect " "if analyser != 'word'.") - regex = options['regex'] if options['regex'] else '.' + regex = options['tokenexp'] if options['tokenexp'] else '.' default_separators = None if op.preprocessor is not None: diff --git a/tests/test_sklearn_count_vectorizer_converter_bug.py b/tests/test_sklearn_count_vectorizer_converter_bug.py index 1fb608fe1..c4433c7f1 100644 --- a/tests/test_sklearn_count_vectorizer_converter_bug.py +++ b/tests/test_sklearn_count_vectorizer_converter_bug.py @@ -30,7 +30,7 @@ def test_model_count_vectorizer_custom_tokenizer(self): extra = { CountVectorizer: { - "sep": ["ZZZZ"] + "separators": ["ZZZZ"] } } @@ -46,7 +46,7 @@ def test_model_count_vectorizer_custom_tokenizer(self): corpus, vect, model_onnx, basename="SklearnTfidfVectorizer11CustomTokenizer-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__) <= " - "StrictVersion('0.3.0')") + "StrictVersion('0.4.0')") @unittest.skipIf( StrictVersion(onnx.__version__) < StrictVersion("1.4.1"), diff --git a/tests/test_sklearn_documentation.py b/tests/test_sklearn_documentation.py index 859f20773..3e32cd26d 100644 --- a/tests/test_sklearn_documentation.py +++ b/tests/test_sklearn_documentation.py @@ -70,7 +70,9 @@ def test_pipeline_tfidf(self): tfi.fit(tdata.ravel()) extra = { TfidfVectorizer: { - "sep": [" ", ".", "?", ",", ";", ":", "!", "(", ")"] + "separators": [ + " ", "[.]", "\\?", ",", ";", ":", "\\!", "\\(", "\\)" + ] } } model_onnx = convert_sklearn( @@ -114,22 +116,22 @@ def test_pipeline_tfidf_pipeline_minmax(self): pipeline.fit(train_data[:300]) extra = { TfidfVectorizer: { - "sep": [ + "separators": [ " ", - ".", - "?", + "[.]", + "\\?", ",", ";", ":", - "!", - "(", - ")", + "\\!", + "\\(", + "\\)", "\n", '"', "'", "-", - "[", - "]", + "\\[", + "\\]", "@", ] } diff --git a/tests/test_sklearn_tfidf_vectorizer_converter.py b/tests/test_sklearn_tfidf_vectorizer_converter.py index d5da21178..743080691 100644 --- a/tests/test_sklearn_tfidf_vectorizer_converter.py +++ b/tests/test_sklearn_tfidf_vectorizer_converter.py @@ -14,7 +14,7 @@ class TestSklearnTfidfVectorizer(unittest.TestCase): def get_options(self): - return {TfidfVectorizer: {"regex": None}} + return {TfidfVectorizer: {"tokenexp": None}} @unittest.skipIf( StrictVersion(onnx.__version__) < StrictVersion("1.4.1"), @@ -38,7 +38,7 @@ def test_model_tfidf_vectorizer11(self): model_onnx, basename="SklearnTfidfVectorizer11-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__)" - " <= StrictVersion('0.3.0')", + " <= StrictVersion('0.4.0')", ) @unittest.skipIf( @@ -64,8 +64,8 @@ def test_model_tfidf_vectorizer11_empty_string_case1(self): corpus[2:], vect, model_onnx, basename="SklearnTfidfVectorizer11EmptyStringSepCase1-" "OneOff-SklCol", - allow_failure="StrictVersion(onnxruntime.__version__) <= " - "StrictVersion('0.3.0')") + allow_failure="StrictVersion(onnxruntime.__version__)" + " <= StrictVersion('0.4.0')") @unittest.skipIf( StrictVersion(onnx.__version__) < StrictVersion("1.4.1"), @@ -90,7 +90,7 @@ def test_model_tfidf_vectorizer11_empty_string_case2(self): model_onnx, basename="SklearnTfidfVectorizer11EmptyString-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__)" - " <= StrictVersion('0.3.0')", + " <= StrictVersion('0.4.0')", ) @unittest.skipIf( @@ -121,7 +121,7 @@ def test_model_tfidf_vectorizer11_out_vocabulary(self): model_onnx, basename="SklearnTfidfVectorizer11OutVocab-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__)" - " <= StrictVersion('0.3.0')", + " <= StrictVersion('0.4.0')", ) @unittest.skipIf( @@ -146,7 +146,7 @@ def test_model_tfidf_vectorizer22(self): model_onnx, basename="SklearnTfidfVectorizer22-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__)" - " <= StrictVersion('0.3.0')", + " <= StrictVersion('0.4.0')", ) @unittest.skipIf( @@ -166,7 +166,7 @@ def test_model_tfidf_vectorizer21(self): model_onnx, basename="SklearnTfidfVectorizer22S-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__)" - " <= StrictVersion('0.3.0')", + " <= StrictVersion('0.4.0')", ) @unittest.skipIf( @@ -191,7 +191,7 @@ def test_model_tfidf_vectorizer12(self): model_onnx, basename="SklearnTfidfVectorizer22-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__)" - " <= StrictVersion('0.3.0')", + " <= StrictVersion('0.4.0')", ) @unittest.skipIf( @@ -215,7 +215,7 @@ def test_model_tfidf_vectorizer12_normL1(self): model_onnx, basename="SklearnTfidfVectorizer22L1-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__)" - " <= StrictVersion('0.3.0')", + " <= StrictVersion('0.4.0')", ) @unittest.skipIf( @@ -240,7 +240,7 @@ def test_model_tfidf_vectorizer12_normL2(self): model_onnx, basename="SklearnTfidfVectorizer22L2-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__)" - " <= StrictVersion('0.3.0')", + " <= StrictVersion('0.4.0')", ) @unittest.skipIf( @@ -265,7 +265,7 @@ def test_model_tfidf_vectorizer13(self): model_onnx, basename="SklearnTfidfVectorizer13-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__)" - " <= StrictVersion('0.3.0')", + " <= StrictVersion('0.4.0')", ) @unittest.skipIf( @@ -282,7 +282,9 @@ def test_model_tfidf_vectorizer11parenthesis_class(self): vect.fit(corpus.ravel()) extra = { TfidfVectorizer: { - "sep": [" ", ".", "?", ",", ";", ":", "!", "(", ")"] + "separators": [ + " ", "\\.", "\\?", ",", ";", ":", "\\!", "\\(", "\\)" + ] } } model_onnx = convert_sklearn( @@ -333,7 +335,9 @@ def test_model_tfidf_vectorizer11_idparenthesis_id(self): extra = { id(vect): { - "sep": [" ", ".", "?", ",", ";", ":", "!", "(", ")"] + "separators": [ + " ", "[.]", "\\?", ",", ";", ":", "\\!", "\\(", "\\)" + ] } } model_onnx = convert_sklearn( @@ -351,8 +355,7 @@ def test_model_tfidf_vectorizer11_idparenthesis_id(self): model_onnx, basename="SklearnTfidfVectorizer11ParenthesisId-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__)" - " <= StrictVersion('0.4.0')", - ) + " <= StrictVersion('0.4.0')") if __name__ == "__main__": diff --git a/tests/test_sklearn_tfidf_vectorizer_converter_regex.py b/tests/test_sklearn_tfidf_vectorizer_converter_regex.py index 7a89ab20b..bb40d362b 100644 --- a/tests/test_sklearn_tfidf_vectorizer_converter_regex.py +++ b/tests/test_sklearn_tfidf_vectorizer_converter_regex.py @@ -14,7 +14,7 @@ class TestSklearnTfidfVectorizerRegex(unittest.TestCase): def get_options(self): - return {TfidfVectorizer: {"regex": ""}} + return {TfidfVectorizer: {"tokenexp": ""}} @unittest.skipIf( StrictVersion(onnx.__version__) < StrictVersion("1.4.1"), @@ -36,7 +36,7 @@ def test_model_tfidf_vectorizer11(self): corpus, vect, model_onnx, basename="SklearnTfidfVectorizer11Regex-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__) <= " - "StrictVersion('0.3.0')") + "StrictVersion('0.4.0')") @unittest.skipIf( StrictVersion(onnx.__version__) < StrictVersion("1.4.1"), @@ -59,7 +59,7 @@ def test_model_tfidf_vectorizer11_word4(self): corpus, vect, model_onnx, basename="SklearnTfidfVectorizer11Regex4-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__) <= " - "StrictVersion('0.3.0')") + "StrictVersion('0.4.0')") @unittest.skipIf( StrictVersion(onnx.__version__) < StrictVersion("1.4.1"), @@ -82,7 +82,7 @@ def test_model_tfidf_vectorizer11_empty_string(self): corpus, vect, model_onnx, basename="SklearnTfidfVectorizer11EmptyStringRegex-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__) " - "<= StrictVersion('0.3.0')") + "<= StrictVersion('0.4.0')") @unittest.skipIf( StrictVersion(onnx.__version__) < StrictVersion("1.4.1"), @@ -110,7 +110,7 @@ def test_model_tfidf_vectorizer11_out_vocabulary(self): corpus, vect, model_onnx, basename="SklearnTfidfVectorizer11OutVocabRegex-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__) <= " - "StrictVersion('0.3.0')") + "StrictVersion('0.4.0')") @unittest.skipIf( StrictVersion(onnx.__version__) < StrictVersion("1.4.1"), @@ -132,7 +132,7 @@ def test_model_tfidf_vectorizer22(self): corpus, vect, model_onnx, basename="SklearnTfidfVectorizer22Regex-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__) <= " - "StrictVersion('0.3.0')") + "StrictVersion('0.4.0')") @unittest.skipIf( StrictVersion(onnx.__version__) < StrictVersion("1.4.1"), @@ -152,7 +152,7 @@ def test_model_tfidf_vectorizer12(self): corpus, vect, model_onnx, basename="SklearnTfidfVectorizer12SRegex-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__) <= " - "StrictVersion('0.3.0')") + "StrictVersion('0.4.0')") @unittest.skipIf( StrictVersion(onnx.__version__) < StrictVersion("1.4.1"), @@ -174,7 +174,7 @@ def test_model_tfidf_vectorizer122(self): corpus, vect, model_onnx, basename="SklearnTfidfVectorizer12Regex-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__) <= " - "StrictVersion('0.3.0')") + "StrictVersion('0.4.0')") @unittest.skipIf( StrictVersion(onnx.__version__) < StrictVersion("1.4.1"), @@ -195,7 +195,7 @@ def test_model_tfidf_vectorizer12_normL1(self): corpus, vect, model_onnx, basename="SklearnTfidfVectorizer12L1Regex-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__) <= " - "StrictVersion('0.3.0')") + "StrictVersion('0.4.0')") @unittest.skipIf( StrictVersion(onnx.__version__) < StrictVersion("1.4.1"), @@ -217,7 +217,7 @@ def test_model_tfidf_vectorizer12_normL2(self): corpus, vect, model_onnx, basename="SklearnTfidfVectorizer12L2Regex-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__) <= " - "StrictVersion('0.3.0')") + "StrictVersion('0.4.0')") @unittest.skipIf( StrictVersion(onnx.__version__) < StrictVersion("1.4.1"), @@ -239,7 +239,7 @@ def test_model_tfidf_vectorizer13(self): corpus, vect, model_onnx, basename="SklearnTfidfVectorizer13Regex-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__) <= " - "StrictVersion('0.3.0')") + "StrictVersion('0.4.0')") @unittest.skipIf( StrictVersion(onnx.__version__) < StrictVersion("1.4.1"), @@ -253,9 +253,11 @@ def test_model_tfidf_vectorizer11parenthesis_class(self): ]).reshape((4, 1)) vect = TfidfVectorizer(ngram_range=(1, 1), norm=None) vect.fit(corpus.ravel()) - extra = {TfidfVectorizer: {'sep': [' ', '.', '?', ',', ';', - ':', '!', '(', ')'], - 'regex': None}} + extra = {TfidfVectorizer: {'separators': [ + ' ', '[.]', '\\?', ',', ';', + ':', '\\!', '\\(', '\\)' + ], + 'tokenexp': None}} model_onnx = convert_sklearn(vect, 'TfidfVectorizer', [('input', StringTensorType([1, 1]))], options=extra) @@ -292,9 +294,11 @@ def test_model_tfidf_vectorizer11_idparenthesis_id(self): except RuntimeError: pass - extra = {id(vect): {"sep": [' ', '.', '?', ',', ';', - ':', '!', '(', ')'], - "regex": None}} + extra = {id(vect): {"separators": [ + ' ', '[.]', '\\?', ',', ';', ':', + '\\!', '\\(', '\\)' + ], + "tokenexp": None}} model_onnx = convert_sklearn(vect, 'TfidfVectorizer', [('input', StringTensorType([1, 1]))], options=extra)