Skip to content

Commit

Permalink
Update TfIdf converter to reflect changes in Tokenizer specifications (
Browse files Browse the repository at this point in the history
…#178)

Update converter for TfIdf after a change of spec in Tokenizer
  • Loading branch information
xadupre committed Jun 14, 2019
1 parent 4200f11 commit 4eb0217
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 66 deletions.
6 changes: 4 additions & 2 deletions skl2onnx/convert.py
Expand Up @@ -87,7 +87,8 @@ def convert_sklearn(model, name=None, initial_types=None, doc_string='',
::
extra = {TfidfVectorizer: {"sep": [' ', '.', '?', ',', ';', ':', '!', '(', ')']}}
extra = {TfidfVectorizer: {"separators": [' ', '[.]', '\\\\?',
',', ';', ':', '\\\\!', '\\\\(', '\\\\)']}}
model_onnx = convert_sklearn(model, "tfidf",
initial_types=[("input", StringTensorType([1, 1]))],
options=extra)
Expand All @@ -97,7 +98,8 @@ def convert_sklearn(model, name=None, initial_types=None, doc_string='',
::
extra = {id(model): {"sep": [' ', '.', '?', ',', ';', ':', '!', '(', ')']}}
extra = {id(model): {"separators": [' ', '.', '\\\\?', ',', ';',
':', '\\\\!', '\\\\(', '\\\\)']}}
model_onnx = convert_sklearn(pipeline, "pipeline-with-2-tfidf",
initial_types=[("input", StringTensorType([1, 1]))],
options=extra)
Expand Down
47 changes: 27 additions & 20 deletions skl2onnx/operator_converters/text_vectoriser.py
Expand Up @@ -79,32 +79,35 @@ def _intelligent_split(text, op, tokenizer, existing):
def convert_sklearn_text_vectorizer(scope, operator, container):
"""
Converters for class
`TfidfVectorizer <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html>`_.
`TfidfVectorizer <https://scikit-learn.org/stable/modules/generated/
sklearn.feature_extraction.text.TfidfVectorizer.html>`_.
The current implementation is a work in progress and the ONNX version
does not produce the exact same results. The converter lets the user
change some of its parameters.
Additional options
------------------
regex: string
tokenexp: string
The default will change to true in version 1.6.0.
The tokenizer splits into words using this regular
expression or the regular expression specified by
*scikit-learn* is the value is an empty string.
See also note below.
Default value: None
sep: list of separators
separators: list of separators
These separators are used to split a string into words.
Options *sep* is ignore if options *regex* is not None.
Default value: ``[' ', '.', '?', ',', ';', ':', '!']``.
Options *separators* is ignore if options *tokenexp* is not None.
Default value: ``[' ', '[.]', '\\\\?', ',', ';', ':', '\\\\!']``.
Example (from :ref:`l-example-tfidfvectorizer`):
::
seps = {TfidfVectorizer: {"sep": [' ', '.', '?', ',', ';', ':', '!', '(', ')',
'\\n', '"', "'", "-", "[", "]", "@"]}}
seps = {TfidfVectorizer: {"separators": [' ', '[.]', '\\\\?', ',', ';',
':', '!', '\\\\(', '\\\\)',
'\\n', '\\\\"', "'", "-",
"\\\\[", "\\\\]", "@"]}}
model_onnx = convert_sklearn(pipeline, "tfidf",
initial_types=[("input", StringTensorType([1, 2]))],
options=seps)
Expand All @@ -127,6 +130,10 @@ def convert_sklearn_text_vectorizer(scope, operator, container):
Regular expression ``[^\\\\\\\\n]`` is used to split
a sentance into character (and not works) if ``analyser=='char'``.
The mode ``analyser=='char_wb'`` is not implemented.
.. versionchanged:: 1.6
Parameters have been renamed: *sep* into *separators*,
*regex* into *tokenexp*.
````
""" # noqa
Expand All @@ -147,40 +154,40 @@ def convert_sklearn_text_vectorizer(scope, operator, container):
"https://github.com/onnx/sklearn-onnx/issues.")

options = container.get_options(
op, dict(sep="DEFAULT",
regex=None))
if set(options) != {'sep', 'regex'}:
op, dict(separators="DEFAULT",
tokenexp=None))
if set(options) != {'separators', 'tokenexp'}:
raise RuntimeError("Unknown option {} for {}".format(
set(options) - {'sep'}, type(op)))
set(options) - {'separators'}, type(op)))

if op.analyzer == 'word':
default_pattern = '(?u)\\b\\w\\w+\\b'
if options['sep'] == "DEFAULT" and options['regex'] is None:
if options['separators'] == "DEFAULT" and options['tokenexp'] is None:
warnings.warn("Converter for TfidfVectorizer will use "
"scikit-learn regular expression by default "
"in version 1.6.",
DeprecationWarning)
default_separators = [' ', '.', '?', ',', ';', ':', '!']
default_separators = [' ', '.', '\\?', ',', ';', ':', '\\!']
regex = op.token_pattern
if regex == default_pattern:
regex = '[a-zA-Z0-9_]+'
default_separators = None
elif options['regex'] is not None:
if options['regex']:
regex = options['regex']
elif options['tokenexp'] is not None:
if options['tokenexp']:
regex = options['tokenexp']
else:
regex = op.token_pattern
if regex == default_pattern:
regex = '[a-zA-Z0-9_]+'
default_separators = None
else:
regex = None
default_separators = options['sep']
default_separators = options['separators']
else:
if options['sep'] != 'DEFAULT':
raise RuntimeError("Option sep has no effect "
if options['separators'] != 'DEFAULT':
raise RuntimeError("Option separators has no effect "
"if analyser != 'word'.")
regex = options['regex'] if options['regex'] else '.'
regex = options['tokenexp'] if options['tokenexp'] else '.'
default_separators = None

if op.preprocessor is not None:
Expand Down
4 changes: 2 additions & 2 deletions tests/test_sklearn_count_vectorizer_converter_bug.py
Expand Up @@ -30,7 +30,7 @@ def test_model_count_vectorizer_custom_tokenizer(self):

extra = {
CountVectorizer: {
"sep": ["ZZZZ"]
"separators": ["ZZZZ"]
}
}

Expand All @@ -46,7 +46,7 @@ def test_model_count_vectorizer_custom_tokenizer(self):
corpus, vect, model_onnx,
basename="SklearnTfidfVectorizer11CustomTokenizer-OneOff-SklCol",
allow_failure="StrictVersion(onnxruntime.__version__) <= "
"StrictVersion('0.3.0')")
"StrictVersion('0.4.0')")

@unittest.skipIf(
StrictVersion(onnx.__version__) < StrictVersion("1.4.1"),
Expand Down
20 changes: 11 additions & 9 deletions tests/test_sklearn_documentation.py
Expand Up @@ -70,7 +70,9 @@ def test_pipeline_tfidf(self):
tfi.fit(tdata.ravel())
extra = {
TfidfVectorizer: {
"sep": [" ", ".", "?", ",", ";", ":", "!", "(", ")"]
"separators": [
" ", "[.]", "\\?", ",", ";", ":", "\\!", "\\(", "\\)"
]
}
}
model_onnx = convert_sklearn(
Expand Down Expand Up @@ -114,22 +116,22 @@ def test_pipeline_tfidf_pipeline_minmax(self):
pipeline.fit(train_data[:300])
extra = {
TfidfVectorizer: {
"sep": [
"separators": [
" ",
".",
"?",
"[.]",
"\\?",
",",
";",
":",
"!",
"(",
")",
"\\!",
"\\(",
"\\)",
"\n",
'"',
"'",
"-",
"[",
"]",
"\\[",
"\\]",
"@",
]
}
Expand Down
35 changes: 19 additions & 16 deletions tests/test_sklearn_tfidf_vectorizer_converter.py
Expand Up @@ -14,7 +14,7 @@
class TestSklearnTfidfVectorizer(unittest.TestCase):

def get_options(self):
return {TfidfVectorizer: {"regex": None}}
return {TfidfVectorizer: {"tokenexp": None}}

@unittest.skipIf(
StrictVersion(onnx.__version__) < StrictVersion("1.4.1"),
Expand All @@ -38,7 +38,7 @@ def test_model_tfidf_vectorizer11(self):
model_onnx,
basename="SklearnTfidfVectorizer11-OneOff-SklCol",
allow_failure="StrictVersion(onnxruntime.__version__)"
" <= StrictVersion('0.3.0')",
" <= StrictVersion('0.4.0')",
)

@unittest.skipIf(
Expand All @@ -64,8 +64,8 @@ def test_model_tfidf_vectorizer11_empty_string_case1(self):
corpus[2:], vect, model_onnx,
basename="SklearnTfidfVectorizer11EmptyStringSepCase1-"
"OneOff-SklCol",
allow_failure="StrictVersion(onnxruntime.__version__) <= "
"StrictVersion('0.3.0')")
allow_failure="StrictVersion(onnxruntime.__version__)"
" <= StrictVersion('0.4.0')")

@unittest.skipIf(
StrictVersion(onnx.__version__) < StrictVersion("1.4.1"),
Expand All @@ -90,7 +90,7 @@ def test_model_tfidf_vectorizer11_empty_string_case2(self):
model_onnx,
basename="SklearnTfidfVectorizer11EmptyString-OneOff-SklCol",
allow_failure="StrictVersion(onnxruntime.__version__)"
" <= StrictVersion('0.3.0')",
" <= StrictVersion('0.4.0')",
)

@unittest.skipIf(
Expand Down Expand Up @@ -121,7 +121,7 @@ def test_model_tfidf_vectorizer11_out_vocabulary(self):
model_onnx,
basename="SklearnTfidfVectorizer11OutVocab-OneOff-SklCol",
allow_failure="StrictVersion(onnxruntime.__version__)"
" <= StrictVersion('0.3.0')",
" <= StrictVersion('0.4.0')",
)

@unittest.skipIf(
Expand All @@ -146,7 +146,7 @@ def test_model_tfidf_vectorizer22(self):
model_onnx,
basename="SklearnTfidfVectorizer22-OneOff-SklCol",
allow_failure="StrictVersion(onnxruntime.__version__)"
" <= StrictVersion('0.3.0')",
" <= StrictVersion('0.4.0')",
)

@unittest.skipIf(
Expand All @@ -166,7 +166,7 @@ def test_model_tfidf_vectorizer21(self):
model_onnx,
basename="SklearnTfidfVectorizer22S-OneOff-SklCol",
allow_failure="StrictVersion(onnxruntime.__version__)"
" <= StrictVersion('0.3.0')",
" <= StrictVersion('0.4.0')",
)

@unittest.skipIf(
Expand All @@ -191,7 +191,7 @@ def test_model_tfidf_vectorizer12(self):
model_onnx,
basename="SklearnTfidfVectorizer22-OneOff-SklCol",
allow_failure="StrictVersion(onnxruntime.__version__)"
" <= StrictVersion('0.3.0')",
" <= StrictVersion('0.4.0')",
)

@unittest.skipIf(
Expand All @@ -215,7 +215,7 @@ def test_model_tfidf_vectorizer12_normL1(self):
model_onnx,
basename="SklearnTfidfVectorizer22L1-OneOff-SklCol",
allow_failure="StrictVersion(onnxruntime.__version__)"
" <= StrictVersion('0.3.0')",
" <= StrictVersion('0.4.0')",
)

@unittest.skipIf(
Expand All @@ -240,7 +240,7 @@ def test_model_tfidf_vectorizer12_normL2(self):
model_onnx,
basename="SklearnTfidfVectorizer22L2-OneOff-SklCol",
allow_failure="StrictVersion(onnxruntime.__version__)"
" <= StrictVersion('0.3.0')",
" <= StrictVersion('0.4.0')",
)

@unittest.skipIf(
Expand All @@ -265,7 +265,7 @@ def test_model_tfidf_vectorizer13(self):
model_onnx,
basename="SklearnTfidfVectorizer13-OneOff-SklCol",
allow_failure="StrictVersion(onnxruntime.__version__)"
" <= StrictVersion('0.3.0')",
" <= StrictVersion('0.4.0')",
)

@unittest.skipIf(
Expand All @@ -282,7 +282,9 @@ def test_model_tfidf_vectorizer11parenthesis_class(self):
vect.fit(corpus.ravel())
extra = {
TfidfVectorizer: {
"sep": [" ", ".", "?", ",", ";", ":", "!", "(", ")"]
"separators": [
" ", "\\.", "\\?", ",", ";", ":", "\\!", "\\(", "\\)"
]
}
}
model_onnx = convert_sklearn(
Expand Down Expand Up @@ -333,7 +335,9 @@ def test_model_tfidf_vectorizer11_idparenthesis_id(self):

extra = {
id(vect): {
"sep": [" ", ".", "?", ",", ";", ":", "!", "(", ")"]
"separators": [
" ", "[.]", "\\?", ",", ";", ":", "\\!", "\\(", "\\)"
]
}
}
model_onnx = convert_sklearn(
Expand All @@ -351,8 +355,7 @@ def test_model_tfidf_vectorizer11_idparenthesis_id(self):
model_onnx,
basename="SklearnTfidfVectorizer11ParenthesisId-OneOff-SklCol",
allow_failure="StrictVersion(onnxruntime.__version__)"
" <= StrictVersion('0.4.0')",
)
" <= StrictVersion('0.4.0')")


if __name__ == "__main__":
Expand Down

0 comments on commit 4eb0217

Please sign in to comment.