From 5a7c258e0c8be723eea6ebe203c74b4744780eb6 Mon Sep 17 00:00:00 2001 From: Chris Date: Wed, 27 Mar 2024 00:41:38 -0400 Subject: [PATCH 1/5] Added a new flag, that when set, will split sentences on newline and carriage returns --- .../transformer_tagging_component.py | 55 ++++++++++++------- 1 file changed, 34 insertions(+), 21 deletions(-) diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py index 65114269..84fa5334 100644 --- a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py +++ b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py @@ -38,6 +38,7 @@ from pkg_resources import resource_filename from nltk.tokenize.punkt import PunktSentenceTokenizer import pandas as pd +import re logger = logging.getLogger('TransformerTaggingComponent') @@ -143,28 +144,37 @@ def _add_tags(self, config, corpus, ff_props: Dict[str, str]): # for each sentence in input for start, end in PunktSentenceTokenizer().span_tokenize(input_text): - probe_sent = input_text[start:end] + probe_str = input_text[start:end] + probe_list: list = [] - # get similarity scores for the input sentence with each corpus sentence - probe_sent_embed = self._cached_model.encode(probe_sent, convert_to_tensor=True, show_progress_bar=False) - scores = [float(util.cos_sim(probe_sent_embed, corpus_sent_embed)) for corpus_sent_embed in corpus.embed] - - probe_df = pd.DataFrame({ - "input text": probe_sent, - "corpus text": corpus.json["text"], - "tag": corpus.json["tag"].str.lower(), - "score": scores, - "offset": str(start) + "-" + str(end - 1) - }) - - # sort by score then group by tag so each group will be sorted highest to lowest score, - # then take top row for each group - probe_df = probe_df.sort_values(by=['score'], ascending=False) - top_per_tag = probe_df.groupby(['tag'], sort=False).head(1) - - # filter out results that are below threshold - top_per_tag_threshold = top_per_tag[top_per_tag["score"] >= config.threshold] - all_tag_results.append(top_per_tag_threshold) + # split input sentence further on newline or carriage return if flag is set + if (config.split_on_newline): + for new_sentence in probe_str.splitlines(): + probe_list.append(new_sentence.lstrip()) + else: + probe_list.append(probe_str) + + for probe_sent in probe_list: + # get similarity scores for the input sentence with each corpus sentence + probe_sent_embed = self._cached_model.encode(probe_sent, convert_to_tensor=True, show_progress_bar=False) + scores = [float(util.cos_sim(probe_sent_embed, corpus_sent_embed)) for corpus_sent_embed in corpus.embed] + + probe_df = pd.DataFrame({ + "input text": probe_sent, + "corpus text": corpus.json["text"], + "tag": corpus.json["tag"].str.lower(), + "score": scores, + "offset": str(start) + "-" + str(end - 1) + }) + + # sort by score then group by tag so each group will be sorted highest to lowest score, + # then take top row for each group + probe_df = probe_df.sort_values(by=['score'], ascending=False) + top_per_tag = probe_df.groupby(['tag'], sort=False).head(1) + + # filter out results that are below threshold + top_per_tag_threshold = top_per_tag[top_per_tag["score"] >= config.threshold] + all_tag_results.append(top_per_tag_threshold) # if no tags found in text return if not all_tag_results: @@ -242,6 +252,9 @@ def __init__(self, props: Mapping[str, str]): # if debug is true will return which corpus sentences triggered the match self.debug = mpf_util.get_property(props, 'ENABLE_DEBUG', False) + # if split on newline is true will split input on newline and carriage returns + self.split_on_newline = mpf_util.get_property(props, 'ENABLE_NEWLINE_SPLIT', False) + self.corpus_file = \ mpf_util.get_property(props, 'TRANSFORMER_TAGGING_CORPUS', "transformer_text_tags_corpus.json") From 451ac6ea286f7944004c9ef08091b46ce561e2ac Mon Sep 17 00:00:00 2001 From: Chris7C Date: Thu, 2 May 2024 16:04:37 -0400 Subject: [PATCH 2/5] Updates to fix offsets on newlines --- python/TransformerTagging/README.md | 3 +++ .../transformer_tagging_component.py | 16 +++++++++++++--- .../transformer_text_tags_corpus.json | 8 ++++++++ 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/python/TransformerTagging/README.md b/python/TransformerTagging/README.md index fd44c515..7309423a 100644 --- a/python/TransformerTagging/README.md +++ b/python/TransformerTagging/README.md @@ -13,6 +13,9 @@ in phrasing, subject, and context. The sentences that generate scores above the are called "trigger sentences". These sentences are grouped by "tag" based on which entry in the corpus they matched against. +When the `ENABLE_NEWLINE_SPLIT` property is set to true, sentences will be split on +newline and carriage returns. + This component can be used independently to perform transformer tagging on text files, or it can be used as a support component in a multi-stage pipeline to perform transformer tagging on feed-forward detections generated by some other diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py index 84fa5334..bad219a4 100644 --- a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py +++ b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py @@ -149,24 +149,34 @@ def _add_tags(self, config, corpus, ff_props: Dict[str, str]): # split input sentence further on newline or carriage return if flag is set if (config.split_on_newline): - for new_sentence in probe_str.splitlines(): - probe_list.append(new_sentence.lstrip()) + for new_sentence in probe_str.splitlines(keepends=True): + probe_list.append(new_sentence) else: probe_list.append(probe_str) + # an offset counter to track offset start if newline flag is set + offset_counter: int = start + offset_end: int + for probe_sent in probe_list: # get similarity scores for the input sentence with each corpus sentence probe_sent_embed = self._cached_model.encode(probe_sent, convert_to_tensor=True, show_progress_bar=False) scores = [float(util.cos_sim(probe_sent_embed, corpus_sent_embed)) for corpus_sent_embed in corpus.embed] + # determine offset ending of sentence + offset_end = offset_counter + (len(probe_sent) - 1) + probe_df = pd.DataFrame({ "input text": probe_sent, "corpus text": corpus.json["text"], "tag": corpus.json["tag"].str.lower(), "score": scores, - "offset": str(start) + "-" + str(end - 1) + "offset": str(offset_counter) + "-" + str(offset_end) }) + # set and adjust offset counter so that next line has correct start offset + offset_counter = offset_end + 1 + # sort by score then group by tag so each group will be sorted highest to lowest score, # then take top row for each group probe_df = probe_df.sort_values(by=['score'], ascending=False) diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_text_tags_corpus.json b/python/TransformerTagging/transformer_tagging_component/transformer_text_tags_corpus.json index a93be27a..db01949f 100644 --- a/python/TransformerTagging/transformer_tagging_component/transformer_text_tags_corpus.json +++ b/python/TransformerTagging/transformer_tagging_component/transformer_text_tags_corpus.json @@ -131,6 +131,10 @@ "text": "This sentence is transaction.", "tag": "financial" }, + { + "text": "This sentence is gift card.", + "tag": "financial" + }, { "text": "This sentence is birth", "tag": "personal" @@ -151,6 +155,10 @@ "text": "This sentence is email.", "tag": "personal" }, + { + "text": "This sentence is gmail.", + "tag": "personal" + }, { "text": "This sentence is fax.", "tag": "personal" From 1626e8a5d31995200ce234f68df8433ede7701c6 Mon Sep 17 00:00:00 2001 From: Chris7C Date: Tue, 7 May 2024 00:51:52 -0400 Subject: [PATCH 3/5] test for newline splits and offset --- .../tests/test_transformer_tagging.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/python/TransformerTagging/tests/test_transformer_tagging.py b/python/TransformerTagging/tests/test_transformer_tagging.py index 0214f97f..4c595a13 100644 --- a/python/TransformerTagging/tests/test_transformer_tagging.py +++ b/python/TransformerTagging/tests/test_transformer_tagging.py @@ -338,5 +338,22 @@ def test_repeat_trigger_job(self): self.assertAlmostEqual(matches, props["TEXT TRAVEL TRIGGER SENTENCES MATCHES"]) + def test_newline(self): + NEWLINE_SAMPLE = ( + 'This first sentence is about driving to the beach\nAnother sentence about driving to the beach\n\n\n\nThis sentence is also about driving to the beach and ends in a period.\n' + ) + ff_track = mpf.GenericTrack(-1, dict(TEXT=NEWLINE_SAMPLE)) + job = mpf.GenericJob('Test Generic', 'test.txt', \ + dict(ENABLE_DEBUG='true', ENABLE_NEWLINE_SPLIT='true'), {}, ff_track) + comp = TransformerTaggingComponent() + result = comp.get_detections_from_generic(job) + + props = result[0].detection_properties + + expectedPersonalSentences: str = 'This first sentence is about driving to the beach\n; Another sentence about driving to the beach\n; This sentence is also about driving to the beach and ends in a period.' + expectedPersonalOffsets: str = '0-49; 50-93; 97-166' + self.assertEqual(expectedPersonalSentences, props["TEXT TRAVEL TRIGGER SENTENCES"]) + self.assertEqual(expectedPersonalOffsets, props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"]) + if __name__ == '__main__': unittest.main() From 7dbd353b8d87c536da8daefdb8e148ffdbe98424 Mon Sep 17 00:00:00 2001 From: jrobble Date: Wed, 8 May 2024 13:11:34 -0400 Subject: [PATCH 4/5] Address PR comments. --- python/TransformerTagging/README.md | 14 +++++++++++--- .../plugin-files/descriptor/descriptor.json | 6 ++++++ .../transformer_tagging_component.py | 13 +++++-------- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/python/TransformerTagging/README.md b/python/TransformerTagging/README.md index 7309423a..6b84c532 100644 --- a/python/TransformerTagging/README.md +++ b/python/TransformerTagging/README.md @@ -13,9 +13,6 @@ in phrasing, subject, and context. The sentences that generate scores above the are called "trigger sentences". These sentences are grouped by "tag" based on which entry in the corpus they matched against. -When the `ENABLE_NEWLINE_SPLIT` property is set to true, sentences will be split on -newline and carriage returns. - This component can be used independently to perform transformer tagging on text files, or it can be used as a support component in a multi-stage pipeline to perform transformer tagging on feed-forward detections generated by some other @@ -33,6 +30,17 @@ component will generate detections with the same `TRANSLATION` output. If none o input properties are present then the transformer tagging is not performed then the feed-forward detection is returned unmodified. +Note that certain document types (e.g. PDF, Word), as well as text generated by OCR, may +use newline and carriage return characters to perform line wrapping. That is, the +characters don't necessarily indicate the end of a sentence, but rather that the text has +reached the column or page width and the following text should appear in the next line +down the page. To address this, when the `ENABLE_NEWLINE_SPLIT` property is set to false, +the transformer tagger may parse out sentences from the input text that have newline or +carriage return characters between words. If you know that your input text is generated +from a source where newlines and carriage returns always indicate a new sentence (e.g. +emails), then you may want to set the `ENABLE_NEWLINE_SPLIT` property to true. The +transformer tagger will then treat those characters as sentence breaks. + The reported detections that are returned by the transformer tagger are based on the corpus used, and the minimum score defined in the `SCORE_THRESHOLD` property, as discussed below. diff --git a/python/TransformerTagging/plugin-files/descriptor/descriptor.json b/python/TransformerTagging/plugin-files/descriptor/descriptor.json index 74810ebd..7d34c3ab 100644 --- a/python/TransformerTagging/plugin-files/descriptor/descriptor.json +++ b/python/TransformerTagging/plugin-files/descriptor/descriptor.json @@ -38,6 +38,12 @@ "type": "STRING", "defaultValue": "transformer_text_tags_corpus.json" }, + { + "name": "ENABLE_NEWLINE_SPLIT", + "description": "If true, newline and carriage return characters will be treated as sentence breaks.", + "type": "BOOLEAN", + "defaultValue": "FALSE" + }, { "name": "ENABLE_DEBUG", "description": "If true, each detection will include a `TRIGGER SENTENCES MATCHES` property for each entry in `TAGS`. The value will be the sentences in the corpus which met the score threshold for that tag.", diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py index bad219a4..01e9401b 100644 --- a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py +++ b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py @@ -38,14 +38,14 @@ from pkg_resources import resource_filename from nltk.tokenize.punkt import PunktSentenceTokenizer import pandas as pd -import re logger = logging.getLogger('TransformerTaggingComponent') class TransformerTaggingComponent: def __init__(self): - self._cached_model = SentenceTransformer('/models/all-mpnet-base-v2') + # self._cached_model = SentenceTransformer('/models/all-mpnet-base-v2') + self._cached_model = SentenceTransformer('/home/mpf/git/openmpf-projects/openmpf-components/python/TransformerTagging/models/all-mpnet-base-v2') # DEBUG self._cached_corpuses: Dict[str, Corpus] = {} @@ -145,18 +145,15 @@ def _add_tags(self, config, corpus, ff_props: Dict[str, str]): # for each sentence in input for start, end in PunktSentenceTokenizer().span_tokenize(input_text): probe_str = input_text[start:end] - probe_list: list = [] # split input sentence further on newline or carriage return if flag is set if (config.split_on_newline): - for new_sentence in probe_str.splitlines(keepends=True): - probe_list.append(new_sentence) + probe_list = probe_str.splitlines(keepends=True) else: - probe_list.append(probe_str) + probe_list = [probe_str] # an offset counter to track offset start if newline flag is set - offset_counter: int = start - offset_end: int + offset_counter = start for probe_sent in probe_list: # get similarity scores for the input sentence with each corpus sentence From 9b08cb565b64a732671f7d78f8000b8f51c8506a Mon Sep 17 00:00:00 2001 From: jrobble Date: Wed, 8 May 2024 14:04:56 -0400 Subject: [PATCH 5/5] Strip probes. --- .../tests/test_transformer_tagging.py | 31 +++++++++++----- .../transformer_tagging_component.py | 35 +++++++++++-------- 2 files changed, 43 insertions(+), 23 deletions(-) diff --git a/python/TransformerTagging/tests/test_transformer_tagging.py b/python/TransformerTagging/tests/test_transformer_tagging.py index 4c595a13..9b9b9fb7 100644 --- a/python/TransformerTagging/tests/test_transformer_tagging.py +++ b/python/TransformerTagging/tests/test_transformer_tagging.py @@ -260,7 +260,7 @@ def test_missing_text_to_process(self): def test_maintain_tags_from_earlier_feedforward_task(self): ff_track = mpf.GenericTrack(-1, dict(TEXT=SHORT_SAMPLE)) job = mpf.GenericJob('Test Generic', 'test.pdf', {}, {}, ff_track) - # add tags + firstTag = "FIRST_TAG" job.feed_forward_track.detection_properties["TAGS"] = firstTag comp = TransformerTaggingComponent() @@ -338,11 +338,26 @@ def test_repeat_trigger_job(self): self.assertAlmostEqual(matches, props["TEXT TRAVEL TRIGGER SENTENCES MATCHES"]) - def test_newline(self): - NEWLINE_SAMPLE = ( - 'This first sentence is about driving to the beach\nAnother sentence about driving to the beach\n\n\n\nThis sentence is also about driving to the beach and ends in a period.\n' + def test_newline_split(self): + sample = ( + 'This first sentence is about driving to the beach\n' + 'Another sentence about driving to the beach\n\n\n\n' + 'This sentence is also about driving to the beach and ends in a period.\n' + ' Beach sentence begins and ends with \t whitespace \n' + 'Final beach sentence!' + ) + + trigger_sentences = ( + 'This first sentence is about driving to the beach; ' + 'Another sentence about driving to the beach; ' + 'This sentence is also about driving to the beach and ends in a period.; ' + 'Beach sentence begins and ends with \t whitespace; ' + 'Final beach sentence!' ) - ff_track = mpf.GenericTrack(-1, dict(TEXT=NEWLINE_SAMPLE)) + + offsets = '0-48; 50-92; 97-166; 171-218; 223-243' + + ff_track = mpf.GenericTrack(-1, dict(TEXT=sample)) job = mpf.GenericJob('Test Generic', 'test.txt', \ dict(ENABLE_DEBUG='true', ENABLE_NEWLINE_SPLIT='true'), {}, ff_track) comp = TransformerTaggingComponent() @@ -350,10 +365,8 @@ def test_newline(self): props = result[0].detection_properties - expectedPersonalSentences: str = 'This first sentence is about driving to the beach\n; Another sentence about driving to the beach\n; This sentence is also about driving to the beach and ends in a period.' - expectedPersonalOffsets: str = '0-49; 50-93; 97-166' - self.assertEqual(expectedPersonalSentences, props["TEXT TRAVEL TRIGGER SENTENCES"]) - self.assertEqual(expectedPersonalOffsets, props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"]) + self.assertEqual(trigger_sentences, props["TEXT TRAVEL TRIGGER SENTENCES"]) + self.assertEqual(offsets, props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"]) if __name__ == '__main__': unittest.main() diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py index 01e9401b..37ae0574 100644 --- a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py +++ b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py @@ -44,8 +44,7 @@ class TransformerTaggingComponent: def __init__(self): - # self._cached_model = SentenceTransformer('/models/all-mpnet-base-v2') - self._cached_model = SentenceTransformer('/home/mpf/git/openmpf-projects/openmpf-components/python/TransformerTagging/models/all-mpnet-base-v2') # DEBUG + self._cached_model = SentenceTransformer('/models/all-mpnet-base-v2') self._cached_corpuses: Dict[str, Corpus] = {} @@ -152,28 +151,36 @@ def _add_tags(self, config, corpus, ff_props: Dict[str, str]): else: probe_list = [probe_str] - # an offset counter to track offset start if newline flag is set + # an offset counter to track character offset start offset_counter = start - for probe_sent in probe_list: - # get similarity scores for the input sentence with each corpus sentence - probe_sent_embed = self._cached_model.encode(probe_sent, convert_to_tensor=True, show_progress_bar=False) - scores = [float(util.cos_sim(probe_sent_embed, corpus_sent_embed)) for corpus_sent_embed in corpus.embed] + for probe in probe_list: + # strip probe of leading and trailing whitespace + stripped_probe = probe.strip() + + # determine probe character offsets + num_leading_chars = len(probe) - len(probe.lstrip()) + offset_start = offset_counter + num_leading_chars + offset_end = offset_start + len(stripped_probe) - 1 + + # set character offset counter for next iteration + offset_counter += len(probe) - # determine offset ending of sentence - offset_end = offset_counter + (len(probe_sent) - 1) + if stripped_probe == "": + continue + + # get similarity scores for the input sentence with each corpus sentence + embed_probe = self._cached_model.encode(stripped_probe, convert_to_tensor=True, show_progress_bar=False) + scores = [float(util.cos_sim(embed_probe, corpus_entry)) for corpus_entry in corpus.embed] probe_df = pd.DataFrame({ - "input text": probe_sent, + "input text": stripped_probe, "corpus text": corpus.json["text"], "tag": corpus.json["tag"].str.lower(), "score": scores, - "offset": str(offset_counter) + "-" + str(offset_end) + "offset": str(offset_start) + "-" + str(offset_end) }) - # set and adjust offset counter so that next line has correct start offset - offset_counter = offset_end + 1 - # sort by score then group by tag so each group will be sorted highest to lowest score, # then take top row for each group probe_df = probe_df.sort_values(by=['score'], ascending=False)