diff --git a/python/TransformerTagging/README.md b/python/TransformerTagging/README.md index fd44c515..6b84c532 100644 --- a/python/TransformerTagging/README.md +++ b/python/TransformerTagging/README.md @@ -30,6 +30,17 @@ component will generate detections with the same `TRANSLATION` output. If none o input properties are present then the transformer tagging is not performed then the feed-forward detection is returned unmodified. +Note that certain document types (e.g. PDF, Word), as well as text generated by OCR, may +use newline and carriage return characters to perform line wrapping. That is, the +characters don't necessarily indicate the end of a sentence, but rather that the text has +reached the column or page width and the following text should appear in the next line +down the page. To address this, when the `ENABLE_NEWLINE_SPLIT` property is set to false, +the transformer tagger may parse out sentences from the input text that have newline or +carriage return characters between words. If you know that your input text is generated +from a source where newlines and carriage returns always indicate a new sentence (e.g. +emails), then you may want to set the `ENABLE_NEWLINE_SPLIT` property to true. The +transformer tagger will then treat those characters as sentence breaks. + The reported detections that are returned by the transformer tagger are based on the corpus used, and the minimum score defined in the `SCORE_THRESHOLD` property, as discussed below. diff --git a/python/TransformerTagging/plugin-files/descriptor/descriptor.json b/python/TransformerTagging/plugin-files/descriptor/descriptor.json index 74810ebd..7d34c3ab 100644 --- a/python/TransformerTagging/plugin-files/descriptor/descriptor.json +++ b/python/TransformerTagging/plugin-files/descriptor/descriptor.json @@ -38,6 +38,12 @@ "type": "STRING", "defaultValue": "transformer_text_tags_corpus.json" }, + { + "name": "ENABLE_NEWLINE_SPLIT", + "description": "If true, newline and carriage return characters will be treated as sentence breaks.", + "type": "BOOLEAN", + "defaultValue": "FALSE" + }, { "name": "ENABLE_DEBUG", "description": "If true, each detection will include a `TRIGGER SENTENCES MATCHES` property for each entry in `TAGS`. The value will be the sentences in the corpus which met the score threshold for that tag.", diff --git a/python/TransformerTagging/tests/test_transformer_tagging.py b/python/TransformerTagging/tests/test_transformer_tagging.py index 0214f97f..9b9b9fb7 100644 --- a/python/TransformerTagging/tests/test_transformer_tagging.py +++ b/python/TransformerTagging/tests/test_transformer_tagging.py @@ -260,7 +260,7 @@ def test_missing_text_to_process(self): def test_maintain_tags_from_earlier_feedforward_task(self): ff_track = mpf.GenericTrack(-1, dict(TEXT=SHORT_SAMPLE)) job = mpf.GenericJob('Test Generic', 'test.pdf', {}, {}, ff_track) - # add tags + firstTag = "FIRST_TAG" job.feed_forward_track.detection_properties["TAGS"] = firstTag comp = TransformerTaggingComponent() @@ -338,5 +338,35 @@ def test_repeat_trigger_job(self): self.assertAlmostEqual(matches, props["TEXT TRAVEL TRIGGER SENTENCES MATCHES"]) + def test_newline_split(self): + sample = ( + 'This first sentence is about driving to the beach\n' + 'Another sentence about driving to the beach\n\n\n\n' + 'This sentence is also about driving to the beach and ends in a period.\n' + ' Beach sentence begins and ends with \t whitespace \n' + 'Final beach sentence!' + ) + + trigger_sentences = ( + 'This first sentence is about driving to the beach; ' + 'Another sentence about driving to the beach; ' + 'This sentence is also about driving to the beach and ends in a period.; ' + 'Beach sentence begins and ends with \t whitespace; ' + 'Final beach sentence!' + ) + + offsets = '0-48; 50-92; 97-166; 171-218; 223-243' + + ff_track = mpf.GenericTrack(-1, dict(TEXT=sample)) + job = mpf.GenericJob('Test Generic', 'test.txt', \ + dict(ENABLE_DEBUG='true', ENABLE_NEWLINE_SPLIT='true'), {}, ff_track) + comp = TransformerTaggingComponent() + result = comp.get_detections_from_generic(job) + + props = result[0].detection_properties + + self.assertEqual(trigger_sentences, props["TEXT TRAVEL TRIGGER SENTENCES"]) + self.assertEqual(offsets, props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"]) + if __name__ == '__main__': unittest.main() diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py index 65114269..37ae0574 100644 --- a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py +++ b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py @@ -143,28 +143,52 @@ def _add_tags(self, config, corpus, ff_props: Dict[str, str]): # for each sentence in input for start, end in PunktSentenceTokenizer().span_tokenize(input_text): - probe_sent = input_text[start:end] + probe_str = input_text[start:end] - # get similarity scores for the input sentence with each corpus sentence - probe_sent_embed = self._cached_model.encode(probe_sent, convert_to_tensor=True, show_progress_bar=False) - scores = [float(util.cos_sim(probe_sent_embed, corpus_sent_embed)) for corpus_sent_embed in corpus.embed] - - probe_df = pd.DataFrame({ - "input text": probe_sent, - "corpus text": corpus.json["text"], - "tag": corpus.json["tag"].str.lower(), - "score": scores, - "offset": str(start) + "-" + str(end - 1) - }) - - # sort by score then group by tag so each group will be sorted highest to lowest score, - # then take top row for each group - probe_df = probe_df.sort_values(by=['score'], ascending=False) - top_per_tag = probe_df.groupby(['tag'], sort=False).head(1) - - # filter out results that are below threshold - top_per_tag_threshold = top_per_tag[top_per_tag["score"] >= config.threshold] - all_tag_results.append(top_per_tag_threshold) + # split input sentence further on newline or carriage return if flag is set + if (config.split_on_newline): + probe_list = probe_str.splitlines(keepends=True) + else: + probe_list = [probe_str] + + # an offset counter to track character offset start + offset_counter = start + + for probe in probe_list: + # strip probe of leading and trailing whitespace + stripped_probe = probe.strip() + + # determine probe character offsets + num_leading_chars = len(probe) - len(probe.lstrip()) + offset_start = offset_counter + num_leading_chars + offset_end = offset_start + len(stripped_probe) - 1 + + # set character offset counter for next iteration + offset_counter += len(probe) + + if stripped_probe == "": + continue + + # get similarity scores for the input sentence with each corpus sentence + embed_probe = self._cached_model.encode(stripped_probe, convert_to_tensor=True, show_progress_bar=False) + scores = [float(util.cos_sim(embed_probe, corpus_entry)) for corpus_entry in corpus.embed] + + probe_df = pd.DataFrame({ + "input text": stripped_probe, + "corpus text": corpus.json["text"], + "tag": corpus.json["tag"].str.lower(), + "score": scores, + "offset": str(offset_start) + "-" + str(offset_end) + }) + + # sort by score then group by tag so each group will be sorted highest to lowest score, + # then take top row for each group + probe_df = probe_df.sort_values(by=['score'], ascending=False) + top_per_tag = probe_df.groupby(['tag'], sort=False).head(1) + + # filter out results that are below threshold + top_per_tag_threshold = top_per_tag[top_per_tag["score"] >= config.threshold] + all_tag_results.append(top_per_tag_threshold) # if no tags found in text return if not all_tag_results: @@ -242,6 +266,9 @@ def __init__(self, props: Mapping[str, str]): # if debug is true will return which corpus sentences triggered the match self.debug = mpf_util.get_property(props, 'ENABLE_DEBUG', False) + # if split on newline is true will split input on newline and carriage returns + self.split_on_newline = mpf_util.get_property(props, 'ENABLE_NEWLINE_SPLIT', False) + self.corpus_file = \ mpf_util.get_property(props, 'TRANSFORMER_TAGGING_CORPUS', "transformer_text_tags_corpus.json") diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_text_tags_corpus.json b/python/TransformerTagging/transformer_tagging_component/transformer_text_tags_corpus.json index a93be27a..db01949f 100644 --- a/python/TransformerTagging/transformer_tagging_component/transformer_text_tags_corpus.json +++ b/python/TransformerTagging/transformer_tagging_component/transformer_text_tags_corpus.json @@ -131,6 +131,10 @@ "text": "This sentence is transaction.", "tag": "financial" }, + { + "text": "This sentence is gift card.", + "tag": "financial" + }, { "text": "This sentence is birth", "tag": "personal" @@ -151,6 +155,10 @@ "text": "This sentence is email.", "tag": "personal" }, + { + "text": "This sentence is gmail.", + "tag": "personal" + }, { "text": "This sentence is fax.", "tag": "personal"