openmpf · jrobble · May 8, 2024 · Mar 27, 2024 · May 2, 2024 · May 7, 2024
diff --git a/python/TransformerTagging/README.md b/python/TransformerTagging/README.md
@@ -30,6 +30,17 @@ component will generate detections with the same `TRANSLATION` output. If none o
 input properties are present then the transformer tagging is not performed then the 
 feed-forward detection is returned unmodified.
 
+Note that certain document types (e.g. PDF, Word), as well as text generated by OCR, may
+use newline and carriage return characters to perform line wrapping. That is, the
+characters don't necessarily indicate the end of a sentence, but rather that the text has
+reached the column or page width and the following text should appear in the next line
+down the page. To address this, when the `ENABLE_NEWLINE_SPLIT` property is set to false,
+the transformer tagger may parse out sentences from the input text that have newline or
+carriage return characters between words. If you know that your input text is generated
+from a source where newlines and carriage returns always indicate a new sentence (e.g.
+emails), then you may want to set the `ENABLE_NEWLINE_SPLIT` property to true. The
+transformer tagger will then treat those characters as sentence breaks.
+
 The reported detections that are returned by the transformer tagger are based on the 
 corpus used, and the minimum score defined in the `SCORE_THRESHOLD` property, as
 discussed below. 

diff --git a/python/TransformerTagging/plugin-files/descriptor/descriptor.json b/python/TransformerTagging/plugin-files/descriptor/descriptor.json
@@ -38,6 +38,12 @@
           "type": "STRING",
           "defaultValue": "transformer_text_tags_corpus.json"
         },
+        {
+          "name": "ENABLE_NEWLINE_SPLIT",
+          "description": "If true, newline and carriage return characters will be treated as sentence breaks.",
+          "type": "BOOLEAN",
+          "defaultValue": "FALSE"
+        },
         {
           "name": "ENABLE_DEBUG",
           "description": "If true, each detection will include a `TRIGGER SENTENCES MATCHES` property for each entry in `TAGS`. The value will be the sentences in the corpus which met the score threshold for that tag.",

diff --git a/python/TransformerTagging/tests/test_transformer_tagging.py b/python/TransformerTagging/tests/test_transformer_tagging.py
@@ -260,7 +260,7 @@ def test_missing_text_to_process(self):
     def test_maintain_tags_from_earlier_feedforward_task(self):
         ff_track = mpf.GenericTrack(-1, dict(TEXT=SHORT_SAMPLE))
         job = mpf.GenericJob('Test Generic', 'test.pdf', {}, {}, ff_track)
-        # add tags
+
         firstTag = "FIRST_TAG"
         job.feed_forward_track.detection_properties["TAGS"] = firstTag
         comp = TransformerTaggingComponent()
@@ -338,5 +338,35 @@ def test_repeat_trigger_job(self):
 
         self.assertAlmostEqual(matches, props["TEXT TRAVEL TRIGGER SENTENCES MATCHES"])
 
+    def test_newline_split(self):
+        sample = (
+            'This first sentence is about driving to the beach\n'
+            'Another sentence about driving to the beach\n\n\n\n'
+            'This sentence is also about driving to the beach and ends in a period.\n'
+            '   Beach sentence begins and ends with \t whitespace   \n'
+            'Final beach sentence!'
+        )
+
+        trigger_sentences = (
+            'This first sentence is about driving to the beach; '
+            'Another sentence about driving to the beach; '
+            'This sentence is also about driving to the beach and ends in a period.; '
+            'Beach sentence begins and ends with \t whitespace; '
+            'Final beach sentence!'
+        )
+
+        offsets = '0-48; 50-92; 97-166; 171-218; 223-243'
+
+        ff_track = mpf.GenericTrack(-1, dict(TEXT=sample))
+        job = mpf.GenericJob('Test Generic', 'test.txt', \
+            dict(ENABLE_DEBUG='true', ENABLE_NEWLINE_SPLIT='true'), {}, ff_track)
+        comp = TransformerTaggingComponent()
+        result = comp.get_detections_from_generic(job)
+
+        props = result[0].detection_properties
+
+        self.assertEqual(trigger_sentences, props["TEXT TRAVEL TRIGGER SENTENCES"])
+        self.assertEqual(offsets, props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"])
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py
@@ -143,28 +143,52 @@ def _add_tags(self, config, corpus, ff_props: Dict[str, str]):
 
         # for each sentence in input
         for start, end in PunktSentenceTokenizer().span_tokenize(input_text):
-            probe_sent = input_text[start:end]
+            probe_str = input_text[start:end]
 
-            # get similarity scores for the input sentence with each corpus sentence
-            probe_sent_embed = self._cached_model.encode(probe_sent, convert_to_tensor=True, show_progress_bar=False)
-            scores = [float(util.cos_sim(probe_sent_embed, corpus_sent_embed)) for corpus_sent_embed in corpus.embed]
-
-            probe_df = pd.DataFrame({
-                "input text": probe_sent,
-                "corpus text": corpus.json["text"],
-                "tag": corpus.json["tag"].str.lower(),
-                "score": scores,
-                "offset": str(start) + "-" + str(end - 1)
-            })
-
-            # sort by score then group by tag so each group will be sorted highest to lowest score,
-            # then take top row for each group
-            probe_df = probe_df.sort_values(by=['score'], ascending=False)
-            top_per_tag = probe_df.groupby(['tag'], sort=False).head(1)
-
-            # filter out results that are below threshold
-            top_per_tag_threshold = top_per_tag[top_per_tag["score"] >= config.threshold]
-            all_tag_results.append(top_per_tag_threshold)
+            # split input sentence further on newline or carriage return if flag is set
+            if (config.split_on_newline):
+                probe_list = probe_str.splitlines(keepends=True)
+            else:
+                probe_list = [probe_str]
+
+            # an offset counter to track character offset start
+            offset_counter = start
+
+            for probe in probe_list:
+                # strip probe of leading and trailing whitespace
+                stripped_probe = probe.strip()
+
+                # determine probe character offsets
+                num_leading_chars = len(probe) - len(probe.lstrip())
+                offset_start = offset_counter + num_leading_chars 
+                offset_end = offset_start + len(stripped_probe) - 1
+
+                # set character offset counter for next iteration
+                offset_counter += len(probe)
+
+                if stripped_probe == "":
+                    continue
+
+                # get similarity scores for the input sentence with each corpus sentence
+                embed_probe = self._cached_model.encode(stripped_probe, convert_to_tensor=True, show_progress_bar=False)
+                scores = [float(util.cos_sim(embed_probe, corpus_entry)) for corpus_entry in corpus.embed]
+
+                probe_df = pd.DataFrame({
+                    "input text": stripped_probe,
+                    "corpus text": corpus.json["text"],
+                    "tag": corpus.json["tag"].str.lower(),
+                    "score": scores,
+                    "offset": str(offset_start) + "-" + str(offset_end)
+                })
+
+                # sort by score then group by tag so each group will be sorted highest to lowest score,
+                # then take top row for each group
+                probe_df = probe_df.sort_values(by=['score'], ascending=False)
+                top_per_tag = probe_df.groupby(['tag'], sort=False).head(1)
+
+                # filter out results that are below threshold
+                top_per_tag_threshold = top_per_tag[top_per_tag["score"] >= config.threshold]
+                all_tag_results.append(top_per_tag_threshold)
 
         # if no tags found in text return
         if not all_tag_results:
@@ -242,6 +266,9 @@ def __init__(self, props: Mapping[str, str]):
         # if debug is true will return which corpus sentences triggered the match
         self.debug = mpf_util.get_property(props, 'ENABLE_DEBUG', False)
 
+        # if split on newline is true will split input on newline and carriage returns
+        self.split_on_newline = mpf_util.get_property(props, 'ENABLE_NEWLINE_SPLIT', False)
+
         self.corpus_file = \
             mpf_util.get_property(props, 'TRANSFORMER_TAGGING_CORPUS', "transformer_text_tags_corpus.json")
 

diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_text_tags_corpus.json b/python/TransformerTagging/transformer_tagging_component/transformer_text_tags_corpus.json
@@ -131,6 +131,10 @@
     "text": "This sentence is transaction.",
     "tag": "financial"
   },
+  {
+    "text": "This sentence is gift card.",
+    "tag": "financial"
+  },
   {
     "text": "This sentence is birth",
     "tag": "personal"
@@ -151,6 +155,10 @@
     "text": "This sentence is email.",
     "tag": "personal"
   },
+  {
+    "text": "This sentence is gmail.",
+    "tag": "personal"
+  },
   {
     "text": "This sentence is fax.",
     "tag": "personal"