From 5a7c258e0c8be723eea6ebe203c74b4744780eb6 Mon Sep 17 00:00:00 2001
From: Chris <CM>
Date: Wed, 27 Mar 2024 00:41:38 -0400
Subject: [PATCH 1/5] Added a new flag, that when set, will split sentences on
 newline and carriage returns

---
 .../transformer_tagging_component.py          | 55 ++++++++++++-------
 1 file changed, 34 insertions(+), 21 deletions(-)
diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py
index 65114269..84fa5334 100644
--- a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py
+++ b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py
@@ -38,6 +38,7 @@
 from pkg_resources import resource_filename
 from nltk.tokenize.punkt import PunktSentenceTokenizer
 import pandas as pd
+import re
 
 logger = logging.getLogger('TransformerTaggingComponent')
 
@@ -143,28 +144,37 @@ def _add_tags(self, config, corpus, ff_props: Dict[str, str]):
 
         # for each sentence in input
         for start, end in PunktSentenceTokenizer().span_tokenize(input_text):
-            probe_sent = input_text[start:end]
+            probe_str = input_text[start:end]
+            probe_list: list = []
 
-            # get similarity scores for the input sentence with each corpus sentence
-            probe_sent_embed = self._cached_model.encode(probe_sent, convert_to_tensor=True, show_progress_bar=False)
-            scores = [float(util.cos_sim(probe_sent_embed, corpus_sent_embed)) for corpus_sent_embed in corpus.embed]
-
-            probe_df = pd.DataFrame({
-                "input text": probe_sent,
-                "corpus text": corpus.json["text"],
-                "tag": corpus.json["tag"].str.lower(),
-                "score": scores,
-                "offset": str(start) + "-" + str(end - 1)
-            })
-
-            # sort by score then group by tag so each group will be sorted highest to lowest score,
-            # then take top row for each group
-            probe_df = probe_df.sort_values(by=['score'], ascending=False)
-            top_per_tag = probe_df.groupby(['tag'], sort=False).head(1)
-
-            # filter out results that are below threshold
-            top_per_tag_threshold = top_per_tag[top_per_tag["score"] >= config.threshold]
-            all_tag_results.append(top_per_tag_threshold)
+            # split input sentence further on newline or carriage return if flag is set
+            if (config.split_on_newline):
+                for new_sentence in probe_str.splitlines():
+                    probe_list.append(new_sentence.lstrip())
+            else:
+                probe_list.append(probe_str)
+
+            for probe_sent in probe_list:
+                # get similarity scores for the input sentence with each corpus sentence
+                probe_sent_embed = self._cached_model.encode(probe_sent, convert_to_tensor=True, show_progress_bar=False)
+                scores = [float(util.cos_sim(probe_sent_embed, corpus_sent_embed)) for corpus_sent_embed in corpus.embed]
+
+                probe_df = pd.DataFrame({
+                    "input text": probe_sent,
+                    "corpus text": corpus.json["text"],
+                    "tag": corpus.json["tag"].str.lower(),
+                    "score": scores,
+                    "offset": str(start) + "-" + str(end - 1)
+                })
+
+                # sort by score then group by tag so each group will be sorted highest to lowest score,
+                # then take top row for each group
+                probe_df = probe_df.sort_values(by=['score'], ascending=False)
+                top_per_tag = probe_df.groupby(['tag'], sort=False).head(1)
+
+                # filter out results that are below threshold
+                top_per_tag_threshold = top_per_tag[top_per_tag["score"] >= config.threshold]
+                all_tag_results.append(top_per_tag_threshold)
 
         # if no tags found in text return
         if not all_tag_results:
@@ -242,6 +252,9 @@ def __init__(self, props: Mapping[str, str]):
         # if debug is true will return which corpus sentences triggered the match
         self.debug = mpf_util.get_property(props, 'ENABLE_DEBUG', False)
 
+        # if split on newline is true will split input on newline and carriage returns
+        self.split_on_newline = mpf_util.get_property(props, 'ENABLE_NEWLINE_SPLIT', False)
+
         self.corpus_file = \
             mpf_util.get_property(props, 'TRANSFORMER_TAGGING_CORPUS', "transformer_text_tags_corpus.json")
 

From 451ac6ea286f7944004c9ef08091b46ce561e2ac Mon Sep 17 00:00:00 2001
From: Chris7C <C.hris@mitre.org>
Date: Thu, 2 May 2024 16:04:37 -0400
Subject: [PATCH 2/5] Updates to fix offsets on newlines

---
 python/TransformerTagging/README.md              |  3 +++
 .../transformer_tagging_component.py             | 16 +++++++++++++---
 .../transformer_text_tags_corpus.json            |  8 ++++++++
 3 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/python/TransformerTagging/README.md b/python/TransformerTagging/README.md
index fd44c515..7309423a 100644
--- a/python/TransformerTagging/README.md
+++ b/python/TransformerTagging/README.md
@@ -13,6 +13,9 @@ in phrasing, subject, and context. The sentences that generate scores above the
 are called "trigger sentences". These sentences are grouped by "tag" based on which entry 
 in the corpus they matched against.
 
+When the `ENABLE_NEWLINE_SPLIT` property is set to true, sentences will be split on
+newline and carriage returns.
+
 This component can be used independently to perform transformer tagging on text
 files, or it can be used as a support component in a multi-stage pipeline to
 perform transformer tagging on feed-forward detections generated by some other
diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py
index 84fa5334..bad219a4 100644
--- a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py
+++ b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py
@@ -149,24 +149,34 @@ def _add_tags(self, config, corpus, ff_props: Dict[str, str]):
 
             # split input sentence further on newline or carriage return if flag is set
             if (config.split_on_newline):
-                for new_sentence in probe_str.splitlines():
-                    probe_list.append(new_sentence.lstrip())
+                for new_sentence in probe_str.splitlines(keepends=True):
+                    probe_list.append(new_sentence)
             else:
                 probe_list.append(probe_str)
 
+            # an offset counter to track offset start if newline flag is set
+            offset_counter: int = start
+            offset_end: int
+
             for probe_sent in probe_list:
                 # get similarity scores for the input sentence with each corpus sentence
                 probe_sent_embed = self._cached_model.encode(probe_sent, convert_to_tensor=True, show_progress_bar=False)
                 scores = [float(util.cos_sim(probe_sent_embed, corpus_sent_embed)) for corpus_sent_embed in corpus.embed]
 
+                # determine offset ending of sentence
+                offset_end = offset_counter + (len(probe_sent) - 1)
+
                 probe_df = pd.DataFrame({
                     "input text": probe_sent,
                     "corpus text": corpus.json["text"],
                     "tag": corpus.json["tag"].str.lower(),
                     "score": scores,
-                    "offset": str(start) + "-" + str(end - 1)
+                    "offset": str(offset_counter) + "-" + str(offset_end)
                 })
 
+                # set and adjust offset counter so that next line has correct start offset
+                offset_counter = offset_end + 1
+
                 # sort by score then group by tag so each group will be sorted highest to lowest score,
                 # then take top row for each group
                 probe_df = probe_df.sort_values(by=['score'], ascending=False)
diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_text_tags_corpus.json b/python/TransformerTagging/transformer_tagging_component/transformer_text_tags_corpus.json
index a93be27a..db01949f 100644
--- a/python/TransformerTagging/transformer_tagging_component/transformer_text_tags_corpus.json
+++ b/python/TransformerTagging/transformer_tagging_component/transformer_text_tags_corpus.json
@@ -131,6 +131,10 @@
     "text": "This sentence is transaction.",
     "tag": "financial"
   },
+  {
+    "text": "This sentence is gift card.",
+    "tag": "financial"
+  },
   {
     "text": "This sentence is birth",
     "tag": "personal"
@@ -151,6 +155,10 @@
     "text": "This sentence is email.",
     "tag": "personal"
   },
+  {
+    "text": "This sentence is gmail.",
+    "tag": "personal"
+  },
   {
     "text": "This sentence is fax.",
     "tag": "personal"

From 1626e8a5d31995200ce234f68df8433ede7701c6 Mon Sep 17 00:00:00 2001
From: Chris7C <C.hris@mitre.org>
Date: Tue, 7 May 2024 00:51:52 -0400
Subject: [PATCH 3/5] test for newline splits and offset

---
 .../tests/test_transformer_tagging.py           | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/python/TransformerTagging/tests/test_transformer_tagging.py b/python/TransformerTagging/tests/test_transformer_tagging.py
index 0214f97f..4c595a13 100644
--- a/python/TransformerTagging/tests/test_transformer_tagging.py
+++ b/python/TransformerTagging/tests/test_transformer_tagging.py
@@ -338,5 +338,22 @@ def test_repeat_trigger_job(self):
 
         self.assertAlmostEqual(matches, props["TEXT TRAVEL TRIGGER SENTENCES MATCHES"])
 
+    def test_newline(self):
+        NEWLINE_SAMPLE = (
+            'This first sentence is about driving to the beach\nAnother sentence about driving to the beach\n\n\n\nThis sentence is also about driving to the beach and ends in a period.\n'
+        )
+        ff_track = mpf.GenericTrack(-1, dict(TEXT=NEWLINE_SAMPLE))
+        job = mpf.GenericJob('Test Generic', 'test.txt', \
+            dict(ENABLE_DEBUG='true', ENABLE_NEWLINE_SPLIT='true'), {}, ff_track)
+        comp = TransformerTaggingComponent()
+        result = comp.get_detections_from_generic(job)
+
+        props = result[0].detection_properties
+
+        expectedPersonalSentences: str = 'This first sentence is about driving to the beach\n; Another sentence about driving to the beach\n; This sentence is also about driving to the beach and ends in a period.'
+        expectedPersonalOffsets: str = '0-49; 50-93; 97-166'
+        self.assertEqual(expectedPersonalSentences, props["TEXT TRAVEL TRIGGER SENTENCES"])
+        self.assertEqual(expectedPersonalOffsets, props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"])
+
 if __name__ == '__main__':
     unittest.main()

From 7dbd353b8d87c536da8daefdb8e148ffdbe98424 Mon Sep 17 00:00:00 2001
From: jrobble <jrobble@mitre.org>
Date: Wed, 8 May 2024 13:11:34 -0400
Subject: [PATCH 4/5] Address PR comments.

---
 python/TransformerTagging/README.md                | 14 +++++++++++---
 .../plugin-files/descriptor/descriptor.json        |  6 ++++++
 .../transformer_tagging_component.py               | 13 +++++--------
 3 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/python/TransformerTagging/README.md b/python/TransformerTagging/README.md
index 7309423a..6b84c532 100644
--- a/python/TransformerTagging/README.md
+++ b/python/TransformerTagging/README.md
@@ -13,9 +13,6 @@ in phrasing, subject, and context. The sentences that generate scores above the
 are called "trigger sentences". These sentences are grouped by "tag" based on which entry 
 in the corpus they matched against.
 
-When the `ENABLE_NEWLINE_SPLIT` property is set to true, sentences will be split on
-newline and carriage returns.
-
 This component can be used independently to perform transformer tagging on text
 files, or it can be used as a support component in a multi-stage pipeline to
 perform transformer tagging on feed-forward detections generated by some other
@@ -33,6 +30,17 @@ component will generate detections with the same `TRANSLATION` output. If none o
 input properties are present then the transformer tagging is not performed then the 
 feed-forward detection is returned unmodified.
 
+Note that certain document types (e.g. PDF, Word), as well as text generated by OCR, may
+use newline and carriage return characters to perform line wrapping. That is, the
+characters don't necessarily indicate the end of a sentence, but rather that the text has
+reached the column or page width and the following text should appear in the next line
+down the page. To address this, when the `ENABLE_NEWLINE_SPLIT` property is set to false,
+the transformer tagger may parse out sentences from the input text that have newline or
+carriage return characters between words. If you know that your input text is generated
+from a source where newlines and carriage returns always indicate a new sentence (e.g.
+emails), then you may want to set the `ENABLE_NEWLINE_SPLIT` property to true. The
+transformer tagger will then treat those characters as sentence breaks.
+
 The reported detections that are returned by the transformer tagger are based on the 
 corpus used, and the minimum score defined in the `SCORE_THRESHOLD` property, as
 discussed below. 
diff --git a/python/TransformerTagging/plugin-files/descriptor/descriptor.json b/python/TransformerTagging/plugin-files/descriptor/descriptor.json
index 74810ebd..7d34c3ab 100644
--- a/python/TransformerTagging/plugin-files/descriptor/descriptor.json
+++ b/python/TransformerTagging/plugin-files/descriptor/descriptor.json
@@ -38,6 +38,12 @@
           "type": "STRING",
           "defaultValue": "transformer_text_tags_corpus.json"
         },
+        {
+          "name": "ENABLE_NEWLINE_SPLIT",
+          "description": "If true, newline and carriage return characters will be treated as sentence breaks.",
+          "type": "BOOLEAN",
+          "defaultValue": "FALSE"
+        },
         {
           "name": "ENABLE_DEBUG",
           "description": "If true, each detection will include a `TRIGGER SENTENCES MATCHES` property for each entry in `TAGS`. The value will be the sentences in the corpus which met the score threshold for that tag.",
diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py
index bad219a4..01e9401b 100644
--- a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py
+++ b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py
@@ -38,14 +38,14 @@
 from pkg_resources import resource_filename
 from nltk.tokenize.punkt import PunktSentenceTokenizer
 import pandas as pd
-import re
 
 logger = logging.getLogger('TransformerTaggingComponent')
 
 class TransformerTaggingComponent:
 
     def __init__(self):
-        self._cached_model = SentenceTransformer('/models/all-mpnet-base-v2')
+        # self._cached_model = SentenceTransformer('/models/all-mpnet-base-v2')
+        self._cached_model = SentenceTransformer('/home/mpf/git/openmpf-projects/openmpf-components/python/TransformerTagging/models/all-mpnet-base-v2') # DEBUG
         self._cached_corpuses: Dict[str, Corpus] = {}
 
 
@@ -145,18 +145,15 @@ def _add_tags(self, config, corpus, ff_props: Dict[str, str]):
         # for each sentence in input
         for start, end in PunktSentenceTokenizer().span_tokenize(input_text):
             probe_str = input_text[start:end]
-            probe_list: list = []
 
             # split input sentence further on newline or carriage return if flag is set
             if (config.split_on_newline):
-                for new_sentence in probe_str.splitlines(keepends=True):
-                    probe_list.append(new_sentence)
+                probe_list = probe_str.splitlines(keepends=True)
             else:
-                probe_list.append(probe_str)
+                probe_list = [probe_str]
 
             # an offset counter to track offset start if newline flag is set
-            offset_counter: int = start
-            offset_end: int
+            offset_counter = start
 
             for probe_sent in probe_list:
                 # get similarity scores for the input sentence with each corpus sentence

From 9b08cb565b64a732671f7d78f8000b8f51c8506a Mon Sep 17 00:00:00 2001
From: jrobble <jrobble@mitre.org>
Date: Wed, 8 May 2024 14:04:56 -0400
Subject: [PATCH 5/5] Strip probes.

---
 .../tests/test_transformer_tagging.py         | 31 +++++++++++-----
 .../transformer_tagging_component.py          | 35 +++++++++++--------
 2 files changed, 43 insertions(+), 23 deletions(-)

diff --git a/python/TransformerTagging/tests/test_transformer_tagging.py b/python/TransformerTagging/tests/test_transformer_tagging.py
index 4c595a13..9b9b9fb7 100644
--- a/python/TransformerTagging/tests/test_transformer_tagging.py
+++ b/python/TransformerTagging/tests/test_transformer_tagging.py
@@ -260,7 +260,7 @@ def test_missing_text_to_process(self):
     def test_maintain_tags_from_earlier_feedforward_task(self):
         ff_track = mpf.GenericTrack(-1, dict(TEXT=SHORT_SAMPLE))
         job = mpf.GenericJob('Test Generic', 'test.pdf', {}, {}, ff_track)
-        # add tags
+
         firstTag = "FIRST_TAG"
         job.feed_forward_track.detection_properties["TAGS"] = firstTag
         comp = TransformerTaggingComponent()
@@ -338,11 +338,26 @@ def test_repeat_trigger_job(self):
 
         self.assertAlmostEqual(matches, props["TEXT TRAVEL TRIGGER SENTENCES MATCHES"])
 
-    def test_newline(self):
-        NEWLINE_SAMPLE = (
-            'This first sentence is about driving to the beach\nAnother sentence about driving to the beach\n\n\n\nThis sentence is also about driving to the beach and ends in a period.\n'
+    def test_newline_split(self):
+        sample = (
+            'This first sentence is about driving to the beach\n'
+            'Another sentence about driving to the beach\n\n\n\n'
+            'This sentence is also about driving to the beach and ends in a period.\n'
+            '   Beach sentence begins and ends with \t whitespace   \n'
+            'Final beach sentence!'
+        )
+
+        trigger_sentences = (
+            'This first sentence is about driving to the beach; '
+            'Another sentence about driving to the beach; '
+            'This sentence is also about driving to the beach and ends in a period.; '
+            'Beach sentence begins and ends with \t whitespace; '
+            'Final beach sentence!'
         )
-        ff_track = mpf.GenericTrack(-1, dict(TEXT=NEWLINE_SAMPLE))
+
+        offsets = '0-48; 50-92; 97-166; 171-218; 223-243'
+
+        ff_track = mpf.GenericTrack(-1, dict(TEXT=sample))
         job = mpf.GenericJob('Test Generic', 'test.txt', \
             dict(ENABLE_DEBUG='true', ENABLE_NEWLINE_SPLIT='true'), {}, ff_track)
         comp = TransformerTaggingComponent()
@@ -350,10 +365,8 @@ def test_newline(self):
 
         props = result[0].detection_properties
 
-        expectedPersonalSentences: str = 'This first sentence is about driving to the beach\n; Another sentence about driving to the beach\n; This sentence is also about driving to the beach and ends in a period.'
-        expectedPersonalOffsets: str = '0-49; 50-93; 97-166'
-        self.assertEqual(expectedPersonalSentences, props["TEXT TRAVEL TRIGGER SENTENCES"])
-        self.assertEqual(expectedPersonalOffsets, props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"])
+        self.assertEqual(trigger_sentences, props["TEXT TRAVEL TRIGGER SENTENCES"])
+        self.assertEqual(offsets, props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"])
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py
index 01e9401b..37ae0574 100644
--- a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py
+++ b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py
@@ -44,8 +44,7 @@
 class TransformerTaggingComponent:
 
     def __init__(self):
-        # self._cached_model = SentenceTransformer('/models/all-mpnet-base-v2')
-        self._cached_model = SentenceTransformer('/home/mpf/git/openmpf-projects/openmpf-components/python/TransformerTagging/models/all-mpnet-base-v2') # DEBUG
+        self._cached_model = SentenceTransformer('/models/all-mpnet-base-v2')
         self._cached_corpuses: Dict[str, Corpus] = {}
 
 
@@ -152,28 +151,36 @@ def _add_tags(self, config, corpus, ff_props: Dict[str, str]):
             else:
                 probe_list = [probe_str]
 
-            # an offset counter to track offset start if newline flag is set
+            # an offset counter to track character offset start
             offset_counter = start
 
-            for probe_sent in probe_list:
-                # get similarity scores for the input sentence with each corpus sentence
-                probe_sent_embed = self._cached_model.encode(probe_sent, convert_to_tensor=True, show_progress_bar=False)
-                scores = [float(util.cos_sim(probe_sent_embed, corpus_sent_embed)) for corpus_sent_embed in corpus.embed]
+            for probe in probe_list:
+                # strip probe of leading and trailing whitespace
+                stripped_probe = probe.strip()
+                
+                # determine probe character offsets
+                num_leading_chars = len(probe) - len(probe.lstrip())
+                offset_start = offset_counter + num_leading_chars 
+                offset_end = offset_start + len(stripped_probe) - 1
+
+                # set character offset counter for next iteration
+                offset_counter += len(probe)
 
-                # determine offset ending of sentence
-                offset_end = offset_counter + (len(probe_sent) - 1)
+                if stripped_probe == "":
+                    continue
+
+                # get similarity scores for the input sentence with each corpus sentence
+                embed_probe = self._cached_model.encode(stripped_probe, convert_to_tensor=True, show_progress_bar=False)
+                scores = [float(util.cos_sim(embed_probe, corpus_entry)) for corpus_entry in corpus.embed]
 
                 probe_df = pd.DataFrame({
-                    "input text": probe_sent,
+                    "input text": stripped_probe,
                     "corpus text": corpus.json["text"],
                     "tag": corpus.json["tag"].str.lower(),
                     "score": scores,
-                    "offset": str(offset_counter) + "-" + str(offset_end)
+                    "offset": str(offset_start) + "-" + str(offset_end)
                 })
 
-                # set and adjust offset counter so that next line has correct start offset
-                offset_counter = offset_end + 1
-
                 # sort by score then group by tag so each group will be sorted highest to lowest score,
                 # then take top row for each group
                 probe_df = probe_df.sort_values(by=['score'], ascending=False)