Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions python/TransformerTagging/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,17 @@ component will generate detections with the same `TRANSLATION` output. If none o
input properties are present then the transformer tagging is not performed then the
feed-forward detection is returned unmodified.

Note that certain document types (e.g. PDF, Word), as well as text generated by OCR, may
use newline and carriage return characters to perform line wrapping. That is, the
characters don't necessarily indicate the end of a sentence, but rather that the text has
reached the column or page width and the following text should appear in the next line
down the page. To address this, when the `ENABLE_NEWLINE_SPLIT` property is set to false,
the transformer tagger may parse out sentences from the input text that have newline or
carriage return characters between words. If you know that your input text is generated
from a source where newlines and carriage returns always indicate a new sentence (e.g.
emails), then you may want to set the `ENABLE_NEWLINE_SPLIT` property to true. The
transformer tagger will then treat those characters as sentence breaks.

The reported detections that are returned by the transformer tagger are based on the
corpus used, and the minimum score defined in the `SCORE_THRESHOLD` property, as
discussed below.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@
"type": "STRING",
"defaultValue": "transformer_text_tags_corpus.json"
},
{
"name": "ENABLE_NEWLINE_SPLIT",
"description": "If true, newline and carriage return characters will be treated as sentence breaks.",
"type": "BOOLEAN",
"defaultValue": "FALSE"
},
{
"name": "ENABLE_DEBUG",
"description": "If true, each detection will include a `TRIGGER SENTENCES MATCHES` property for each entry in `TAGS`. The value will be the sentences in the corpus which met the score threshold for that tag.",
Expand Down
32 changes: 31 additions & 1 deletion python/TransformerTagging/tests/test_transformer_tagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ def test_missing_text_to_process(self):
def test_maintain_tags_from_earlier_feedforward_task(self):
ff_track = mpf.GenericTrack(-1, dict(TEXT=SHORT_SAMPLE))
job = mpf.GenericJob('Test Generic', 'test.pdf', {}, {}, ff_track)
# add tags

firstTag = "FIRST_TAG"
job.feed_forward_track.detection_properties["TAGS"] = firstTag
comp = TransformerTaggingComponent()
Expand Down Expand Up @@ -338,5 +338,35 @@ def test_repeat_trigger_job(self):

self.assertAlmostEqual(matches, props["TEXT TRAVEL TRIGGER SENTENCES MATCHES"])

def test_newline_split(self):
sample = (
'This first sentence is about driving to the beach\n'
'Another sentence about driving to the beach\n\n\n\n'
'This sentence is also about driving to the beach and ends in a period.\n'
' Beach sentence begins and ends with \t whitespace \n'
'Final beach sentence!'
)

trigger_sentences = (
'This first sentence is about driving to the beach; '
'Another sentence about driving to the beach; '
'This sentence is also about driving to the beach and ends in a period.; '
'Beach sentence begins and ends with \t whitespace; '
'Final beach sentence!'
)

offsets = '0-48; 50-92; 97-166; 171-218; 223-243'

ff_track = mpf.GenericTrack(-1, dict(TEXT=sample))
job = mpf.GenericJob('Test Generic', 'test.txt', \
dict(ENABLE_DEBUG='true', ENABLE_NEWLINE_SPLIT='true'), {}, ff_track)
comp = TransformerTaggingComponent()
result = comp.get_detections_from_generic(job)

props = result[0].detection_properties

self.assertEqual(trigger_sentences, props["TEXT TRAVEL TRIGGER SENTENCES"])
self.assertEqual(offsets, props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"])

if __name__ == '__main__':
unittest.main()
Original file line number Diff line number Diff line change
Expand Up @@ -143,28 +143,52 @@ def _add_tags(self, config, corpus, ff_props: Dict[str, str]):

# for each sentence in input
for start, end in PunktSentenceTokenizer().span_tokenize(input_text):
probe_sent = input_text[start:end]
probe_str = input_text[start:end]

# get similarity scores for the input sentence with each corpus sentence
probe_sent_embed = self._cached_model.encode(probe_sent, convert_to_tensor=True, show_progress_bar=False)
scores = [float(util.cos_sim(probe_sent_embed, corpus_sent_embed)) for corpus_sent_embed in corpus.embed]

probe_df = pd.DataFrame({
"input text": probe_sent,
"corpus text": corpus.json["text"],
"tag": corpus.json["tag"].str.lower(),
"score": scores,
"offset": str(start) + "-" + str(end - 1)
})

# sort by score then group by tag so each group will be sorted highest to lowest score,
# then take top row for each group
probe_df = probe_df.sort_values(by=['score'], ascending=False)
top_per_tag = probe_df.groupby(['tag'], sort=False).head(1)

# filter out results that are below threshold
top_per_tag_threshold = top_per_tag[top_per_tag["score"] >= config.threshold]
all_tag_results.append(top_per_tag_threshold)
# split input sentence further on newline or carriage return if flag is set
if (config.split_on_newline):
probe_list = probe_str.splitlines(keepends=True)
else:
probe_list = [probe_str]

# an offset counter to track character offset start
offset_counter = start

for probe in probe_list:
# strip probe of leading and trailing whitespace
stripped_probe = probe.strip()

# determine probe character offsets
num_leading_chars = len(probe) - len(probe.lstrip())
offset_start = offset_counter + num_leading_chars
offset_end = offset_start + len(stripped_probe) - 1

# set character offset counter for next iteration
offset_counter += len(probe)

if stripped_probe == "":
continue

# get similarity scores for the input sentence with each corpus sentence
embed_probe = self._cached_model.encode(stripped_probe, convert_to_tensor=True, show_progress_bar=False)
scores = [float(util.cos_sim(embed_probe, corpus_entry)) for corpus_entry in corpus.embed]

probe_df = pd.DataFrame({
"input text": stripped_probe,
"corpus text": corpus.json["text"],
"tag": corpus.json["tag"].str.lower(),
"score": scores,
"offset": str(offset_start) + "-" + str(offset_end)
})

# sort by score then group by tag so each group will be sorted highest to lowest score,
# then take top row for each group
probe_df = probe_df.sort_values(by=['score'], ascending=False)
top_per_tag = probe_df.groupby(['tag'], sort=False).head(1)

# filter out results that are below threshold
top_per_tag_threshold = top_per_tag[top_per_tag["score"] >= config.threshold]
all_tag_results.append(top_per_tag_threshold)

# if no tags found in text return
if not all_tag_results:
Expand Down Expand Up @@ -242,6 +266,9 @@ def __init__(self, props: Mapping[str, str]):
# if debug is true will return which corpus sentences triggered the match
self.debug = mpf_util.get_property(props, 'ENABLE_DEBUG', False)

# if split on newline is true will split input on newline and carriage returns
self.split_on_newline = mpf_util.get_property(props, 'ENABLE_NEWLINE_SPLIT', False)

self.corpus_file = \
mpf_util.get_property(props, 'TRANSFORMER_TAGGING_CORPUS', "transformer_text_tags_corpus.json")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,10 @@
"text": "This sentence is transaction.",
"tag": "financial"
},
{
"text": "This sentence is gift card.",
"tag": "financial"
},
{
"text": "This sentence is birth",
"tag": "personal"
Expand All @@ -151,6 +155,10 @@
"text": "This sentence is email.",
"tag": "personal"
},
{
"text": "This sentence is gmail.",
"tag": "personal"
},
{
"text": "This sentence is fax.",
"tag": "personal"
Expand Down