From 784fd041115d2bb7b02bc5e25dc32b880f5de791 Mon Sep 17 00:00:00 2001 From: mcrenshaw Date: Thu, 14 Sep 2023 16:23:22 -0400 Subject: [PATCH 01/21] creating transformer sentence tagging component --- .../plugin-files/descriptor/descriptor.json | 98 ++++++ python/TransformerTagging/pyproject.toml | 29 ++ .../sample_transformer_tagger.py | 45 +++ python/TransformerTagging/setup.cfg | 42 +++ .../tests/config/custom_corpus.json | 278 ++++++++++++++++++ .../config/transformer_text_tags_corpus.json | 278 ++++++++++++++++++ .../tests/data/multiple_tags.txt | 1 + .../tests/data/simple_input.txt | 3 + .../tests/test_transformer_tagging.py | 245 +++++++++++++++ .../transformer_tagging_component/__init__.py | 27 ++ .../transformer_tagging_component.py | 216 ++++++++++++++ .../transformer_text_tags_corpus.json | 278 ++++++++++++++++++ 12 files changed, 1540 insertions(+) create mode 100644 python/TransformerTagging/plugin-files/descriptor/descriptor.json create mode 100644 python/TransformerTagging/pyproject.toml create mode 100644 python/TransformerTagging/sample_transformer_tagger.py create mode 100644 python/TransformerTagging/setup.cfg create mode 100644 python/TransformerTagging/tests/config/custom_corpus.json create mode 100644 python/TransformerTagging/tests/config/transformer_text_tags_corpus.json create mode 100644 python/TransformerTagging/tests/data/multiple_tags.txt create mode 100644 python/TransformerTagging/tests/data/simple_input.txt create mode 100644 python/TransformerTagging/tests/test_transformer_tagging.py create mode 100644 python/TransformerTagging/transformer_tagging_component/__init__.py create mode 100644 python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py create mode 100644 python/TransformerTagging/transformer_tagging_component/transformer_text_tags_corpus.json diff --git a/python/TransformerTagging/plugin-files/descriptor/descriptor.json b/python/TransformerTagging/plugin-files/descriptor/descriptor.json new file mode 100644 index 00000000..cd667b30 --- /dev/null +++ b/python/TransformerTagging/plugin-files/descriptor/descriptor.json @@ -0,0 +1,98 @@ +{ + "componentName": "TransformerTagging", + "componentVersion": "7.2", + "middlewareVersion": "7.2", + "sourceLanguage": "python", + "batchLibrary": "ArgosTranslation", + "environmentVariables": [], + "algorithm": { + "name": "TRANSFORMERTAGGING", + "description": "Uses SentenceTransformers to tag sentences.", + "actionType": "DETECTION", + "requiresCollection": { + "states": [] + }, + "providesCollection": { + "states": [ + "DETECTION", + "DETECTION_TAGGING", + "DETECTION_TAGGING_TRANSFORMER" + ], + "properties": [ + { + "name": "FEED_FORWARD_PROP_TO_PROCESS", + "description": "Comma-separated list of property names indicating which properties in the feed-forward track or detection to consider translating. If the first property listed is present, then that property will be translated. If it's not, then the next property in the list is considered. At most, one property will be translated.", + "type": "STRING", + "defaultValue": "TEXT,TRANSCRIPT" + }, + { + "name": "SCORE_THRESHOLD", + "description": "The minimum score score which must be met or exceeded. Tags below this threshold are silently discarded.", + "type": "DOUBLE", + "defaultValue": "0.3" + }, + { + "name": "TRANSFORMER_TAGGING_CORPUS", + "description": "", + "type": "STRING", + "defaultValue": "transformer_text_tags_corpus.json" + }, + { + "name": "ENABLE_DEBUG", + "description": "", + "type": "BOOLEAN", + "defaultValue": "FALSE" + } + ] + } + }, + "actions": [ + { + "name": "ARGOS TRANSLATION (WITH FF REGION) ACTION", + "description": "Uses Argos Translation to perform translation on feed-forward tracks and detections.", + "algorithm": "ARGOSTRANSLATION", + "properties": [ + { + "name": "FEED_FORWARD_TYPE", + "value": "REGION" + }, + { + "name": "OUTPUT_MERGE_WITH_PREVIOUS_TASK", + "value": "TRUE" + } + ] + }, + { + "name": "ARGOS TRANSLATION TEXT FILE ACTION", + "description": "Uses Argos Translation to perform translation on a plain text file.", + "algorithm": "ARGOSTRANSLATION", + "properties": [ + ] + } + ], + "tasks": [ + { + "name": "ARGOS TRANSLATION (WITH FF REGION) TASK", + "description": "Uses Argos Translate to perform translation on feed-forward tracks and detections.", + "actions": [ + "ARGOS TRANSLATION (WITH FF REGION) ACTION" + ] + }, + { + "name": "ARGOS TRANSLATION TEXT FILE TASK", + "description": "Uses Argos Translate to perform translation on a plain text file.", + "actions": [ + "ARGOS TRANSLATION TEXT FILE ACTION" + ] + } + ], + "pipelines": [ + { + "name": "ARGOS TRANSLATION TEXT FILE PIPELINE", + "description": "Uses Argos Translate to perform translation on a plain text file.", + "tasks": [ + "ARGOS TRANSLATION TEXT FILE TASK" + ] + } + ] +} \ No newline at end of file diff --git a/python/TransformerTagging/pyproject.toml b/python/TransformerTagging/pyproject.toml new file mode 100644 index 00000000..49566867 --- /dev/null +++ b/python/TransformerTagging/pyproject.toml @@ -0,0 +1,29 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2023 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/python/TransformerTagging/sample_transformer_tagger.py b/python/TransformerTagging/sample_transformer_tagger.py new file mode 100644 index 00000000..8eacd620 --- /dev/null +++ b/python/TransformerTagging/sample_transformer_tagger.py @@ -0,0 +1,45 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2023 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +import sys + +from transformer_tagging_component import TransformerTaggingComponent, TransformerWrapper +import mpf_component_api as mpf + + +def main(): + wrapper = TransformerWrapper({}) + detection_props = dict(TEXT="I also have a knife. I have a gun. I took a plane to Florida. I bought some cocaine. " + "It did not go well.") + print(detection_props["TEXT"]) + wrapper.add_tags(detection_props) + + for prop in detection_props: + print(prop, ": ", detection_props[prop]) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/python/TransformerTagging/setup.cfg b/python/TransformerTagging/setup.cfg new file mode 100644 index 00000000..5c12a95f --- /dev/null +++ b/python/TransformerTagging/setup.cfg @@ -0,0 +1,42 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2023 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +[metadata] +name = TransformerTagging +version = 7.2 + +[options] +packages = transformer_tagging_component +install_requires = + mpf_component_api>=7.2 + mpf_component_util>=7.2 + +[options.entry_points] +mpf.exported_component = + component = transformer_tagging_component.transformer_tagging_component:TransformerTaggingComponent + +[options.package_data] +transformer_tagging_component=transformer_text_tags_corpus.json \ No newline at end of file diff --git a/python/TransformerTagging/tests/config/custom_corpus.json b/python/TransformerTagging/tests/config/custom_corpus.json new file mode 100644 index 00000000..188bce09 --- /dev/null +++ b/python/TransformerTagging/tests/config/custom_corpus.json @@ -0,0 +1,278 @@ +[ + { + "text": "This sentence is auto.", + "tag": "vehicle" + }, + { + "text": "This sentence is bike.", + "tag": "vehicle" + }, + { + "text": "This sentence is bus.", + "tag": "vehicle" + }, + { + "text": "This sentence is car.", + "tag": "vehicle" + }, + { + "text": "This sentence is motor vehicle.", + "tag": "vehicle" + }, + { + "text": "This sentence is motorcycle.", + "tag": "vehicle" + }, + { + "text": "This sentence is suv.", + "tag": "vehicle" + }, + { + "text": "This sentence is truck.", + "tag": "vehicle" + }, + { + "text": "This sentence is trolley.", + "tag": "vehicle" + }, + { + "text": "This sentence is tram.", + "tag": "vehicle" + }, + { + "text": "This sentence is van.", + "tag": "vehicle" + }, + { + "text": "This sentence is vehicle.", + "tag": "vehicle" + }, + { + "text": "This sentence is vin.", + "tag": "vehicle" + }, + { + "text": "This sentence is financ.", + "tag": "financial" + }, + { + "text": "This sentence is bank.", + "tag": "financial" + }, + { + "text": "This sentence is ATM.", + "tag": "financial" + }, + { + "text": "This sentence is balance.", + "tag": "financial" + }, + { + "text": "This sentence is bill.", + "tag": "financial" + }, + { + "text": "This sentence is cash.", + "tag": "financial" + }, + { + "text": "This sentence is credit.", + "tag": "financial" + }, + { + "text": "This sentence is debit.", + "tag": "financial" + }, + { + "text": "This sentence is deposit.", + "tag": "financial" + }, + { + "text": "This sentence is dollar.", + "tag": "financial" + }, + { + "text": "This sentence is dollars.", + "tag": "financial" + }, + { + "text": "This sentence is loan.", + "tag": "financial" + }, + { + "text": "This sentence is money.", + "tag": "financial" + }, + { + "text": "This sentence is mortgage.", + "tag": "financial" + }, + { + "text": "This sentence is payment.", + "tag": "financial" + }, + { + "text": "This sentence is purchase.", + "tag": "financial" + }, + { + "text": "This sentence is salary.", + "tag": "financial" + }, + { + "text": "This sentence is savings.", + "tag": "financial" + }, + { + "text": "This sentence is transaction.", + "tag": "financial" + }, + { + "text": "This sentence is birth", + "tag": "personal" + }, + { + "text": "This sentence is 3G", + "tag": "personal" + }, + { + "text": "This sentence is 4G", + "tag": "personal" + }, + { + "text": "This sentence is cellular.", + "tag": "personal" + }, + { + "text": "This sentence is email.", + "tag": "personal" + }, + { + "text": "This sentence is fax.", + "tag": "personal" + }, + { + "text": "This sentence is password.", + "tag": "personal" + }, + { + "text": "This sentence is text.", + "tag": "personal" + }, + { + "text": "This sentence is telephone.", + "tag": "personal" + }, + { + "text": "This sentence is username.", + "tag": "personal" + }, + { + "text": "This sentence is firearm.", + "tag": "weapon" + }, + { + "text": "This sentence is grenade.", + "tag": "weapon" + }, + { + "text": "This sentence is gun.", + "tag": "weapon" + }, + { + "text": "This sentence is knife.", + "tag": "weapon" + }, + { + "text": "This sentence is rifle", + "tag": "weapon" + }, + { + "text": "This sentence is sword.", + "tag": "weapon" + }, + { + "text": "This sentence is passport.", + "tag": "identity document" + }, + { + "text": "This sentence is citizen.", + "tag": "identity document" + }, + { + "text": "This sentence is license.", + "tag": "identity document" + }, + { + "text": "This sentence is country.", + "tag": "identity document" + }, + { + "text": "This sentence is DOB.", + "tag": "identity document" + }, + { + "text": "This sentence is identity.", + "tag": "identity document" + }, + { + "text": "This sentence is surname.", + "tag": "identity document" + }, + { + "text": "This sentence is name.", + "tag": "identity document" + }, + { + "text": "This sentence is address.", + "tag": "identity document" + }, + { + "text": "This sentence is nationality.", + "tag": "identity document" + }, + { + "text": "This sentence is airline.", + "tag": "travel" + }, + { + "text": "This sentence is airport.", + "tag": "travel" + }, + { + "text": "This sentence is booking.", + "tag": "travel" + }, + { + "text": "This sentence is hotel.", + "tag": "travel" + }, + { + "text": "This sentence is itinerary.", + "tag": "travel" + }, + { + "text": "This sentence is motel.", + "tag": "travel" + }, + { + "text": "This sentence is passenger.", + "tag": "travel" + }, + { + "text": "This sentence is reservation.", + "tag": "travel" + }, + { + "text": "This sentence is roundtrip.", + "tag": "travel" + }, + { + "text": "This sentence is travel.", + "tag": "travel" + }, + { + "text": "This sentence is trip.", + "tag": "travel" + } +] \ No newline at end of file diff --git a/python/TransformerTagging/tests/config/transformer_text_tags_corpus.json b/python/TransformerTagging/tests/config/transformer_text_tags_corpus.json new file mode 100644 index 00000000..188bce09 --- /dev/null +++ b/python/TransformerTagging/tests/config/transformer_text_tags_corpus.json @@ -0,0 +1,278 @@ +[ + { + "text": "This sentence is auto.", + "tag": "vehicle" + }, + { + "text": "This sentence is bike.", + "tag": "vehicle" + }, + { + "text": "This sentence is bus.", + "tag": "vehicle" + }, + { + "text": "This sentence is car.", + "tag": "vehicle" + }, + { + "text": "This sentence is motor vehicle.", + "tag": "vehicle" + }, + { + "text": "This sentence is motorcycle.", + "tag": "vehicle" + }, + { + "text": "This sentence is suv.", + "tag": "vehicle" + }, + { + "text": "This sentence is truck.", + "tag": "vehicle" + }, + { + "text": "This sentence is trolley.", + "tag": "vehicle" + }, + { + "text": "This sentence is tram.", + "tag": "vehicle" + }, + { + "text": "This sentence is van.", + "tag": "vehicle" + }, + { + "text": "This sentence is vehicle.", + "tag": "vehicle" + }, + { + "text": "This sentence is vin.", + "tag": "vehicle" + }, + { + "text": "This sentence is financ.", + "tag": "financial" + }, + { + "text": "This sentence is bank.", + "tag": "financial" + }, + { + "text": "This sentence is ATM.", + "tag": "financial" + }, + { + "text": "This sentence is balance.", + "tag": "financial" + }, + { + "text": "This sentence is bill.", + "tag": "financial" + }, + { + "text": "This sentence is cash.", + "tag": "financial" + }, + { + "text": "This sentence is credit.", + "tag": "financial" + }, + { + "text": "This sentence is debit.", + "tag": "financial" + }, + { + "text": "This sentence is deposit.", + "tag": "financial" + }, + { + "text": "This sentence is dollar.", + "tag": "financial" + }, + { + "text": "This sentence is dollars.", + "tag": "financial" + }, + { + "text": "This sentence is loan.", + "tag": "financial" + }, + { + "text": "This sentence is money.", + "tag": "financial" + }, + { + "text": "This sentence is mortgage.", + "tag": "financial" + }, + { + "text": "This sentence is payment.", + "tag": "financial" + }, + { + "text": "This sentence is purchase.", + "tag": "financial" + }, + { + "text": "This sentence is salary.", + "tag": "financial" + }, + { + "text": "This sentence is savings.", + "tag": "financial" + }, + { + "text": "This sentence is transaction.", + "tag": "financial" + }, + { + "text": "This sentence is birth", + "tag": "personal" + }, + { + "text": "This sentence is 3G", + "tag": "personal" + }, + { + "text": "This sentence is 4G", + "tag": "personal" + }, + { + "text": "This sentence is cellular.", + "tag": "personal" + }, + { + "text": "This sentence is email.", + "tag": "personal" + }, + { + "text": "This sentence is fax.", + "tag": "personal" + }, + { + "text": "This sentence is password.", + "tag": "personal" + }, + { + "text": "This sentence is text.", + "tag": "personal" + }, + { + "text": "This sentence is telephone.", + "tag": "personal" + }, + { + "text": "This sentence is username.", + "tag": "personal" + }, + { + "text": "This sentence is firearm.", + "tag": "weapon" + }, + { + "text": "This sentence is grenade.", + "tag": "weapon" + }, + { + "text": "This sentence is gun.", + "tag": "weapon" + }, + { + "text": "This sentence is knife.", + "tag": "weapon" + }, + { + "text": "This sentence is rifle", + "tag": "weapon" + }, + { + "text": "This sentence is sword.", + "tag": "weapon" + }, + { + "text": "This sentence is passport.", + "tag": "identity document" + }, + { + "text": "This sentence is citizen.", + "tag": "identity document" + }, + { + "text": "This sentence is license.", + "tag": "identity document" + }, + { + "text": "This sentence is country.", + "tag": "identity document" + }, + { + "text": "This sentence is DOB.", + "tag": "identity document" + }, + { + "text": "This sentence is identity.", + "tag": "identity document" + }, + { + "text": "This sentence is surname.", + "tag": "identity document" + }, + { + "text": "This sentence is name.", + "tag": "identity document" + }, + { + "text": "This sentence is address.", + "tag": "identity document" + }, + { + "text": "This sentence is nationality.", + "tag": "identity document" + }, + { + "text": "This sentence is airline.", + "tag": "travel" + }, + { + "text": "This sentence is airport.", + "tag": "travel" + }, + { + "text": "This sentence is booking.", + "tag": "travel" + }, + { + "text": "This sentence is hotel.", + "tag": "travel" + }, + { + "text": "This sentence is itinerary.", + "tag": "travel" + }, + { + "text": "This sentence is motel.", + "tag": "travel" + }, + { + "text": "This sentence is passenger.", + "tag": "travel" + }, + { + "text": "This sentence is reservation.", + "tag": "travel" + }, + { + "text": "This sentence is roundtrip.", + "tag": "travel" + }, + { + "text": "This sentence is travel.", + "tag": "travel" + }, + { + "text": "This sentence is trip.", + "tag": "travel" + } +] \ No newline at end of file diff --git a/python/TransformerTagging/tests/data/multiple_tags.txt b/python/TransformerTagging/tests/data/multiple_tags.txt new file mode 100644 index 00000000..651b65c3 --- /dev/null +++ b/python/TransformerTagging/tests/data/multiple_tags.txt @@ -0,0 +1 @@ +Vehicles include wagons, bicycles, motor vehicles (motorcycles, cars, trucks, buses, mobility scooters for disabled people), railed vehicles (trains, trams), watercraft (ships, boats, underwater vehicles), amphibious vehicles (screw-propelled vehicles, hovercraft), aircraft (airplanes, helicopters, aerostats) and spacecraft. "An automated teller machine (ATM) is an electronic telecommunications device that enables customers of financial institutions to perform financial transactions, such as cash withdrawals, deposits, funds transfers, balance inquiries or account information inquiries, at any time and without the need for direct interaction with bank staff. The advent of widespread text-messaging has resulted in the cell phone novel, the first literary genre to emerge from the cellular age, via text messaging to a website that collects the novels as a whole. The sword developed from the knife or dagger. A passport holder is normally entitled to enter the country that issued the passport, though some people entitled to a passport may not be full citizens with right of abode (e.g. American nationals or British nationals). "A number of hotels and motels have entered the public consciousness through popular culture. diff --git a/python/TransformerTagging/tests/data/simple_input.txt b/python/TransformerTagging/tests/data/simple_input.txt new file mode 100644 index 00000000..3b798406 --- /dev/null +++ b/python/TransformerTagging/tests/data/simple_input.txt @@ -0,0 +1,3 @@ +I drove to the beach today and will be staying overnight at a hotel. I texted my friend before I left so she could look +after my cats. She will drop by to check on them after stopping by the bank. I plan to spend all day at the beach +tomorrow. \ No newline at end of file diff --git a/python/TransformerTagging/tests/test_transformer_tagging.py b/python/TransformerTagging/tests/test_transformer_tagging.py new file mode 100644 index 00000000..807659a9 --- /dev/null +++ b/python/TransformerTagging/tests/test_transformer_tagging.py @@ -0,0 +1,245 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2023 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +from pathlib import Path +import sys +import unittest + +import mpf_component_api as mpf + +from transformer_tagging_component import TransformerTaggingComponent + +LOCAL_PATH = Path(__file__).parent +sys.path.insert(0, str(LOCAL_PATH.parent)) +TEST_DATA = LOCAL_PATH / 'data' + +SHORT_SAMPLE = ( + 'I drove to the beach today and will be staying overnight at a hotel. ' + 'I texted my friend before I left so she could look after my cats. ' + 'She will drop by to check on them after stopping by the bank. ' + 'I plan to spend all day at the beach tomorrow.' +) + +SHORT_SAMPLE_TAGS = "TRAVEL" +SHORT_SAMPLE_TRIGGER_SENTENCES = "I drove to the beach today and will be staying overnight at a hotel." +SHORT_SAMPLE_OFFSET = "0-67" +SHORT_SAMPLE_SCORE = "0.4680028557777405" + + +class TestArgosTranslation(unittest.TestCase): + + def test_generic_job(self): + ff_track = mpf.GenericTrack(-1, dict(TEXT=SHORT_SAMPLE)) + job = mpf.GenericJob('Test Generic', 'test.pdf', {}, {}, ff_track) + comp = TransformerTaggingComponent() + result = comp.get_detections_from_generic(job) + + self.assertEqual(1, len(result)) + + props = result[0].detection_properties + + self.assertEqual(SHORT_SAMPLE_TAGS, props["TAGS"]) + self.assertEqual(SHORT_SAMPLE_TRIGGER_SENTENCES, props["TEXT TRAVEL TRIGGER SENTENCES"]) + self.assertEqual(SHORT_SAMPLE_OFFSET, props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"]) + self.assertEqual(SHORT_SAMPLE_SCORE, props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]) + + def test_plaintext_job(self): + job = mpf.GenericJob('Test Plaintext', str(TEST_DATA / 'simple_input.txt'), {}, {}) + comp = TransformerTaggingComponent() + result = comp.get_detections_from_generic(job) + + self.assertEqual(1, len(result)) + + props = result[0].detection_properties + + self.assertEqual(SHORT_SAMPLE_TAGS, props["TAGS"]) + self.assertEqual(SHORT_SAMPLE_TRIGGER_SENTENCES, props["TEXT TRAVEL TRIGGER SENTENCES"]) + self.assertEqual(SHORT_SAMPLE_OFFSET, props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"]) + self.assertEqual(SHORT_SAMPLE_SCORE, props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]) + + def test_audio_job(self): + ff_track = mpf.AudioTrack(0, 1, -1, dict(TEXT=SHORT_SAMPLE)) + job = mpf.AudioJob('Test Audio', 'test.wav', 0, 1, {}, {}, ff_track) + comp = TransformerTaggingComponent() + result = comp.get_detections_from_audio(job) + + self.assertEqual(1, len(result)) + + props = result[0].detection_properties + + self.assertEqual(SHORT_SAMPLE_TAGS, props["TAGS"]) + self.assertEqual(SHORT_SAMPLE_TRIGGER_SENTENCES, props["TEXT TRAVEL TRIGGER SENTENCES"]) + self.assertEqual(SHORT_SAMPLE_OFFSET, props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"]) + self.assertEqual(SHORT_SAMPLE_SCORE, props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]) + + def test_image_job(self): + ff_loc = mpf.ImageLocation(0, 0, 10, 10, -1, dict(TEXT=SHORT_SAMPLE)) + job = mpf.ImageJob('Test Image', 'test.jpg', {}, {}, ff_loc) + comp = TransformerTaggingComponent() + result = comp.get_detections_from_image(job) + + self.assertEqual(1, len(result)) + + props = result[0].detection_properties + + self.assertEqual(SHORT_SAMPLE_TAGS, props["TAGS"]) + self.assertEqual(SHORT_SAMPLE_TRIGGER_SENTENCES, props["TEXT TRAVEL TRIGGER SENTENCES"]) + self.assertEqual(SHORT_SAMPLE_OFFSET, props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"]) + self.assertEqual(SHORT_SAMPLE_SCORE, props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]) + + def test_video_job(self): + ff_track = mpf.VideoTrack( + 0, 1, -1, + { + 0: mpf.ImageLocation(0, 0, 10, 10, -1, dict(TEXT=SHORT_SAMPLE)), + 1: mpf.ImageLocation(0, 10, 10, 10, -1, dict(TRANSCRIPT=SHORT_SAMPLE)) + }, + dict(TEXT=SHORT_SAMPLE)) + job = mpf.VideoJob('Test Video', 'test.mp4', 0, 1, {}, {}, ff_track) + comp = TransformerTaggingComponent() + result = comp.get_detections_from_video(job) + + self.assertEqual(1, len(result)) + + props = result[0].detection_properties + self.assertEqual(SHORT_SAMPLE_TAGS, props["TAGS"]) + self.assertEqual(SHORT_SAMPLE_TRIGGER_SENTENCES, props["TEXT TRAVEL TRIGGER SENTENCES"]) + self.assertEqual(SHORT_SAMPLE_OFFSET, props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"]) + self.assertEqual(SHORT_SAMPLE_SCORE, props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]) + + frame_1_props = result[0].frame_locations[0].detection_properties + self.assertEqual(SHORT_SAMPLE_TAGS, frame_1_props["TAGS"]) + self.assertEqual(SHORT_SAMPLE_TRIGGER_SENTENCES, frame_1_props["TEXT TRAVEL TRIGGER SENTENCES"]) + self.assertEqual(SHORT_SAMPLE_OFFSET, frame_1_props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"]) + self.assertEqual(SHORT_SAMPLE_SCORE, frame_1_props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]) + + frame_2_props = result[0].frame_locations[1].detection_properties + self.assertEqual(SHORT_SAMPLE_TAGS, frame_2_props["TAGS"]) + self.assertEqual(SHORT_SAMPLE_TRIGGER_SENTENCES, frame_2_props["TRANSCRIPT TRAVEL TRIGGER SENTENCES"]) + self.assertEqual(SHORT_SAMPLE_OFFSET, frame_2_props["TRANSCRIPT TRAVEL TRIGGER SENTENCES OFFSET"]) + self.assertEqual(SHORT_SAMPLE_SCORE, frame_2_props["TRANSCRIPT TRAVEL TRIGGER SENTENCES SCORE"]) + + def test_no_feed_forward_location(self): + comp = TransformerTaggingComponent() + job = mpf.ImageJob('Test', 'test.jpg', {}, {}) + + with self.assertRaises(mpf.DetectionException) as cm: + list(comp.get_detections_from_image(job)) + self.assertEqual(mpf.DetectionError.UNSUPPORTED_DATA_TYPE, cm.exception.error_code) + + def test_no_feed_forward_track(self): + comp = TransformerTaggingComponent() + job = mpf.VideoJob('test', 'test.mp4', 0, 1, {}, {}) + + with self.assertRaises(mpf.DetectionException) as cm: + list(comp.get_detections_from_video(job)) + self.assertEqual(mpf.DetectionError.UNSUPPORTED_DATA_TYPE, cm.exception.error_code) + + job = mpf.AudioJob('Test Audio', 'test.wav', 0, 1, {}, {}) + with self.assertRaises(mpf.DetectionException) as cm: + list(comp.get_detections_from_audio(job)) + self.assertEqual(mpf.DetectionError.UNSUPPORTED_DATA_TYPE, cm.exception.error_code) + + def test_custom_confidence_threshold(self): + ff_loc = mpf.ImageLocation(0, 0, 10, 10, -1, dict(TEXT=SHORT_SAMPLE)) + job = mpf.ImageJob('Test Image', 'test.jpg', dict(SCORE_THRESHOLD=".2"), {}, ff_loc) + + comp = TransformerTaggingComponent() + result = comp.get_detections_from_image(job) + + self.assertEqual(1, len(result)) + + props = result[0].detection_properties + + self.assertEqual("TRAVEL; FINANCIAL", props["TAGS"]) + self.assertEqual(SHORT_SAMPLE_TRIGGER_SENTENCES, props["TEXT TRAVEL TRIGGER SENTENCES"]) + self.assertEqual(SHORT_SAMPLE_OFFSET, props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"]) + self.assertEqual(SHORT_SAMPLE_SCORE, props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]) + + custom_threshold_sentence = "She will drop by to check on them after stopping by the bank." + custom_threshold_sentence_offset = "135-195" + custom_threshold_sentence_score = "0.2906474769115448" + + self.assertEqual(custom_threshold_sentence, props["TEXT FINANCIAL TRIGGER SENTENCES"]) + self.assertEqual(custom_threshold_sentence_offset, props["TEXT FINANCIAL TRIGGER SENTENCES OFFSET"]) + self.assertEqual(custom_threshold_sentence_score, props["TEXT FINANCIAL TRIGGER SENTENCES SCORE"]) + + def test_custom_tagging_file(self): + ff_loc = mpf.ImageLocation(0, 0, 10, 10, -1, dict(TEXT=SHORT_SAMPLE)) + job = mpf.ImageJob('Test Image', 'test.jpg', + dict(TRANSFORMER_TAGGING_CORPUS="config/custom_corpus.json"), {}, ff_loc) + + comp = TransformerTaggingComponent() + result = comp.get_detections_from_image(job) + + self.assertEqual(1, len(result)) + + def test_debugging_show_matches(self): + ff_loc = mpf.ImageLocation(0, 0, 10, 10, -1, dict(TEXT=SHORT_SAMPLE)) + job = mpf.ImageJob('Test Image', 'test.jpg', {}, {}, ff_loc) + comp = TransformerTaggingComponent() + result = comp.get_detections_from_image(job) + + self.assertEqual(1, len(result)) + props = result[0].detection_properties + self.assertTrue("TEXT TRAVEL TRIGGER SENTENCES MATCHES" not in props) + + job = mpf.ImageJob('Test Image', 'test.jpg', dict(ENABLE_DEBUG="TRUE"), {}, ff_loc) + comp = TransformerTaggingComponent() + result = comp.get_detections_from_image(job) + + self.assertEqual(1, len(result)) + props = result[0].detection_properties + self.assertTrue("TEXT TRAVEL TRIGGER SENTENCES MATCHES" in props) + self.assertEqual("This sentence is hotel.", props["TEXT TRAVEL TRIGGER SENTENCES MATCHES"]) + + def test_missing_property_to_process(self): + ff_loc = mpf.ImageLocation(0, 0, 10, 10, -1, dict(INPUT="some input")) + job = mpf.ImageJob('Test Image', 'test.jpg', {}, {}, ff_loc) + + comp = TransformerTaggingComponent() + result = comp.get_detections_from_image(job) + + self.assertEqual(1, len(result)) + + self.assertEqual(ff_loc.x_left_upper, result[0].x_left_upper) + self.assertEqual(ff_loc.y_left_upper, result[0].y_left_upper) + self.assertEqual(ff_loc.width, result[0].width) + self.assertEqual(ff_loc.height, result[0].height) + self.assertEqual(ff_loc.confidence, result[0].confidence) + self.assertEqual(ff_loc.detection_properties, result[0].detection_properties) + + def test_missing_text_to_process(self): + ff_loc = mpf.ImageLocation(0, 0, 10, 10, -1, dict(TEXT="")) + job = mpf.ImageJob('Test Image', 'test.jpg', {}, {}, ff_loc) + comp = TransformerTaggingComponent() + result = comp.get_detections_from_image(job) + + self.assertEqual(1, len(result)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/TransformerTagging/transformer_tagging_component/__init__.py b/python/TransformerTagging/transformer_tagging_component/__init__.py new file mode 100644 index 00000000..f8d30e7d --- /dev/null +++ b/python/TransformerTagging/transformer_tagging_component/__init__.py @@ -0,0 +1,27 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2022 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2022 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +from .transformer_tagging_component import TransformerTaggingComponent, TransformerWrapper \ No newline at end of file diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py new file mode 100644 index 00000000..fd03e6c5 --- /dev/null +++ b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py @@ -0,0 +1,216 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2023 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +import logging + +import mpf_component_api as mpf +import mpf_component_util as mpf_util + +from sentence_transformers import SentenceTransformer, util + +from typing import Sequence, Dict +import pathlib +import os + +from pkg_resources import resource_filename +from nltk.tokenize import sent_tokenize +import pandas as pd + +logger = logging.getLogger('TransformerTaggingComponent') + + +class TransformerTaggingComponent: + detection_type = 'TEXT' + + def get_detections_from_video(self, job: mpf.VideoJob) -> Sequence[mpf.VideoTrack]: + logger.info(f'Received video job.') + + return self.get_feed_forward_detections(job, job.feed_forward_track, video_job=True) + + def get_detections_from_image(self, job: mpf.ImageJob) -> Sequence[mpf.ImageLocation]: + logger.info(f'Received image job.') + + return self.get_feed_forward_detections(job, job.feed_forward_location) + + def get_detections_from_audio(self, job: mpf.AudioJob) -> Sequence[mpf.AudioTrack]: + logger.info(f'Received audio job.') + + return self.get_feed_forward_detections(job, job.feed_forward_track) + + def get_detections_from_generic(self, job: mpf.GenericJob) -> Sequence[mpf.GenericTrack]: + logger.info(f'Received generic job.') + + if job.feed_forward_track: + return self.get_feed_forward_detections(job, job.feed_forward_track) + else: + logger.info('Job did not contain a feed forward track. Assuming ' + 'media file is a plain text file containing the text to ' + 'be tagged.') + + text = pathlib.Path(job.data_uri).read_text().strip() + new_ff_props = dict(TEXT=text) + ff_track = mpf.GenericTrack(detection_properties=new_ff_props) + + new_job_props = { + **job.job_properties, + 'FEED_FORWARD_PROP_TO_PROCESS': 'TEXT' + } + + tw = TransformerWrapper(new_job_props) + tw.add_tags(new_ff_props) + + return [ff_track] + + @staticmethod + def get_feed_forward_detections(job, job_feed_forward, video_job=False): + try: + if job_feed_forward is None: + raise mpf.DetectionError.UNSUPPORTED_DATA_TYPE.exception( + f'Component can only process feed forward ' + ' jobs, but no feed forward track provided. ') + + tw = TransformerWrapper(job.job_properties) + tw.add_tags(job_feed_forward.detection_properties) + + if video_job: + for ff_location in job.feed_forward_track.frame_locations.values(): + tw.add_tags(ff_location.detection_properties) + + return [job_feed_forward] + + except Exception: + logger.exception( + f'Failed to complete job due to the following exception:') + raise + + +class TransformerWrapper: + def __init__(self, job_props): + self.model = SentenceTransformer('all-mpnet-base-v2') + + self._props_to_process = [ + prop.strip() for prop in + mpf_util.get_property( + properties=job_props, + key='FEED_FORWARD_PROP_TO_PROCESS', + default_value='TEXT,TRANSCRIPT', + prop_type=str + ).split(',') + ] + + self._corpus_file = \ + mpf_util.get_property(job_props, 'TRANSFORMER_TAGGING_CORPUS', "transformer_text_tags_corpus.json") + + self._corpus_path = "" + + if "$" not in self._corpus_file and "/" not in self._corpus_file: + self._corpus_path = os.path.realpath(resource_filename(__name__, self._corpus_file)) + else: + self._corpus_path = os.path.expandvars(self._corpus_file) + + if os.path.exists(self._corpus_path): + self.corpus = pd.read_json(self._corpus_path) + else: + print(self._corpus_path) + logger.exception('Failed to complete job due incorrect file path for the transformer tagging corpus: ' + f'"{self._corpus_file}"') + raise mpf.DetectionException( + 'Invalid path provided for transformer tagging corpus: ' + f'"{self._corpus_file}"', + mpf.DetectionError.COULD_NOT_READ_DATAFILE) + + self.threshold = mpf_util.get_property(job_props, 'SCORE_THRESHOLD', .3) + self.debug = mpf_util.get_property(job_props, 'ENABLE_DEBUG', False) + + def add_tags(self, ff_props: Dict[str, str]): + for prop_to_tag in self._props_to_process: + input_text = ff_props.get(prop_to_tag, None) + if input_text: + break + elif input_text == "": + logger.warning(f'No {prop_to_tag.lower()} to tag found in track.') + break + else: + logger.warning("Feed forward element missing one of the following properties: " + + ", ".join(self._props_to_process)) + return + + input_sentences = sent_tokenize(input_text) + + all_tag_results = [] + + for probe_sent in input_sentences: + probe_sent_embed = self.model.encode([probe_sent] * len(self.corpus), convert_to_tensor=True) + corpus_embed = self.model.encode(self.corpus["text"], convert_to_tensor=True) + + cosine_scores = util.cos_sim(probe_sent_embed, corpus_embed) + scores = [] + + offset_beginning = input_text.find(probe_sent) + offset_end = offset_beginning + len(probe_sent) - 1 + offset_string = str(offset_beginning) + "-" + str(offset_end) + + for i in range(len(probe_sent_embed)): + scores.append(float(cosine_scores[i][i])) + + probe_df = pd.DataFrame({ + "input text": probe_sent, + "corpus text": self.corpus["text"], + "tag": self.corpus["tag"], + "score": scores, + "offset": offset_string + }) + + probe_df = probe_df.sort_values(by=['score'], ascending=False) + top_per_tag = probe_df.groupby(['tag'], sort=False).head(1) + + top_per_tag_threshold = top_per_tag[top_per_tag["score"] >= self.threshold] + all_tag_results.append(top_per_tag_threshold) + + if not all_tag_results: + return + + all_tag_results = pd.concat(all_tag_results) + + for tag in all_tag_results["tag"].unique(): + tag_df = all_tag_results[all_tag_results["tag"] == tag] + + if "TAGS" in ff_props and tag.upper() not in ff_props: + ff_props["TAGS"] = ff_props["TAGS"] + "; " + tag.upper() + else: + ff_props["TAGS"] = tag.upper() + + prop_name_sent = prop_to_tag + " " + tag.upper() + " TRIGGER SENTENCES" + prop_name_offset = prop_name_sent + " OFFSET" + prop_name_score = prop_name_sent + " SCORE" + + ff_props[prop_name_sent] = "; ".join(tag_df["input text"]) + ff_props[prop_name_offset] = "; ".join(tag_df["offset"]) + ff_props[prop_name_score] = "; ".join(tag_df["score"].astype(str)) + + if self.debug: + prop_name_matches = prop_name_sent + " MATCHES" + ff_props[prop_name_matches] = "; ".join(tag_df["corpus text"]) diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_text_tags_corpus.json b/python/TransformerTagging/transformer_tagging_component/transformer_text_tags_corpus.json new file mode 100644 index 00000000..188bce09 --- /dev/null +++ b/python/TransformerTagging/transformer_tagging_component/transformer_text_tags_corpus.json @@ -0,0 +1,278 @@ +[ + { + "text": "This sentence is auto.", + "tag": "vehicle" + }, + { + "text": "This sentence is bike.", + "tag": "vehicle" + }, + { + "text": "This sentence is bus.", + "tag": "vehicle" + }, + { + "text": "This sentence is car.", + "tag": "vehicle" + }, + { + "text": "This sentence is motor vehicle.", + "tag": "vehicle" + }, + { + "text": "This sentence is motorcycle.", + "tag": "vehicle" + }, + { + "text": "This sentence is suv.", + "tag": "vehicle" + }, + { + "text": "This sentence is truck.", + "tag": "vehicle" + }, + { + "text": "This sentence is trolley.", + "tag": "vehicle" + }, + { + "text": "This sentence is tram.", + "tag": "vehicle" + }, + { + "text": "This sentence is van.", + "tag": "vehicle" + }, + { + "text": "This sentence is vehicle.", + "tag": "vehicle" + }, + { + "text": "This sentence is vin.", + "tag": "vehicle" + }, + { + "text": "This sentence is financ.", + "tag": "financial" + }, + { + "text": "This sentence is bank.", + "tag": "financial" + }, + { + "text": "This sentence is ATM.", + "tag": "financial" + }, + { + "text": "This sentence is balance.", + "tag": "financial" + }, + { + "text": "This sentence is bill.", + "tag": "financial" + }, + { + "text": "This sentence is cash.", + "tag": "financial" + }, + { + "text": "This sentence is credit.", + "tag": "financial" + }, + { + "text": "This sentence is debit.", + "tag": "financial" + }, + { + "text": "This sentence is deposit.", + "tag": "financial" + }, + { + "text": "This sentence is dollar.", + "tag": "financial" + }, + { + "text": "This sentence is dollars.", + "tag": "financial" + }, + { + "text": "This sentence is loan.", + "tag": "financial" + }, + { + "text": "This sentence is money.", + "tag": "financial" + }, + { + "text": "This sentence is mortgage.", + "tag": "financial" + }, + { + "text": "This sentence is payment.", + "tag": "financial" + }, + { + "text": "This sentence is purchase.", + "tag": "financial" + }, + { + "text": "This sentence is salary.", + "tag": "financial" + }, + { + "text": "This sentence is savings.", + "tag": "financial" + }, + { + "text": "This sentence is transaction.", + "tag": "financial" + }, + { + "text": "This sentence is birth", + "tag": "personal" + }, + { + "text": "This sentence is 3G", + "tag": "personal" + }, + { + "text": "This sentence is 4G", + "tag": "personal" + }, + { + "text": "This sentence is cellular.", + "tag": "personal" + }, + { + "text": "This sentence is email.", + "tag": "personal" + }, + { + "text": "This sentence is fax.", + "tag": "personal" + }, + { + "text": "This sentence is password.", + "tag": "personal" + }, + { + "text": "This sentence is text.", + "tag": "personal" + }, + { + "text": "This sentence is telephone.", + "tag": "personal" + }, + { + "text": "This sentence is username.", + "tag": "personal" + }, + { + "text": "This sentence is firearm.", + "tag": "weapon" + }, + { + "text": "This sentence is grenade.", + "tag": "weapon" + }, + { + "text": "This sentence is gun.", + "tag": "weapon" + }, + { + "text": "This sentence is knife.", + "tag": "weapon" + }, + { + "text": "This sentence is rifle", + "tag": "weapon" + }, + { + "text": "This sentence is sword.", + "tag": "weapon" + }, + { + "text": "This sentence is passport.", + "tag": "identity document" + }, + { + "text": "This sentence is citizen.", + "tag": "identity document" + }, + { + "text": "This sentence is license.", + "tag": "identity document" + }, + { + "text": "This sentence is country.", + "tag": "identity document" + }, + { + "text": "This sentence is DOB.", + "tag": "identity document" + }, + { + "text": "This sentence is identity.", + "tag": "identity document" + }, + { + "text": "This sentence is surname.", + "tag": "identity document" + }, + { + "text": "This sentence is name.", + "tag": "identity document" + }, + { + "text": "This sentence is address.", + "tag": "identity document" + }, + { + "text": "This sentence is nationality.", + "tag": "identity document" + }, + { + "text": "This sentence is airline.", + "tag": "travel" + }, + { + "text": "This sentence is airport.", + "tag": "travel" + }, + { + "text": "This sentence is booking.", + "tag": "travel" + }, + { + "text": "This sentence is hotel.", + "tag": "travel" + }, + { + "text": "This sentence is itinerary.", + "tag": "travel" + }, + { + "text": "This sentence is motel.", + "tag": "travel" + }, + { + "text": "This sentence is passenger.", + "tag": "travel" + }, + { + "text": "This sentence is reservation.", + "tag": "travel" + }, + { + "text": "This sentence is roundtrip.", + "tag": "travel" + }, + { + "text": "This sentence is travel.", + "tag": "travel" + }, + { + "text": "This sentence is trip.", + "tag": "travel" + } +] \ No newline at end of file From 23474e3e01bf072d6dc356e10facb1677b234c7a Mon Sep 17 00:00:00 2001 From: jrobble Date: Fri, 15 Sep 2023 22:56:43 -0400 Subject: [PATCH 02/21] Get tests to work in local dev. env. --- .../sample_transformer_tagger.py | 45 --- python/TransformerTagging/setup.cfg | 3 + .../config/transformer_text_tags_corpus.json | 278 ------------------ .../tests/test_transformer_tagging.py | 36 +-- 4 files changed, 22 insertions(+), 340 deletions(-) delete mode 100644 python/TransformerTagging/sample_transformer_tagger.py delete mode 100644 python/TransformerTagging/tests/config/transformer_text_tags_corpus.json diff --git a/python/TransformerTagging/sample_transformer_tagger.py b/python/TransformerTagging/sample_transformer_tagger.py deleted file mode 100644 index 8eacd620..00000000 --- a/python/TransformerTagging/sample_transformer_tagger.py +++ /dev/null @@ -1,45 +0,0 @@ -############################################################################# -# NOTICE # -# # -# This software (or technical data) was produced for the U.S. Government # -# under contract, and is subject to the Rights in Data-General Clause # -# 52.227-14, Alt. IV (DEC 2007). # -# # -# Copyright 2023 The MITRE Corporation. All Rights Reserved. # -############################################################################# - -############################################################################# -# Copyright 2023 The MITRE Corporation # -# # -# Licensed under the Apache License, Version 2.0 (the "License"); # -# you may not use this file except in compliance with the License. # -# You may obtain a copy of the License at # -# # -# http://www.apache.org/licenses/LICENSE-2.0 # -# # -# Unless required by applicable law or agreed to in writing, software # -# distributed under the License is distributed on an "AS IS" BASIS, # -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # -# See the License for the specific language governing permissions and # -# limitations under the License. # -############################################################################# - -import sys - -from transformer_tagging_component import TransformerTaggingComponent, TransformerWrapper -import mpf_component_api as mpf - - -def main(): - wrapper = TransformerWrapper({}) - detection_props = dict(TEXT="I also have a knife. I have a gun. I took a plane to Florida. I bought some cocaine. " - "It did not go well.") - print(detection_props["TEXT"]) - wrapper.add_tags(detection_props) - - for prop in detection_props: - print(prop, ": ", detection_props[prop]) - - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/python/TransformerTagging/setup.cfg b/python/TransformerTagging/setup.cfg index 5c12a95f..8daf2285 100644 --- a/python/TransformerTagging/setup.cfg +++ b/python/TransformerTagging/setup.cfg @@ -33,6 +33,9 @@ packages = transformer_tagging_component install_requires = mpf_component_api>=7.2 mpf_component_util>=7.2 + nltk + sentence_transformers + pandas [options.entry_points] mpf.exported_component = diff --git a/python/TransformerTagging/tests/config/transformer_text_tags_corpus.json b/python/TransformerTagging/tests/config/transformer_text_tags_corpus.json deleted file mode 100644 index 188bce09..00000000 --- a/python/TransformerTagging/tests/config/transformer_text_tags_corpus.json +++ /dev/null @@ -1,278 +0,0 @@ -[ - { - "text": "This sentence is auto.", - "tag": "vehicle" - }, - { - "text": "This sentence is bike.", - "tag": "vehicle" - }, - { - "text": "This sentence is bus.", - "tag": "vehicle" - }, - { - "text": "This sentence is car.", - "tag": "vehicle" - }, - { - "text": "This sentence is motor vehicle.", - "tag": "vehicle" - }, - { - "text": "This sentence is motorcycle.", - "tag": "vehicle" - }, - { - "text": "This sentence is suv.", - "tag": "vehicle" - }, - { - "text": "This sentence is truck.", - "tag": "vehicle" - }, - { - "text": "This sentence is trolley.", - "tag": "vehicle" - }, - { - "text": "This sentence is tram.", - "tag": "vehicle" - }, - { - "text": "This sentence is van.", - "tag": "vehicle" - }, - { - "text": "This sentence is vehicle.", - "tag": "vehicle" - }, - { - "text": "This sentence is vin.", - "tag": "vehicle" - }, - { - "text": "This sentence is financ.", - "tag": "financial" - }, - { - "text": "This sentence is bank.", - "tag": "financial" - }, - { - "text": "This sentence is ATM.", - "tag": "financial" - }, - { - "text": "This sentence is balance.", - "tag": "financial" - }, - { - "text": "This sentence is bill.", - "tag": "financial" - }, - { - "text": "This sentence is cash.", - "tag": "financial" - }, - { - "text": "This sentence is credit.", - "tag": "financial" - }, - { - "text": "This sentence is debit.", - "tag": "financial" - }, - { - "text": "This sentence is deposit.", - "tag": "financial" - }, - { - "text": "This sentence is dollar.", - "tag": "financial" - }, - { - "text": "This sentence is dollars.", - "tag": "financial" - }, - { - "text": "This sentence is loan.", - "tag": "financial" - }, - { - "text": "This sentence is money.", - "tag": "financial" - }, - { - "text": "This sentence is mortgage.", - "tag": "financial" - }, - { - "text": "This sentence is payment.", - "tag": "financial" - }, - { - "text": "This sentence is purchase.", - "tag": "financial" - }, - { - "text": "This sentence is salary.", - "tag": "financial" - }, - { - "text": "This sentence is savings.", - "tag": "financial" - }, - { - "text": "This sentence is transaction.", - "tag": "financial" - }, - { - "text": "This sentence is birth", - "tag": "personal" - }, - { - "text": "This sentence is 3G", - "tag": "personal" - }, - { - "text": "This sentence is 4G", - "tag": "personal" - }, - { - "text": "This sentence is cellular.", - "tag": "personal" - }, - { - "text": "This sentence is email.", - "tag": "personal" - }, - { - "text": "This sentence is fax.", - "tag": "personal" - }, - { - "text": "This sentence is password.", - "tag": "personal" - }, - { - "text": "This sentence is text.", - "tag": "personal" - }, - { - "text": "This sentence is telephone.", - "tag": "personal" - }, - { - "text": "This sentence is username.", - "tag": "personal" - }, - { - "text": "This sentence is firearm.", - "tag": "weapon" - }, - { - "text": "This sentence is grenade.", - "tag": "weapon" - }, - { - "text": "This sentence is gun.", - "tag": "weapon" - }, - { - "text": "This sentence is knife.", - "tag": "weapon" - }, - { - "text": "This sentence is rifle", - "tag": "weapon" - }, - { - "text": "This sentence is sword.", - "tag": "weapon" - }, - { - "text": "This sentence is passport.", - "tag": "identity document" - }, - { - "text": "This sentence is citizen.", - "tag": "identity document" - }, - { - "text": "This sentence is license.", - "tag": "identity document" - }, - { - "text": "This sentence is country.", - "tag": "identity document" - }, - { - "text": "This sentence is DOB.", - "tag": "identity document" - }, - { - "text": "This sentence is identity.", - "tag": "identity document" - }, - { - "text": "This sentence is surname.", - "tag": "identity document" - }, - { - "text": "This sentence is name.", - "tag": "identity document" - }, - { - "text": "This sentence is address.", - "tag": "identity document" - }, - { - "text": "This sentence is nationality.", - "tag": "identity document" - }, - { - "text": "This sentence is airline.", - "tag": "travel" - }, - { - "text": "This sentence is airport.", - "tag": "travel" - }, - { - "text": "This sentence is booking.", - "tag": "travel" - }, - { - "text": "This sentence is hotel.", - "tag": "travel" - }, - { - "text": "This sentence is itinerary.", - "tag": "travel" - }, - { - "text": "This sentence is motel.", - "tag": "travel" - }, - { - "text": "This sentence is passenger.", - "tag": "travel" - }, - { - "text": "This sentence is reservation.", - "tag": "travel" - }, - { - "text": "This sentence is roundtrip.", - "tag": "travel" - }, - { - "text": "This sentence is travel.", - "tag": "travel" - }, - { - "text": "This sentence is trip.", - "tag": "travel" - } -] \ No newline at end of file diff --git a/python/TransformerTagging/tests/test_transformer_tagging.py b/python/TransformerTagging/tests/test_transformer_tagging.py index 807659a9..2ec48d4b 100644 --- a/python/TransformerTagging/tests/test_transformer_tagging.py +++ b/python/TransformerTagging/tests/test_transformer_tagging.py @@ -25,16 +25,18 @@ ############################################################################# from pathlib import Path -import sys +import logging import unittest import mpf_component_api as mpf from transformer_tagging_component import TransformerTaggingComponent -LOCAL_PATH = Path(__file__).parent -sys.path.insert(0, str(LOCAL_PATH.parent)) -TEST_DATA = LOCAL_PATH / 'data' + +TEST_DATA = Path(__file__).parent / 'data' +TEST_CONFIG = Path(__file__).parent / 'config' + +logging.basicConfig(level=logging.DEBUG) SHORT_SAMPLE = ( 'I drove to the beach today and will be staying overnight at a hotel. ' @@ -46,10 +48,10 @@ SHORT_SAMPLE_TAGS = "TRAVEL" SHORT_SAMPLE_TRIGGER_SENTENCES = "I drove to the beach today and will be staying overnight at a hotel." SHORT_SAMPLE_OFFSET = "0-67" -SHORT_SAMPLE_SCORE = "0.4680028557777405" +SHORT_SAMPLE_SCORE = 0.4680028557777405 -class TestArgosTranslation(unittest.TestCase): +class TestTransformerTagging(unittest.TestCase): def test_generic_job(self): ff_track = mpf.GenericTrack(-1, dict(TEXT=SHORT_SAMPLE)) @@ -64,7 +66,7 @@ def test_generic_job(self): self.assertEqual(SHORT_SAMPLE_TAGS, props["TAGS"]) self.assertEqual(SHORT_SAMPLE_TRIGGER_SENTENCES, props["TEXT TRAVEL TRIGGER SENTENCES"]) self.assertEqual(SHORT_SAMPLE_OFFSET, props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"]) - self.assertEqual(SHORT_SAMPLE_SCORE, props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]) + self.assertAlmostEqual(SHORT_SAMPLE_SCORE, float(props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]), places=3) def test_plaintext_job(self): job = mpf.GenericJob('Test Plaintext', str(TEST_DATA / 'simple_input.txt'), {}, {}) @@ -78,7 +80,7 @@ def test_plaintext_job(self): self.assertEqual(SHORT_SAMPLE_TAGS, props["TAGS"]) self.assertEqual(SHORT_SAMPLE_TRIGGER_SENTENCES, props["TEXT TRAVEL TRIGGER SENTENCES"]) self.assertEqual(SHORT_SAMPLE_OFFSET, props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"]) - self.assertEqual(SHORT_SAMPLE_SCORE, props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]) + self.assertAlmostEqual(SHORT_SAMPLE_SCORE, float(props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]), places=3) def test_audio_job(self): ff_track = mpf.AudioTrack(0, 1, -1, dict(TEXT=SHORT_SAMPLE)) @@ -93,7 +95,7 @@ def test_audio_job(self): self.assertEqual(SHORT_SAMPLE_TAGS, props["TAGS"]) self.assertEqual(SHORT_SAMPLE_TRIGGER_SENTENCES, props["TEXT TRAVEL TRIGGER SENTENCES"]) self.assertEqual(SHORT_SAMPLE_OFFSET, props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"]) - self.assertEqual(SHORT_SAMPLE_SCORE, props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]) + self.assertAlmostEqual(SHORT_SAMPLE_SCORE, float(props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]), places=3) def test_image_job(self): ff_loc = mpf.ImageLocation(0, 0, 10, 10, -1, dict(TEXT=SHORT_SAMPLE)) @@ -108,7 +110,7 @@ def test_image_job(self): self.assertEqual(SHORT_SAMPLE_TAGS, props["TAGS"]) self.assertEqual(SHORT_SAMPLE_TRIGGER_SENTENCES, props["TEXT TRAVEL TRIGGER SENTENCES"]) self.assertEqual(SHORT_SAMPLE_OFFSET, props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"]) - self.assertEqual(SHORT_SAMPLE_SCORE, props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]) + self.assertAlmostEqual(SHORT_SAMPLE_SCORE, float(props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]), places=3) def test_video_job(self): ff_track = mpf.VideoTrack( @@ -128,19 +130,19 @@ def test_video_job(self): self.assertEqual(SHORT_SAMPLE_TAGS, props["TAGS"]) self.assertEqual(SHORT_SAMPLE_TRIGGER_SENTENCES, props["TEXT TRAVEL TRIGGER SENTENCES"]) self.assertEqual(SHORT_SAMPLE_OFFSET, props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"]) - self.assertEqual(SHORT_SAMPLE_SCORE, props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]) + self.assertAlmostEqual(SHORT_SAMPLE_SCORE, float(props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]), places=3) frame_1_props = result[0].frame_locations[0].detection_properties self.assertEqual(SHORT_SAMPLE_TAGS, frame_1_props["TAGS"]) self.assertEqual(SHORT_SAMPLE_TRIGGER_SENTENCES, frame_1_props["TEXT TRAVEL TRIGGER SENTENCES"]) self.assertEqual(SHORT_SAMPLE_OFFSET, frame_1_props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"]) - self.assertEqual(SHORT_SAMPLE_SCORE, frame_1_props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]) + self.assertAlmostEqual(SHORT_SAMPLE_SCORE, float(props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]), places=3) frame_2_props = result[0].frame_locations[1].detection_properties self.assertEqual(SHORT_SAMPLE_TAGS, frame_2_props["TAGS"]) self.assertEqual(SHORT_SAMPLE_TRIGGER_SENTENCES, frame_2_props["TRANSCRIPT TRAVEL TRIGGER SENTENCES"]) self.assertEqual(SHORT_SAMPLE_OFFSET, frame_2_props["TRANSCRIPT TRAVEL TRIGGER SENTENCES OFFSET"]) - self.assertEqual(SHORT_SAMPLE_SCORE, frame_2_props["TRANSCRIPT TRAVEL TRIGGER SENTENCES SCORE"]) + self.assertAlmostEqual(SHORT_SAMPLE_SCORE, float(frame_2_props["TRANSCRIPT TRAVEL TRIGGER SENTENCES SCORE"]), places=3) def test_no_feed_forward_location(self): comp = TransformerTaggingComponent() @@ -177,20 +179,20 @@ def test_custom_confidence_threshold(self): self.assertEqual("TRAVEL; FINANCIAL", props["TAGS"]) self.assertEqual(SHORT_SAMPLE_TRIGGER_SENTENCES, props["TEXT TRAVEL TRIGGER SENTENCES"]) self.assertEqual(SHORT_SAMPLE_OFFSET, props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"]) - self.assertEqual(SHORT_SAMPLE_SCORE, props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]) + self.assertAlmostEqual(SHORT_SAMPLE_SCORE, float(props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]), places=3) custom_threshold_sentence = "She will drop by to check on them after stopping by the bank." custom_threshold_sentence_offset = "135-195" - custom_threshold_sentence_score = "0.2906474769115448" + custom_threshold_sentence_score = 0.2906474769115448 self.assertEqual(custom_threshold_sentence, props["TEXT FINANCIAL TRIGGER SENTENCES"]) self.assertEqual(custom_threshold_sentence_offset, props["TEXT FINANCIAL TRIGGER SENTENCES OFFSET"]) - self.assertEqual(custom_threshold_sentence_score, props["TEXT FINANCIAL TRIGGER SENTENCES SCORE"]) + self.assertAlmostEqual(custom_threshold_sentence_score, float(props["TEXT FINANCIAL TRIGGER SENTENCES SCORE"]), places=3) def test_custom_tagging_file(self): ff_loc = mpf.ImageLocation(0, 0, 10, 10, -1, dict(TEXT=SHORT_SAMPLE)) job = mpf.ImageJob('Test Image', 'test.jpg', - dict(TRANSFORMER_TAGGING_CORPUS="config/custom_corpus.json"), {}, ff_loc) + dict(TRANSFORMER_TAGGING_CORPUS=str(TEST_CONFIG / "custom_corpus.json")), {}, ff_loc) comp = TransformerTaggingComponent() result = comp.get_detections_from_image(job) From bd834fb0f7ced74e04700a087c565942f8b966e3 Mon Sep 17 00:00:00 2001 From: mcrenshaw Date: Fri, 22 Sep 2023 22:09:48 -0400 Subject: [PATCH 03/21] added translation to props to process, changed custom corpus test --- .../plugin-files/descriptor/descriptor.json | 2 +- .../tests/config/custom_corpus.json | 276 +----------------- .../tests/data/multiple_tags.txt | 1 - .../tests/test_transformer_tagging.py | 15 + .../transformer_tagging_component.py | 15 +- .../transformer_text_tags_corpus.json | 6 +- 6 files changed, 37 insertions(+), 278 deletions(-) delete mode 100644 python/TransformerTagging/tests/data/multiple_tags.txt diff --git a/python/TransformerTagging/plugin-files/descriptor/descriptor.json b/python/TransformerTagging/plugin-files/descriptor/descriptor.json index cd667b30..e0175f34 100644 --- a/python/TransformerTagging/plugin-files/descriptor/descriptor.json +++ b/python/TransformerTagging/plugin-files/descriptor/descriptor.json @@ -23,7 +23,7 @@ "name": "FEED_FORWARD_PROP_TO_PROCESS", "description": "Comma-separated list of property names indicating which properties in the feed-forward track or detection to consider translating. If the first property listed is present, then that property will be translated. If it's not, then the next property in the list is considered. At most, one property will be translated.", "type": "STRING", - "defaultValue": "TEXT,TRANSCRIPT" + "defaultValue": "TEXT,TRANSCRIPT,TRANSLATION" }, { "name": "SCORE_THRESHOLD", diff --git a/python/TransformerTagging/tests/config/custom_corpus.json b/python/TransformerTagging/tests/config/custom_corpus.json index 188bce09..a550b6ed 100644 --- a/python/TransformerTagging/tests/config/custom_corpus.json +++ b/python/TransformerTagging/tests/config/custom_corpus.json @@ -1,278 +1,6 @@ [ { - "text": "This sentence is auto.", - "tag": "vehicle" - }, - { - "text": "This sentence is bike.", - "tag": "vehicle" - }, - { - "text": "This sentence is bus.", - "tag": "vehicle" - }, - { - "text": "This sentence is car.", - "tag": "vehicle" - }, - { - "text": "This sentence is motor vehicle.", - "tag": "vehicle" - }, - { - "text": "This sentence is motorcycle.", - "tag": "vehicle" - }, - { - "text": "This sentence is suv.", - "tag": "vehicle" - }, - { - "text": "This sentence is truck.", - "tag": "vehicle" - }, - { - "text": "This sentence is trolley.", - "tag": "vehicle" - }, - { - "text": "This sentence is tram.", - "tag": "vehicle" - }, - { - "text": "This sentence is van.", - "tag": "vehicle" - }, - { - "text": "This sentence is vehicle.", - "tag": "vehicle" - }, - { - "text": "This sentence is vin.", - "tag": "vehicle" - }, - { - "text": "This sentence is financ.", - "tag": "financial" - }, - { - "text": "This sentence is bank.", - "tag": "financial" - }, - { - "text": "This sentence is ATM.", - "tag": "financial" - }, - { - "text": "This sentence is balance.", - "tag": "financial" - }, - { - "text": "This sentence is bill.", - "tag": "financial" - }, - { - "text": "This sentence is cash.", - "tag": "financial" - }, - { - "text": "This sentence is credit.", - "tag": "financial" - }, - { - "text": "This sentence is debit.", - "tag": "financial" - }, - { - "text": "This sentence is deposit.", - "tag": "financial" - }, - { - "text": "This sentence is dollar.", - "tag": "financial" - }, - { - "text": "This sentence is dollars.", - "tag": "financial" - }, - { - "text": "This sentence is loan.", - "tag": "financial" - }, - { - "text": "This sentence is money.", - "tag": "financial" - }, - { - "text": "This sentence is mortgage.", - "tag": "financial" - }, - { - "text": "This sentence is payment.", - "tag": "financial" - }, - { - "text": "This sentence is purchase.", - "tag": "financial" - }, - { - "text": "This sentence is salary.", - "tag": "financial" - }, - { - "text": "This sentence is savings.", - "tag": "financial" - }, - { - "text": "This sentence is transaction.", - "tag": "financial" - }, - { - "text": "This sentence is birth", - "tag": "personal" - }, - { - "text": "This sentence is 3G", - "tag": "personal" - }, - { - "text": "This sentence is 4G", - "tag": "personal" - }, - { - "text": "This sentence is cellular.", - "tag": "personal" - }, - { - "text": "This sentence is email.", - "tag": "personal" - }, - { - "text": "This sentence is fax.", - "tag": "personal" - }, - { - "text": "This sentence is password.", - "tag": "personal" - }, - { - "text": "This sentence is text.", - "tag": "personal" - }, - { - "text": "This sentence is telephone.", - "tag": "personal" - }, - { - "text": "This sentence is username.", - "tag": "personal" - }, - { - "text": "This sentence is firearm.", - "tag": "weapon" - }, - { - "text": "This sentence is grenade.", - "tag": "weapon" - }, - { - "text": "This sentence is gun.", - "tag": "weapon" - }, - { - "text": "This sentence is knife.", - "tag": "weapon" - }, - { - "text": "This sentence is rifle", - "tag": "weapon" - }, - { - "text": "This sentence is sword.", - "tag": "weapon" - }, - { - "text": "This sentence is passport.", - "tag": "identity document" - }, - { - "text": "This sentence is citizen.", - "tag": "identity document" - }, - { - "text": "This sentence is license.", - "tag": "identity document" - }, - { - "text": "This sentence is country.", - "tag": "identity document" - }, - { - "text": "This sentence is DOB.", - "tag": "identity document" - }, - { - "text": "This sentence is identity.", - "tag": "identity document" - }, - { - "text": "This sentence is surname.", - "tag": "identity document" - }, - { - "text": "This sentence is name.", - "tag": "identity document" - }, - { - "text": "This sentence is address.", - "tag": "identity document" - }, - { - "text": "This sentence is nationality.", - "tag": "identity document" - }, - { - "text": "This sentence is airline.", - "tag": "travel" - }, - { - "text": "This sentence is airport.", - "tag": "travel" - }, - { - "text": "This sentence is booking.", - "tag": "travel" - }, - { - "text": "This sentence is hotel.", - "tag": "travel" - }, - { - "text": "This sentence is itinerary.", - "tag": "travel" - }, - { - "text": "This sentence is motel.", - "tag": "travel" - }, - { - "text": "This sentence is passenger.", - "tag": "travel" - }, - { - "text": "This sentence is reservation.", - "tag": "travel" - }, - { - "text": "This sentence is roundtrip.", - "tag": "travel" - }, - { - "text": "This sentence is travel.", - "tag": "travel" - }, - { - "text": "This sentence is trip.", - "tag": "travel" + "text": "This sentence is beach.", + "tag": "beach" } ] \ No newline at end of file diff --git a/python/TransformerTagging/tests/data/multiple_tags.txt b/python/TransformerTagging/tests/data/multiple_tags.txt deleted file mode 100644 index 651b65c3..00000000 --- a/python/TransformerTagging/tests/data/multiple_tags.txt +++ /dev/null @@ -1 +0,0 @@ -Vehicles include wagons, bicycles, motor vehicles (motorcycles, cars, trucks, buses, mobility scooters for disabled people), railed vehicles (trains, trams), watercraft (ships, boats, underwater vehicles), amphibious vehicles (screw-propelled vehicles, hovercraft), aircraft (airplanes, helicopters, aerostats) and spacecraft. "An automated teller machine (ATM) is an electronic telecommunications device that enables customers of financial institutions to perform financial transactions, such as cash withdrawals, deposits, funds transfers, balance inquiries or account information inquiries, at any time and without the need for direct interaction with bank staff. The advent of widespread text-messaging has resulted in the cell phone novel, the first literary genre to emerge from the cellular age, via text messaging to a website that collects the novels as a whole. The sword developed from the knife or dagger. A passport holder is normally entitled to enter the country that issued the passport, though some people entitled to a passport may not be full citizens with right of abode (e.g. American nationals or British nationals). "A number of hotels and motels have entered the public consciousness through popular culture. diff --git a/python/TransformerTagging/tests/test_transformer_tagging.py b/python/TransformerTagging/tests/test_transformer_tagging.py index 2ec48d4b..d7291008 100644 --- a/python/TransformerTagging/tests/test_transformer_tagging.py +++ b/python/TransformerTagging/tests/test_transformer_tagging.py @@ -199,6 +199,21 @@ def test_custom_tagging_file(self): self.assertEqual(1, len(result)) + props = result[0].detection_properties + + beach_sentences = 'I drove to the beach today and will be staying overnight at a hotel.; ' \ + 'I plan to spend all day at the beach tomorrow.' + + beach_score_1 = 0.4417020082473755 + beach_score_2 = 0.4624265432357788 + beach_score_result_1, beach_score_result_2 = props["TEXT BEACH TRIGGER SENTENCES SCORE"].split(";") + + self.assertEqual("BEACH", props["TAGS"]) + self.assertEqual(beach_sentences, props["TEXT BEACH TRIGGER SENTENCES"]) + self.assertEqual('0-67; 197-242', props["TEXT BEACH TRIGGER SENTENCES OFFSET"]) + self.assertAlmostEqual(beach_score_1, float(beach_score_result_1), places=3) + self.assertAlmostEqual(beach_score_2, float(beach_score_result_2), places=3) + def test_debugging_show_matches(self): ff_loc = mpf.ImageLocation(0, 0, 10, 10, -1, dict(TEXT=SHORT_SAMPLE)) job = mpf.ImageJob('Test Image', 'test.jpg', {}, {}, ff_loc) diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py index fd03e6c5..21524b18 100644 --- a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py +++ b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py @@ -116,7 +116,7 @@ def __init__(self, job_props): mpf_util.get_property( properties=job_props, key='FEED_FORWARD_PROP_TO_PROCESS', - default_value='TEXT,TRANSCRIPT', + default_value='TEXT,TRANSCRIPT,TRANSLATION', prop_type=str ).split(',') ] @@ -133,6 +133,7 @@ def __init__(self, job_props): if os.path.exists(self._corpus_path): self.corpus = pd.read_json(self._corpus_path) + logger.info("Successfully read corpus json.") else: print(self._corpus_path) logger.exception('Failed to complete job due incorrect file path for the transformer tagging corpus: ' @@ -143,6 +144,8 @@ def __init__(self, job_props): mpf.DetectionError.COULD_NOT_READ_DATAFILE) self.threshold = mpf_util.get_property(job_props, 'SCORE_THRESHOLD', .3) + + # if debug is true will return which corpus sentences triggered the match self.debug = mpf_util.get_property(job_props, 'ENABLE_DEBUG', False) def add_tags(self, ff_props: Dict[str, str]): @@ -162,13 +165,16 @@ def add_tags(self, ff_props: Dict[str, str]): all_tag_results = [] + # for each sentence in input for probe_sent in input_sentences: + # get similarity scores for the input sentence with each corpus sentence probe_sent_embed = self.model.encode([probe_sent] * len(self.corpus), convert_to_tensor=True) corpus_embed = self.model.encode(self.corpus["text"], convert_to_tensor=True) cosine_scores = util.cos_sim(probe_sent_embed, corpus_embed) scores = [] + # get offset of the input sentence in the input text offset_beginning = input_text.find(probe_sent) offset_end = offset_beginning + len(probe_sent) - 1 offset_string = str(offset_beginning) + "-" + str(offset_end) @@ -184,17 +190,23 @@ def add_tags(self, ff_props: Dict[str, str]): "offset": offset_string }) + # sort by score then group by tag so each group will be sorted highest to lowest score, + # then take top row for each group probe_df = probe_df.sort_values(by=['score'], ascending=False) top_per_tag = probe_df.groupby(['tag'], sort=False).head(1) + # filter out results that are below threshold top_per_tag_threshold = top_per_tag[top_per_tag["score"] >= self.threshold] all_tag_results.append(top_per_tag_threshold) + # if no tags found in text return if not all_tag_results: return all_tag_results = pd.concat(all_tag_results) + # create detection properties for each tag found in the text + # detection properties formatted as TRIGGER SENTENCES... for tag in all_tag_results["tag"].unique(): tag_df = all_tag_results[all_tag_results["tag"] == tag] @@ -212,5 +224,6 @@ def add_tags(self, ff_props: Dict[str, str]): ff_props[prop_name_score] = "; ".join(tag_df["score"].astype(str)) if self.debug: + logger.info("Debug set to true, including corpus sentences that triggered the match.") prop_name_matches = prop_name_sent + " MATCHES" ff_props[prop_name_matches] = "; ".join(tag_df["corpus text"]) diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_text_tags_corpus.json b/python/TransformerTagging/transformer_tagging_component/transformer_text_tags_corpus.json index 188bce09..a93be27a 100644 --- a/python/TransformerTagging/transformer_tagging_component/transformer_text_tags_corpus.json +++ b/python/TransformerTagging/transformer_tagging_component/transformer_text_tags_corpus.json @@ -52,7 +52,11 @@ "tag": "vehicle" }, { - "text": "This sentence is financ.", + "text": "This sentence is finance.", + "tag": "financial" + }, + { + "text": "This sentence is financial.", "tag": "financial" }, { From 4883351e3df03f231df9db8433785318284a2fcd Mon Sep 17 00:00:00 2001 From: jrobble Date: Mon, 25 Sep 2023 13:56:01 -0400 Subject: [PATCH 04/21] Improve speed. --- .../plugin-files/descriptor/descriptor.json | 4 +- .../tests/config/custom_corpus.json | 4 + .../transformer_tagging_component.py | 104 ++++++++++-------- 3 files changed, 65 insertions(+), 47 deletions(-) diff --git a/python/TransformerTagging/plugin-files/descriptor/descriptor.json b/python/TransformerTagging/plugin-files/descriptor/descriptor.json index e0175f34..76ad364f 100644 --- a/python/TransformerTagging/plugin-files/descriptor/descriptor.json +++ b/python/TransformerTagging/plugin-files/descriptor/descriptor.json @@ -33,13 +33,13 @@ }, { "name": "TRANSFORMER_TAGGING_CORPUS", - "description": "", + "description": "Name of a JSON file that describes a tag hierarchy to be used for matching sentences. Will default to the plugin's config folder unless an alternate path to corpus file is specified (i.e. `$MPF_HOME/.../transformer_text_tags_corpus.json`).", "type": "STRING", "defaultValue": "transformer_text_tags_corpus.json" }, { "name": "ENABLE_DEBUG", - "description": "", + "description": "If true, each detection will include a `TEXT [TAG] TRIGGER SENTENCES MATCHES` property for each entry in `TAGS`. The value will be the sentence in the corpus which generated the highest match score for that tag.", "type": "BOOLEAN", "defaultValue": "FALSE" } diff --git a/python/TransformerTagging/tests/config/custom_corpus.json b/python/TransformerTagging/tests/config/custom_corpus.json index a550b6ed..b37da89e 100644 --- a/python/TransformerTagging/tests/config/custom_corpus.json +++ b/python/TransformerTagging/tests/config/custom_corpus.json @@ -2,5 +2,9 @@ { "text": "This sentence is beach.", "tag": "beach" + }, + { + "text": "This sentence is forest.", + "tag": "forest" } ] \ No newline at end of file diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py index 21524b18..d99e9c25 100644 --- a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py +++ b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py @@ -34,6 +34,7 @@ from typing import Sequence, Dict import pathlib import os +import time from pkg_resources import resource_filename from nltk.tokenize import sent_tokenize @@ -41,6 +42,8 @@ logger = logging.getLogger('TransformerTaggingComponent') +corpus_wrappers = {} + class TransformerTaggingComponent: detection_type = 'TEXT' @@ -48,23 +51,26 @@ class TransformerTaggingComponent: def get_detections_from_video(self, job: mpf.VideoJob) -> Sequence[mpf.VideoTrack]: logger.info(f'Received video job.') - return self.get_feed_forward_detections(job, job.feed_forward_track, video_job=True) + return self._get_feed_forward_detections(job, job.feed_forward_track, video_job=True) + def get_detections_from_image(self, job: mpf.ImageJob) -> Sequence[mpf.ImageLocation]: logger.info(f'Received image job.') - return self.get_feed_forward_detections(job, job.feed_forward_location) + return self._get_feed_forward_detections(job, job.feed_forward_location) + def get_detections_from_audio(self, job: mpf.AudioJob) -> Sequence[mpf.AudioTrack]: logger.info(f'Received audio job.') - return self.get_feed_forward_detections(job, job.feed_forward_track) + return self._get_feed_forward_detections(job, job.feed_forward_track) + def get_detections_from_generic(self, job: mpf.GenericJob) -> Sequence[mpf.GenericTrack]: logger.info(f'Received generic job.') if job.feed_forward_track: - return self.get_feed_forward_detections(job, job.feed_forward_track) + return self._get_feed_forward_detections(job, job.feed_forward_track) else: logger.info('Job did not contain a feed forward track. Assuming ' 'media file is a plain text file containing the text to ' @@ -79,20 +85,20 @@ def get_detections_from_generic(self, job: mpf.GenericJob) -> Sequence[mpf.Gener 'FEED_FORWARD_PROP_TO_PROCESS': 'TEXT' } - tw = TransformerWrapper(new_job_props) + tw = self._get_wrapper(new_job_props) tw.add_tags(new_ff_props) return [ff_track] - @staticmethod - def get_feed_forward_detections(job, job_feed_forward, video_job=False): + + def _get_feed_forward_detections(self, job, job_feed_forward, video_job=False): try: if job_feed_forward is None: raise mpf.DetectionError.UNSUPPORTED_DATA_TYPE.exception( f'Component can only process feed forward ' ' jobs, but no feed forward track provided. ') - tw = TransformerWrapper(job.job_properties) + tw = self._get_wrapper(job.job_properties) tw.add_tags(job_feed_forward.detection_properties) if video_job: @@ -107,9 +113,37 @@ def get_feed_forward_detections(job, job_feed_forward, video_job=False): raise + @staticmethod + def _get_wrapper(job_props): + corpus_file = \ + mpf_util.get_property(job_props, 'TRANSFORMER_TAGGING_CORPUS', "transformer_text_tags_corpus.json") + + corpus_path = "" + if "$" not in corpus_file and "/" not in corpus_file: + corpus_path = os.path.realpath(resource_filename(__name__, corpus_file)) + else: + corpus_path = os.path.expandvars(corpus_file) + + if not os.path.exists(corpus_path): + logger.exception('Failed to complete job due incorrect file path for the transformer tagging corpus: ' + f'"{corpus_file}"') + raise mpf.DetectionException( + 'Invalid path provided for transformer tagging corpus: ' + f'"{corpus_file}"', + mpf.DetectionError.COULD_NOT_READ_DATAFILE) + + if not corpus_path in corpus_wrappers: + corpus_wrappers[corpus_path] = TransformerWrapper(job_props, corpus_path) + + return corpus_wrappers[corpus_path] + + class TransformerWrapper: - def __init__(self, job_props): - self.model = SentenceTransformer('all-mpnet-base-v2') + + def __init__(self, job_props, corpus_path): + self._model = SentenceTransformer('all-mpnet-base-v2') + + self._corpus_path = corpus_path self._props_to_process = [ prop.strip() for prop in @@ -121,32 +155,18 @@ def __init__(self, job_props): ).split(',') ] - self._corpus_file = \ - mpf_util.get_property(job_props, 'TRANSFORMER_TAGGING_CORPUS', "transformer_text_tags_corpus.json") - - self._corpus_path = "" + self._threshold = mpf_util.get_property(job_props, 'SCORE_THRESHOLD', .3) - if "$" not in self._corpus_file and "/" not in self._corpus_file: - self._corpus_path = os.path.realpath(resource_filename(__name__, self._corpus_file)) - else: - self._corpus_path = os.path.expandvars(self._corpus_file) - - if os.path.exists(self._corpus_path): - self.corpus = pd.read_json(self._corpus_path) - logger.info("Successfully read corpus json.") - else: - print(self._corpus_path) - logger.exception('Failed to complete job due incorrect file path for the transformer tagging corpus: ' - f'"{self._corpus_file}"') - raise mpf.DetectionException( - 'Invalid path provided for transformer tagging corpus: ' - f'"{self._corpus_file}"', - mpf.DetectionError.COULD_NOT_READ_DATAFILE) + # if debug is true will return which corpus sentences triggered the match + self._debug = mpf_util.get_property(job_props, 'ENABLE_DEBUG', False) - self.threshold = mpf_util.get_property(job_props, 'SCORE_THRESHOLD', .3) + self._corpus = pd.read_json(self._corpus_path) - # if debug is true will return which corpus sentences triggered the match - self.debug = mpf_util.get_property(job_props, 'ENABLE_DEBUG', False) + start = time.time() + self._corpus_embed = self._model.encode(self._corpus["text"], convert_to_tensor=True, show_progress_bar=False) + elapsed = time.time() - start + logger.info(f"Successfully encoded corpus in {elapsed} seconds.") + def add_tags(self, ff_props: Dict[str, str]): for prop_to_tag in self._props_to_process: @@ -168,24 +188,18 @@ def add_tags(self, ff_props: Dict[str, str]): # for each sentence in input for probe_sent in input_sentences: # get similarity scores for the input sentence with each corpus sentence - probe_sent_embed = self.model.encode([probe_sent] * len(self.corpus), convert_to_tensor=True) - corpus_embed = self.model.encode(self.corpus["text"], convert_to_tensor=True) - - cosine_scores = util.cos_sim(probe_sent_embed, corpus_embed) - scores = [] + probe_sent_embed = self._model.encode(probe_sent, convert_to_tensor=True, show_progress_bar=False) + scores = [float(util.cos_sim(probe_sent_embed, corpus_sent_embed)) for corpus_sent_embed in self._corpus_embed] # get offset of the input sentence in the input text offset_beginning = input_text.find(probe_sent) offset_end = offset_beginning + len(probe_sent) - 1 offset_string = str(offset_beginning) + "-" + str(offset_end) - for i in range(len(probe_sent_embed)): - scores.append(float(cosine_scores[i][i])) - probe_df = pd.DataFrame({ "input text": probe_sent, - "corpus text": self.corpus["text"], - "tag": self.corpus["tag"], + "corpus text": self._corpus["text"], + "tag": self._corpus["tag"], "score": scores, "offset": offset_string }) @@ -196,7 +210,7 @@ def add_tags(self, ff_props: Dict[str, str]): top_per_tag = probe_df.groupby(['tag'], sort=False).head(1) # filter out results that are below threshold - top_per_tag_threshold = top_per_tag[top_per_tag["score"] >= self.threshold] + top_per_tag_threshold = top_per_tag[top_per_tag["score"] >= self._threshold] all_tag_results.append(top_per_tag_threshold) # if no tags found in text return @@ -223,7 +237,7 @@ def add_tags(self, ff_props: Dict[str, str]): ff_props[prop_name_offset] = "; ".join(tag_df["offset"]) ff_props[prop_name_score] = "; ".join(tag_df["score"].astype(str)) - if self.debug: + if self._debug: logger.info("Debug set to true, including corpus sentences that triggered the match.") prop_name_matches = prop_name_sent + " MATCHES" ff_props[prop_name_matches] = "; ".join(tag_df["corpus text"]) From cc9ce01176f28f62ebfea05079769d9fe476f03c Mon Sep 17 00:00:00 2001 From: jrobble Date: Mon, 25 Sep 2023 15:41:02 -0400 Subject: [PATCH 05/21] Refactor. --- .../transformer_tagging_component/__init__.py | 2 +- .../transformer_tagging_component.py | 142 +++++++++--------- 2 files changed, 74 insertions(+), 70 deletions(-) diff --git a/python/TransformerTagging/transformer_tagging_component/__init__.py b/python/TransformerTagging/transformer_tagging_component/__init__.py index f8d30e7d..34a90aea 100644 --- a/python/TransformerTagging/transformer_tagging_component/__init__.py +++ b/python/TransformerTagging/transformer_tagging_component/__init__.py @@ -24,4 +24,4 @@ # limitations under the License. # ############################################################################# -from .transformer_tagging_component import TransformerTaggingComponent, TransformerWrapper \ No newline at end of file +from .transformer_tagging_component import TransformerTaggingComponent \ No newline at end of file diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py index d99e9c25..5265512a 100644 --- a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py +++ b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py @@ -31,7 +31,7 @@ from sentence_transformers import SentenceTransformer, util -from typing import Sequence, Dict +from typing import Sequence, Dict, Mapping import pathlib import os import time @@ -42,12 +42,15 @@ logger = logging.getLogger('TransformerTaggingComponent') -corpus_wrappers = {} - class TransformerTaggingComponent: detection_type = 'TEXT' + def __init__(self): + self._cached_model = SentenceTransformer('all-mpnet-base-v2') + self._cached_corpuses: Dict[str, Corpus] = {} + + def get_detections_from_video(self, job: mpf.VideoJob) -> Sequence[mpf.VideoTrack]: logger.info(f'Received video job.') @@ -85,8 +88,9 @@ def get_detections_from_generic(self, job: mpf.GenericJob) -> Sequence[mpf.Gener 'FEED_FORWARD_PROP_TO_PROCESS': 'TEXT' } - tw = self._get_wrapper(new_job_props) - tw.add_tags(new_ff_props) + config = JobConfig(new_job_props) + corpus = self._get_corpus(config.corpus_path) + self._add_tags(config, corpus, new_ff_props) return [ff_track] @@ -98,12 +102,14 @@ def _get_feed_forward_detections(self, job, job_feed_forward, video_job=False): f'Component can only process feed forward ' ' jobs, but no feed forward track provided. ') - tw = self._get_wrapper(job.job_properties) - tw.add_tags(job_feed_forward.detection_properties) + config = JobConfig(job.job_properties) + corpus = self._get_corpus(config.corpus_path) + + self._add_tags(config, corpus, job_feed_forward.detection_properties) if video_job: for ff_location in job.feed_forward_track.frame_locations.values(): - tw.add_tags(ff_location.detection_properties) + self._add_tags(config, corpus, ff_location.detection_properties) return [job_feed_forward] @@ -113,63 +119,15 @@ def _get_feed_forward_detections(self, job, job_feed_forward, video_job=False): raise - @staticmethod - def _get_wrapper(job_props): - corpus_file = \ - mpf_util.get_property(job_props, 'TRANSFORMER_TAGGING_CORPUS', "transformer_text_tags_corpus.json") - - corpus_path = "" - if "$" not in corpus_file and "/" not in corpus_file: - corpus_path = os.path.realpath(resource_filename(__name__, corpus_file)) - else: - corpus_path = os.path.expandvars(corpus_file) + def _get_corpus(self, corpus_path): + if not corpus_path in self._cached_corpuses: + self._cached_corpuses[corpus_path] = Corpus(corpus_path, self._cached_model) - if not os.path.exists(corpus_path): - logger.exception('Failed to complete job due incorrect file path for the transformer tagging corpus: ' - f'"{corpus_file}"') - raise mpf.DetectionException( - 'Invalid path provided for transformer tagging corpus: ' - f'"{corpus_file}"', - mpf.DetectionError.COULD_NOT_READ_DATAFILE) - - if not corpus_path in corpus_wrappers: - corpus_wrappers[corpus_path] = TransformerWrapper(job_props, corpus_path) - - return corpus_wrappers[corpus_path] + return self._cached_corpuses[corpus_path] -class TransformerWrapper: - - def __init__(self, job_props, corpus_path): - self._model = SentenceTransformer('all-mpnet-base-v2') - - self._corpus_path = corpus_path - - self._props_to_process = [ - prop.strip() for prop in - mpf_util.get_property( - properties=job_props, - key='FEED_FORWARD_PROP_TO_PROCESS', - default_value='TEXT,TRANSCRIPT,TRANSLATION', - prop_type=str - ).split(',') - ] - - self._threshold = mpf_util.get_property(job_props, 'SCORE_THRESHOLD', .3) - - # if debug is true will return which corpus sentences triggered the match - self._debug = mpf_util.get_property(job_props, 'ENABLE_DEBUG', False) - - self._corpus = pd.read_json(self._corpus_path) - - start = time.time() - self._corpus_embed = self._model.encode(self._corpus["text"], convert_to_tensor=True, show_progress_bar=False) - elapsed = time.time() - start - logger.info(f"Successfully encoded corpus in {elapsed} seconds.") - - - def add_tags(self, ff_props: Dict[str, str]): - for prop_to_tag in self._props_to_process: + def _add_tags(self, config, corpus, ff_props: Dict[str, str]): + for prop_to_tag in config.props_to_process: input_text = ff_props.get(prop_to_tag, None) if input_text: break @@ -178,7 +136,7 @@ def add_tags(self, ff_props: Dict[str, str]): break else: logger.warning("Feed forward element missing one of the following properties: " - + ", ".join(self._props_to_process)) + + ", ".join(config.props_to_process)) return input_sentences = sent_tokenize(input_text) @@ -188,8 +146,8 @@ def add_tags(self, ff_props: Dict[str, str]): # for each sentence in input for probe_sent in input_sentences: # get similarity scores for the input sentence with each corpus sentence - probe_sent_embed = self._model.encode(probe_sent, convert_to_tensor=True, show_progress_bar=False) - scores = [float(util.cos_sim(probe_sent_embed, corpus_sent_embed)) for corpus_sent_embed in self._corpus_embed] + probe_sent_embed = self._cached_model.encode(probe_sent, convert_to_tensor=True, show_progress_bar=False) + scores = [float(util.cos_sim(probe_sent_embed, corpus_sent_embed)) for corpus_sent_embed in corpus.embed] # get offset of the input sentence in the input text offset_beginning = input_text.find(probe_sent) @@ -198,8 +156,8 @@ def add_tags(self, ff_props: Dict[str, str]): probe_df = pd.DataFrame({ "input text": probe_sent, - "corpus text": self._corpus["text"], - "tag": self._corpus["tag"], + "corpus text": corpus.json["text"], + "tag": corpus.json["tag"], "score": scores, "offset": offset_string }) @@ -210,7 +168,7 @@ def add_tags(self, ff_props: Dict[str, str]): top_per_tag = probe_df.groupby(['tag'], sort=False).head(1) # filter out results that are below threshold - top_per_tag_threshold = top_per_tag[top_per_tag["score"] >= self._threshold] + top_per_tag_threshold = top_per_tag[top_per_tag["score"] >= config.threshold] all_tag_results.append(top_per_tag_threshold) # if no tags found in text return @@ -237,7 +195,53 @@ def add_tags(self, ff_props: Dict[str, str]): ff_props[prop_name_offset] = "; ".join(tag_df["offset"]) ff_props[prop_name_score] = "; ".join(tag_df["score"].astype(str)) - if self._debug: + if config.debug: logger.info("Debug set to true, including corpus sentences that triggered the match.") prop_name_matches = prop_name_sent + " MATCHES" ff_props[prop_name_matches] = "; ".join(tag_df["corpus text"]) + + +class Corpus: + def __init__(self, corpus_path, model): + self.json = pd.read_json(corpus_path) + + start = time.time() + self.embed= model.encode(self.json["text"], convert_to_tensor=True, show_progress_bar=False) + elapsed = time.time() - start + logger.info(f"Successfully encoded corpus in {elapsed} seconds.") + + +class JobConfig: + def __init__(self, props: Mapping[str, str]): + + self.props_to_process = [ + prop.strip() for prop in + mpf_util.get_property( + properties=props, + key='FEED_FORWARD_PROP_TO_PROCESS', + default_value='TEXT,TRANSCRIPT,TRANSLATION', + prop_type=str + ).split(',') + ] + + self.threshold = mpf_util.get_property(props, 'SCORE_THRESHOLD', .3) + + # if debug is true will return which corpus sentences triggered the match + self.debug = mpf_util.get_property(props, 'ENABLE_DEBUG', False) + + self.corpus_file = \ + mpf_util.get_property(props, 'TRANSFORMER_TAGGING_CORPUS', "transformer_text_tags_corpus.json") + + self.corpus_path = "" + if "$" not in self.corpus_file and "/" not in self.corpus_file: + self.corpus_path = os.path.realpath(resource_filename(__name__, self.corpus_file)) + else: + self.corpus_path = os.path.expandvars(self.corpus_file) + + if not os.path.exists(self.corpus_path): + logger.exception('Failed to complete job due incorrect file path for the transformer tagging corpus: ' + f'"{self.corpus_file}"') + raise mpf.DetectionException( + 'Invalid path provided for transformer tagging corpus: ' + f'"{self.corpus_file}"', + mpf.DetectionError.COULD_NOT_READ_DATAFILE) From 9aac89899478866e1048beb6ed1c0ccc406f0c3f Mon Sep 17 00:00:00 2001 From: Chris7C Date: Tue, 19 Dec 2023 14:33:21 -0500 Subject: [PATCH 06/21] wip --- cpp/KeywordTagging/Dockerfile | 23 +++ cpp/KeywordTagging/KeywordTagging.cpp | 65 ++++++--- cpp/KeywordTagging/KeywordTagging.h | 2 +- cpp/KeywordTagging/README.md | 21 +-- .../test/test_keyword_tagging.cpp | 136 ++++++++++++------ .../plugin-files/descriptor/descriptor.json | 39 +++-- python/TransformerTagging/setup.cfg | 2 +- .../tests/test_transformer_tagging.py | 15 ++ .../transformer_tagging_component.py | 1 - 9 files changed, 210 insertions(+), 94 deletions(-) diff --git a/cpp/KeywordTagging/Dockerfile b/cpp/KeywordTagging/Dockerfile index 9410160e..2cca752f 100644 --- a/cpp/KeywordTagging/Dockerfile +++ b/cpp/KeywordTagging/Dockerfile @@ -33,6 +33,29 @@ FROM ${BUILD_REGISTRY}openmpf_cpp_component_build:${BUILD_TAG} as build_componen ENV LANG C.UTF-8 ENV LC_ALL C.UTF-8 +#mitre cert... +RUN mkdir -p /usr/local/share/ca-certificates +COPY mitre.crt /usr/local/share/ca-certificates/mitre.crt +RUN update-ca-certificates ||: + +#install openssl... +#ADD https://www.openssl.org/source/openssl-3.2.0.tar.gz /tmp +#RUN tar -zxvf /tmp/openssl-3.2.0.tar.gz -C /tmp +#RUN cd /tmp/openssl-3.2.0; ./config --prefix=/usr/local/ssl --openssldir=/usr/local/ssl shared zlib; make; make test; make install; break!; + +#install curl... +#ADD https://curl.se/download/curl-8.4.0.tar.gz /tmp +#RUN tar -zxvf /tmp/curl-8.4.0.tar.gz -C /tmp +#RUN /tmp/curl-8.4.0/configure --with-openssl +#RUN cd /tmp/curl-8.4.0; openssl version -d; ./configure --without-ssl; make; make install;ls -al; curl -V; + +#ADD https://gitlab.mitre.org/mitre-scripts/mitre-pki/raw/master/os_scripts/install_certs.sh /tmp/install_certs.sh +#RUN chmod a+x /tmp/install_certs.sh +#RUN /tmp/install_certs.sh +#RUN rm /tmp/install_certs.sh +#RUN apt install curl +#RUN curl -ksSL https://gitlab.mitre.org/mitre-scripts/mitre-pki/raw/master/tool_scripts/install_certs.sh | JAVA_HOME=/path/to/java MODE=java sh + RUN --mount=type=tmpfs,target=/var/cache/apt \ --mount=type=tmpfs,target=/var/lib/apt/lists \ --mount=type=tmpfs,target=/tmp \ diff --git a/cpp/KeywordTagging/KeywordTagging.cpp b/cpp/KeywordTagging/KeywordTagging.cpp index 7f466574..c6e267db 100644 --- a/cpp/KeywordTagging/KeywordTagging.cpp +++ b/cpp/KeywordTagging/KeywordTagging.cpp @@ -278,7 +278,7 @@ bool KeywordTagging::comp_regex(const MPFJob &job, const wstring &full_text, set KeywordTagging::search_regex(const MPFJob &job, const wstring &full_text, const map>> &json_kvs_regex, - map> &trigger_words_offset, + map>> &trigger_tags_words_offset, bool full_regex) { wstring found_tags_regex = L""; set found_keys_regex; @@ -290,17 +290,19 @@ set KeywordTagging::search_regex(const MPFJob &job, const wstring &full for (const auto &kv : json_kvs_regex) { auto key = kv.first; auto values = kv.second; + map> trigger_words_offset; for (const pair &value : values) { wstring regex_pattern = value.first; bool case_sens = value.second; if (comp_regex(job, full_text, regex_pattern, trigger_words_offset, full_regex, case_sens)) { found_keys_regex.insert(key); + trigger_tags_words_offset[key] = trigger_words_offset; // Discontinue searching unless full regex search is enabled. if (!full_regex) { break; } - } + } } } @@ -546,33 +548,56 @@ void KeywordTagging::process_text_tagging(Properties &detection_properties, cons bool full_regex = DetectionComponentUtils::GetProperty(job.job_properties, "FULL_REGEX_SEARCH", true); - set trigger_words; - map> trigger_words_offset; - set found_tags_regex = search_regex(job, text, json_kvs_regex, trigger_words_offset, full_regex); + map>> trigger_tags_words_offset; + set found_tags_regex = search_regex(job, text, json_kvs_regex, trigger_tags_words_offset, full_regex); all_found_tags.insert(found_tags_regex.begin(), found_tags_regex.end()); wstring tag_string = boost::algorithm::join(found_tags_regex, L"; "); - vector offsets_list; - vector triggers_list; - - wstring tag_trigger = boost::algorithm::join(trigger_words, L"; "); - - for (auto const& word_offset : trigger_words_offset) { - triggers_list.push_back(word_offset.first); - offsets_list.push_back(boost::algorithm::join(word_offset.second, ", ")); - } + map>>::iterator trigger_tags_words_offset_iterator = trigger_tags_words_offset.begin(); + while(trigger_tags_words_offset_iterator != trigger_tags_words_offset.end()) + { + vector offsets_list; + vector triggers_list; - string tag_offset = boost::algorithm::join(offsets_list, "; "); - tag_trigger = tag_trigger + boost::algorithm::join(triggers_list, L"; "); + wstring tag = trigger_tags_words_offset_iterator->first; + boost::to_upper(tag); + map> trigger_words_offset = trigger_tags_words_offset_iterator->second; - detection_properties[boost::locale::conv::utf_to_utf(prop) + " TRIGGER WORDS"] = boost::locale::conv::utf_to_utf(tag_trigger); - detection_properties[boost::locale::conv::utf_to_utf(prop)+ " TRIGGER WORDS OFFSET"] = tag_offset; + for (auto const& word_offset : trigger_words_offset) { + triggers_list.push_back(word_offset.first); + offsets_list.push_back(boost::algorithm::join(word_offset.second, ", ")); + } + + string tag_offset = boost::algorithm::join(offsets_list, "; "); + wstring tag_trigger = boost::algorithm::join(triggers_list, L"; "); + + detection_properties[boost::locale::conv::utf_to_utf(prop) + " " + boost::locale::conv::utf_to_utf(tag) + " TRIGGER WORDS"] = boost::locale::conv::utf_to_utf(tag_trigger); + detection_properties[boost::locale::conv::utf_to_utf(prop) + " " + boost::locale::conv::utf_to_utf(tag) + " TRIGGER WORDS OFFSET"] = tag_offset; + trigger_tags_words_offset_iterator++; + } } if (has_text) { - wstring tag_string = boost::algorithm::join(all_found_tags, L"; "); - detection_properties["TAGS"] = boost::locale::conv::utf_to_utf(tag_string); + set all_tags; + // store off earlier tags + boost::regex delimiter{"( *; *)"}; + boost::sregex_token_iterator iter(detection_properties["TAGS"].begin(), + detection_properties["TAGS"].end(), delimiter, -1); + boost::sregex_token_iterator end; + while(iter != end) + { + all_tags.insert(*iter++); + } + + // add in new tags + set::iterator new_tags_it; + for(new_tags_it = all_found_tags.begin(); new_tags_it != all_found_tags.end(); new_tags_it++) + { + all_tags.insert(boost::locale::conv::utf_to_utf(*new_tags_it)); + } + string tag_string = boost::algorithm::join(all_tags, L"; "); + detection_properties["TAGS"] = tag_string; } } diff --git a/cpp/KeywordTagging/KeywordTagging.h b/cpp/KeywordTagging/KeywordTagging.h index 4aa07f60..f382fca8 100644 --- a/cpp/KeywordTagging/KeywordTagging.h +++ b/cpp/KeywordTagging/KeywordTagging.h @@ -59,7 +59,7 @@ class KeywordTagging : public MPFDetectionComponent { std::set search_regex(const MPFJob &job, const std::wstring &full_text, const std::map>> &json_kvs_regex, - std::map> &trigger_words_offset, + std::map>> &trigger_tags_words_offset, bool full_regex); void process_regex_match(const boost::wsmatch &match, const std::wstring &full_text, diff --git a/cpp/KeywordTagging/README.md b/cpp/KeywordTagging/README.md index 6eef8aa6..efd84842 100644 --- a/cpp/KeywordTagging/README.md +++ b/cpp/KeywordTagging/README.md @@ -125,24 +125,25 @@ instead. # Outputs Each input property listed in `FEED_FORWARD_PROP_TO_PROCESS` that's -present, and not just whitespace, will result in a `TRIGGER WORDS` and -`TRIGGER WORDS OFFSET` output property. For example, if -`FEED_FORWARD_PROP_TO_PROCESS=TEXT,TRANSLATION`, and the `TEXT` and `TRANSLATION` -properties are both present, then the following output properties will be produced: - -- `TEXT TRIGGER WORDS` -- `TEXT TRIGGER WORDS OFFSET` +present, and not just whitespace, will result in a `[TAG] TRIGGER WORDS` and +`[TAG] TRIGGER WORDS OFFSET` output property. The '[TAG]' will be the tag property +that matched in the input text. `FEED_FORWARD_PROP_TO_PROCESS=TEXT,TRANSLATION`, +and the `TEXT` and `TRANSLATION` properties are both present, then the following +output properties will be produced: + +- `TEXT [TAG] TRIGGER WORDS` +- `TEXT [TAG] TRIGGER WORDS OFFSET` - `TRANSLATION TRIGGER WORDS` - `TRANSLATION TRIGGER WORDS OFFSET` Let's assume that we need process the `TEXT` property. The substring(s) that -triggered each tag will be stored in `TEXT TRIGGER WORDS` in alphabetical order. +triggered each tag will be stored in `TEXT [TAG] TRIGGER WORDS` in alphabetical order. For each trigger word the substring index range relative to the `TEXT` output -will be stored in `TEXT TRIGGER WORDS OFFSET`. Because the same trigger word +will be stored in `TEXT [TAG] TRIGGER WORDS OFFSET`. Because the same trigger word can be encountered multiple times in the `TEXT` output, the results are organized as follows: -* `TEXT TRIGGER WORDS`: Each distinct trigger word is separated by a semicolon +* `TEXT [TAGS] TRIGGER WORDS`: Each distinct trigger word is separated by a semicolon followed by a space. For example: `TEXT TRIGGER WORDS=trigger1; trigger2` * Because semicolons can be part of the trigger word itself, those semicolons will be encapsulated in brackets. For example, diff --git a/cpp/KeywordTagging/test/test_keyword_tagging.cpp b/cpp/KeywordTagging/test/test_keyword_tagging.cpp index 837366d4..854d3039 100644 --- a/cpp/KeywordTagging/test/test_keyword_tagging.cpp +++ b/cpp/KeywordTagging/test/test_keyword_tagging.cpp @@ -129,8 +129,10 @@ TEST(KEYWORDTAGGING, TaggingTest) { // Test escaped backslash text tagging. ASSERT_NO_FATAL_FAILURE(runKeywordTagging("data/test-backslash.txt", tagger, results, custom_properties)); assertInText("data/test-backslash.txt", "backslash; personal", results, "TAGS"); - assertInText("data/test-backslash.txt", "TEXT; \\", results, "TEXT TRIGGER WORDS"); - assertInText("data/test-backslash.txt", "7-10; 0, 12, 15, 16, 18, 19", results, "TEXT TRIGGER WORDS OFFSET"); + assertInText("data/test-backslash.txt", "\\", results, "TEXT BACKSLASH TRIGGER WORDS"); + assertInText("data/test-backslash.txt", "0, 12, 15, 16, 18, 19, 20, 21", results, "TEXT BACKSLASH TRIGGER WORDS OFFSET"); + assertInText("data/test-backslash.txt", "TEXT", results, "TEXT PERSONAL TRIGGER WORDS"); + assertInText("data/test-backslash.txt", "7-10", results, "TEXT PERSONAL TRIGGER WORDS OFFSET"); ASSERT_TRUE(tagger.Close()); } @@ -148,22 +150,34 @@ TEST(KEYWORDTAGGING, MulitpleTagsTest) { ASSERT_NO_FATAL_FAILURE(runKeywordTagging("data/tags-keyword.txt", tagger, results, custom_properties)); assertInText("data/tags-keyword.txt", "Passenger Passport", results, "TEXT"); assertInText("data/tags-keyword.txt", "identity document; travel", results, "TAGS"); - assertInText("data/tags-keyword.txt", "Passenger; Passport", results, "TEXT TRIGGER WORDS"); - assertInText("data/tags-keyword.txt", "0-8; 10-17", results, "TEXT TRIGGER WORDS OFFSET"); + assertInText("data/tags-keyword.txt", "Passport", results, "TEXT IDENTITY DOCUMENT TRIGGER WORDS"); + assertInText("data/tags-keyword.txt", "10-17", results, "TEXT IDENTITY DOCUMENT TRIGGER WORDS OFFSET"); + assertInText("data/tags-keyword.txt", "Passenger", results, "TEXT TRAVEL TRIGGER WORDS"); + assertInText("data/tags-keyword.txt", "0-8", results, "TEXT TRAVEL TRIGGER WORDS OFFSET"); results.clear(); ASSERT_NO_FATAL_FAILURE(runKeywordTagging("data/tags-regex.txt", tagger, results, custom_properties)); assertInText("data/tags-regex.txt", "case-insensitive-tag; financial; personal", results, "TAGS"); - assertInText("data/tags-regex.txt", "122-123-1234; financ", results, "TEXT TRIGGER WORDS"); - assertInText("data/tags-regex.txt", "17-28; 0-5", results, "TEXT TRIGGER WORDS OFFSET"); + assertInText("data/tags-regex.txt", "financ", results, "TEXT CASE-INSENSITIVE-TAG TRIGGER WORDS"); + assertInText("data/tags-regex.txt", "0-5", results, "TEXT CASE-INSENSITIVE-TAG TRIGGER WORDS OFFSET"); + assertInText("data/tags-regex.txt", "financ", results, "TEXT FINANCIAL TRIGGER WORDS"); + assertInText("data/tags-regex.txt", "0-5", results, "TEXT FINANCIAL TRIGGER WORDS OFFSET"); + assertInText("data/tags-regex.txt", "122-123-1234", results, "TEXT PERSONAL TRIGGER WORDS"); + assertInText("data/tags-regex.txt", "17-28", results, "TEXT PERSONAL TRIGGER WORDS OFFSET"); results.clear(); // Test multiple text tagging w/ delimiter tag. ASSERT_NO_FATAL_FAILURE(runKeywordTagging("data/tags-regex-delimiter.txt", tagger, results, custom_properties)); assertInText("data/tags-regex-delimiter.txt", "case-insensitive-tag; delimiter-test; financial; personal", results, "TAGS"); - assertInText("data/tags-regex-delimiter.txt", "122-123-1234; a[[;] ]b; financ", results, "TEXT TRIGGER WORDS"); - assertInText("data/tags-regex-delimiter.txt", "22-33; 15-20; 0-5", results, "TEXT TRIGGER WORDS OFFSET"); + assertInText("data/tags-regex-delimiter.txt", "financ", results, "TEXT CASE-INSENSITIVE-TAG TRIGGER WORDS"); + assertInText("data/tags-regex-delimiter.txt", "0-5", results, "TEXT CASE-INSENSITIVE-TAG TRIGGER WORDS OFFSET"); + assertInText("data/tags-regex-delimiter.txt", "a[[;] ]b", results, "TEXT DELIMITER-TEST TRIGGER WORDS"); + assertInText("data/tags-regex-delimiter.txt", "15-20", results, "TEXT DELIMITER-TEST TRIGGER WORDS OFFSET"); + assertInText("data/tags-regex-delimiter.txt", "financ", results, "TEXT FINANCIAL TRIGGER WORDS"); + assertInText("data/tags-regex-delimiter.txt", "0-5", results, "TEXT FINANCIAL TRIGGER WORDS OFFSET"); + assertInText("data/tags-regex-delimiter.txt", "122-123-1234", results, "TEXT PERSONAL TRIGGER WORDS"); + assertInText("data/tags-regex-delimiter.txt", "22-33", results, "TEXT PERSONAL TRIGGER WORDS OFFSET"); ASSERT_TRUE(tagger.Close()); } @@ -181,16 +195,32 @@ TEST(KEYWORDTAGGING, FullSearch) { ASSERT_NO_FATAL_FAILURE(runKeywordTagging("data/tags-keywordregex.txt", tagger, results, custom_properties)); assertInText("data/tags-keywordregex.txt", "case-insensitive-tag; case-sensitive-tag; financial; personal; vehicle", results, "TAGS"); - assertInText("data/tags-keywordregex.txt", "01/01/20; Financ; Text; Vehicle", results, "TEXT TRIGGER WORDS"); - assertInText("data/tags-keywordregex.txt", "20-27; 37-42; 10-13, 15-18; 29-35", results, "TEXT TRIGGER WORDS OFFSET"); + assertInText("data/tags-keywordregex.txt", "Financ", results, "TEXT CASE-INSENSITIVE-TAG TRIGGER WORDS"); + assertInText("data/tags-keywordregex.txt", "37-42", results, "TEXT CASE-INSENSITIVE-TAG TRIGGER WORDS OFFSET"); + assertInText("data/tags-keywordregex.txt", "Financ", results, "TEXT CASE-SENSITIVE-TAG TRIGGER WORDS"); + assertInText("data/tags-keywordregex.txt", "37-42", results, "TEXT CASE-SENSITIVE-TAG TRIGGER WORDS OFFSET"); + assertInText("data/tags-keywordregex.txt", "Financ", results, "TEXT FINANCIAL TRIGGER WORDS"); + assertInText("data/tags-keywordregex.txt", "37-42", results, "TEXT FINANCIAL TRIGGER WORDS OFFSET"); + assertInText("data/tags-keywordregex.txt", "01/01/20; Text", results, "TEXT PERSONAL TRIGGER WORDS"); + assertInText("data/tags-keywordregex.txt", "20-27; 10-13, 15-18", results, "TEXT PERSONAL TRIGGER WORDS OFFSET"); + assertInText("data/tags-keywordregex.txt", "Vehicle", results, "TEXT VEHICLE TRIGGER WORDS"); + assertInText("data/tags-keywordregex.txt", "29-35", results, "TEXT VEHICLE TRIGGER WORDS OFFSET"); results.clear(); // With full regex search disabled, number of reported triggers and offsets will decrease. ASSERT_NO_FATAL_FAILURE(runKeywordTagging("data/tags-keywordregex.txt", tagger, results, custom_properties_disabled)); assertInText("data/tags-keywordregex.txt", "case-insensitive-tag; case-sensitive-tag; financial; personal; vehicle", results, "TAGS"); - assertInText("data/tags-keywordregex.txt", "01/01/20; Financ; Vehicle", results, "TEXT TRIGGER WORDS"); - assertInText("data/tags-keywordregex.txt", "20-27; 37-42; 29-35", results, "TEXT TRIGGER WORDS OFFSET"); + assertInText("data/tags-keywordregex.txt", "Financ", results, "TEXT CASE-INSENSITIVE-TAG TRIGGER WORDS"); + assertInText("data/tags-keywordregex.txt", "37-42", results, "TEXT CASE-INSENSITIVE-TAG TRIGGER WORDS OFFSET"); + assertInText("data/tags-keywordregex.txt", "Financ", results, "TEXT CASE-SENSITIVE-TAG TRIGGER WORDS"); + assertInText("data/tags-keywordregex.txt", "37-42", results, "TEXT CASE-SENSITIVE-TAG TRIGGER WORDS OFFSET"); + assertInText("data/tags-keywordregex.txt", "Financ", results, "TEXT FINANCIAL TRIGGER WORDS"); + assertInText("data/tags-keywordregex.txt", "37-42", results, "TEXT FINANCIAL TRIGGER WORDS OFFSET"); + assertInText("data/tags-keywordregex.txt", "01/01/20", results, "TEXT PERSONAL TRIGGER WORDS"); + assertInText("data/tags-keywordregex.txt", "20-27", results, "TEXT PERSONAL TRIGGER WORDS OFFSET"); + assertInText("data/tags-keywordregex.txt", "Vehicle", results, "TEXT VEHICLE TRIGGER WORDS"); + assertInText("data/tags-keywordregex.txt", "29-35", results, "TEXT VEHICLE TRIGGER WORDS OFFSET"); ASSERT_TRUE(tagger.Close()); } @@ -309,8 +339,8 @@ TEST(KEYWORDTAGGING, ProcessAllProperties) { ASSERT_EQ("cash", props["TRANSLATION"]); ASSERT_EQ("car", props["TEXT"]); ASSERT_EQ("vehicle", props["TAGS"]); - ASSERT_EQ("car", props["TEXT TRIGGER WORDS"]); - ASSERT_EQ("0-2", props["TEXT TRIGGER WORDS OFFSET"]); + ASSERT_EQ("car", props["TEXT VEHICLE TRIGGER WORDS"]); + ASSERT_EQ("0-2", props["TEXT VEHICLE TRIGGER WORDS OFFSET"]); } { @@ -332,10 +362,10 @@ TEST(KEYWORDTAGGING, ProcessAllProperties) { ASSERT_EQ("cash", props["TRANSLATION"]); ASSERT_EQ("car", props["TEXT"]); ASSERT_EQ("financial; vehicle", props["TAGS"]); // tags added in alphabetical order - ASSERT_EQ("cash", props["TRANSLATION TRIGGER WORDS"]); - ASSERT_EQ("0-3", props["TRANSLATION TRIGGER WORDS OFFSET"]); - ASSERT_EQ("car", props["TEXT TRIGGER WORDS"]); - ASSERT_EQ("0-2", props["TEXT TRIGGER WORDS OFFSET"]); + ASSERT_EQ("cash", props["TRANSLATION FINANCIAL TRIGGER WORDS"]); + ASSERT_EQ("0-3", props["TRANSLATION FINANCIAL TRIGGER WORDS OFFSET"]); + ASSERT_EQ("car", props["TEXT VEHICLE TRIGGER WORDS"]); + ASSERT_EQ("0-2", props["TEXT VEHICLE TRIGGER WORDS OFFSET"]); } { @@ -355,10 +385,10 @@ TEST(KEYWORDTAGGING, ProcessAllProperties) { ASSERT_EQ("cash", props["BAR"]); ASSERT_EQ("car", props["FOO"]); ASSERT_EQ("financial; vehicle", props["TAGS"]); // tags added in alphabetical order - ASSERT_EQ("car", props["FOO TRIGGER WORDS"]); - ASSERT_EQ("0-2", props["FOO TRIGGER WORDS OFFSET"]); - ASSERT_EQ("cash", props["BAR TRIGGER WORDS"]); - ASSERT_EQ("0-3", props["BAR TRIGGER WORDS OFFSET"]); + ASSERT_EQ("car", props["FOO VEHICLE TRIGGER WORDS"]); + ASSERT_EQ("0-2", props["FOO VEHICLE TRIGGER WORDS OFFSET"]); + ASSERT_EQ("cash", props["BAR FINANCIAL TRIGGER WORDS"]); + ASSERT_EQ("0-3", props["BAR FINANCIAL TRIGGER WORDS OFFSET"]); } ASSERT_TRUE(tagger.Close()); @@ -400,8 +430,8 @@ TEST(KEYWORDTAGGING, ProcessTrackAndDetectionProperties) { ASSERT_EQ("airport", props["TEXT"]); ASSERT_EQ("SOME_VAL_3", props["SOME_PROP_3"]); ASSERT_EQ("travel", props["TAGS"]); - ASSERT_EQ("airport", props["TEXT TRIGGER WORDS"]); - ASSERT_EQ("0-6", props["TEXT TRIGGER WORDS OFFSET"]); + ASSERT_EQ("airport", props["TEXT TRAVEL TRIGGER WORDS"]); + ASSERT_EQ("0-6", props["TEXT TRAVEL TRIGGER WORDS OFFSET"]); MPFImageLocation location = results.at(0).frame_locations.at(10); ASSERT_EQ(location1.x_left_upper, location.x_left_upper); @@ -415,8 +445,8 @@ TEST(KEYWORDTAGGING, ProcessTrackAndDetectionProperties) { ASSERT_EQ("SOME_VAL_1", props["SOME_PROP_1"]); ASSERT_EQ("car", props["TEXT"]); ASSERT_EQ("vehicle", props["TAGS"]); - ASSERT_EQ("car", props["TEXT TRIGGER WORDS"]); - ASSERT_EQ("0-2", props["TEXT TRIGGER WORDS OFFSET"]); + ASSERT_EQ("car", props["TEXT VEHICLE TRIGGER WORDS"]); + ASSERT_EQ("0-2", props["TEXT VEHICLE TRIGGER WORDS OFFSET"]); location = results.at(0).frame_locations.at(12); ASSERT_EQ(location2.x_left_upper, location.x_left_upper); @@ -430,8 +460,8 @@ TEST(KEYWORDTAGGING, ProcessTrackAndDetectionProperties) { ASSERT_EQ("SOME_VAL_2", props["SOME_PROP_2"]); ASSERT_EQ("username", props["TEXT"]); ASSERT_EQ("personal", props["TAGS"]); - ASSERT_EQ("username", props["TEXT TRIGGER WORDS"]); - ASSERT_EQ("0-7", props["TEXT TRIGGER WORDS OFFSET"]); + ASSERT_EQ("username", props["TEXT PERSONAL TRIGGER WORDS"]); + ASSERT_EQ("0-7", props["TEXT PERSONAL TRIGGER WORDS OFFSET"]); } { @@ -478,8 +508,8 @@ TEST(KEYWORDTAGGING, ProcessTrackAndDetectionProperties) { ASSERT_EQ(4, props.size()); ASSERT_EQ("username", props["TRANSCRIPT"]); ASSERT_EQ("personal", props["TAGS"]); - ASSERT_EQ("username", props["TRANSCRIPT TRIGGER WORDS"]); - ASSERT_EQ("0-7", props["TRANSCRIPT TRIGGER WORDS OFFSET"]); + ASSERT_EQ("username", props["TRANSCRIPT PERSONAL TRIGGER WORDS"]); + ASSERT_EQ("0-7", props["TRANSCRIPT PERSONAL TRIGGER WORDS OFFSET"]); } ASSERT_TRUE(tagger.Close()); @@ -492,8 +522,8 @@ TEST(KEYWORDTAGGING, ProcessRepeatTags) { ASSERT_TRUE(tagger.Init()); MPFImageLocation location(1, 2, 3, 4, 5, - {{"TEXT", "cash-car"}, - {"OTHER TEXT", "car-cash"}, + {{"TEXT", "cash-car-suv"}, + {"OTHER TEXT", "car-cash-suv"}, {"MORE TEXT", "cash cash"}, {"BLANK TEXT", " "}}); MPFImageJob job("JOB NAME", "/some/path", location, @@ -508,26 +538,50 @@ TEST(KEYWORDTAGGING, ProcessRepeatTags) { ASSERT_EQ(location.confidence, results.at(0).confidence); Properties props = results.at(0).detection_properties; - ASSERT_EQ(11, props.size()); + ASSERT_EQ(15, props.size()); - ASSERT_EQ("cash-car", props["TEXT"]); - ASSERT_EQ("car-cash", props["OTHER TEXT"]); + ASSERT_EQ("cash-car-suv", props["TEXT"]); + ASSERT_EQ("car-cash-suv", props["OTHER TEXT"]); ASSERT_EQ("cash cash", props["MORE TEXT"]); ASSERT_EQ(" ", props["BLANK TEXT"]); ASSERT_EQ("financial; vehicle", props["TAGS"]); // tags added in alphabetical order - ASSERT_EQ("car; cash", props["TEXT TRIGGER WORDS"]); // words added in alphabetical order - ASSERT_EQ("5-7; 0-3", props["TEXT TRIGGER WORDS OFFSET"]); // offsets line up with words + ASSERT_EQ("cash", props["TEXT FINANCIAL TRIGGER WORDS"]); // words added in alphabetical order + ASSERT_EQ("0-3", props["TEXT FINANCIAL TRIGGER WORDS OFFSET"]); // offsets line up with words + ASSERT_EQ("car; suv", props["TEXT VEHICLE TRIGGER WORDS"]); + ASSERT_EQ("5-7; 9-11", props["TEXT VEHICLE TRIGGER WORDS OFFSET"]); - ASSERT_EQ("car; cash", props["OTHER TEXT TRIGGER WORDS"]); // words added in alphabetical order - ASSERT_EQ("0-2; 4-7", props["OTHER TEXT TRIGGER WORDS OFFSET"]); // offsets line up with words + ASSERT_EQ("cash", props["OTHER TEXT FINANCIAL TRIGGER WORDS"]); // words added in alphabetical order + ASSERT_EQ("4-7", props["OTHER TEXT FINANCIAL TRIGGER WORDS OFFSET"]); // offsets line up with words + ASSERT_EQ("car; suv", props["OTHER TEXT VEHICLE TRIGGER WORDS"]); + ASSERT_EQ("0-2; 9-11", props["OTHER TEXT VEHICLE TRIGGER WORDS OFFSET"]); - ASSERT_EQ("cash", props["MORE TEXT TRIGGER WORDS"]); - ASSERT_EQ("0-3, 5-8", props["MORE TEXT TRIGGER WORDS OFFSET"]); // offsets are in ascending order + ASSERT_EQ("cash", props["MORE TEXT FINANCIAL TRIGGER WORDS"]); + ASSERT_EQ("0-3, 5-8", props["MORE TEXT FINANCIAL TRIGGER WORDS OFFSET"]); // offsets are in ascending order // "BLANK TEXT TRIGGER WORDS" and "BLANK TEXT TRIGGER WORDS OFFSET" are omitted since "BLANK TEXT" // is only whitespace. ASSERT_TRUE(tagger.Close()); } + +TEST(KEYWORDTAGGING, FeedForwardTags) { + KeywordTagging tagger; + tagger.SetRunDirectory("../plugin"); + ASSERT_TRUE(tagger.Init()); + + MPFGenericTrack track(0.9, + {{"TAGS", "FeedForwardTag"}, + {"BAR", "cash"}}); + MPFGenericJob job("JOB NAME", "/some/path", track, + { { "FEED_FORWARD_PROP_TO_PROCESS", "FOO,BAR" } }, {}); + + std::vector results = tagger.GetDetections(job); + ASSERT_EQ(1, results.size()); + ASSERT_EQ(track.confidence, results.at(0).confidence); + + Properties props = results.at(0).detection_properties; + //ASSERT_EQ(7, props.size()); + ASSERT_EQ("FeedForwardTag; financial", props["TAGS"]); +} diff --git a/python/TransformerTagging/plugin-files/descriptor/descriptor.json b/python/TransformerTagging/plugin-files/descriptor/descriptor.json index 76ad364f..8e1b158f 100644 --- a/python/TransformerTagging/plugin-files/descriptor/descriptor.json +++ b/python/TransformerTagging/plugin-files/descriptor/descriptor.json @@ -3,7 +3,7 @@ "componentVersion": "7.2", "middlewareVersion": "7.2", "sourceLanguage": "python", - "batchLibrary": "ArgosTranslation", + "batchLibrary": "TransformerTagging", "environmentVariables": [], "algorithm": { "name": "TRANSFORMERTAGGING", @@ -48,9 +48,15 @@ }, "actions": [ { - "name": "ARGOS TRANSLATION (WITH FF REGION) ACTION", - "description": "Uses Argos Translation to perform translation on feed-forward tracks and detections.", - "algorithm": "ARGOSTRANSLATION", + "name": "TRANSFORMER TAGGING TEXT FILE ACTION", + "description": "Performs transformer tagging on a plain text file.", + "algorithm": "TRANSFORMERTAGGING", + "properties": [] + }, + { + "name": "TRANSFORMER TAGGING (WITH FF REGION) ACTION", + "description": "Performs transformer tagging on feed-forward tracks and detections.", + "algorithm": "TRANSFORMERTAGGING", "properties": [ { "name": "FEED_FORWARD_TYPE", @@ -61,37 +67,30 @@ "value": "TRUE" } ] - }, - { - "name": "ARGOS TRANSLATION TEXT FILE ACTION", - "description": "Uses Argos Translation to perform translation on a plain text file.", - "algorithm": "ARGOSTRANSLATION", - "properties": [ - ] } ], "tasks": [ { - "name": "ARGOS TRANSLATION (WITH FF REGION) TASK", - "description": "Uses Argos Translate to perform translation on feed-forward tracks and detections.", + "name": "TRANSFORMER TAGGING TEXT FILE TASK", + "description": "Performs transformer tagging on a plain text file.", "actions": [ - "ARGOS TRANSLATION (WITH FF REGION) ACTION" + "TRANSFORMER TAGGING TEXT FILE ACTION" ] }, { - "name": "ARGOS TRANSLATION TEXT FILE TASK", - "description": "Uses Argos Translate to perform translation on a plain text file.", + "name": "TRANSFORMER TAGGING (WITH FF REGION) TASK", + "description": "Performs transformer tagging on feed-forward tracks and detections.", "actions": [ - "ARGOS TRANSLATION TEXT FILE ACTION" + "TRANSFORMER TAGGING (WITH FF REGION) ACTION" ] } ], "pipelines": [ { - "name": "ARGOS TRANSLATION TEXT FILE PIPELINE", - "description": "Uses Argos Translate to perform translation on a plain text file.", + "name": "TRANSFORMER TAGGING TEXT FILE PIPELINE", + "description": "Performs transformer tagging on a plain text file.", "tasks": [ - "ARGOS TRANSLATION TEXT FILE TASK" + "TRANSFORMER TAGGING TEXT FILE TASK" ] } ] diff --git a/python/TransformerTagging/setup.cfg b/python/TransformerTagging/setup.cfg index 8daf2285..94a2873c 100644 --- a/python/TransformerTagging/setup.cfg +++ b/python/TransformerTagging/setup.cfg @@ -42,4 +42,4 @@ mpf.exported_component = component = transformer_tagging_component.transformer_tagging_component:TransformerTaggingComponent [options.package_data] -transformer_tagging_component=transformer_text_tags_corpus.json \ No newline at end of file +transformer_tagging_component=transformer_text_tags_corpus.json diff --git a/python/TransformerTagging/tests/test_transformer_tagging.py b/python/TransformerTagging/tests/test_transformer_tagging.py index d7291008..45b4cbd7 100644 --- a/python/TransformerTagging/tests/test_transformer_tagging.py +++ b/python/TransformerTagging/tests/test_transformer_tagging.py @@ -257,6 +257,21 @@ def test_missing_text_to_process(self): self.assertEqual(1, len(result)) + def test_maintain_tags_from_earlier_feedforward_task(self): + ff_track = mpf.GenericTrack(-1, dict(TEXT=SHORT_SAMPLE)) + job = mpf.GenericJob('Test Generic', 'test.pdf', {}, {}, ff_track) + # add tags + firstTag = "FIRST_TAG" + job.feed_forward_track.detection_properties["TAGS"] = firstTag + comp = TransformerTaggingComponent() + result = comp.get_detections_from_generic(job) + + self.assertEqual(1, len(result)) + + props = result[0].detection_properties + expectedTags = firstTag + "; " + SHORT_SAMPLE_TAGS + + self.assertEqual(expectedTags, props["TAGS"]) if __name__ == '__main__': unittest.main() diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py index 5265512a..3ab04e27 100644 --- a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py +++ b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py @@ -42,7 +42,6 @@ logger = logging.getLogger('TransformerTaggingComponent') - class TransformerTaggingComponent: detection_type = 'TEXT' From 0c42ed22713dff3c18caa859f393292b298ab386 Mon Sep 17 00:00:00 2001 From: Chris7C Date: Wed, 3 Jan 2024 00:37:29 -0500 Subject: [PATCH 07/21] made tags added by feedforward transformer tagger case insensitive --- .../TransformerTagging/tests/test_transformer_tagging.py | 6 +++--- .../transformer_tagging_component.py | 8 +++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/python/TransformerTagging/tests/test_transformer_tagging.py b/python/TransformerTagging/tests/test_transformer_tagging.py index 45b4cbd7..708d5c69 100644 --- a/python/TransformerTagging/tests/test_transformer_tagging.py +++ b/python/TransformerTagging/tests/test_transformer_tagging.py @@ -45,7 +45,7 @@ 'I plan to spend all day at the beach tomorrow.' ) -SHORT_SAMPLE_TAGS = "TRAVEL" +SHORT_SAMPLE_TAGS = "travel" SHORT_SAMPLE_TRIGGER_SENTENCES = "I drove to the beach today and will be staying overnight at a hotel." SHORT_SAMPLE_OFFSET = "0-67" SHORT_SAMPLE_SCORE = 0.4680028557777405 @@ -176,7 +176,7 @@ def test_custom_confidence_threshold(self): props = result[0].detection_properties - self.assertEqual("TRAVEL; FINANCIAL", props["TAGS"]) + self.assertEqual("TRAVEL; FINANCIAL".casefold(), props["TAGS"]) self.assertEqual(SHORT_SAMPLE_TRIGGER_SENTENCES, props["TEXT TRAVEL TRIGGER SENTENCES"]) self.assertEqual(SHORT_SAMPLE_OFFSET, props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"]) self.assertAlmostEqual(SHORT_SAMPLE_SCORE, float(props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]), places=3) @@ -208,7 +208,7 @@ def test_custom_tagging_file(self): beach_score_2 = 0.4624265432357788 beach_score_result_1, beach_score_result_2 = props["TEXT BEACH TRIGGER SENTENCES SCORE"].split(";") - self.assertEqual("BEACH", props["TAGS"]) + self.assertEqual("beach", props["TAGS"]) self.assertEqual(beach_sentences, props["TEXT BEACH TRIGGER SENTENCES"]) self.assertEqual('0-67; 197-242', props["TEXT BEACH TRIGGER SENTENCES OFFSET"]) self.assertAlmostEqual(beach_score_1, float(beach_score_result_1), places=3) diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py index 3ab04e27..b406577d 100644 --- a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py +++ b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py @@ -181,10 +181,12 @@ def _add_tags(self, config, corpus, ff_props: Dict[str, str]): for tag in all_tag_results["tag"].unique(): tag_df = all_tag_results[all_tag_results["tag"] == tag] - if "TAGS" in ff_props and tag.upper() not in ff_props: - ff_props["TAGS"] = ff_props["TAGS"] + "; " + tag.upper() + if "TAGS" in ff_props: + # only add tag if it is not already in ff_props["TAGS"], else do nothing + if tag.casefold() not in ff_props["TAGS"].casefold(): + ff_props["TAGS"] = ff_props["TAGS"] + "; " + tag.lower() else: - ff_props["TAGS"] = tag.upper() + ff_props["TAGS"] = tag prop_name_sent = prop_to_tag + " " + tag.upper() + " TRIGGER SENTENCES" prop_name_offset = prop_name_sent + " OFFSET" From 4b0100920bfaabf2dde10236d3b152d4fd2e8a39 Mon Sep 17 00:00:00 2001 From: Chris7C Date: Fri, 12 Jan 2024 00:03:52 -0500 Subject: [PATCH 08/21] Updates to transformer and keyword tagger to include brackets around semicolons, updated all upstream tags to be lowercase, and updates to readme" --- cpp/KeywordTagging/KeywordTagging.cpp | 18 +-- cpp/KeywordTagging/README.md | 38 +++--- .../test/test_keyword_tagging.cpp | 4 +- python/TransformerTagging/Dockerfile | 65 +++++++++ python/TransformerTagging/README.md | 123 ++++++++++++++++++ .../plugin-files/descriptor/descriptor.json | 2 +- .../tests/test_transformer_tagging.py | 15 +++ .../transformer_tagging_component.py | 8 +- 8 files changed, 238 insertions(+), 35 deletions(-) create mode 100644 python/TransformerTagging/Dockerfile create mode 100644 python/TransformerTagging/README.md diff --git a/cpp/KeywordTagging/KeywordTagging.cpp b/cpp/KeywordTagging/KeywordTagging.cpp index c6e267db..4090cbef 100644 --- a/cpp/KeywordTagging/KeywordTagging.cpp +++ b/cpp/KeywordTagging/KeywordTagging.cpp @@ -26,6 +26,7 @@ #include "KeywordTagging.h" #include +#include #include #include #include @@ -288,7 +289,7 @@ set KeywordTagging::search_regex(const MPFJob &job, const wstring &full } for (const auto &kv : json_kvs_regex) { - auto key = kv.first; + auto key = boost::locale::to_lower(kv.first); auto values = kv.second; map> trigger_words_offset; for (const pair &value : values) { @@ -579,25 +580,20 @@ void KeywordTagging::process_text_tagging(Properties &detection_properties, cons } if (has_text) { - set all_tags; // store off earlier tags boost::regex delimiter{"( *; *)"}; boost::sregex_token_iterator iter(detection_properties["TAGS"].begin(), detection_properties["TAGS"].end(), delimiter, -1); boost::sregex_token_iterator end; + while(iter != end) { - all_tags.insert(*iter++); + std::wstring_convert> convert_s_to_ws; + all_found_tags.insert(boost::to_lower_copy(convert_s_to_ws.from_bytes(*iter++))); } - // add in new tags - set::iterator new_tags_it; - for(new_tags_it = all_found_tags.begin(); new_tags_it != all_found_tags.end(); new_tags_it++) - { - all_tags.insert(boost::locale::conv::utf_to_utf(*new_tags_it)); - } - string tag_string = boost::algorithm::join(all_tags, L"; "); - detection_properties["TAGS"] = tag_string; + wstring tag_string = boost::algorithm::join(all_found_tags, L"; "); + detection_properties["TAGS"] = boost::locale::conv::utf_to_utf(tag_string); } } diff --git a/cpp/KeywordTagging/README.md b/cpp/KeywordTagging/README.md index efd84842..db53de4e 100644 --- a/cpp/KeywordTagging/README.md +++ b/cpp/KeywordTagging/README.md @@ -14,20 +14,14 @@ component. # Inputs -When performing keyword tagging on a text file, the contents of the file will be -stored in a `TEXT` output property. When performing keyword tagging on -feed-forward detections generated from some other component in a multi-stage -pipeline, the output properties from that component will be preserved. This -means that if those detections have a `TEXT` output property, then this -component will generate detections with the same `TEXT` output. Similarly, if -those detections have a `TRANSCRIPT` output property, then this component will -generate detections with the same `TRANSCRIPT` output. - -Keyword tagging will be performed on all of the input properties listed in -`FEED_FORWARD_PROP_TO_PROCESS`, if present. If none of the input properties are -present then keyword tagging is not performed and the feed-forward detection -is returned unmodified. For the sake of discussion, let's assume we need to -perform keyword tagging on the `TEXT` property. +When acting as a downstream stage of a feed forward pipeline, this component will +accept feed forward tracks as input. The `FEED_FORWARD_PROP_TO_PROCESS` job +property will be used to determine which properties in the feed forward track +should be processed. For example, if `FEED_FORWARD_PROP_TO_PROCESS` is set to +`TEXT,TRANSCRIPT,TRANSLATION` this component will look for tags in the `TEXT`, +`TRANSCRIPT`, and `TRANSLATION` property in the feed forward track. The trigger +words for each of these properties will be represented as seperate outputs. Refer +to the Outputs section below. # JSON Tagging File @@ -122,8 +116,18 @@ pattern becomes `(\\b)end(\\W+)of(\\W+)a(\\W+)sentence\\.`. Note that the `.` symbol is typically used in regex to match any character, which is why we use `\\.` instead. + # Outputs +When performing keyword tagging on a text file, the contents of the file will be +stored in a `TEXT` output property. When performing keyword tagging on +feed-forward detections generated from some other component in a multi-stage +pipeline, the output properties from that component will be preserved.This +means that if those detections have a `TEXT` output property, then this +component will generate detections with the same `TEXT` output. Similarly, if +those detections have a `TRANSLATION` output property, then this component will +generate detections with the same `TRANSLATION` output. + Each input property listed in `FEED_FORWARD_PROP_TO_PROCESS` that's present, and not just whitespace, will result in a `[TAG] TRIGGER WORDS` and `[TAG] TRIGGER WORDS OFFSET` output property. The '[TAG]' will be the tag property @@ -133,8 +137,8 @@ output properties will be produced: - `TEXT [TAG] TRIGGER WORDS` - `TEXT [TAG] TRIGGER WORDS OFFSET` -- `TRANSLATION TRIGGER WORDS` -- `TRANSLATION TRIGGER WORDS OFFSET` +- `TRANSLATION [TAG] TRIGGER WORDS` +- `TRANSLATION [TAG] TRIGGER WORDS OFFSET` Let's assume that we need process the `TEXT` property. The substring(s) that triggered each tag will be stored in `TEXT [TAG] TRIGGER WORDS` in alphabetical order. @@ -143,7 +147,7 @@ will be stored in `TEXT [TAG] TRIGGER WORDS OFFSET`. Because the same trigger wo can be encountered multiple times in the `TEXT` output, the results are organized as follows: -* `TEXT [TAGS] TRIGGER WORDS`: Each distinct trigger word is separated by a semicolon +* `TEXT [TAG] TRIGGER WORDS`: Each distinct trigger word is separated by a semicolon followed by a space. For example: `TEXT TRIGGER WORDS=trigger1; trigger2` * Because semicolons can be part of the trigger word itself, those semicolons will be encapsulated in brackets. For example, diff --git a/cpp/KeywordTagging/test/test_keyword_tagging.cpp b/cpp/KeywordTagging/test/test_keyword_tagging.cpp index 854d3039..b76d35b2 100644 --- a/cpp/KeywordTagging/test/test_keyword_tagging.cpp +++ b/cpp/KeywordTagging/test/test_keyword_tagging.cpp @@ -582,6 +582,6 @@ TEST(KEYWORDTAGGING, FeedForwardTags) { ASSERT_EQ(track.confidence, results.at(0).confidence); Properties props = results.at(0).detection_properties; - //ASSERT_EQ(7, props.size()); - ASSERT_EQ("FeedForwardTag; financial", props["TAGS"]); + ASSERT_EQ(4, props.size()); + ASSERT_EQ("feedforwardtag; financial", props["TAGS"]); } diff --git a/python/TransformerTagging/Dockerfile b/python/TransformerTagging/Dockerfile new file mode 100644 index 00000000..881a1093 --- /dev/null +++ b/python/TransformerTagging/Dockerfile @@ -0,0 +1,65 @@ +# syntax=docker/dockerfile:1.2 + +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2023 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +ARG BUILD_REGISTRY +ARG BUILD_TAG=latest +FROM ${BUILD_REGISTRY}openmpf_python_executor_ssb:${BUILD_TAG} + +ARG RUN_TESTS=false + +# needed to build sentence transformer wheel file +RUN pip install --no-cache-dir \ + 'wheel' + +RUN pip install --no-cache-dir \ + 'nltk' \ + 'sentence_transformers' \ + 'pandas' + +# copy over punkt files +RUN python -c \ + "import nltk; nltk.download('punkt')" + +# download sentence transformer model +RUN python -c \ + "import os; \ + os.environ['REQUESTS_CA_BUNDLE']='/etc/ssl/certs/ca-certificates.crt'; \ + from sentence_transformers import SentenceTransformer, util; \ + SentenceTransformer('all-mpnet-base-v2')" + +RUN --mount=target=.,readwrite \ + install-component.sh; \ + if [ "${RUN_TESTS,,}" == true ]; then python tests/test_transformer_tagging.py; fi + + +LABEL org.label-schema.license="Apache 2.0" \ + org.label-schema.name="OpenMPF Transformer Tagging" \ + org.label-schema.schema-version="1.0" \ + org.label-schema.url="https://openmpf.github.io" \ + org.label-schema.vcs-url="https://github.com/openmpf/openmpf-components" \ + org.label-schema.vendor="MITRE" diff --git a/python/TransformerTagging/README.md b/python/TransformerTagging/README.md new file mode 100644 index 00000000..2ed453c6 --- /dev/null +++ b/python/TransformerTagging/README.md @@ -0,0 +1,123 @@ +# Overview + +This repository contains source code for the OpenMPF Transformer Tagging component. + +This component uses a user-specified corpus-json file to match known phrases against +each sentence in the input text data. This is done by generating an embedding for each +phrase in the corpus and comparing that against the embedding for each sentence of the +input text. The comparison generates a score based on how similar the content is. +This is based on how the underlying +[all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) +was trained on a variety of text data in order to understand the common phrasing, +subject, and context. The sentences that generate scores above the threshold are +called "trigger sentences". These sentences are grouped by "tag" based on which entry +in the corpus they matched against. + +This component can be used independently to perform transformer tagging on text +files, or it can be used as a support component in a multi-stage pipeline to +perform transformer tagging on feed-forward detections generated by some other +component. + +# Inputs + +The transformer tagger will run on all input properties listed in the +`FEED_FORWARD_PROP_TO_PROCESS`. If there are feed-forward detections generated from +an upstream component in a multi-stage pipeline, the output properties from that +component are preserved. This means that if those detections have a `TEXT` output +property, this component will generate detections with the same `TEXT` output. +Similarly, if those detections have a `TRANSLATION` output property, then this +component will generate detections with the same `TRANSLATION` output. If none of the +input properties are present then the transformer tagging is not performed then the +feed-forward detection is returned unmodified. + +The reported detections that are returned by the transformer tagger are based on the +corpus used, and the minimum score defined in the`SCORE_THRESHOLD` property. These +values are discussed below. + +# Corpus File + +Transformer patterns are specified in a JSON corpus file. By default this is the +`transformer_text_tags_corpus.json` file. Alternativley, the path to a corpus file can +be changed by setting the `TRANSFORMER_TAGGING_CORPUS` property. + +In the corpus file, users can specify sentence patterns to compare against using the +following syntax: + +``` + [ + { + "text": "This sentence is dog.", + "tag": "dog" + } + ] +``` + +Where the `text` field specifies a sentence to compare against, and the `tag` field +is used to report in the output results if the input sentence scores meet the +`SCORE_THRESHOLD`. + +Multiple patterns can be specified with a comma separated list: + +``` + [ + { + "text": "This sentence is dog.", + "tag": "dog" + }, + { + "text": "My favorite animal is a corgi.", + "tag": "dog" + }, + { + "text": "This sentence is cat.", + "tag": "cat" + }, + ... + ] +``` + +# Outputs + +When performing transformer tagging on a text file, the contents of the file will be +stored in a `TEXT` output property. Text input that is not just whitespace, which has +sentences that scored high enough against entries in the corpus file, will result in +the following output properties: + +- `TEXT [TAG] TRIGGER SENTENCES` +- `TEXT [TAG] TRIGGER SENTENCES OFFSET` +- `TEXT [TAG] TRIGGER SENTENCES SCORE` +- `TRANSLATION [TAG] TRIGGER SENTENCES` +- `TRANSLATION [TAG] TRIGGER SENTENCES OFFSET` +- `TRANSLATION [TAG] TRIGGER SENTENCES SCORE` +Note: The '[TAG]' value in each of the output properties above will be the tag +property from the corpus file that the trigger sentence scored against. + +The tags associated with the trigger words will be stored in a `TAGS` output +property, separated by semicolons. Note that there is only one `TAGS` output +property. This is unlike `TEXT [TAG] TRIGGER SENTENCES` and `TEXT [TAG] TRIGGER +SENTENCES OFFSET`, which are prefixed by the input property that produced those +trigger words. Each tag will only appear once in `TAGS` no matter how many trigger +words activate that tag. It doesn't matter if the trigger words are found in only one +or multiple input properties defined in `FEED_FORWARD_PROP_TO_PROCESS`. + +When `ENABLE_DEBUG` is set to true, the output properties will include a +`TRIGGER SENTENCES MATCHES` property containing a semicolon separated list of the +`text` sentences in the corpus that were triggered for that tag. + +- `TEXT [TAG] TRIGGER SENTENCES` +- `TEXT [TAG] TRIGGER SENTENCES MATCHES` +- `TEXT [TAG] TRIGGER SENTENCES OFFSET` +- `TEXT [TAG] TRIGGER SENTENCES SCORE` +- `TRANSLATION [TAG] TRIGGER SENTENCES` +- `TRANSLATION [TAG] TRIGGER SENTENCES MATCHES` +- `TRANSLATION [TAG] TRIGGER SENTENCES OFFSET` +- `TRANSLATION [TAG] TRIGGER SENTENCES SCORE` + +Let's assume that we need process the `TEXT` property. The sentence(s) that +triggered each tag will be stored in `TEXT [TAG] TRIGGER SENTENCES`. While the +sentence that was matched against in the corpus will be stored in the +`TEXT [TAG] TRIGGER SENTENCES MATCHES` property. For each trigger sentence the +substring index range relative to the `TEXT` output will be stored in +`TEXT [TAG] TRIGGER SENTENCES OFFSET`. + + diff --git a/python/TransformerTagging/plugin-files/descriptor/descriptor.json b/python/TransformerTagging/plugin-files/descriptor/descriptor.json index 8e1b158f..28b26719 100644 --- a/python/TransformerTagging/plugin-files/descriptor/descriptor.json +++ b/python/TransformerTagging/plugin-files/descriptor/descriptor.json @@ -39,7 +39,7 @@ }, { "name": "ENABLE_DEBUG", - "description": "If true, each detection will include a `TEXT [TAG] TRIGGER SENTENCES MATCHES` property for each entry in `TAGS`. The value will be the sentence in the corpus which generated the highest match score for that tag.", + "description": "If true, each detection will include a `TRIGGER SENTENCES MATCHES` property for each entry in `TAGS`. The value will be the sentences in the corpus which met the score threshold for that tag.", "type": "BOOLEAN", "defaultValue": "FALSE" } diff --git a/python/TransformerTagging/tests/test_transformer_tagging.py b/python/TransformerTagging/tests/test_transformer_tagging.py index 708d5c69..d85eed2c 100644 --- a/python/TransformerTagging/tests/test_transformer_tagging.py +++ b/python/TransformerTagging/tests/test_transformer_tagging.py @@ -273,5 +273,20 @@ def test_maintain_tags_from_earlier_feedforward_task(self): self.assertEqual(expectedTags, props["TAGS"]) + def test_matches_with_semicolons(self): + SEMICOLON_SAMPLE = ( + 'I drove to the beach today; it was a long drive. ' + ) + ff_track = mpf.GenericTrack(-1, dict(TEXT=SEMICOLON_SAMPLE)) + job = mpf.GenericJob('Test Generic', 'test.pdf', {}, {}, ff_track) + comp = TransformerTaggingComponent() + result = comp.get_detections_from_generic(job) + + self.assertEqual(1, len(result)) + props = result[0].detection_properties + + expected_output = "I drove to the beach today[;] it was a long drive." + self.assertEqual(expected_output, props["TEXT TRAVEL TRIGGER SENTENCES"]) + if __name__ == '__main__': unittest.main() diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py index b406577d..f749d9e4 100644 --- a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py +++ b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py @@ -123,7 +123,7 @@ def _get_corpus(self, corpus_path): self._cached_corpuses[corpus_path] = Corpus(corpus_path, self._cached_model) return self._cached_corpuses[corpus_path] - + def _add_tags(self, config, corpus, ff_props: Dict[str, str]): for prop_to_tag in config.props_to_process: @@ -156,7 +156,7 @@ def _add_tags(self, config, corpus, ff_props: Dict[str, str]): probe_df = pd.DataFrame({ "input text": probe_sent, "corpus text": corpus.json["text"], - "tag": corpus.json["tag"], + "tag": corpus.json["tag"].str.lower(), "score": scores, "offset": offset_string }) @@ -192,14 +192,14 @@ def _add_tags(self, config, corpus, ff_props: Dict[str, str]): prop_name_offset = prop_name_sent + " OFFSET" prop_name_score = prop_name_sent + " SCORE" - ff_props[prop_name_sent] = "; ".join(tag_df["input text"]) + ff_props[prop_name_sent] = "; ".join(tag_df["input text"].str.replace(';', '[;]')) ff_props[prop_name_offset] = "; ".join(tag_df["offset"]) ff_props[prop_name_score] = "; ".join(tag_df["score"].astype(str)) if config.debug: logger.info("Debug set to true, including corpus sentences that triggered the match.") prop_name_matches = prop_name_sent + " MATCHES" - ff_props[prop_name_matches] = "; ".join(tag_df["corpus text"]) + ff_props[prop_name_matches] = "; ".join(tag_df["corpus text"].str.replace(';', '[;]')) class Corpus: From 90350a3719a5f510964bd34ba21315bccc014a9f Mon Sep 17 00:00:00 2001 From: Chris7C Date: Fri, 12 Jan 2024 12:49:46 -0500 Subject: [PATCH 09/21] Updates to dockerfiles --- cpp/KeywordTagging/Dockerfile | 23 ----------------------- python/TransformerTagging/Dockerfile | 1 - 2 files changed, 24 deletions(-) diff --git a/cpp/KeywordTagging/Dockerfile b/cpp/KeywordTagging/Dockerfile index 2cca752f..9410160e 100644 --- a/cpp/KeywordTagging/Dockerfile +++ b/cpp/KeywordTagging/Dockerfile @@ -33,29 +33,6 @@ FROM ${BUILD_REGISTRY}openmpf_cpp_component_build:${BUILD_TAG} as build_componen ENV LANG C.UTF-8 ENV LC_ALL C.UTF-8 -#mitre cert... -RUN mkdir -p /usr/local/share/ca-certificates -COPY mitre.crt /usr/local/share/ca-certificates/mitre.crt -RUN update-ca-certificates ||: - -#install openssl... -#ADD https://www.openssl.org/source/openssl-3.2.0.tar.gz /tmp -#RUN tar -zxvf /tmp/openssl-3.2.0.tar.gz -C /tmp -#RUN cd /tmp/openssl-3.2.0; ./config --prefix=/usr/local/ssl --openssldir=/usr/local/ssl shared zlib; make; make test; make install; break!; - -#install curl... -#ADD https://curl.se/download/curl-8.4.0.tar.gz /tmp -#RUN tar -zxvf /tmp/curl-8.4.0.tar.gz -C /tmp -#RUN /tmp/curl-8.4.0/configure --with-openssl -#RUN cd /tmp/curl-8.4.0; openssl version -d; ./configure --without-ssl; make; make install;ls -al; curl -V; - -#ADD https://gitlab.mitre.org/mitre-scripts/mitre-pki/raw/master/os_scripts/install_certs.sh /tmp/install_certs.sh -#RUN chmod a+x /tmp/install_certs.sh -#RUN /tmp/install_certs.sh -#RUN rm /tmp/install_certs.sh -#RUN apt install curl -#RUN curl -ksSL https://gitlab.mitre.org/mitre-scripts/mitre-pki/raw/master/tool_scripts/install_certs.sh | JAVA_HOME=/path/to/java MODE=java sh - RUN --mount=type=tmpfs,target=/var/cache/apt \ --mount=type=tmpfs,target=/var/lib/apt/lists \ --mount=type=tmpfs,target=/tmp \ diff --git a/python/TransformerTagging/Dockerfile b/python/TransformerTagging/Dockerfile index 881a1093..51620afe 100644 --- a/python/TransformerTagging/Dockerfile +++ b/python/TransformerTagging/Dockerfile @@ -48,7 +48,6 @@ RUN python -c \ # download sentence transformer model RUN python -c \ "import os; \ - os.environ['REQUESTS_CA_BUNDLE']='/etc/ssl/certs/ca-certificates.crt'; \ from sentence_transformers import SentenceTransformer, util; \ SentenceTransformer('all-mpnet-base-v2')" From a792e351427bda871439ee0e74384e80bf09d03a Mon Sep 17 00:00:00 2001 From: Chris7C Date: Fri, 12 Jan 2024 15:02:58 -0500 Subject: [PATCH 10/21] undoing bad commit --- python/TransformerTagging/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/python/TransformerTagging/Dockerfile b/python/TransformerTagging/Dockerfile index 51620afe..881a1093 100644 --- a/python/TransformerTagging/Dockerfile +++ b/python/TransformerTagging/Dockerfile @@ -48,6 +48,7 @@ RUN python -c \ # download sentence transformer model RUN python -c \ "import os; \ + os.environ['REQUESTS_CA_BUNDLE']='/etc/ssl/certs/ca-certificates.crt'; \ from sentence_transformers import SentenceTransformer, util; \ SentenceTransformer('all-mpnet-base-v2')" From efc63496aa6030ba28df5e2f3874597b62b40f8c Mon Sep 17 00:00:00 2001 From: Chris7C Date: Fri, 12 Jan 2024 16:06:36 -0500 Subject: [PATCH 11/21] added trackType --- .../TransformerTagging/plugin-files/descriptor/descriptor.json | 1 + .../transformer_tagging_component.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/python/TransformerTagging/plugin-files/descriptor/descriptor.json b/python/TransformerTagging/plugin-files/descriptor/descriptor.json index 28b26719..df61c6a9 100644 --- a/python/TransformerTagging/plugin-files/descriptor/descriptor.json +++ b/python/TransformerTagging/plugin-files/descriptor/descriptor.json @@ -9,6 +9,7 @@ "name": "TRANSFORMERTAGGING", "description": "Uses SentenceTransformers to tag sentences.", "actionType": "DETECTION", + "trackType": "TEXT", "requiresCollection": { "states": [] }, diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py index f749d9e4..7f28f95d 100644 --- a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py +++ b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py @@ -43,7 +43,6 @@ logger = logging.getLogger('TransformerTaggingComponent') class TransformerTaggingComponent: - detection_type = 'TEXT' def __init__(self): self._cached_model = SentenceTransformer('all-mpnet-base-v2') From 5c42c1ee9c919c2f0e6025e128a1c920cdadc61b Mon Sep 17 00:00:00 2001 From: Chris7C Date: Tue, 16 Jan 2024 16:44:11 -0500 Subject: [PATCH 12/21] updates based on review --- cpp/KeywordTagging/README.md | 2 +- python/TransformerTagging/README.md | 12 ++++++++---- .../transformer_tagging_component.py | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/cpp/KeywordTagging/README.md b/cpp/KeywordTagging/README.md index db53de4e..676f61e6 100644 --- a/cpp/KeywordTagging/README.md +++ b/cpp/KeywordTagging/README.md @@ -18,7 +18,7 @@ When acting as a downstream stage of a feed forward pipeline, this component wil accept feed forward tracks as input. The `FEED_FORWARD_PROP_TO_PROCESS` job property will be used to determine which properties in the feed forward track should be processed. For example, if `FEED_FORWARD_PROP_TO_PROCESS` is set to -`TEXT,TRANSCRIPT,TRANSLATION` this component will look for tags in the `TEXT`, +`TEXT,TRANSLATION` this component will look for tags in the `TEXT`, `TRANSCRIPT`, and `TRANSLATION` property in the feed forward track. The trigger words for each of these properties will be represented as seperate outputs. Refer to the Outputs section below. diff --git a/python/TransformerTagging/README.md b/python/TransformerTagging/README.md index 2ed453c6..38b3066f 100644 --- a/python/TransformerTagging/README.md +++ b/python/TransformerTagging/README.md @@ -113,11 +113,15 @@ When `ENABLE_DEBUG` is set to true, the output properties will include a - `TRANSLATION [TAG] TRIGGER SENTENCES OFFSET` - `TRANSLATION [TAG] TRIGGER SENTENCES SCORE` -Let's assume that we need process the `TEXT` property. The sentence(s) that +Let's assume that we need to process the `TEXT` property. The sentence(s) that triggered each tag will be stored in `TEXT [TAG] TRIGGER SENTENCES`. While the sentence that was matched against in the corpus will be stored in the -`TEXT [TAG] TRIGGER SENTENCES MATCHES` property. For each trigger sentence the -substring index range relative to the `TEXT` output will be stored in -`TEXT [TAG] TRIGGER SENTENCES OFFSET`. +`TEXT [TAG] TRIGGER SENTENCES MATCHES` property. Note, that because semicolons +can be part of the trigger sentence itself, those semicolons will be encapsulated +in brackets. For example, `This sentence has has a semicolon;` in the input `TEXT` +is reported as: +`TEXT [TAG] TRIGGER WORDS=This sentence has has a semicolon[;]; other triggers`. +For each trigger sentence the substring index range relative to the `TEXT` +output will be stored in `TEXT [TAG] TRIGGER SENTENCES OFFSET`. diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py index 7f28f95d..a4f8e2ea 100644 --- a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py +++ b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py @@ -183,7 +183,7 @@ def _add_tags(self, config, corpus, ff_props: Dict[str, str]): if "TAGS" in ff_props: # only add tag if it is not already in ff_props["TAGS"], else do nothing if tag.casefold() not in ff_props["TAGS"].casefold(): - ff_props["TAGS"] = ff_props["TAGS"] + "; " + tag.lower() + ff_props["TAGS"] = ff_props["TAGS"] + "; " + tag else: ff_props["TAGS"] = tag From aea5cc0f0c694070e59b418d21a741b3b2e3370e Mon Sep 17 00:00:00 2001 From: jrobble Date: Wed, 31 Jan 2024 12:42:13 -0500 Subject: [PATCH 13/21] Update KeywordTagging README. --- cpp/KeywordTagging/README.md | 80 ++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 39 deletions(-) diff --git a/cpp/KeywordTagging/README.md b/cpp/KeywordTagging/README.md index 676f61e6..fb02abe9 100644 --- a/cpp/KeywordTagging/README.md +++ b/cpp/KeywordTagging/README.md @@ -17,11 +17,11 @@ component. When acting as a downstream stage of a feed forward pipeline, this component will accept feed forward tracks as input. The `FEED_FORWARD_PROP_TO_PROCESS` job property will be used to determine which properties in the feed forward track -should be processed. For example, if `FEED_FORWARD_PROP_TO_PROCESS` is set to -`TEXT,TRANSLATION` this component will look for tags in the `TEXT`, -`TRANSCRIPT`, and `TRANSLATION` property in the feed forward track. The trigger -words for each of these properties will be represented as seperate outputs. Refer -to the Outputs section below. +should be processed. For example, if `FEED_FORWARD_PROP_TO_PROCESS` is set to +`TEXT,TRANSLATION` then this component will look for tags in both the `TEXT` and +`TRANSLATION` properties in the feed forward track. The trigger words for each of +these properties will be represented as seperate outputs. Refer to the Outputs +section below. # JSON Tagging File @@ -128,12 +128,13 @@ component will generate detections with the same `TEXT` output. Similarly, if those detections have a `TRANSLATION` output property, then this component will generate detections with the same `TRANSLATION` output. -Each input property listed in `FEED_FORWARD_PROP_TO_PROCESS` that's -present, and not just whitespace, will result in a `[TAG] TRIGGER WORDS` and -`[TAG] TRIGGER WORDS OFFSET` output property. The '[TAG]' will be the tag property -that matched in the input text. `FEED_FORWARD_PROP_TO_PROCESS=TEXT,TRANSLATION`, -and the `TEXT` and `TRANSLATION` properties are both present, then the following -output properties will be produced: +Each input property listed in `FEED_FORWARD_PROP_TO_PROCESS` that's present, and +not just whitespace, will result in a `[TAG] TRIGGER WORDS` and +`[TAG] TRIGGER WORDS OFFSET` output property. The `[TAG]` will be the tag property +that matched in the input text. For example, in +`FEED_FORWARD_PROP_TO_PROCESS=TEXT,TRANSLATION`, the `TEXT` and `TRANSLATION` +properties are both present, so the following output properties will be produced +if trigger words are found: - `TEXT [TAG] TRIGGER WORDS` - `TEXT [TAG] TRIGGER WORDS OFFSET` @@ -141,34 +142,35 @@ output properties will be produced: - `TRANSLATION [TAG] TRIGGER WORDS OFFSET` Let's assume that we need process the `TEXT` property. The substring(s) that -triggered each tag will be stored in `TEXT [TAG] TRIGGER WORDS` in alphabetical order. -For each trigger word the substring index range relative to the `TEXT` output -will be stored in `TEXT [TAG] TRIGGER WORDS OFFSET`. Because the same trigger word -can be encountered multiple times in the `TEXT` output, the results are organized -as follows: - -* `TEXT [TAG] TRIGGER WORDS`: Each distinct trigger word is separated by a semicolon -followed by a space. For example: `TEXT TRIGGER WORDS=trigger1; trigger2` - * Because semicolons can be part of the trigger word itself, those - semicolons will be encapsulated in brackets. For example, - `detected trigger with a ;` in the input `TEXT` is reported as - `TEXT TRIGGER WORDS=detected trigger with a [;]; some other trigger`. -* `TEXT TRIGGER WORDS OFFSET`: Each group of indexes, referring to the same trigger -word reported in sequence, is separated by a semicolon followed by a space. -Indexes within a single group are separated by commas. - * Example `TEXT TRIGGER WORDS=trigger1; trigger2`, - `TEXT TRIGGER WORDS OFFSET=0-5, 6-10; 12-15`, means that `trigger1` occurs twice - in the text at the index ranges 0-5 and 6-10, and `trigger2` occurs at index - range 12-15. - -Note that all `TEXT TRIGGER WORDS` results are trimmed of leading and trailing -whitespace, regardless of the regex pattern used. The respective -`TEXT TRIGGER WORDS OFFSET` indexes refer to the trimmed substrings. +triggered each tag will be stored in `TEXT [TAG] TRIGGER WORDS` in alphabetical +order. For each trigger word the substring index range relative to the `TEXT` +output will be stored in `TEXT [TAG] TRIGGER WORDS OFFSET`. Because the same +trigger word can be encountered multiple times in the `TEXT` output, the results +are organized as follows: + +* `TEXT [TAG] TRIGGER WORDS`: Each distinct trigger word is separated by a +semicolon followed by a space. For example: +`TEXT [TAG] TRIGGER WORDS=trigger1; trigger2` + * Because semicolons can be part of the trigger word itself, those semicolons + will be encapsulated in brackets. For example, `detected trigger with a ;` in + the input `TEXT` is reported as + `TEXT [TAG] TRIGGER WORDS=detected trigger with a [;]; some other trigger`. +* `TEXT [TAG] TRIGGER WORDS OFFSET`: Each group of indexes, referring to the same +trigger word reported in sequence, is separated by a semicolon followed by a +space. Indexes within a single group are separated by commas. + * Example `TEXT [TAG] TRIGGER WORDS=trigger1; trigger2`, + `TEXT [TAG] TRIGGER WORDS OFFSET=0-5, 6-10; 12-15`, means that `trigger1` + occurs twice in the text at the index ranges 0-5 and 6-10, and `trigger2` + occurs at index range 12-15. + +Note that all `TEXT [TAG] TRIGGER WORDS` results are trimmed of leading and +trailing whitespace, regardless of the regex pattern used. The respective +`TEXT [TAG] TRIGGER WORDS OFFSET` indexes refer to the trimmed substrings. The tags associated with the trigger words will be stored in a `TAGS` output property in alphabetical order, separated by semicolons. Note that there is only -one `TAGS` output property. This is unlike `TRIGGER WORDS` and `TRIGGER WORDS OFFSET`, -which are prefixed by the input property that produced those trigger words. -Each tag will only appear once in `TAGS` no matter how many trigger words -activate that tag. It doesn't matter if the trigger words are found in only one -or multiple input properties defined in `FEED_FORWARD_PROP_TO_PROCESS`. +one `TAGS` output property. This is unlike `TRIGGER WORDS` and `TRIGGER WORDS +OFFSET`, which are prefixed by the input property that produced those trigger +words. Each tag will only appear once in `TAGS` no matter how many trigger words +activate that tag. It doesn't matter if the trigger words are found in only one or +multiple input properties defined in `FEED_FORWARD_PROP_TO_PROCESS`. From 803a41a504f699dbcc2385bb98bc9802ec8c5af5 Mon Sep 17 00:00:00 2001 From: jrobble Date: Wed, 31 Jan 2024 15:31:31 -0500 Subject: [PATCH 14/21] Fix TransformerTagging char offset. --- .../transformer_tagging_component.py | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py index a4f8e2ea..dc2153cc 100644 --- a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py +++ b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py @@ -37,7 +37,7 @@ import time from pkg_resources import resource_filename -from nltk.tokenize import sent_tokenize +from nltk.tokenize.punkt import PunktSentenceTokenizer import pandas as pd logger = logging.getLogger('TransformerTaggingComponent') @@ -45,7 +45,7 @@ class TransformerTaggingComponent: def __init__(self): - self._cached_model = SentenceTransformer('all-mpnet-base-v2') + self._cached_model = SentenceTransformer('/models/all-mpnet-base-v2') self._cached_corpuses: Dict[str, Corpus] = {} @@ -137,27 +137,24 @@ def _add_tags(self, config, corpus, ff_props: Dict[str, str]): + ", ".join(config.props_to_process)) return - input_sentences = sent_tokenize(input_text) - all_tag_results = [] # for each sentence in input - for probe_sent in input_sentences: + for start, end in PunktSentenceTokenizer().span_tokenize(input_text): + probe_sent = input_text[start:end] + logger.info(f"INPUT_TEXT: {input_text}") ## DEBUG + logger.info(f"PROBE: {str(start)}-{str(end)}: {probe_sent}") ## DEBUG + # get similarity scores for the input sentence with each corpus sentence probe_sent_embed = self._cached_model.encode(probe_sent, convert_to_tensor=True, show_progress_bar=False) scores = [float(util.cos_sim(probe_sent_embed, corpus_sent_embed)) for corpus_sent_embed in corpus.embed] - # get offset of the input sentence in the input text - offset_beginning = input_text.find(probe_sent) - offset_end = offset_beginning + len(probe_sent) - 1 - offset_string = str(offset_beginning) + "-" + str(offset_end) - probe_df = pd.DataFrame({ "input text": probe_sent, "corpus text": corpus.json["text"], "tag": corpus.json["tag"].str.lower(), "score": scores, - "offset": offset_string + "offset": str(start) + "-" + str(end) }) # sort by score then group by tag so each group will be sorted highest to lowest score, @@ -231,7 +228,7 @@ def __init__(self, props: Mapping[str, str]): self.corpus_file = \ mpf_util.get_property(props, 'TRANSFORMER_TAGGING_CORPUS', "transformer_text_tags_corpus.json") - + self.corpus_path = "" if "$" not in self.corpus_file and "/" not in self.corpus_file: self.corpus_path = os.path.realpath(resource_filename(__name__, self.corpus_file)) From 54097db57ab135c9fb15095a86a8212f268491e0 Mon Sep 17 00:00:00 2001 From: Chris7C Date: Wed, 31 Jan 2024 15:50:28 -0500 Subject: [PATCH 15/21] updated transformer tagger with update version and new changes from all-mpnet-base-v2 --- python/TransformerTagging/Dockerfile | 7 +++++-- .../plugin-files/descriptor/descriptor.json | 4 ++-- python/TransformerTagging/setup.cfg | 6 +++--- .../transformer_tagging_component.py | 2 +- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/python/TransformerTagging/Dockerfile b/python/TransformerTagging/Dockerfile index 881a1093..81b5552a 100644 --- a/python/TransformerTagging/Dockerfile +++ b/python/TransformerTagging/Dockerfile @@ -46,11 +46,14 @@ RUN python -c \ "import nltk; nltk.download('punkt')" # download sentence transformer model -RUN python -c \ +RUN --mount=type=tmpfs,target=/tmp/models \ + mkdir -p "/models/all-mpnet-base-v2"; \ + python -c \ "import os; \ os.environ['REQUESTS_CA_BUNDLE']='/etc/ssl/certs/ca-certificates.crt'; \ from sentence_transformers import SentenceTransformer, util; \ - SentenceTransformer('all-mpnet-base-v2')" + model = SentenceTransformer('all-mpnet-base-v2'); \ + model.save('/models/all-mpnet-base-v2')" RUN --mount=target=.,readwrite \ install-component.sh; \ diff --git a/python/TransformerTagging/plugin-files/descriptor/descriptor.json b/python/TransformerTagging/plugin-files/descriptor/descriptor.json index df61c6a9..74810ebd 100644 --- a/python/TransformerTagging/plugin-files/descriptor/descriptor.json +++ b/python/TransformerTagging/plugin-files/descriptor/descriptor.json @@ -1,7 +1,7 @@ { "componentName": "TransformerTagging", - "componentVersion": "7.2", - "middlewareVersion": "7.2", + "componentVersion": "8.0", + "middlewareVersion": "8.0", "sourceLanguage": "python", "batchLibrary": "TransformerTagging", "environmentVariables": [], diff --git a/python/TransformerTagging/setup.cfg b/python/TransformerTagging/setup.cfg index 94a2873c..2871a22f 100644 --- a/python/TransformerTagging/setup.cfg +++ b/python/TransformerTagging/setup.cfg @@ -26,13 +26,13 @@ [metadata] name = TransformerTagging -version = 7.2 +version = 8.0 [options] packages = transformer_tagging_component install_requires = - mpf_component_api>=7.2 - mpf_component_util>=7.2 + mpf_component_api>=8.0 + mpf_component_util>=8.0 nltk sentence_transformers pandas diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py index a4f8e2ea..d154daff 100644 --- a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py +++ b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py @@ -45,7 +45,7 @@ class TransformerTaggingComponent: def __init__(self): - self._cached_model = SentenceTransformer('all-mpnet-base-v2') + self._cached_model = SentenceTransformer('/models/all-mpnet-base-v2') self._cached_corpuses: Dict[str, Corpus] = {} From f5b575badf80906b12fb82350c94805ee63bc16c Mon Sep 17 00:00:00 2001 From: jrobble Date: Wed, 31 Jan 2024 17:21:21 -0500 Subject: [PATCH 16/21] Improve TransformerTagging character offset logic to handle repeats. --- .../tests/test_transformer_tagging.py | 56 ++++++++++++++++++- .../transformer_tagging_component.py | 25 +++++++-- 2 files changed, 72 insertions(+), 9 deletions(-) diff --git a/python/TransformerTagging/tests/test_transformer_tagging.py b/python/TransformerTagging/tests/test_transformer_tagging.py index d85eed2c..a47dd545 100644 --- a/python/TransformerTagging/tests/test_transformer_tagging.py +++ b/python/TransformerTagging/tests/test_transformer_tagging.py @@ -47,7 +47,7 @@ SHORT_SAMPLE_TAGS = "travel" SHORT_SAMPLE_TRIGGER_SENTENCES = "I drove to the beach today and will be staying overnight at a hotel." -SHORT_SAMPLE_OFFSET = "0-67" +SHORT_SAMPLE_OFFSET = "0-68" SHORT_SAMPLE_SCORE = 0.4680028557777405 @@ -182,7 +182,7 @@ def test_custom_confidence_threshold(self): self.assertAlmostEqual(SHORT_SAMPLE_SCORE, float(props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]), places=3) custom_threshold_sentence = "She will drop by to check on them after stopping by the bank." - custom_threshold_sentence_offset = "135-195" + custom_threshold_sentence_offset = "135-196" custom_threshold_sentence_score = 0.2906474769115448 self.assertEqual(custom_threshold_sentence, props["TEXT FINANCIAL TRIGGER SENTENCES"]) @@ -210,7 +210,7 @@ def test_custom_tagging_file(self): self.assertEqual("beach", props["TAGS"]) self.assertEqual(beach_sentences, props["TEXT BEACH TRIGGER SENTENCES"]) - self.assertEqual('0-67; 197-242', props["TEXT BEACH TRIGGER SENTENCES OFFSET"]) + self.assertEqual('0-68; 197-243', props["TEXT BEACH TRIGGER SENTENCES OFFSET"]) self.assertAlmostEqual(beach_score_1, float(beach_score_result_1), places=3) self.assertAlmostEqual(beach_score_2, float(beach_score_result_2), places=3) @@ -288,5 +288,55 @@ def test_matches_with_semicolons(self): expected_output = "I drove to the beach today[;] it was a long drive." self.assertEqual(expected_output, props["TEXT TRAVEL TRIGGER SENTENCES"]) + def test_repeat_trigger_job(self): + sample = ( + 'I drove to the beach today and will be staying overnight at a hotel. ' + 'I drove to the beach today and will be staying overnight at a hotel. ' + 'I texted my friend before I left so she could look after my cats. ' + 'I am going to the airport tomorrow. ' + 'I plan to spend all day at the beach tomorrow. ' + 'This airline serves peanuts. ' + 'I am going to the airport tomorrow. ' + ) + + trigger_sentences = ( + 'I drove to the beach today and will be staying overnight at a hotel.; ' + 'I am going to the airport tomorrow.; ' + 'This airline serves peanuts.' + ) + + offsets = "0-68, 69-137; 204-239, 316-351; 287-315" + + score_1 = 0.4680027663707733 + score_2 = 0.5079247951507568 + score_3 = 0.5265363454818726 + + matches = ( + 'This sentence is hotel.; ' + 'This sentence is airport.; ' + 'This sentence is airline.' + ) + + ff_track = mpf.GenericTrack(-1, dict(TEXT=sample)) + job = mpf.GenericJob('Test Repeat', 'test.pdf', \ + dict(ENABLE_DEBUG='true', SCORE_THRESHOLD='0.4'), {}, ff_track) + comp = TransformerTaggingComponent() + result = comp.get_detections_from_generic(job) + + self.assertEqual(1, len(result)) + + props = result[0].detection_properties + + self.assertEqual("travel", props["TAGS"]) + self.assertEqual(trigger_sentences, props["TEXT TRAVEL TRIGGER SENTENCES"]) + self.assertEqual(offsets, props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"]) + + score_result_1, score_result_2, score_result_3 = props["TEXT TRAVEL TRIGGER SENTENCES SCORE"].split(";") + self.assertAlmostEqual(score_1, float(score_result_1), places=3) + self.assertAlmostEqual(score_2, float(score_result_2), places=3) + self.assertAlmostEqual(score_3, float(score_result_3), places=3) + + self.assertAlmostEqual(matches, props["TEXT TRAVEL TRIGGER SENTENCES MATCHES"]) + if __name__ == '__main__': unittest.main() diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py index dc2153cc..9deb55f4 100644 --- a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py +++ b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py @@ -142,8 +142,6 @@ def _add_tags(self, config, corpus, ff_props: Dict[str, str]): # for each sentence in input for start, end in PunktSentenceTokenizer().span_tokenize(input_text): probe_sent = input_text[start:end] - logger.info(f"INPUT_TEXT: {input_text}") ## DEBUG - logger.info(f"PROBE: {str(start)}-{str(end)}: {probe_sent}") ## DEBUG # get similarity scores for the input sentence with each corpus sentence probe_sent_embed = self._cached_model.encode(probe_sent, convert_to_tensor=True, show_progress_bar=False) @@ -184,18 +182,33 @@ def _add_tags(self, config, corpus, ff_props: Dict[str, str]): else: ff_props["TAGS"] = tag + sents = [] + offsets = [] + scores = [] + matches = [] + + for input_text in tag_df["input text"].unique(): + input_text_df = tag_df[tag_df["input text"] == input_text] + + sents.append(input_text.replace(';', '[;]')) + offsets.append(", ".join(input_text_df["offset"])) + scores.append(input_text_df["score"].values[0].astype(str)) ## should all have the same score + + if config.debug: + matches.append(input_text_df["corpus text"].values[0].replace(';', '[;]')) ## should all have the same match + prop_name_sent = prop_to_tag + " " + tag.upper() + " TRIGGER SENTENCES" prop_name_offset = prop_name_sent + " OFFSET" prop_name_score = prop_name_sent + " SCORE" - ff_props[prop_name_sent] = "; ".join(tag_df["input text"].str.replace(';', '[;]')) - ff_props[prop_name_offset] = "; ".join(tag_df["offset"]) - ff_props[prop_name_score] = "; ".join(tag_df["score"].astype(str)) + ff_props[prop_name_sent] = "; ".join(sents) + ff_props[prop_name_offset] = "; ".join(offsets) + ff_props[prop_name_score] = "; ".join(scores) if config.debug: logger.info("Debug set to true, including corpus sentences that triggered the match.") prop_name_matches = prop_name_sent + " MATCHES" - ff_props[prop_name_matches] = "; ".join(tag_df["corpus text"].str.replace(';', '[;]')) + ff_props[prop_name_matches] = "; ".join(matches) class Corpus: From 0cc3a8a8b2fd5708a86a60d1656b8baf7f3ab1c2 Mon Sep 17 00:00:00 2001 From: jrobble Date: Wed, 31 Jan 2024 17:52:53 -0500 Subject: [PATCH 17/21] Improve tagging READMEs. Fix TransformerTagging character offset issue. --- cpp/KeywordTagging/README.md | 24 +-- python/TransformerTagging/README.md | 146 ++++++++++-------- .../transformer_tagging_component.py | 2 +- 3 files changed, 97 insertions(+), 75 deletions(-) diff --git a/cpp/KeywordTagging/README.md b/cpp/KeywordTagging/README.md index fb02abe9..2669e0cd 100644 --- a/cpp/KeywordTagging/README.md +++ b/cpp/KeywordTagging/README.md @@ -14,12 +14,12 @@ component. # Inputs -When acting as a downstream stage of a feed forward pipeline, this component will -accept feed forward tracks as input. The `FEED_FORWARD_PROP_TO_PROCESS` job -property will be used to determine which properties in the feed forward track +When acting as a downstream stage of a feed-forward pipeline, this component will +accept feed-forward tracks as input. The `FEED_FORWARD_PROP_TO_PROCESS` job +property will be used to determine which properties in the feed-forward track should be processed. For example, if `FEED_FORWARD_PROP_TO_PROCESS` is set to `TEXT,TRANSLATION` then this component will look for tags in both the `TEXT` and -`TRANSLATION` properties in the feed forward track. The trigger words for each of +`TRANSLATION` properties in the feed-forward track. The trigger words for each of these properties will be represented as seperate outputs. Refer to the Outputs section below. @@ -159,9 +159,9 @@ semicolon followed by a space. For example: trigger word reported in sequence, is separated by a semicolon followed by a space. Indexes within a single group are separated by commas. * Example `TEXT [TAG] TRIGGER WORDS=trigger1; trigger2`, - `TEXT [TAG] TRIGGER WORDS OFFSET=0-5, 6-10; 12-15`, means that `trigger1` - occurs twice in the text at the index ranges 0-5 and 6-10, and `trigger2` - occurs at index range 12-15. + `TEXT [TAG] TRIGGER WORDS OFFSET=0-7, 20-27; 55-62`, means that `trigger1` + occurs twice in the text at the index ranges 0-7 and 20-17, and `trigger2` + occurs once at index range 55-62. Note that all `TEXT [TAG] TRIGGER WORDS` results are trimmed of leading and trailing whitespace, regardless of the regex pattern used. The respective @@ -169,8 +169,8 @@ trailing whitespace, regardless of the regex pattern used. The respective The tags associated with the trigger words will be stored in a `TAGS` output property in alphabetical order, separated by semicolons. Note that there is only -one `TAGS` output property. This is unlike `TRIGGER WORDS` and `TRIGGER WORDS -OFFSET`, which are prefixed by the input property that produced those trigger -words. Each tag will only appear once in `TAGS` no matter how many trigger words -activate that tag. It doesn't matter if the trigger words are found in only one or -multiple input properties defined in `FEED_FORWARD_PROP_TO_PROCESS`. +one `TAGS` output property. This is unlike `TRIGGER WORDS` and +`TRIGGER WORDS OFFSET`, which are prefixed by the input property that produced those +trigger words. Each tag will only appear once in `TAGS` no matter how many trigger +words activate that tag. It doesn't matter if the trigger words are found in only +one or multiple input properties defined in `FEED_FORWARD_PROP_TO_PROCESS`. diff --git a/python/TransformerTagging/README.md b/python/TransformerTagging/README.md index 38b3066f..fd44c515 100644 --- a/python/TransformerTagging/README.md +++ b/python/TransformerTagging/README.md @@ -2,15 +2,15 @@ This repository contains source code for the OpenMPF Transformer Tagging component. -This component uses a user-specified corpus-json file to match known phrases against +This component uses a user-specified corpus JSON file to match known phrases against each sentence in the input text data. This is done by generating an embedding for each phrase in the corpus and comparing that against the embedding for each sentence of the input text. The comparison generates a score based on how similar the content is. This is based on how the underlying [all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) -was trained on a variety of text data in order to understand the common phrasing, -subject, and context. The sentences that generate scores above the threshold are -called "trigger sentences". These sentences are grouped by "tag" based on which entry +model was trained on a variety of text data in order to understand the commonalities +in phrasing, subject, and context. The sentences that generate scores above the threshold +are called "trigger sentences". These sentences are grouped by "tag" based on which entry in the corpus they matched against. This component can be used independently to perform transformer tagging on text @@ -31,57 +31,65 @@ input properties are present then the transformer tagging is not performed then feed-forward detection is returned unmodified. The reported detections that are returned by the transformer tagger are based on the -corpus used, and the minimum score defined in the`SCORE_THRESHOLD` property. These -values are discussed below. +corpus used, and the minimum score defined in the `SCORE_THRESHOLD` property, as +discussed below. # Corpus File -Transformer patterns are specified in a JSON corpus file. By default this is the -`transformer_text_tags_corpus.json` file. Alternativley, the path to a corpus file can +Transformer patterns are specified in a JSON corpus file. By default this is +`transformer_text_tags_corpus.json`. Alternativley, the path to the corpus file can be changed by setting the `TRANSFORMER_TAGGING_CORPUS` property. In the corpus file, users can specify sentence patterns to compare against using the following syntax: +```json +[ + { + "text": "This sentence is dog.", + "tag": "dog" + } +] ``` - [ - { - "text": "This sentence is dog.", - "tag": "dog" - } - ] -``` - -Where the `text` field specifies a sentence to compare against, and the `tag` field -is used to report in the output results if the input sentence scores meet the -`SCORE_THRESHOLD`. - -Multiple patterns can be specified with a comma separated list: -``` - [ - { - "text": "This sentence is dog.", - "tag": "dog" - }, - { - "text": "My favorite animal is a corgi.", - "tag": "dog" - }, - { - "text": "This sentence is cat.", - "tag": "cat" - }, - ... - ] +Where the `text` field specifies a sentence to compare each input sentence against. If +the match score meets the `SCORE_THRESHOLD` property, then the value of the `tag` field +will be added to the list in the `TAGS` output property. + +Multiple patterns can be specified with a comma-separated list: + +```json +[ + { + "text": "This sentence is dog.", + "tag": "dog" + }, + { + "text": "My favorite animal is a corgi.", + "tag": "dog" + }, + { + "text": "This sentence is cat.", + "tag": "cat" + }, + ... +] ``` # Outputs When performing transformer tagging on a text file, the contents of the file will be -stored in a `TEXT` output property. Text input that is not just whitespace, which has -sentences that scored high enough against entries in the corpus file, will result in -the following output properties: +stored in a `TEXT` output property. When performing transformer tagging on +feed-forward detections generated from some other component in a multi-stage +pipeline, the output properties from that component will be preserved.This +means that if those detections have a `TEXT` output property, then this +component will generate detections with the same `TEXT` output. Similarly, if +those detections have a `TRANSLATION` output property, then this component will +generate detections with the same `TRANSLATION` output. + +Each input property listed in `FEED_FORWARD_PROP_TO_PROCESS` that's present, and +not just whitespace, which has sentences that scored high enough against entries in +the corpus file, will result in the following output properties: - `TEXT [TAG] TRIGGER SENTENCES` - `TEXT [TAG] TRIGGER SENTENCES OFFSET` @@ -89,20 +97,40 @@ the following output properties: - `TRANSLATION [TAG] TRIGGER SENTENCES` - `TRANSLATION [TAG] TRIGGER SENTENCES OFFSET` - `TRANSLATION [TAG] TRIGGER SENTENCES SCORE` -Note: The '[TAG]' value in each of the output properties above will be the tag -property from the corpus file that the trigger sentence scored against. -The tags associated with the trigger words will be stored in a `TAGS` output +The `[TAG]` value in each of the output properties above will be the `tag` +value from the corpus file that the trigger sentence scored against. + +The tags associated with the trigger sentences will be stored in a `TAGS` output property, separated by semicolons. Note that there is only one `TAGS` output -property. This is unlike `TEXT [TAG] TRIGGER SENTENCES` and `TEXT [TAG] TRIGGER -SENTENCES OFFSET`, which are prefixed by the input property that produced those -trigger words. Each tag will only appear once in `TAGS` no matter how many trigger -words activate that tag. It doesn't matter if the trigger words are found in only one -or multiple input properties defined in `FEED_FORWARD_PROP_TO_PROCESS`. +property. This is unlike `TRIGGER SENTENCES` and `TRIGGER SENTENCES OFFSET`, which are +prefixed by the input property that produced those trigger sentences. Each tag will only +appear once in `TAGS` no matter how many trigger sentences activate that tag. It doesn't +matter if the trigger sentences are found in only one or multiple input properties defined +in `FEED_FORWARD_PROP_TO_PROCESS`. + +When the `TEXT` property is processed, the input sentence(s) that triggered each tag will +be stored in `TEXT [TAG] TRIGGER SENTENCES`. Note that because semicolons can be part of +the trigger sentence itself, those semicolons will be encapsulated in brackets. For +example, `This sentence has has a semicolon;` in the input `TEXT` is reported as: +`TEXT [TAG] TRIGGER SENTENCES=This sentence has has a semicolon[;]; other trigger sentence`. + +For each trigger sentence in `TEXT`, the substring index range will be stored in +`TEXT [TAG] TRIGGER SENTENCES OFFSET`. Each group of indexes, referring to the same +trigger sentence reported in sequence, is separated by a semicolon followed by a space. +Indexes within a single group are separated by commas. For example: -When `ENABLE_DEBUG` is set to true, the output properties will include a -`TRIGGER SENTENCES MATCHES` property containing a semicolon separated list of the -`text` sentences in the corpus that were triggered for that tag. +``` +TEXT [TAG] TRIGGER SENTENCES=trigger sentence 1; trigger sentence 2 +TEXT [TAG] TRIGGER SENTENCES OFFSET=0-17, 40-57; 112-129 +``` + +This means that `trigger sentence 1` occurs twice in the text at the index ranges +0-17 and 40-57, and `trigger sentence 2` occurs once at index range 112-129. + +When `ENABLE_DEBUG` is set to true, the output properties will also include a +`TRIGGER SENTENCES MATCHES` property containing a semicolon-separated list of the +`text` sentences in the corpus that were triggered for that tag: - `TEXT [TAG] TRIGGER SENTENCES` - `TEXT [TAG] TRIGGER SENTENCES MATCHES` @@ -113,15 +141,9 @@ When `ENABLE_DEBUG` is set to true, the output properties will include a - `TRANSLATION [TAG] TRIGGER SENTENCES OFFSET` - `TRANSLATION [TAG] TRIGGER SENTENCES SCORE` -Let's assume that we need to process the `TEXT` property. The sentence(s) that -triggered each tag will be stored in `TEXT [TAG] TRIGGER SENTENCES`. While the -sentence that was matched against in the corpus will be stored in the -`TEXT [TAG] TRIGGER SENTENCES MATCHES` property. Note, that because semicolons -can be part of the trigger sentence itself, those semicolons will be encapsulated -in brackets. For example, `This sentence has has a semicolon;` in the input `TEXT` -is reported as: -`TEXT [TAG] TRIGGER WORDS=This sentence has has a semicolon[;]; other triggers`. -For each trigger sentence the substring index range relative to the `TEXT` -output will be stored in `TEXT [TAG] TRIGGER SENTENCES OFFSET`. - +For example: +``` +TEXT [TAG] TRIGGER SENTENCES=trigger sentence 1; trigger sentence 2 +TEXT [TAG] TRIGGER SENTENCES MATCHES=Corpus sentence matching trigger sentence 1; Corpus sentence matching trigger sentence 2 +``` \ No newline at end of file diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py index a4f8e2ea..dd3a740b 100644 --- a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py +++ b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py @@ -149,7 +149,7 @@ def _add_tags(self, config, corpus, ff_props: Dict[str, str]): # get offset of the input sentence in the input text offset_beginning = input_text.find(probe_sent) - offset_end = offset_beginning + len(probe_sent) - 1 + offset_end = offset_beginning + len(probe_sent) offset_string = str(offset_beginning) + "-" + str(offset_end) probe_df = pd.DataFrame({ From ef6e96f8196c0f28dc580b9d23cc0ebe32593096 Mon Sep 17 00:00:00 2001 From: jrobble Date: Wed, 31 Jan 2024 20:22:47 -0500 Subject: [PATCH 18/21] Add cache_folder. --- python/TransformerTagging/Dockerfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/TransformerTagging/Dockerfile b/python/TransformerTagging/Dockerfile index 81b5552a..8eaeb18c 100644 --- a/python/TransformerTagging/Dockerfile +++ b/python/TransformerTagging/Dockerfile @@ -51,9 +51,9 @@ RUN --mount=type=tmpfs,target=/tmp/models \ python -c \ "import os; \ os.environ['REQUESTS_CA_BUNDLE']='/etc/ssl/certs/ca-certificates.crt'; \ - from sentence_transformers import SentenceTransformer, util; \ - model = SentenceTransformer('all-mpnet-base-v2'); \ - model.save('/models/all-mpnet-base-v2')" + from sentence_transformers import SentenceTransformer; \ + model = SentenceTransformer('all-mpnet-base-v2', cache_folder='/tmp/models'); \ + model.save('/models/all-mpnet-base-v2')" RUN --mount=target=.,readwrite \ install-component.sh; \ From c0296642df0fee5d6cc1dbb9646d2bfa479fdd9e Mon Sep 17 00:00:00 2001 From: jrobble Date: Thu, 1 Feb 2024 09:37:29 -0500 Subject: [PATCH 19/21] Remove debug line. --- .../transformer_tagging_component.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py index 9deb55f4..53e6bb29 100644 --- a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py +++ b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py @@ -206,7 +206,6 @@ def _add_tags(self, config, corpus, ff_props: Dict[str, str]): ff_props[prop_name_score] = "; ".join(scores) if config.debug: - logger.info("Debug set to true, including corpus sentences that triggered the match.") prop_name_matches = prop_name_sent + " MATCHES" ff_props[prop_name_matches] = "; ".join(matches) From ab219fb2a203ca06e27954b3aa4610cd42249c11 Mon Sep 17 00:00:00 2001 From: jrobble Date: Thu, 1 Feb 2024 11:11:30 -0500 Subject: [PATCH 20/21] Preserve newline characters. --- .../transformer_tagging_component.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py index 53e6bb29..a3362ef1 100644 --- a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py +++ b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py @@ -32,7 +32,6 @@ from sentence_transformers import SentenceTransformer, util from typing import Sequence, Dict, Mapping -import pathlib import os import time @@ -77,7 +76,10 @@ def get_detections_from_generic(self, job: mpf.GenericJob) -> Sequence[mpf.Gener 'media file is a plain text file containing the text to ' 'be tagged.') - text = pathlib.Path(job.data_uri).read_text().strip() + # preserve line endings in the original text, such as '\r\n' + with open(job.data_uri, 'r', newline='') as f: + text = f.read() + new_ff_props = dict(TEXT=text) ff_track = mpf.GenericTrack(detection_properties=new_ff_props) @@ -192,10 +194,12 @@ def _add_tags(self, config, corpus, ff_props: Dict[str, str]): sents.append(input_text.replace(';', '[;]')) offsets.append(", ".join(input_text_df["offset"])) - scores.append(input_text_df["score"].values[0].astype(str)) ## should all have the same score + # all entries should have the same score, so just use the first + scores.append(input_text_df["score"].values[0].astype(str)) if config.debug: - matches.append(input_text_df["corpus text"].values[0].replace(';', '[;]')) ## should all have the same match + # all entries should have the same match, so just use the first + matches.append(input_text_df["corpus text"].values[0].replace(';', '[;]')) prop_name_sent = prop_to_tag + " " + tag.upper() + " TRIGGER SENTENCES" prop_name_offset = prop_name_sent + " OFFSET" From 8954ac02b897dd89a0f2ca68070561aa3e7153a1 Mon Sep 17 00:00:00 2001 From: jrobble Date: Thu, 1 Feb 2024 12:01:05 -0500 Subject: [PATCH 21/21] Fix offset. --- .../TransformerTagging/tests/test_transformer_tagging.py | 8 ++++---- .../transformer_tagging_component.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/TransformerTagging/tests/test_transformer_tagging.py b/python/TransformerTagging/tests/test_transformer_tagging.py index a47dd545..0214f97f 100644 --- a/python/TransformerTagging/tests/test_transformer_tagging.py +++ b/python/TransformerTagging/tests/test_transformer_tagging.py @@ -47,7 +47,7 @@ SHORT_SAMPLE_TAGS = "travel" SHORT_SAMPLE_TRIGGER_SENTENCES = "I drove to the beach today and will be staying overnight at a hotel." -SHORT_SAMPLE_OFFSET = "0-68" +SHORT_SAMPLE_OFFSET = "0-67" SHORT_SAMPLE_SCORE = 0.4680028557777405 @@ -182,7 +182,7 @@ def test_custom_confidence_threshold(self): self.assertAlmostEqual(SHORT_SAMPLE_SCORE, float(props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]), places=3) custom_threshold_sentence = "She will drop by to check on them after stopping by the bank." - custom_threshold_sentence_offset = "135-196" + custom_threshold_sentence_offset = "135-195" custom_threshold_sentence_score = 0.2906474769115448 self.assertEqual(custom_threshold_sentence, props["TEXT FINANCIAL TRIGGER SENTENCES"]) @@ -210,7 +210,7 @@ def test_custom_tagging_file(self): self.assertEqual("beach", props["TAGS"]) self.assertEqual(beach_sentences, props["TEXT BEACH TRIGGER SENTENCES"]) - self.assertEqual('0-68; 197-243', props["TEXT BEACH TRIGGER SENTENCES OFFSET"]) + self.assertEqual('0-67; 197-242', props["TEXT BEACH TRIGGER SENTENCES OFFSET"]) self.assertAlmostEqual(beach_score_1, float(beach_score_result_1), places=3) self.assertAlmostEqual(beach_score_2, float(beach_score_result_2), places=3) @@ -305,7 +305,7 @@ def test_repeat_trigger_job(self): 'This airline serves peanuts.' ) - offsets = "0-68, 69-137; 204-239, 316-351; 287-315" + offsets = "0-67, 69-136; 204-238, 316-350; 287-314" score_1 = 0.4680027663707733 score_2 = 0.5079247951507568 diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py index a3362ef1..65114269 100644 --- a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py +++ b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py @@ -154,7 +154,7 @@ def _add_tags(self, config, corpus, ff_props: Dict[str, str]): "corpus text": corpus.json["text"], "tag": corpus.json["tag"].str.lower(), "score": scores, - "offset": str(start) + "-" + str(end) + "offset": str(start) + "-" + str(end - 1) }) # sort by score then group by tag so each group will be sorted highest to lowest score,