diff --git a/cpp/KeywordTagging/KeywordTagging.cpp b/cpp/KeywordTagging/KeywordTagging.cpp index d715eee7..1d2cb79d 100644 --- a/cpp/KeywordTagging/KeywordTagging.cpp +++ b/cpp/KeywordTagging/KeywordTagging.cpp @@ -26,6 +26,7 @@ #include "KeywordTagging.h" #include +#include #include #include #include @@ -278,7 +279,7 @@ bool KeywordTagging::comp_regex(const MPFJob &job, const wstring &full_text, set KeywordTagging::search_regex(const MPFJob &job, const wstring &full_text, const map>> &json_kvs_regex, - map> &trigger_words_offset, + map>> &trigger_tags_words_offset, bool full_regex) { wstring found_tags_regex = L""; set found_keys_regex; @@ -288,19 +289,21 @@ set KeywordTagging::search_regex(const MPFJob &job, const wstring &full } for (const auto &kv : json_kvs_regex) { - auto key = kv.first; + auto key = boost::locale::to_lower(kv.first); auto values = kv.second; + map> trigger_words_offset; for (const pair &value : values) { wstring regex_pattern = value.first; bool case_sens = value.second; if (comp_regex(job, full_text, regex_pattern, trigger_words_offset, full_regex, case_sens)) { found_keys_regex.insert(key); + trigger_tags_words_offset[key] = trigger_words_offset; // Discontinue searching unless full regex search is enabled. if (!full_regex) { break; } - } + } } } @@ -542,31 +545,49 @@ void KeywordTagging::process_text_tagging(Properties &detection_properties, cons bool full_regex = DetectionComponentUtils::GetProperty(job.job_properties, "FULL_REGEX_SEARCH", true); - set trigger_words; - map> trigger_words_offset; - set found_tags_regex = search_regex(job, text, json_kvs_regex, trigger_words_offset, full_regex); + map>> trigger_tags_words_offset; + set found_tags_regex = search_regex(job, text, json_kvs_regex, trigger_tags_words_offset, full_regex); all_found_tags.insert(found_tags_regex.begin(), found_tags_regex.end()); wstring tag_string = boost::algorithm::join(found_tags_regex, L"; "); - vector offsets_list; - vector triggers_list; - - wstring tag_trigger = boost::algorithm::join(trigger_words, L"; "); - - for (auto const& word_offset : trigger_words_offset) { - triggers_list.push_back(word_offset.first); - offsets_list.push_back(boost::algorithm::join(word_offset.second, ", ")); - } + map>>::iterator trigger_tags_words_offset_iterator = trigger_tags_words_offset.begin(); + while(trigger_tags_words_offset_iterator != trigger_tags_words_offset.end()) + { + vector offsets_list; + vector triggers_list; - string tag_offset = boost::algorithm::join(offsets_list, "; "); - tag_trigger = tag_trigger + boost::algorithm::join(triggers_list, L"; "); + wstring tag = trigger_tags_words_offset_iterator->first; + boost::to_upper(tag); + map> trigger_words_offset = trigger_tags_words_offset_iterator->second; - detection_properties[boost::locale::conv::utf_to_utf(prop) + " TRIGGER WORDS"] = boost::locale::conv::utf_to_utf(tag_trigger); - detection_properties[boost::locale::conv::utf_to_utf(prop)+ " TRIGGER WORDS OFFSET"] = tag_offset; + for (auto const& word_offset : trigger_words_offset) { + triggers_list.push_back(word_offset.first); + offsets_list.push_back(boost::algorithm::join(word_offset.second, ", ")); + } + + string tag_offset = boost::algorithm::join(offsets_list, "; "); + wstring tag_trigger = boost::algorithm::join(triggers_list, L"; "); + + detection_properties[boost::locale::conv::utf_to_utf(prop) + " " + boost::locale::conv::utf_to_utf(tag) + " TRIGGER WORDS"] = boost::locale::conv::utf_to_utf(tag_trigger); + detection_properties[boost::locale::conv::utf_to_utf(prop) + " " + boost::locale::conv::utf_to_utf(tag) + " TRIGGER WORDS OFFSET"] = tag_offset; + trigger_tags_words_offset_iterator++; + } } if (has_text) { + // store off earlier tags + boost::regex delimiter{"( *; *)"}; + boost::sregex_token_iterator iter(detection_properties["TAGS"].begin(), + detection_properties["TAGS"].end(), delimiter, -1); + boost::sregex_token_iterator end; + + while(iter != end) + { + std::wstring_convert> convert_s_to_ws; + all_found_tags.insert(boost::to_lower_copy(convert_s_to_ws.from_bytes(*iter++))); + } + wstring tag_string = boost::algorithm::join(all_found_tags, L"; "); detection_properties["TAGS"] = boost::locale::conv::utf_to_utf(tag_string); } diff --git a/cpp/KeywordTagging/KeywordTagging.h b/cpp/KeywordTagging/KeywordTagging.h index a5a932d3..dd0b6594 100644 --- a/cpp/KeywordTagging/KeywordTagging.h +++ b/cpp/KeywordTagging/KeywordTagging.h @@ -57,7 +57,7 @@ class KeywordTagging : public MPFDetectionComponent { std::set search_regex(const MPFJob &job, const std::wstring &full_text, const std::map>> &json_kvs_regex, - std::map> &trigger_words_offset, + std::map>> &trigger_tags_words_offset, bool full_regex); void process_regex_match(const boost::wsmatch &match, const std::wstring &full_text, diff --git a/cpp/KeywordTagging/README.md b/cpp/KeywordTagging/README.md index 6eef8aa6..2669e0cd 100644 --- a/cpp/KeywordTagging/README.md +++ b/cpp/KeywordTagging/README.md @@ -14,20 +14,14 @@ component. # Inputs -When performing keyword tagging on a text file, the contents of the file will be -stored in a `TEXT` output property. When performing keyword tagging on -feed-forward detections generated from some other component in a multi-stage -pipeline, the output properties from that component will be preserved. This -means that if those detections have a `TEXT` output property, then this -component will generate detections with the same `TEXT` output. Similarly, if -those detections have a `TRANSCRIPT` output property, then this component will -generate detections with the same `TRANSCRIPT` output. - -Keyword tagging will be performed on all of the input properties listed in -`FEED_FORWARD_PROP_TO_PROCESS`, if present. If none of the input properties are -present then keyword tagging is not performed and the feed-forward detection -is returned unmodified. For the sake of discussion, let's assume we need to -perform keyword tagging on the `TEXT` property. +When acting as a downstream stage of a feed-forward pipeline, this component will +accept feed-forward tracks as input. The `FEED_FORWARD_PROP_TO_PROCESS` job +property will be used to determine which properties in the feed-forward track +should be processed. For example, if `FEED_FORWARD_PROP_TO_PROCESS` is set to +`TEXT,TRANSLATION` then this component will look for tags in both the `TEXT` and +`TRANSLATION` properties in the feed-forward track. The trigger words for each of +these properties will be represented as seperate outputs. Refer to the Outputs +section below. # JSON Tagging File @@ -122,48 +116,61 @@ pattern becomes `(\\b)end(\\W+)of(\\W+)a(\\W+)sentence\\.`. Note that the `.` symbol is typically used in regex to match any character, which is why we use `\\.` instead. -# Outputs -Each input property listed in `FEED_FORWARD_PROP_TO_PROCESS` that's -present, and not just whitespace, will result in a `TRIGGER WORDS` and -`TRIGGER WORDS OFFSET` output property. For example, if -`FEED_FORWARD_PROP_TO_PROCESS=TEXT,TRANSLATION`, and the `TEXT` and `TRANSLATION` -properties are both present, then the following output properties will be produced: +# Outputs -- `TEXT TRIGGER WORDS` -- `TEXT TRIGGER WORDS OFFSET` -- `TRANSLATION TRIGGER WORDS` -- `TRANSLATION TRIGGER WORDS OFFSET` +When performing keyword tagging on a text file, the contents of the file will be +stored in a `TEXT` output property. When performing keyword tagging on +feed-forward detections generated from some other component in a multi-stage +pipeline, the output properties from that component will be preserved.This +means that if those detections have a `TEXT` output property, then this +component will generate detections with the same `TEXT` output. Similarly, if +those detections have a `TRANSLATION` output property, then this component will +generate detections with the same `TRANSLATION` output. + +Each input property listed in `FEED_FORWARD_PROP_TO_PROCESS` that's present, and +not just whitespace, will result in a `[TAG] TRIGGER WORDS` and +`[TAG] TRIGGER WORDS OFFSET` output property. The `[TAG]` will be the tag property +that matched in the input text. For example, in +`FEED_FORWARD_PROP_TO_PROCESS=TEXT,TRANSLATION`, the `TEXT` and `TRANSLATION` +properties are both present, so the following output properties will be produced +if trigger words are found: + +- `TEXT [TAG] TRIGGER WORDS` +- `TEXT [TAG] TRIGGER WORDS OFFSET` +- `TRANSLATION [TAG] TRIGGER WORDS` +- `TRANSLATION [TAG] TRIGGER WORDS OFFSET` Let's assume that we need process the `TEXT` property. The substring(s) that -triggered each tag will be stored in `TEXT TRIGGER WORDS` in alphabetical order. -For each trigger word the substring index range relative to the `TEXT` output -will be stored in `TEXT TRIGGER WORDS OFFSET`. Because the same trigger word -can be encountered multiple times in the `TEXT` output, the results are organized -as follows: - -* `TEXT TRIGGER WORDS`: Each distinct trigger word is separated by a semicolon -followed by a space. For example: `TEXT TRIGGER WORDS=trigger1; trigger2` - * Because semicolons can be part of the trigger word itself, those - semicolons will be encapsulated in brackets. For example, - `detected trigger with a ;` in the input `TEXT` is reported as - `TEXT TRIGGER WORDS=detected trigger with a [;]; some other trigger`. -* `TEXT TRIGGER WORDS OFFSET`: Each group of indexes, referring to the same trigger -word reported in sequence, is separated by a semicolon followed by a space. -Indexes within a single group are separated by commas. - * Example `TEXT TRIGGER WORDS=trigger1; trigger2`, - `TEXT TRIGGER WORDS OFFSET=0-5, 6-10; 12-15`, means that `trigger1` occurs twice - in the text at the index ranges 0-5 and 6-10, and `trigger2` occurs at index - range 12-15. - -Note that all `TEXT TRIGGER WORDS` results are trimmed of leading and trailing -whitespace, regardless of the regex pattern used. The respective -`TEXT TRIGGER WORDS OFFSET` indexes refer to the trimmed substrings. +triggered each tag will be stored in `TEXT [TAG] TRIGGER WORDS` in alphabetical +order. For each trigger word the substring index range relative to the `TEXT` +output will be stored in `TEXT [TAG] TRIGGER WORDS OFFSET`. Because the same +trigger word can be encountered multiple times in the `TEXT` output, the results +are organized as follows: + +* `TEXT [TAG] TRIGGER WORDS`: Each distinct trigger word is separated by a +semicolon followed by a space. For example: +`TEXT [TAG] TRIGGER WORDS=trigger1; trigger2` + * Because semicolons can be part of the trigger word itself, those semicolons + will be encapsulated in brackets. For example, `detected trigger with a ;` in + the input `TEXT` is reported as + `TEXT [TAG] TRIGGER WORDS=detected trigger with a [;]; some other trigger`. +* `TEXT [TAG] TRIGGER WORDS OFFSET`: Each group of indexes, referring to the same +trigger word reported in sequence, is separated by a semicolon followed by a +space. Indexes within a single group are separated by commas. + * Example `TEXT [TAG] TRIGGER WORDS=trigger1; trigger2`, + `TEXT [TAG] TRIGGER WORDS OFFSET=0-7, 20-27; 55-62`, means that `trigger1` + occurs twice in the text at the index ranges 0-7 and 20-17, and `trigger2` + occurs once at index range 55-62. + +Note that all `TEXT [TAG] TRIGGER WORDS` results are trimmed of leading and +trailing whitespace, regardless of the regex pattern used. The respective +`TEXT [TAG] TRIGGER WORDS OFFSET` indexes refer to the trimmed substrings. The tags associated with the trigger words will be stored in a `TAGS` output property in alphabetical order, separated by semicolons. Note that there is only -one `TAGS` output property. This is unlike `TRIGGER WORDS` and `TRIGGER WORDS OFFSET`, -which are prefixed by the input property that produced those trigger words. -Each tag will only appear once in `TAGS` no matter how many trigger words -activate that tag. It doesn't matter if the trigger words are found in only one -or multiple input properties defined in `FEED_FORWARD_PROP_TO_PROCESS`. +one `TAGS` output property. This is unlike `TRIGGER WORDS` and +`TRIGGER WORDS OFFSET`, which are prefixed by the input property that produced those +trigger words. Each tag will only appear once in `TAGS` no matter how many trigger +words activate that tag. It doesn't matter if the trigger words are found in only +one or multiple input properties defined in `FEED_FORWARD_PROP_TO_PROCESS`. diff --git a/cpp/KeywordTagging/test/test_keyword_tagging.cpp b/cpp/KeywordTagging/test/test_keyword_tagging.cpp index 837366d4..b76d35b2 100644 --- a/cpp/KeywordTagging/test/test_keyword_tagging.cpp +++ b/cpp/KeywordTagging/test/test_keyword_tagging.cpp @@ -129,8 +129,10 @@ TEST(KEYWORDTAGGING, TaggingTest) { // Test escaped backslash text tagging. ASSERT_NO_FATAL_FAILURE(runKeywordTagging("data/test-backslash.txt", tagger, results, custom_properties)); assertInText("data/test-backslash.txt", "backslash; personal", results, "TAGS"); - assertInText("data/test-backslash.txt", "TEXT; \\", results, "TEXT TRIGGER WORDS"); - assertInText("data/test-backslash.txt", "7-10; 0, 12, 15, 16, 18, 19", results, "TEXT TRIGGER WORDS OFFSET"); + assertInText("data/test-backslash.txt", "\\", results, "TEXT BACKSLASH TRIGGER WORDS"); + assertInText("data/test-backslash.txt", "0, 12, 15, 16, 18, 19, 20, 21", results, "TEXT BACKSLASH TRIGGER WORDS OFFSET"); + assertInText("data/test-backslash.txt", "TEXT", results, "TEXT PERSONAL TRIGGER WORDS"); + assertInText("data/test-backslash.txt", "7-10", results, "TEXT PERSONAL TRIGGER WORDS OFFSET"); ASSERT_TRUE(tagger.Close()); } @@ -148,22 +150,34 @@ TEST(KEYWORDTAGGING, MulitpleTagsTest) { ASSERT_NO_FATAL_FAILURE(runKeywordTagging("data/tags-keyword.txt", tagger, results, custom_properties)); assertInText("data/tags-keyword.txt", "Passenger Passport", results, "TEXT"); assertInText("data/tags-keyword.txt", "identity document; travel", results, "TAGS"); - assertInText("data/tags-keyword.txt", "Passenger; Passport", results, "TEXT TRIGGER WORDS"); - assertInText("data/tags-keyword.txt", "0-8; 10-17", results, "TEXT TRIGGER WORDS OFFSET"); + assertInText("data/tags-keyword.txt", "Passport", results, "TEXT IDENTITY DOCUMENT TRIGGER WORDS"); + assertInText("data/tags-keyword.txt", "10-17", results, "TEXT IDENTITY DOCUMENT TRIGGER WORDS OFFSET"); + assertInText("data/tags-keyword.txt", "Passenger", results, "TEXT TRAVEL TRIGGER WORDS"); + assertInText("data/tags-keyword.txt", "0-8", results, "TEXT TRAVEL TRIGGER WORDS OFFSET"); results.clear(); ASSERT_NO_FATAL_FAILURE(runKeywordTagging("data/tags-regex.txt", tagger, results, custom_properties)); assertInText("data/tags-regex.txt", "case-insensitive-tag; financial; personal", results, "TAGS"); - assertInText("data/tags-regex.txt", "122-123-1234; financ", results, "TEXT TRIGGER WORDS"); - assertInText("data/tags-regex.txt", "17-28; 0-5", results, "TEXT TRIGGER WORDS OFFSET"); + assertInText("data/tags-regex.txt", "financ", results, "TEXT CASE-INSENSITIVE-TAG TRIGGER WORDS"); + assertInText("data/tags-regex.txt", "0-5", results, "TEXT CASE-INSENSITIVE-TAG TRIGGER WORDS OFFSET"); + assertInText("data/tags-regex.txt", "financ", results, "TEXT FINANCIAL TRIGGER WORDS"); + assertInText("data/tags-regex.txt", "0-5", results, "TEXT FINANCIAL TRIGGER WORDS OFFSET"); + assertInText("data/tags-regex.txt", "122-123-1234", results, "TEXT PERSONAL TRIGGER WORDS"); + assertInText("data/tags-regex.txt", "17-28", results, "TEXT PERSONAL TRIGGER WORDS OFFSET"); results.clear(); // Test multiple text tagging w/ delimiter tag. ASSERT_NO_FATAL_FAILURE(runKeywordTagging("data/tags-regex-delimiter.txt", tagger, results, custom_properties)); assertInText("data/tags-regex-delimiter.txt", "case-insensitive-tag; delimiter-test; financial; personal", results, "TAGS"); - assertInText("data/tags-regex-delimiter.txt", "122-123-1234; a[[;] ]b; financ", results, "TEXT TRIGGER WORDS"); - assertInText("data/tags-regex-delimiter.txt", "22-33; 15-20; 0-5", results, "TEXT TRIGGER WORDS OFFSET"); + assertInText("data/tags-regex-delimiter.txt", "financ", results, "TEXT CASE-INSENSITIVE-TAG TRIGGER WORDS"); + assertInText("data/tags-regex-delimiter.txt", "0-5", results, "TEXT CASE-INSENSITIVE-TAG TRIGGER WORDS OFFSET"); + assertInText("data/tags-regex-delimiter.txt", "a[[;] ]b", results, "TEXT DELIMITER-TEST TRIGGER WORDS"); + assertInText("data/tags-regex-delimiter.txt", "15-20", results, "TEXT DELIMITER-TEST TRIGGER WORDS OFFSET"); + assertInText("data/tags-regex-delimiter.txt", "financ", results, "TEXT FINANCIAL TRIGGER WORDS"); + assertInText("data/tags-regex-delimiter.txt", "0-5", results, "TEXT FINANCIAL TRIGGER WORDS OFFSET"); + assertInText("data/tags-regex-delimiter.txt", "122-123-1234", results, "TEXT PERSONAL TRIGGER WORDS"); + assertInText("data/tags-regex-delimiter.txt", "22-33", results, "TEXT PERSONAL TRIGGER WORDS OFFSET"); ASSERT_TRUE(tagger.Close()); } @@ -181,16 +195,32 @@ TEST(KEYWORDTAGGING, FullSearch) { ASSERT_NO_FATAL_FAILURE(runKeywordTagging("data/tags-keywordregex.txt", tagger, results, custom_properties)); assertInText("data/tags-keywordregex.txt", "case-insensitive-tag; case-sensitive-tag; financial; personal; vehicle", results, "TAGS"); - assertInText("data/tags-keywordregex.txt", "01/01/20; Financ; Text; Vehicle", results, "TEXT TRIGGER WORDS"); - assertInText("data/tags-keywordregex.txt", "20-27; 37-42; 10-13, 15-18; 29-35", results, "TEXT TRIGGER WORDS OFFSET"); + assertInText("data/tags-keywordregex.txt", "Financ", results, "TEXT CASE-INSENSITIVE-TAG TRIGGER WORDS"); + assertInText("data/tags-keywordregex.txt", "37-42", results, "TEXT CASE-INSENSITIVE-TAG TRIGGER WORDS OFFSET"); + assertInText("data/tags-keywordregex.txt", "Financ", results, "TEXT CASE-SENSITIVE-TAG TRIGGER WORDS"); + assertInText("data/tags-keywordregex.txt", "37-42", results, "TEXT CASE-SENSITIVE-TAG TRIGGER WORDS OFFSET"); + assertInText("data/tags-keywordregex.txt", "Financ", results, "TEXT FINANCIAL TRIGGER WORDS"); + assertInText("data/tags-keywordregex.txt", "37-42", results, "TEXT FINANCIAL TRIGGER WORDS OFFSET"); + assertInText("data/tags-keywordregex.txt", "01/01/20; Text", results, "TEXT PERSONAL TRIGGER WORDS"); + assertInText("data/tags-keywordregex.txt", "20-27; 10-13, 15-18", results, "TEXT PERSONAL TRIGGER WORDS OFFSET"); + assertInText("data/tags-keywordregex.txt", "Vehicle", results, "TEXT VEHICLE TRIGGER WORDS"); + assertInText("data/tags-keywordregex.txt", "29-35", results, "TEXT VEHICLE TRIGGER WORDS OFFSET"); results.clear(); // With full regex search disabled, number of reported triggers and offsets will decrease. ASSERT_NO_FATAL_FAILURE(runKeywordTagging("data/tags-keywordregex.txt", tagger, results, custom_properties_disabled)); assertInText("data/tags-keywordregex.txt", "case-insensitive-tag; case-sensitive-tag; financial; personal; vehicle", results, "TAGS"); - assertInText("data/tags-keywordregex.txt", "01/01/20; Financ; Vehicle", results, "TEXT TRIGGER WORDS"); - assertInText("data/tags-keywordregex.txt", "20-27; 37-42; 29-35", results, "TEXT TRIGGER WORDS OFFSET"); + assertInText("data/tags-keywordregex.txt", "Financ", results, "TEXT CASE-INSENSITIVE-TAG TRIGGER WORDS"); + assertInText("data/tags-keywordregex.txt", "37-42", results, "TEXT CASE-INSENSITIVE-TAG TRIGGER WORDS OFFSET"); + assertInText("data/tags-keywordregex.txt", "Financ", results, "TEXT CASE-SENSITIVE-TAG TRIGGER WORDS"); + assertInText("data/tags-keywordregex.txt", "37-42", results, "TEXT CASE-SENSITIVE-TAG TRIGGER WORDS OFFSET"); + assertInText("data/tags-keywordregex.txt", "Financ", results, "TEXT FINANCIAL TRIGGER WORDS"); + assertInText("data/tags-keywordregex.txt", "37-42", results, "TEXT FINANCIAL TRIGGER WORDS OFFSET"); + assertInText("data/tags-keywordregex.txt", "01/01/20", results, "TEXT PERSONAL TRIGGER WORDS"); + assertInText("data/tags-keywordregex.txt", "20-27", results, "TEXT PERSONAL TRIGGER WORDS OFFSET"); + assertInText("data/tags-keywordregex.txt", "Vehicle", results, "TEXT VEHICLE TRIGGER WORDS"); + assertInText("data/tags-keywordregex.txt", "29-35", results, "TEXT VEHICLE TRIGGER WORDS OFFSET"); ASSERT_TRUE(tagger.Close()); } @@ -309,8 +339,8 @@ TEST(KEYWORDTAGGING, ProcessAllProperties) { ASSERT_EQ("cash", props["TRANSLATION"]); ASSERT_EQ("car", props["TEXT"]); ASSERT_EQ("vehicle", props["TAGS"]); - ASSERT_EQ("car", props["TEXT TRIGGER WORDS"]); - ASSERT_EQ("0-2", props["TEXT TRIGGER WORDS OFFSET"]); + ASSERT_EQ("car", props["TEXT VEHICLE TRIGGER WORDS"]); + ASSERT_EQ("0-2", props["TEXT VEHICLE TRIGGER WORDS OFFSET"]); } { @@ -332,10 +362,10 @@ TEST(KEYWORDTAGGING, ProcessAllProperties) { ASSERT_EQ("cash", props["TRANSLATION"]); ASSERT_EQ("car", props["TEXT"]); ASSERT_EQ("financial; vehicle", props["TAGS"]); // tags added in alphabetical order - ASSERT_EQ("cash", props["TRANSLATION TRIGGER WORDS"]); - ASSERT_EQ("0-3", props["TRANSLATION TRIGGER WORDS OFFSET"]); - ASSERT_EQ("car", props["TEXT TRIGGER WORDS"]); - ASSERT_EQ("0-2", props["TEXT TRIGGER WORDS OFFSET"]); + ASSERT_EQ("cash", props["TRANSLATION FINANCIAL TRIGGER WORDS"]); + ASSERT_EQ("0-3", props["TRANSLATION FINANCIAL TRIGGER WORDS OFFSET"]); + ASSERT_EQ("car", props["TEXT VEHICLE TRIGGER WORDS"]); + ASSERT_EQ("0-2", props["TEXT VEHICLE TRIGGER WORDS OFFSET"]); } { @@ -355,10 +385,10 @@ TEST(KEYWORDTAGGING, ProcessAllProperties) { ASSERT_EQ("cash", props["BAR"]); ASSERT_EQ("car", props["FOO"]); ASSERT_EQ("financial; vehicle", props["TAGS"]); // tags added in alphabetical order - ASSERT_EQ("car", props["FOO TRIGGER WORDS"]); - ASSERT_EQ("0-2", props["FOO TRIGGER WORDS OFFSET"]); - ASSERT_EQ("cash", props["BAR TRIGGER WORDS"]); - ASSERT_EQ("0-3", props["BAR TRIGGER WORDS OFFSET"]); + ASSERT_EQ("car", props["FOO VEHICLE TRIGGER WORDS"]); + ASSERT_EQ("0-2", props["FOO VEHICLE TRIGGER WORDS OFFSET"]); + ASSERT_EQ("cash", props["BAR FINANCIAL TRIGGER WORDS"]); + ASSERT_EQ("0-3", props["BAR FINANCIAL TRIGGER WORDS OFFSET"]); } ASSERT_TRUE(tagger.Close()); @@ -400,8 +430,8 @@ TEST(KEYWORDTAGGING, ProcessTrackAndDetectionProperties) { ASSERT_EQ("airport", props["TEXT"]); ASSERT_EQ("SOME_VAL_3", props["SOME_PROP_3"]); ASSERT_EQ("travel", props["TAGS"]); - ASSERT_EQ("airport", props["TEXT TRIGGER WORDS"]); - ASSERT_EQ("0-6", props["TEXT TRIGGER WORDS OFFSET"]); + ASSERT_EQ("airport", props["TEXT TRAVEL TRIGGER WORDS"]); + ASSERT_EQ("0-6", props["TEXT TRAVEL TRIGGER WORDS OFFSET"]); MPFImageLocation location = results.at(0).frame_locations.at(10); ASSERT_EQ(location1.x_left_upper, location.x_left_upper); @@ -415,8 +445,8 @@ TEST(KEYWORDTAGGING, ProcessTrackAndDetectionProperties) { ASSERT_EQ("SOME_VAL_1", props["SOME_PROP_1"]); ASSERT_EQ("car", props["TEXT"]); ASSERT_EQ("vehicle", props["TAGS"]); - ASSERT_EQ("car", props["TEXT TRIGGER WORDS"]); - ASSERT_EQ("0-2", props["TEXT TRIGGER WORDS OFFSET"]); + ASSERT_EQ("car", props["TEXT VEHICLE TRIGGER WORDS"]); + ASSERT_EQ("0-2", props["TEXT VEHICLE TRIGGER WORDS OFFSET"]); location = results.at(0).frame_locations.at(12); ASSERT_EQ(location2.x_left_upper, location.x_left_upper); @@ -430,8 +460,8 @@ TEST(KEYWORDTAGGING, ProcessTrackAndDetectionProperties) { ASSERT_EQ("SOME_VAL_2", props["SOME_PROP_2"]); ASSERT_EQ("username", props["TEXT"]); ASSERT_EQ("personal", props["TAGS"]); - ASSERT_EQ("username", props["TEXT TRIGGER WORDS"]); - ASSERT_EQ("0-7", props["TEXT TRIGGER WORDS OFFSET"]); + ASSERT_EQ("username", props["TEXT PERSONAL TRIGGER WORDS"]); + ASSERT_EQ("0-7", props["TEXT PERSONAL TRIGGER WORDS OFFSET"]); } { @@ -478,8 +508,8 @@ TEST(KEYWORDTAGGING, ProcessTrackAndDetectionProperties) { ASSERT_EQ(4, props.size()); ASSERT_EQ("username", props["TRANSCRIPT"]); ASSERT_EQ("personal", props["TAGS"]); - ASSERT_EQ("username", props["TRANSCRIPT TRIGGER WORDS"]); - ASSERT_EQ("0-7", props["TRANSCRIPT TRIGGER WORDS OFFSET"]); + ASSERT_EQ("username", props["TRANSCRIPT PERSONAL TRIGGER WORDS"]); + ASSERT_EQ("0-7", props["TRANSCRIPT PERSONAL TRIGGER WORDS OFFSET"]); } ASSERT_TRUE(tagger.Close()); @@ -492,8 +522,8 @@ TEST(KEYWORDTAGGING, ProcessRepeatTags) { ASSERT_TRUE(tagger.Init()); MPFImageLocation location(1, 2, 3, 4, 5, - {{"TEXT", "cash-car"}, - {"OTHER TEXT", "car-cash"}, + {{"TEXT", "cash-car-suv"}, + {"OTHER TEXT", "car-cash-suv"}, {"MORE TEXT", "cash cash"}, {"BLANK TEXT", " "}}); MPFImageJob job("JOB NAME", "/some/path", location, @@ -508,26 +538,50 @@ TEST(KEYWORDTAGGING, ProcessRepeatTags) { ASSERT_EQ(location.confidence, results.at(0).confidence); Properties props = results.at(0).detection_properties; - ASSERT_EQ(11, props.size()); + ASSERT_EQ(15, props.size()); - ASSERT_EQ("cash-car", props["TEXT"]); - ASSERT_EQ("car-cash", props["OTHER TEXT"]); + ASSERT_EQ("cash-car-suv", props["TEXT"]); + ASSERT_EQ("car-cash-suv", props["OTHER TEXT"]); ASSERT_EQ("cash cash", props["MORE TEXT"]); ASSERT_EQ(" ", props["BLANK TEXT"]); ASSERT_EQ("financial; vehicle", props["TAGS"]); // tags added in alphabetical order - ASSERT_EQ("car; cash", props["TEXT TRIGGER WORDS"]); // words added in alphabetical order - ASSERT_EQ("5-7; 0-3", props["TEXT TRIGGER WORDS OFFSET"]); // offsets line up with words + ASSERT_EQ("cash", props["TEXT FINANCIAL TRIGGER WORDS"]); // words added in alphabetical order + ASSERT_EQ("0-3", props["TEXT FINANCIAL TRIGGER WORDS OFFSET"]); // offsets line up with words + ASSERT_EQ("car; suv", props["TEXT VEHICLE TRIGGER WORDS"]); + ASSERT_EQ("5-7; 9-11", props["TEXT VEHICLE TRIGGER WORDS OFFSET"]); - ASSERT_EQ("car; cash", props["OTHER TEXT TRIGGER WORDS"]); // words added in alphabetical order - ASSERT_EQ("0-2; 4-7", props["OTHER TEXT TRIGGER WORDS OFFSET"]); // offsets line up with words + ASSERT_EQ("cash", props["OTHER TEXT FINANCIAL TRIGGER WORDS"]); // words added in alphabetical order + ASSERT_EQ("4-7", props["OTHER TEXT FINANCIAL TRIGGER WORDS OFFSET"]); // offsets line up with words + ASSERT_EQ("car; suv", props["OTHER TEXT VEHICLE TRIGGER WORDS"]); + ASSERT_EQ("0-2; 9-11", props["OTHER TEXT VEHICLE TRIGGER WORDS OFFSET"]); - ASSERT_EQ("cash", props["MORE TEXT TRIGGER WORDS"]); - ASSERT_EQ("0-3, 5-8", props["MORE TEXT TRIGGER WORDS OFFSET"]); // offsets are in ascending order + ASSERT_EQ("cash", props["MORE TEXT FINANCIAL TRIGGER WORDS"]); + ASSERT_EQ("0-3, 5-8", props["MORE TEXT FINANCIAL TRIGGER WORDS OFFSET"]); // offsets are in ascending order // "BLANK TEXT TRIGGER WORDS" and "BLANK TEXT TRIGGER WORDS OFFSET" are omitted since "BLANK TEXT" // is only whitespace. ASSERT_TRUE(tagger.Close()); } + +TEST(KEYWORDTAGGING, FeedForwardTags) { + KeywordTagging tagger; + tagger.SetRunDirectory("../plugin"); + ASSERT_TRUE(tagger.Init()); + + MPFGenericTrack track(0.9, + {{"TAGS", "FeedForwardTag"}, + {"BAR", "cash"}}); + MPFGenericJob job("JOB NAME", "/some/path", track, + { { "FEED_FORWARD_PROP_TO_PROCESS", "FOO,BAR" } }, {}); + + std::vector results = tagger.GetDetections(job); + ASSERT_EQ(1, results.size()); + ASSERT_EQ(track.confidence, results.at(0).confidence); + + Properties props = results.at(0).detection_properties; + ASSERT_EQ(4, props.size()); + ASSERT_EQ("feedforwardtag; financial", props["TAGS"]); +} diff --git a/python/TransformerTagging/Dockerfile b/python/TransformerTagging/Dockerfile new file mode 100644 index 00000000..8eaeb18c --- /dev/null +++ b/python/TransformerTagging/Dockerfile @@ -0,0 +1,68 @@ +# syntax=docker/dockerfile:1.2 + +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2023 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +ARG BUILD_REGISTRY +ARG BUILD_TAG=latest +FROM ${BUILD_REGISTRY}openmpf_python_executor_ssb:${BUILD_TAG} + +ARG RUN_TESTS=false + +# needed to build sentence transformer wheel file +RUN pip install --no-cache-dir \ + 'wheel' + +RUN pip install --no-cache-dir \ + 'nltk' \ + 'sentence_transformers' \ + 'pandas' + +# copy over punkt files +RUN python -c \ + "import nltk; nltk.download('punkt')" + +# download sentence transformer model +RUN --mount=type=tmpfs,target=/tmp/models \ + mkdir -p "/models/all-mpnet-base-v2"; \ + python -c \ + "import os; \ + os.environ['REQUESTS_CA_BUNDLE']='/etc/ssl/certs/ca-certificates.crt'; \ + from sentence_transformers import SentenceTransformer; \ + model = SentenceTransformer('all-mpnet-base-v2', cache_folder='/tmp/models'); \ + model.save('/models/all-mpnet-base-v2')" + +RUN --mount=target=.,readwrite \ + install-component.sh; \ + if [ "${RUN_TESTS,,}" == true ]; then python tests/test_transformer_tagging.py; fi + + +LABEL org.label-schema.license="Apache 2.0" \ + org.label-schema.name="OpenMPF Transformer Tagging" \ + org.label-schema.schema-version="1.0" \ + org.label-schema.url="https://openmpf.github.io" \ + org.label-schema.vcs-url="https://github.com/openmpf/openmpf-components" \ + org.label-schema.vendor="MITRE" diff --git a/python/TransformerTagging/README.md b/python/TransformerTagging/README.md new file mode 100644 index 00000000..fd44c515 --- /dev/null +++ b/python/TransformerTagging/README.md @@ -0,0 +1,149 @@ +# Overview + +This repository contains source code for the OpenMPF Transformer Tagging component. + +This component uses a user-specified corpus JSON file to match known phrases against +each sentence in the input text data. This is done by generating an embedding for each +phrase in the corpus and comparing that against the embedding for each sentence of the +input text. The comparison generates a score based on how similar the content is. +This is based on how the underlying +[all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) +model was trained on a variety of text data in order to understand the commonalities +in phrasing, subject, and context. The sentences that generate scores above the threshold +are called "trigger sentences". These sentences are grouped by "tag" based on which entry +in the corpus they matched against. + +This component can be used independently to perform transformer tagging on text +files, or it can be used as a support component in a multi-stage pipeline to +perform transformer tagging on feed-forward detections generated by some other +component. + +# Inputs + +The transformer tagger will run on all input properties listed in the +`FEED_FORWARD_PROP_TO_PROCESS`. If there are feed-forward detections generated from +an upstream component in a multi-stage pipeline, the output properties from that +component are preserved. This means that if those detections have a `TEXT` output +property, this component will generate detections with the same `TEXT` output. +Similarly, if those detections have a `TRANSLATION` output property, then this +component will generate detections with the same `TRANSLATION` output. If none of the +input properties are present then the transformer tagging is not performed then the +feed-forward detection is returned unmodified. + +The reported detections that are returned by the transformer tagger are based on the +corpus used, and the minimum score defined in the `SCORE_THRESHOLD` property, as +discussed below. + +# Corpus File + +Transformer patterns are specified in a JSON corpus file. By default this is +`transformer_text_tags_corpus.json`. Alternativley, the path to the corpus file can +be changed by setting the `TRANSFORMER_TAGGING_CORPUS` property. + +In the corpus file, users can specify sentence patterns to compare against using the +following syntax: + +```json +[ + { + "text": "This sentence is dog.", + "tag": "dog" + } +] +``` + +Where the `text` field specifies a sentence to compare each input sentence against. If +the match score meets the `SCORE_THRESHOLD` property, then the value of the `tag` field +will be added to the list in the `TAGS` output property. + +Multiple patterns can be specified with a comma-separated list: + +```json +[ + { + "text": "This sentence is dog.", + "tag": "dog" + }, + { + "text": "My favorite animal is a corgi.", + "tag": "dog" + }, + { + "text": "This sentence is cat.", + "tag": "cat" + }, + ... +] +``` + +# Outputs + +When performing transformer tagging on a text file, the contents of the file will be +stored in a `TEXT` output property. When performing transformer tagging on +feed-forward detections generated from some other component in a multi-stage +pipeline, the output properties from that component will be preserved.This +means that if those detections have a `TEXT` output property, then this +component will generate detections with the same `TEXT` output. Similarly, if +those detections have a `TRANSLATION` output property, then this component will +generate detections with the same `TRANSLATION` output. + +Each input property listed in `FEED_FORWARD_PROP_TO_PROCESS` that's present, and +not just whitespace, which has sentences that scored high enough against entries in +the corpus file, will result in the following output properties: + +- `TEXT [TAG] TRIGGER SENTENCES` +- `TEXT [TAG] TRIGGER SENTENCES OFFSET` +- `TEXT [TAG] TRIGGER SENTENCES SCORE` +- `TRANSLATION [TAG] TRIGGER SENTENCES` +- `TRANSLATION [TAG] TRIGGER SENTENCES OFFSET` +- `TRANSLATION [TAG] TRIGGER SENTENCES SCORE` + +The `[TAG]` value in each of the output properties above will be the `tag` +value from the corpus file that the trigger sentence scored against. + +The tags associated with the trigger sentences will be stored in a `TAGS` output +property, separated by semicolons. Note that there is only one `TAGS` output +property. This is unlike `TRIGGER SENTENCES` and `TRIGGER SENTENCES OFFSET`, which are +prefixed by the input property that produced those trigger sentences. Each tag will only +appear once in `TAGS` no matter how many trigger sentences activate that tag. It doesn't +matter if the trigger sentences are found in only one or multiple input properties defined +in `FEED_FORWARD_PROP_TO_PROCESS`. + +When the `TEXT` property is processed, the input sentence(s) that triggered each tag will +be stored in `TEXT [TAG] TRIGGER SENTENCES`. Note that because semicolons can be part of +the trigger sentence itself, those semicolons will be encapsulated in brackets. For +example, `This sentence has has a semicolon;` in the input `TEXT` is reported as: +`TEXT [TAG] TRIGGER SENTENCES=This sentence has has a semicolon[;]; other trigger sentence`. + +For each trigger sentence in `TEXT`, the substring index range will be stored in +`TEXT [TAG] TRIGGER SENTENCES OFFSET`. Each group of indexes, referring to the same +trigger sentence reported in sequence, is separated by a semicolon followed by a space. +Indexes within a single group are separated by commas. For example: + +``` +TEXT [TAG] TRIGGER SENTENCES=trigger sentence 1; trigger sentence 2 +TEXT [TAG] TRIGGER SENTENCES OFFSET=0-17, 40-57; 112-129 +``` + +This means that `trigger sentence 1` occurs twice in the text at the index ranges +0-17 and 40-57, and `trigger sentence 2` occurs once at index range 112-129. + +When `ENABLE_DEBUG` is set to true, the output properties will also include a +`TRIGGER SENTENCES MATCHES` property containing a semicolon-separated list of the +`text` sentences in the corpus that were triggered for that tag: + +- `TEXT [TAG] TRIGGER SENTENCES` +- `TEXT [TAG] TRIGGER SENTENCES MATCHES` +- `TEXT [TAG] TRIGGER SENTENCES OFFSET` +- `TEXT [TAG] TRIGGER SENTENCES SCORE` +- `TRANSLATION [TAG] TRIGGER SENTENCES` +- `TRANSLATION [TAG] TRIGGER SENTENCES MATCHES` +- `TRANSLATION [TAG] TRIGGER SENTENCES OFFSET` +- `TRANSLATION [TAG] TRIGGER SENTENCES SCORE` + +For example: + +``` +TEXT [TAG] TRIGGER SENTENCES=trigger sentence 1; trigger sentence 2 +TEXT [TAG] TRIGGER SENTENCES MATCHES=Corpus sentence matching trigger sentence 1; Corpus sentence matching trigger sentence 2 +``` \ No newline at end of file diff --git a/python/TransformerTagging/plugin-files/descriptor/descriptor.json b/python/TransformerTagging/plugin-files/descriptor/descriptor.json new file mode 100644 index 00000000..74810ebd --- /dev/null +++ b/python/TransformerTagging/plugin-files/descriptor/descriptor.json @@ -0,0 +1,98 @@ +{ + "componentName": "TransformerTagging", + "componentVersion": "8.0", + "middlewareVersion": "8.0", + "sourceLanguage": "python", + "batchLibrary": "TransformerTagging", + "environmentVariables": [], + "algorithm": { + "name": "TRANSFORMERTAGGING", + "description": "Uses SentenceTransformers to tag sentences.", + "actionType": "DETECTION", + "trackType": "TEXT", + "requiresCollection": { + "states": [] + }, + "providesCollection": { + "states": [ + "DETECTION", + "DETECTION_TAGGING", + "DETECTION_TAGGING_TRANSFORMER" + ], + "properties": [ + { + "name": "FEED_FORWARD_PROP_TO_PROCESS", + "description": "Comma-separated list of property names indicating which properties in the feed-forward track or detection to consider translating. If the first property listed is present, then that property will be translated. If it's not, then the next property in the list is considered. At most, one property will be translated.", + "type": "STRING", + "defaultValue": "TEXT,TRANSCRIPT,TRANSLATION" + }, + { + "name": "SCORE_THRESHOLD", + "description": "The minimum score score which must be met or exceeded. Tags below this threshold are silently discarded.", + "type": "DOUBLE", + "defaultValue": "0.3" + }, + { + "name": "TRANSFORMER_TAGGING_CORPUS", + "description": "Name of a JSON file that describes a tag hierarchy to be used for matching sentences. Will default to the plugin's config folder unless an alternate path to corpus file is specified (i.e. `$MPF_HOME/.../transformer_text_tags_corpus.json`).", + "type": "STRING", + "defaultValue": "transformer_text_tags_corpus.json" + }, + { + "name": "ENABLE_DEBUG", + "description": "If true, each detection will include a `TRIGGER SENTENCES MATCHES` property for each entry in `TAGS`. The value will be the sentences in the corpus which met the score threshold for that tag.", + "type": "BOOLEAN", + "defaultValue": "FALSE" + } + ] + } + }, + "actions": [ + { + "name": "TRANSFORMER TAGGING TEXT FILE ACTION", + "description": "Performs transformer tagging on a plain text file.", + "algorithm": "TRANSFORMERTAGGING", + "properties": [] + }, + { + "name": "TRANSFORMER TAGGING (WITH FF REGION) ACTION", + "description": "Performs transformer tagging on feed-forward tracks and detections.", + "algorithm": "TRANSFORMERTAGGING", + "properties": [ + { + "name": "FEED_FORWARD_TYPE", + "value": "REGION" + }, + { + "name": "OUTPUT_MERGE_WITH_PREVIOUS_TASK", + "value": "TRUE" + } + ] + } + ], + "tasks": [ + { + "name": "TRANSFORMER TAGGING TEXT FILE TASK", + "description": "Performs transformer tagging on a plain text file.", + "actions": [ + "TRANSFORMER TAGGING TEXT FILE ACTION" + ] + }, + { + "name": "TRANSFORMER TAGGING (WITH FF REGION) TASK", + "description": "Performs transformer tagging on feed-forward tracks and detections.", + "actions": [ + "TRANSFORMER TAGGING (WITH FF REGION) ACTION" + ] + } + ], + "pipelines": [ + { + "name": "TRANSFORMER TAGGING TEXT FILE PIPELINE", + "description": "Performs transformer tagging on a plain text file.", + "tasks": [ + "TRANSFORMER TAGGING TEXT FILE TASK" + ] + } + ] +} \ No newline at end of file diff --git a/python/TransformerTagging/pyproject.toml b/python/TransformerTagging/pyproject.toml new file mode 100644 index 00000000..49566867 --- /dev/null +++ b/python/TransformerTagging/pyproject.toml @@ -0,0 +1,29 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2023 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/python/TransformerTagging/setup.cfg b/python/TransformerTagging/setup.cfg new file mode 100644 index 00000000..2871a22f --- /dev/null +++ b/python/TransformerTagging/setup.cfg @@ -0,0 +1,45 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2023 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +[metadata] +name = TransformerTagging +version = 8.0 + +[options] +packages = transformer_tagging_component +install_requires = + mpf_component_api>=8.0 + mpf_component_util>=8.0 + nltk + sentence_transformers + pandas + +[options.entry_points] +mpf.exported_component = + component = transformer_tagging_component.transformer_tagging_component:TransformerTaggingComponent + +[options.package_data] +transformer_tagging_component=transformer_text_tags_corpus.json diff --git a/python/TransformerTagging/tests/config/custom_corpus.json b/python/TransformerTagging/tests/config/custom_corpus.json new file mode 100644 index 00000000..b37da89e --- /dev/null +++ b/python/TransformerTagging/tests/config/custom_corpus.json @@ -0,0 +1,10 @@ +[ + { + "text": "This sentence is beach.", + "tag": "beach" + }, + { + "text": "This sentence is forest.", + "tag": "forest" + } +] \ No newline at end of file diff --git a/python/TransformerTagging/tests/data/simple_input.txt b/python/TransformerTagging/tests/data/simple_input.txt new file mode 100644 index 00000000..3b798406 --- /dev/null +++ b/python/TransformerTagging/tests/data/simple_input.txt @@ -0,0 +1,3 @@ +I drove to the beach today and will be staying overnight at a hotel. I texted my friend before I left so she could look +after my cats. She will drop by to check on them after stopping by the bank. I plan to spend all day at the beach +tomorrow. \ No newline at end of file diff --git a/python/TransformerTagging/tests/test_transformer_tagging.py b/python/TransformerTagging/tests/test_transformer_tagging.py new file mode 100644 index 00000000..0214f97f --- /dev/null +++ b/python/TransformerTagging/tests/test_transformer_tagging.py @@ -0,0 +1,342 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2023 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +from pathlib import Path +import logging +import unittest + +import mpf_component_api as mpf + +from transformer_tagging_component import TransformerTaggingComponent + + +TEST_DATA = Path(__file__).parent / 'data' +TEST_CONFIG = Path(__file__).parent / 'config' + +logging.basicConfig(level=logging.DEBUG) + +SHORT_SAMPLE = ( + 'I drove to the beach today and will be staying overnight at a hotel. ' + 'I texted my friend before I left so she could look after my cats. ' + 'She will drop by to check on them after stopping by the bank. ' + 'I plan to spend all day at the beach tomorrow.' +) + +SHORT_SAMPLE_TAGS = "travel" +SHORT_SAMPLE_TRIGGER_SENTENCES = "I drove to the beach today and will be staying overnight at a hotel." +SHORT_SAMPLE_OFFSET = "0-67" +SHORT_SAMPLE_SCORE = 0.4680028557777405 + + +class TestTransformerTagging(unittest.TestCase): + + def test_generic_job(self): + ff_track = mpf.GenericTrack(-1, dict(TEXT=SHORT_SAMPLE)) + job = mpf.GenericJob('Test Generic', 'test.pdf', {}, {}, ff_track) + comp = TransformerTaggingComponent() + result = comp.get_detections_from_generic(job) + + self.assertEqual(1, len(result)) + + props = result[0].detection_properties + + self.assertEqual(SHORT_SAMPLE_TAGS, props["TAGS"]) + self.assertEqual(SHORT_SAMPLE_TRIGGER_SENTENCES, props["TEXT TRAVEL TRIGGER SENTENCES"]) + self.assertEqual(SHORT_SAMPLE_OFFSET, props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"]) + self.assertAlmostEqual(SHORT_SAMPLE_SCORE, float(props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]), places=3) + + def test_plaintext_job(self): + job = mpf.GenericJob('Test Plaintext', str(TEST_DATA / 'simple_input.txt'), {}, {}) + comp = TransformerTaggingComponent() + result = comp.get_detections_from_generic(job) + + self.assertEqual(1, len(result)) + + props = result[0].detection_properties + + self.assertEqual(SHORT_SAMPLE_TAGS, props["TAGS"]) + self.assertEqual(SHORT_SAMPLE_TRIGGER_SENTENCES, props["TEXT TRAVEL TRIGGER SENTENCES"]) + self.assertEqual(SHORT_SAMPLE_OFFSET, props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"]) + self.assertAlmostEqual(SHORT_SAMPLE_SCORE, float(props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]), places=3) + + def test_audio_job(self): + ff_track = mpf.AudioTrack(0, 1, -1, dict(TEXT=SHORT_SAMPLE)) + job = mpf.AudioJob('Test Audio', 'test.wav', 0, 1, {}, {}, ff_track) + comp = TransformerTaggingComponent() + result = comp.get_detections_from_audio(job) + + self.assertEqual(1, len(result)) + + props = result[0].detection_properties + + self.assertEqual(SHORT_SAMPLE_TAGS, props["TAGS"]) + self.assertEqual(SHORT_SAMPLE_TRIGGER_SENTENCES, props["TEXT TRAVEL TRIGGER SENTENCES"]) + self.assertEqual(SHORT_SAMPLE_OFFSET, props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"]) + self.assertAlmostEqual(SHORT_SAMPLE_SCORE, float(props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]), places=3) + + def test_image_job(self): + ff_loc = mpf.ImageLocation(0, 0, 10, 10, -1, dict(TEXT=SHORT_SAMPLE)) + job = mpf.ImageJob('Test Image', 'test.jpg', {}, {}, ff_loc) + comp = TransformerTaggingComponent() + result = comp.get_detections_from_image(job) + + self.assertEqual(1, len(result)) + + props = result[0].detection_properties + + self.assertEqual(SHORT_SAMPLE_TAGS, props["TAGS"]) + self.assertEqual(SHORT_SAMPLE_TRIGGER_SENTENCES, props["TEXT TRAVEL TRIGGER SENTENCES"]) + self.assertEqual(SHORT_SAMPLE_OFFSET, props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"]) + self.assertAlmostEqual(SHORT_SAMPLE_SCORE, float(props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]), places=3) + + def test_video_job(self): + ff_track = mpf.VideoTrack( + 0, 1, -1, + { + 0: mpf.ImageLocation(0, 0, 10, 10, -1, dict(TEXT=SHORT_SAMPLE)), + 1: mpf.ImageLocation(0, 10, 10, 10, -1, dict(TRANSCRIPT=SHORT_SAMPLE)) + }, + dict(TEXT=SHORT_SAMPLE)) + job = mpf.VideoJob('Test Video', 'test.mp4', 0, 1, {}, {}, ff_track) + comp = TransformerTaggingComponent() + result = comp.get_detections_from_video(job) + + self.assertEqual(1, len(result)) + + props = result[0].detection_properties + self.assertEqual(SHORT_SAMPLE_TAGS, props["TAGS"]) + self.assertEqual(SHORT_SAMPLE_TRIGGER_SENTENCES, props["TEXT TRAVEL TRIGGER SENTENCES"]) + self.assertEqual(SHORT_SAMPLE_OFFSET, props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"]) + self.assertAlmostEqual(SHORT_SAMPLE_SCORE, float(props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]), places=3) + + frame_1_props = result[0].frame_locations[0].detection_properties + self.assertEqual(SHORT_SAMPLE_TAGS, frame_1_props["TAGS"]) + self.assertEqual(SHORT_SAMPLE_TRIGGER_SENTENCES, frame_1_props["TEXT TRAVEL TRIGGER SENTENCES"]) + self.assertEqual(SHORT_SAMPLE_OFFSET, frame_1_props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"]) + self.assertAlmostEqual(SHORT_SAMPLE_SCORE, float(props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]), places=3) + + frame_2_props = result[0].frame_locations[1].detection_properties + self.assertEqual(SHORT_SAMPLE_TAGS, frame_2_props["TAGS"]) + self.assertEqual(SHORT_SAMPLE_TRIGGER_SENTENCES, frame_2_props["TRANSCRIPT TRAVEL TRIGGER SENTENCES"]) + self.assertEqual(SHORT_SAMPLE_OFFSET, frame_2_props["TRANSCRIPT TRAVEL TRIGGER SENTENCES OFFSET"]) + self.assertAlmostEqual(SHORT_SAMPLE_SCORE, float(frame_2_props["TRANSCRIPT TRAVEL TRIGGER SENTENCES SCORE"]), places=3) + + def test_no_feed_forward_location(self): + comp = TransformerTaggingComponent() + job = mpf.ImageJob('Test', 'test.jpg', {}, {}) + + with self.assertRaises(mpf.DetectionException) as cm: + list(comp.get_detections_from_image(job)) + self.assertEqual(mpf.DetectionError.UNSUPPORTED_DATA_TYPE, cm.exception.error_code) + + def test_no_feed_forward_track(self): + comp = TransformerTaggingComponent() + job = mpf.VideoJob('test', 'test.mp4', 0, 1, {}, {}) + + with self.assertRaises(mpf.DetectionException) as cm: + list(comp.get_detections_from_video(job)) + self.assertEqual(mpf.DetectionError.UNSUPPORTED_DATA_TYPE, cm.exception.error_code) + + job = mpf.AudioJob('Test Audio', 'test.wav', 0, 1, {}, {}) + with self.assertRaises(mpf.DetectionException) as cm: + list(comp.get_detections_from_audio(job)) + self.assertEqual(mpf.DetectionError.UNSUPPORTED_DATA_TYPE, cm.exception.error_code) + + def test_custom_confidence_threshold(self): + ff_loc = mpf.ImageLocation(0, 0, 10, 10, -1, dict(TEXT=SHORT_SAMPLE)) + job = mpf.ImageJob('Test Image', 'test.jpg', dict(SCORE_THRESHOLD=".2"), {}, ff_loc) + + comp = TransformerTaggingComponent() + result = comp.get_detections_from_image(job) + + self.assertEqual(1, len(result)) + + props = result[0].detection_properties + + self.assertEqual("TRAVEL; FINANCIAL".casefold(), props["TAGS"]) + self.assertEqual(SHORT_SAMPLE_TRIGGER_SENTENCES, props["TEXT TRAVEL TRIGGER SENTENCES"]) + self.assertEqual(SHORT_SAMPLE_OFFSET, props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"]) + self.assertAlmostEqual(SHORT_SAMPLE_SCORE, float(props["TEXT TRAVEL TRIGGER SENTENCES SCORE"]), places=3) + + custom_threshold_sentence = "She will drop by to check on them after stopping by the bank." + custom_threshold_sentence_offset = "135-195" + custom_threshold_sentence_score = 0.2906474769115448 + + self.assertEqual(custom_threshold_sentence, props["TEXT FINANCIAL TRIGGER SENTENCES"]) + self.assertEqual(custom_threshold_sentence_offset, props["TEXT FINANCIAL TRIGGER SENTENCES OFFSET"]) + self.assertAlmostEqual(custom_threshold_sentence_score, float(props["TEXT FINANCIAL TRIGGER SENTENCES SCORE"]), places=3) + + def test_custom_tagging_file(self): + ff_loc = mpf.ImageLocation(0, 0, 10, 10, -1, dict(TEXT=SHORT_SAMPLE)) + job = mpf.ImageJob('Test Image', 'test.jpg', + dict(TRANSFORMER_TAGGING_CORPUS=str(TEST_CONFIG / "custom_corpus.json")), {}, ff_loc) + + comp = TransformerTaggingComponent() + result = comp.get_detections_from_image(job) + + self.assertEqual(1, len(result)) + + props = result[0].detection_properties + + beach_sentences = 'I drove to the beach today and will be staying overnight at a hotel.; ' \ + 'I plan to spend all day at the beach tomorrow.' + + beach_score_1 = 0.4417020082473755 + beach_score_2 = 0.4624265432357788 + beach_score_result_1, beach_score_result_2 = props["TEXT BEACH TRIGGER SENTENCES SCORE"].split(";") + + self.assertEqual("beach", props["TAGS"]) + self.assertEqual(beach_sentences, props["TEXT BEACH TRIGGER SENTENCES"]) + self.assertEqual('0-67; 197-242', props["TEXT BEACH TRIGGER SENTENCES OFFSET"]) + self.assertAlmostEqual(beach_score_1, float(beach_score_result_1), places=3) + self.assertAlmostEqual(beach_score_2, float(beach_score_result_2), places=3) + + def test_debugging_show_matches(self): + ff_loc = mpf.ImageLocation(0, 0, 10, 10, -1, dict(TEXT=SHORT_SAMPLE)) + job = mpf.ImageJob('Test Image', 'test.jpg', {}, {}, ff_loc) + comp = TransformerTaggingComponent() + result = comp.get_detections_from_image(job) + + self.assertEqual(1, len(result)) + props = result[0].detection_properties + self.assertTrue("TEXT TRAVEL TRIGGER SENTENCES MATCHES" not in props) + + job = mpf.ImageJob('Test Image', 'test.jpg', dict(ENABLE_DEBUG="TRUE"), {}, ff_loc) + comp = TransformerTaggingComponent() + result = comp.get_detections_from_image(job) + + self.assertEqual(1, len(result)) + props = result[0].detection_properties + self.assertTrue("TEXT TRAVEL TRIGGER SENTENCES MATCHES" in props) + self.assertEqual("This sentence is hotel.", props["TEXT TRAVEL TRIGGER SENTENCES MATCHES"]) + + def test_missing_property_to_process(self): + ff_loc = mpf.ImageLocation(0, 0, 10, 10, -1, dict(INPUT="some input")) + job = mpf.ImageJob('Test Image', 'test.jpg', {}, {}, ff_loc) + + comp = TransformerTaggingComponent() + result = comp.get_detections_from_image(job) + + self.assertEqual(1, len(result)) + + self.assertEqual(ff_loc.x_left_upper, result[0].x_left_upper) + self.assertEqual(ff_loc.y_left_upper, result[0].y_left_upper) + self.assertEqual(ff_loc.width, result[0].width) + self.assertEqual(ff_loc.height, result[0].height) + self.assertEqual(ff_loc.confidence, result[0].confidence) + self.assertEqual(ff_loc.detection_properties, result[0].detection_properties) + + def test_missing_text_to_process(self): + ff_loc = mpf.ImageLocation(0, 0, 10, 10, -1, dict(TEXT="")) + job = mpf.ImageJob('Test Image', 'test.jpg', {}, {}, ff_loc) + comp = TransformerTaggingComponent() + result = comp.get_detections_from_image(job) + + self.assertEqual(1, len(result)) + + def test_maintain_tags_from_earlier_feedforward_task(self): + ff_track = mpf.GenericTrack(-1, dict(TEXT=SHORT_SAMPLE)) + job = mpf.GenericJob('Test Generic', 'test.pdf', {}, {}, ff_track) + # add tags + firstTag = "FIRST_TAG" + job.feed_forward_track.detection_properties["TAGS"] = firstTag + comp = TransformerTaggingComponent() + result = comp.get_detections_from_generic(job) + + self.assertEqual(1, len(result)) + + props = result[0].detection_properties + expectedTags = firstTag + "; " + SHORT_SAMPLE_TAGS + + self.assertEqual(expectedTags, props["TAGS"]) + + def test_matches_with_semicolons(self): + SEMICOLON_SAMPLE = ( + 'I drove to the beach today; it was a long drive. ' + ) + ff_track = mpf.GenericTrack(-1, dict(TEXT=SEMICOLON_SAMPLE)) + job = mpf.GenericJob('Test Generic', 'test.pdf', {}, {}, ff_track) + comp = TransformerTaggingComponent() + result = comp.get_detections_from_generic(job) + + self.assertEqual(1, len(result)) + props = result[0].detection_properties + + expected_output = "I drove to the beach today[;] it was a long drive." + self.assertEqual(expected_output, props["TEXT TRAVEL TRIGGER SENTENCES"]) + + def test_repeat_trigger_job(self): + sample = ( + 'I drove to the beach today and will be staying overnight at a hotel. ' + 'I drove to the beach today and will be staying overnight at a hotel. ' + 'I texted my friend before I left so she could look after my cats. ' + 'I am going to the airport tomorrow. ' + 'I plan to spend all day at the beach tomorrow. ' + 'This airline serves peanuts. ' + 'I am going to the airport tomorrow. ' + ) + + trigger_sentences = ( + 'I drove to the beach today and will be staying overnight at a hotel.; ' + 'I am going to the airport tomorrow.; ' + 'This airline serves peanuts.' + ) + + offsets = "0-67, 69-136; 204-238, 316-350; 287-314" + + score_1 = 0.4680027663707733 + score_2 = 0.5079247951507568 + score_3 = 0.5265363454818726 + + matches = ( + 'This sentence is hotel.; ' + 'This sentence is airport.; ' + 'This sentence is airline.' + ) + + ff_track = mpf.GenericTrack(-1, dict(TEXT=sample)) + job = mpf.GenericJob('Test Repeat', 'test.pdf', \ + dict(ENABLE_DEBUG='true', SCORE_THRESHOLD='0.4'), {}, ff_track) + comp = TransformerTaggingComponent() + result = comp.get_detections_from_generic(job) + + self.assertEqual(1, len(result)) + + props = result[0].detection_properties + + self.assertEqual("travel", props["TAGS"]) + self.assertEqual(trigger_sentences, props["TEXT TRAVEL TRIGGER SENTENCES"]) + self.assertEqual(offsets, props["TEXT TRAVEL TRIGGER SENTENCES OFFSET"]) + + score_result_1, score_result_2, score_result_3 = props["TEXT TRAVEL TRIGGER SENTENCES SCORE"].split(";") + self.assertAlmostEqual(score_1, float(score_result_1), places=3) + self.assertAlmostEqual(score_2, float(score_result_2), places=3) + self.assertAlmostEqual(score_3, float(score_result_3), places=3) + + self.assertAlmostEqual(matches, props["TEXT TRAVEL TRIGGER SENTENCES MATCHES"]) + +if __name__ == '__main__': + unittest.main() diff --git a/python/TransformerTagging/transformer_tagging_component/__init__.py b/python/TransformerTagging/transformer_tagging_component/__init__.py new file mode 100644 index 00000000..34a90aea --- /dev/null +++ b/python/TransformerTagging/transformer_tagging_component/__init__.py @@ -0,0 +1,27 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2022 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2022 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +from .transformer_tagging_component import TransformerTaggingComponent \ No newline at end of file diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py new file mode 100644 index 00000000..65114269 --- /dev/null +++ b/python/TransformerTagging/transformer_tagging_component/transformer_tagging_component.py @@ -0,0 +1,260 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2023 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +import logging + +import mpf_component_api as mpf +import mpf_component_util as mpf_util + +from sentence_transformers import SentenceTransformer, util + +from typing import Sequence, Dict, Mapping +import os +import time + +from pkg_resources import resource_filename +from nltk.tokenize.punkt import PunktSentenceTokenizer +import pandas as pd + +logger = logging.getLogger('TransformerTaggingComponent') + +class TransformerTaggingComponent: + + def __init__(self): + self._cached_model = SentenceTransformer('/models/all-mpnet-base-v2') + self._cached_corpuses: Dict[str, Corpus] = {} + + + def get_detections_from_video(self, job: mpf.VideoJob) -> Sequence[mpf.VideoTrack]: + logger.info(f'Received video job.') + + return self._get_feed_forward_detections(job, job.feed_forward_track, video_job=True) + + + def get_detections_from_image(self, job: mpf.ImageJob) -> Sequence[mpf.ImageLocation]: + logger.info(f'Received image job.') + + return self._get_feed_forward_detections(job, job.feed_forward_location) + + + def get_detections_from_audio(self, job: mpf.AudioJob) -> Sequence[mpf.AudioTrack]: + logger.info(f'Received audio job.') + + return self._get_feed_forward_detections(job, job.feed_forward_track) + + + def get_detections_from_generic(self, job: mpf.GenericJob) -> Sequence[mpf.GenericTrack]: + logger.info(f'Received generic job.') + + if job.feed_forward_track: + return self._get_feed_forward_detections(job, job.feed_forward_track) + else: + logger.info('Job did not contain a feed forward track. Assuming ' + 'media file is a plain text file containing the text to ' + 'be tagged.') + + # preserve line endings in the original text, such as '\r\n' + with open(job.data_uri, 'r', newline='') as f: + text = f.read() + + new_ff_props = dict(TEXT=text) + ff_track = mpf.GenericTrack(detection_properties=new_ff_props) + + new_job_props = { + **job.job_properties, + 'FEED_FORWARD_PROP_TO_PROCESS': 'TEXT' + } + + config = JobConfig(new_job_props) + corpus = self._get_corpus(config.corpus_path) + self._add_tags(config, corpus, new_ff_props) + + return [ff_track] + + + def _get_feed_forward_detections(self, job, job_feed_forward, video_job=False): + try: + if job_feed_forward is None: + raise mpf.DetectionError.UNSUPPORTED_DATA_TYPE.exception( + f'Component can only process feed forward ' + ' jobs, but no feed forward track provided. ') + + config = JobConfig(job.job_properties) + corpus = self._get_corpus(config.corpus_path) + + self._add_tags(config, corpus, job_feed_forward.detection_properties) + + if video_job: + for ff_location in job.feed_forward_track.frame_locations.values(): + self._add_tags(config, corpus, ff_location.detection_properties) + + return [job_feed_forward] + + except Exception: + logger.exception( + f'Failed to complete job due to the following exception:') + raise + + + def _get_corpus(self, corpus_path): + if not corpus_path in self._cached_corpuses: + self._cached_corpuses[corpus_path] = Corpus(corpus_path, self._cached_model) + + return self._cached_corpuses[corpus_path] + + + def _add_tags(self, config, corpus, ff_props: Dict[str, str]): + for prop_to_tag in config.props_to_process: + input_text = ff_props.get(prop_to_tag, None) + if input_text: + break + elif input_text == "": + logger.warning(f'No {prop_to_tag.lower()} to tag found in track.') + break + else: + logger.warning("Feed forward element missing one of the following properties: " + + ", ".join(config.props_to_process)) + return + + all_tag_results = [] + + # for each sentence in input + for start, end in PunktSentenceTokenizer().span_tokenize(input_text): + probe_sent = input_text[start:end] + + # get similarity scores for the input sentence with each corpus sentence + probe_sent_embed = self._cached_model.encode(probe_sent, convert_to_tensor=True, show_progress_bar=False) + scores = [float(util.cos_sim(probe_sent_embed, corpus_sent_embed)) for corpus_sent_embed in corpus.embed] + + probe_df = pd.DataFrame({ + "input text": probe_sent, + "corpus text": corpus.json["text"], + "tag": corpus.json["tag"].str.lower(), + "score": scores, + "offset": str(start) + "-" + str(end - 1) + }) + + # sort by score then group by tag so each group will be sorted highest to lowest score, + # then take top row for each group + probe_df = probe_df.sort_values(by=['score'], ascending=False) + top_per_tag = probe_df.groupby(['tag'], sort=False).head(1) + + # filter out results that are below threshold + top_per_tag_threshold = top_per_tag[top_per_tag["score"] >= config.threshold] + all_tag_results.append(top_per_tag_threshold) + + # if no tags found in text return + if not all_tag_results: + return + + all_tag_results = pd.concat(all_tag_results) + + # create detection properties for each tag found in the text + # detection properties formatted as TRIGGER SENTENCES... + for tag in all_tag_results["tag"].unique(): + tag_df = all_tag_results[all_tag_results["tag"] == tag] + + if "TAGS" in ff_props: + # only add tag if it is not already in ff_props["TAGS"], else do nothing + if tag.casefold() not in ff_props["TAGS"].casefold(): + ff_props["TAGS"] = ff_props["TAGS"] + "; " + tag + else: + ff_props["TAGS"] = tag + + sents = [] + offsets = [] + scores = [] + matches = [] + + for input_text in tag_df["input text"].unique(): + input_text_df = tag_df[tag_df["input text"] == input_text] + + sents.append(input_text.replace(';', '[;]')) + offsets.append(", ".join(input_text_df["offset"])) + # all entries should have the same score, so just use the first + scores.append(input_text_df["score"].values[0].astype(str)) + + if config.debug: + # all entries should have the same match, so just use the first + matches.append(input_text_df["corpus text"].values[0].replace(';', '[;]')) + + prop_name_sent = prop_to_tag + " " + tag.upper() + " TRIGGER SENTENCES" + prop_name_offset = prop_name_sent + " OFFSET" + prop_name_score = prop_name_sent + " SCORE" + + ff_props[prop_name_sent] = "; ".join(sents) + ff_props[prop_name_offset] = "; ".join(offsets) + ff_props[prop_name_score] = "; ".join(scores) + + if config.debug: + prop_name_matches = prop_name_sent + " MATCHES" + ff_props[prop_name_matches] = "; ".join(matches) + + +class Corpus: + def __init__(self, corpus_path, model): + self.json = pd.read_json(corpus_path) + + start = time.time() + self.embed= model.encode(self.json["text"], convert_to_tensor=True, show_progress_bar=False) + elapsed = time.time() - start + logger.info(f"Successfully encoded corpus in {elapsed} seconds.") + + +class JobConfig: + def __init__(self, props: Mapping[str, str]): + + self.props_to_process = [ + prop.strip() for prop in + mpf_util.get_property( + properties=props, + key='FEED_FORWARD_PROP_TO_PROCESS', + default_value='TEXT,TRANSCRIPT,TRANSLATION', + prop_type=str + ).split(',') + ] + + self.threshold = mpf_util.get_property(props, 'SCORE_THRESHOLD', .3) + + # if debug is true will return which corpus sentences triggered the match + self.debug = mpf_util.get_property(props, 'ENABLE_DEBUG', False) + + self.corpus_file = \ + mpf_util.get_property(props, 'TRANSFORMER_TAGGING_CORPUS', "transformer_text_tags_corpus.json") + + self.corpus_path = "" + if "$" not in self.corpus_file and "/" not in self.corpus_file: + self.corpus_path = os.path.realpath(resource_filename(__name__, self.corpus_file)) + else: + self.corpus_path = os.path.expandvars(self.corpus_file) + + if not os.path.exists(self.corpus_path): + logger.exception('Failed to complete job due incorrect file path for the transformer tagging corpus: ' + f'"{self.corpus_file}"') + raise mpf.DetectionException( + 'Invalid path provided for transformer tagging corpus: ' + f'"{self.corpus_file}"', + mpf.DetectionError.COULD_NOT_READ_DATAFILE) diff --git a/python/TransformerTagging/transformer_tagging_component/transformer_text_tags_corpus.json b/python/TransformerTagging/transformer_tagging_component/transformer_text_tags_corpus.json new file mode 100644 index 00000000..a93be27a --- /dev/null +++ b/python/TransformerTagging/transformer_tagging_component/transformer_text_tags_corpus.json @@ -0,0 +1,282 @@ +[ + { + "text": "This sentence is auto.", + "tag": "vehicle" + }, + { + "text": "This sentence is bike.", + "tag": "vehicle" + }, + { + "text": "This sentence is bus.", + "tag": "vehicle" + }, + { + "text": "This sentence is car.", + "tag": "vehicle" + }, + { + "text": "This sentence is motor vehicle.", + "tag": "vehicle" + }, + { + "text": "This sentence is motorcycle.", + "tag": "vehicle" + }, + { + "text": "This sentence is suv.", + "tag": "vehicle" + }, + { + "text": "This sentence is truck.", + "tag": "vehicle" + }, + { + "text": "This sentence is trolley.", + "tag": "vehicle" + }, + { + "text": "This sentence is tram.", + "tag": "vehicle" + }, + { + "text": "This sentence is van.", + "tag": "vehicle" + }, + { + "text": "This sentence is vehicle.", + "tag": "vehicle" + }, + { + "text": "This sentence is vin.", + "tag": "vehicle" + }, + { + "text": "This sentence is finance.", + "tag": "financial" + }, + { + "text": "This sentence is financial.", + "tag": "financial" + }, + { + "text": "This sentence is bank.", + "tag": "financial" + }, + { + "text": "This sentence is ATM.", + "tag": "financial" + }, + { + "text": "This sentence is balance.", + "tag": "financial" + }, + { + "text": "This sentence is bill.", + "tag": "financial" + }, + { + "text": "This sentence is cash.", + "tag": "financial" + }, + { + "text": "This sentence is credit.", + "tag": "financial" + }, + { + "text": "This sentence is debit.", + "tag": "financial" + }, + { + "text": "This sentence is deposit.", + "tag": "financial" + }, + { + "text": "This sentence is dollar.", + "tag": "financial" + }, + { + "text": "This sentence is dollars.", + "tag": "financial" + }, + { + "text": "This sentence is loan.", + "tag": "financial" + }, + { + "text": "This sentence is money.", + "tag": "financial" + }, + { + "text": "This sentence is mortgage.", + "tag": "financial" + }, + { + "text": "This sentence is payment.", + "tag": "financial" + }, + { + "text": "This sentence is purchase.", + "tag": "financial" + }, + { + "text": "This sentence is salary.", + "tag": "financial" + }, + { + "text": "This sentence is savings.", + "tag": "financial" + }, + { + "text": "This sentence is transaction.", + "tag": "financial" + }, + { + "text": "This sentence is birth", + "tag": "personal" + }, + { + "text": "This sentence is 3G", + "tag": "personal" + }, + { + "text": "This sentence is 4G", + "tag": "personal" + }, + { + "text": "This sentence is cellular.", + "tag": "personal" + }, + { + "text": "This sentence is email.", + "tag": "personal" + }, + { + "text": "This sentence is fax.", + "tag": "personal" + }, + { + "text": "This sentence is password.", + "tag": "personal" + }, + { + "text": "This sentence is text.", + "tag": "personal" + }, + { + "text": "This sentence is telephone.", + "tag": "personal" + }, + { + "text": "This sentence is username.", + "tag": "personal" + }, + { + "text": "This sentence is firearm.", + "tag": "weapon" + }, + { + "text": "This sentence is grenade.", + "tag": "weapon" + }, + { + "text": "This sentence is gun.", + "tag": "weapon" + }, + { + "text": "This sentence is knife.", + "tag": "weapon" + }, + { + "text": "This sentence is rifle", + "tag": "weapon" + }, + { + "text": "This sentence is sword.", + "tag": "weapon" + }, + { + "text": "This sentence is passport.", + "tag": "identity document" + }, + { + "text": "This sentence is citizen.", + "tag": "identity document" + }, + { + "text": "This sentence is license.", + "tag": "identity document" + }, + { + "text": "This sentence is country.", + "tag": "identity document" + }, + { + "text": "This sentence is DOB.", + "tag": "identity document" + }, + { + "text": "This sentence is identity.", + "tag": "identity document" + }, + { + "text": "This sentence is surname.", + "tag": "identity document" + }, + { + "text": "This sentence is name.", + "tag": "identity document" + }, + { + "text": "This sentence is address.", + "tag": "identity document" + }, + { + "text": "This sentence is nationality.", + "tag": "identity document" + }, + { + "text": "This sentence is airline.", + "tag": "travel" + }, + { + "text": "This sentence is airport.", + "tag": "travel" + }, + { + "text": "This sentence is booking.", + "tag": "travel" + }, + { + "text": "This sentence is hotel.", + "tag": "travel" + }, + { + "text": "This sentence is itinerary.", + "tag": "travel" + }, + { + "text": "This sentence is motel.", + "tag": "travel" + }, + { + "text": "This sentence is passenger.", + "tag": "travel" + }, + { + "text": "This sentence is reservation.", + "tag": "travel" + }, + { + "text": "This sentence is roundtrip.", + "tag": "travel" + }, + { + "text": "This sentence is travel.", + "tag": "travel" + }, + { + "text": "This sentence is trip.", + "tag": "travel" + } +] \ No newline at end of file