openmpf · Chris7C · Feb 1, 2024 · Sep 14, 2023 · Sep 16, 2023 · Sep 23, 2023
diff --git a/cpp/KeywordTagging/KeywordTagging.cpp b/cpp/KeywordTagging/KeywordTagging.cpp
@@ -26,6 +26,7 @@
 
 #include "KeywordTagging.h"
 #include <string>
+#include <codecvt>
 #include <vector>
 #include <iostream>
 #include <fstream>
@@ -278,7 +279,7 @@ bool KeywordTagging::comp_regex(const MPFJob &job, const wstring &full_text,
 
 set<wstring> KeywordTagging::search_regex(const MPFJob &job, const wstring &full_text,
                                           const map<wstring, vector<pair<wstring, bool>>> &json_kvs_regex,
-                                          map<wstring, vector<string>>  &trigger_words_offset,
+                                          map<wstring, map<wstring, vector<string>>> &trigger_tags_words_offset,
                                           bool full_regex) {
     wstring found_tags_regex = L"";
     set<wstring> found_keys_regex;
@@ -288,19 +289,21 @@ set<wstring> KeywordTagging::search_regex(const MPFJob &job, const wstring &full
     }
 
     for (const auto &kv : json_kvs_regex) {
-        auto key = kv.first;
+        auto key = boost::locale::to_lower(kv.first);
         auto values = kv.second;
+        map<wstring, vector<string>>  trigger_words_offset;
         for (const pair<wstring, bool> &value : values) {
             wstring regex_pattern = value.first;
             bool case_sens = value.second;
 
             if (comp_regex(job, full_text, regex_pattern, trigger_words_offset, full_regex, case_sens)) {
                 found_keys_regex.insert(key);
+                trigger_tags_words_offset[key] = trigger_words_offset;
                 // Discontinue searching unless full regex search is enabled.
                 if (!full_regex) {
                     break;
                 }
-            }
+            }      
         }
     }
 
@@ -542,31 +545,49 @@ void KeywordTagging::process_text_tagging(Properties &detection_properties, cons
 
         bool full_regex = DetectionComponentUtils::GetProperty(job.job_properties, "FULL_REGEX_SEARCH", true);
 
-        set<wstring> trigger_words;
-        map<wstring, vector<string>> trigger_words_offset;
-        set<wstring> found_tags_regex = search_regex(job, text, json_kvs_regex, trigger_words_offset, full_regex);
+        map<wstring, map<wstring, vector<string>>> trigger_tags_words_offset;
+        set<wstring> found_tags_regex = search_regex(job, text, json_kvs_regex, trigger_tags_words_offset, full_regex);
         all_found_tags.insert(found_tags_regex.begin(), found_tags_regex.end());
 
         wstring tag_string = boost::algorithm::join(found_tags_regex, L"; ");
 
-        vector<string> offsets_list;
-        vector<wstring> triggers_list;
-
-        wstring tag_trigger = boost::algorithm::join(trigger_words, L"; ");
-
-        for (auto const& word_offset : trigger_words_offset) {
-            triggers_list.push_back(word_offset.first);
-            offsets_list.push_back(boost::algorithm::join(word_offset.second, ", "));
-        }
+        map<wstring, map<wstring, vector<string>>>::iterator trigger_tags_words_offset_iterator = trigger_tags_words_offset.begin();
+        while(trigger_tags_words_offset_iterator != trigger_tags_words_offset.end())
+        {
+            vector<string> offsets_list;
+            vector<wstring> triggers_list;
 
-        string tag_offset = boost::algorithm::join(offsets_list, "; ");
-        tag_trigger = tag_trigger + boost::algorithm::join(triggers_list, L"; ");
+            wstring tag = trigger_tags_words_offset_iterator->first;
+            boost::to_upper(tag);
+            map<wstring, vector<string>> trigger_words_offset = trigger_tags_words_offset_iterator->second;
 
-        detection_properties[boost::locale::conv::utf_to_utf<char>(prop) + " TRIGGER WORDS"] = boost::locale::conv::utf_to_utf<char>(tag_trigger);
-        detection_properties[boost::locale::conv::utf_to_utf<char>(prop)+ " TRIGGER WORDS OFFSET"] = tag_offset;
+            for (auto const& word_offset : trigger_words_offset) {
+                triggers_list.push_back(word_offset.first);
+                offsets_list.push_back(boost::algorithm::join(word_offset.second, ", "));
+            }
+
+            string tag_offset = boost::algorithm::join(offsets_list, "; ");
+            wstring tag_trigger = boost::algorithm::join(triggers_list, L"; ");
+
+            detection_properties[boost::locale::conv::utf_to_utf<char>(prop) + " " + boost::locale::conv::utf_to_utf<char>(tag) + " TRIGGER WORDS"] = boost::locale::conv::utf_to_utf<char>(tag_trigger);
+            detection_properties[boost::locale::conv::utf_to_utf<char>(prop) + " " + boost::locale::conv::utf_to_utf<char>(tag) + " TRIGGER WORDS OFFSET"] = tag_offset;
+            trigger_tags_words_offset_iterator++;
+        }   
     }
 
     if (has_text) {
+        // store off earlier tags
+        boost::regex delimiter{"( *; *)"};
+        boost::sregex_token_iterator iter(detection_properties["TAGS"].begin(), 
+            detection_properties["TAGS"].end(), delimiter, -1);
+        boost::sregex_token_iterator end;
+
+        while(iter != end)
+        {
+            std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> convert_s_to_ws;
+            all_found_tags.insert(boost::to_lower_copy(convert_s_to_ws.from_bytes(*iter++)));
+        }
+
         wstring tag_string = boost::algorithm::join(all_found_tags, L"; ");
         detection_properties["TAGS"] = boost::locale::conv::utf_to_utf<char>(tag_string);
     }

diff --git a/cpp/KeywordTagging/KeywordTagging.h b/cpp/KeywordTagging/KeywordTagging.h
@@ -57,7 +57,7 @@ class KeywordTagging : public MPFDetectionComponent {
 
     std::set<std::wstring> search_regex(const MPFJob &job, const std::wstring &full_text,
                                         const std::map<std::wstring, std::vector<std::pair<std::wstring, bool>>> &json_kvs_regex,
-                                        std::map<std::wstring, std::vector<std::string>> &trigger_words_offset,
+                                        std::map<std::wstring, std::map<std::wstring, std::vector<std::string>>> &trigger_tags_words_offset,
                                         bool full_regex);
 
     void process_regex_match(const boost::wsmatch &match, const std::wstring &full_text,

diff --git a/cpp/KeywordTagging/README.md b/cpp/KeywordTagging/README.md
@@ -14,20 +14,14 @@ component.
 
 # Inputs
 
-When performing keyword tagging on a text file, the contents of the file will be
-stored in a `TEXT` output property. When performing keyword tagging on
-feed-forward detections generated from some other component in a multi-stage
-pipeline, the output properties from that component will be preserved. This
-means that if those detections have a `TEXT` output property, then this
-component will generate detections with the same `TEXT` output. Similarly, if
-those detections have a `TRANSCRIPT` output property, then this component will
-generate detections with the same `TRANSCRIPT` output.
-
-Keyword tagging will be performed on all of the input properties listed in
-`FEED_FORWARD_PROP_TO_PROCESS`, if present. If none of the input properties are
-present then keyword tagging is not performed and the feed-forward detection
-is returned unmodified. For the sake of discussion, let's assume we need to
-perform keyword tagging on the `TEXT` property.
+When acting as a downstream stage of a feed-forward pipeline, this component will
+accept feed-forward tracks as input. The `FEED_FORWARD_PROP_TO_PROCESS` job
+property will be used to determine which properties in the feed-forward track
+should be processed. For example, if `FEED_FORWARD_PROP_TO_PROCESS` is set to
+`TEXT,TRANSLATION` then this component will look for tags in both the `TEXT` and
+`TRANSLATION` properties in the feed-forward track. The trigger words for each of
+these properties will be represented as seperate outputs. Refer to the Outputs
+section below.
 
 # JSON Tagging File
 
@@ -122,48 +116,61 @@ pattern becomes `(\\b)end(\\W+)of(\\W+)a(\\W+)sentence\\.`. Note that the `.`
 symbol is typically used in regex to match any character, which is why we use `\\.`
 instead.
 
-# Outputs
 
-Each input property listed in `FEED_FORWARD_PROP_TO_PROCESS` that's
-present, and not just whitespace, will result in a `TRIGGER WORDS` and
-`TRIGGER WORDS OFFSET` output property. For example, if 
-`FEED_FORWARD_PROP_TO_PROCESS=TEXT,TRANSLATION`, and the `TEXT` and `TRANSLATION`
-properties are both present, then the following output properties will be produced:
+# Outputs
 
-- `TEXT TRIGGER WORDS`
-- `TEXT TRIGGER WORDS OFFSET`
-- `TRANSLATION TRIGGER WORDS`
-- `TRANSLATION TRIGGER WORDS OFFSET`
+When performing keyword tagging on a text file, the contents of the file will be
+stored in a `TEXT` output property. When performing keyword tagging on
+feed-forward detections generated from some other component in a multi-stage
+pipeline, the output properties from that component will be preserved.This
+means that if those detections have a `TEXT` output property, then this
+component will generate detections with the same `TEXT` output. Similarly, if
+those detections have a `TRANSLATION` output property, then this component will
+generate detections with the same `TRANSLATION` output.
+
+Each input property listed in `FEED_FORWARD_PROP_TO_PROCESS` that's present, and
+not just whitespace, will result in a `[TAG] TRIGGER WORDS` and 
+`[TAG] TRIGGER WORDS OFFSET` output property. The `[TAG]` will be the tag property
+that matched in the input text. For example, in
+`FEED_FORWARD_PROP_TO_PROCESS=TEXT,TRANSLATION`, the `TEXT` and `TRANSLATION`
+properties are both present, so the following output properties will be produced
+if trigger words are found:
+
+- `TEXT [TAG] TRIGGER WORDS`
+- `TEXT [TAG] TRIGGER WORDS OFFSET`
+- `TRANSLATION [TAG] TRIGGER WORDS`
+- `TRANSLATION [TAG] TRIGGER WORDS OFFSET`
 
 Let's assume that we need process the `TEXT` property. The substring(s) that
-triggered each tag will be stored in `TEXT TRIGGER WORDS` in alphabetical order.
-For each trigger word the substring index range relative to the `TEXT` output
-will be stored in `TEXT TRIGGER WORDS OFFSET`. Because the same trigger word
-can be encountered multiple times in the `TEXT` output, the results are organized
-as follows:
-
-* `TEXT TRIGGER WORDS`: Each distinct trigger word is separated by a semicolon
-followed by a space. For example: `TEXT TRIGGER WORDS=trigger1; trigger2`
-    * Because semicolons can be part of the trigger word itself, those
-    semicolons will be encapsulated in brackets. For example,
-    `detected trigger with a ;` in the input `TEXT` is reported as
-    `TEXT TRIGGER WORDS=detected trigger with a [;]; some other trigger`.
-* `TEXT TRIGGER WORDS OFFSET`: Each group of indexes, referring to the same trigger
-word reported in sequence, is separated by a semicolon followed by a space.
-Indexes within a single group are separated by commas.
-    * Example `TEXT TRIGGER WORDS=trigger1; trigger2`,
-    `TEXT TRIGGER WORDS OFFSET=0-5, 6-10; 12-15`, means that `trigger1` occurs twice
-    in the text at the index ranges 0-5 and 6-10, and `trigger2` occurs at index
-    range 12-15.
-
-Note that all `TEXT TRIGGER WORDS` results are trimmed of leading and trailing
-whitespace, regardless of the regex pattern used. The respective
-`TEXT TRIGGER WORDS OFFSET` indexes refer to the trimmed substrings.
+triggered each tag will be stored in `TEXT [TAG] TRIGGER WORDS` in alphabetical
+order. For each trigger word the substring index range relative to the `TEXT`
+output will be stored in `TEXT [TAG] TRIGGER WORDS OFFSET`. Because the same
+trigger word can be encountered multiple times in the `TEXT` output, the results
+are organized as follows:
+
+* `TEXT [TAG] TRIGGER WORDS`: Each distinct trigger word is separated by a
+semicolon followed by a space. For example: 
+`TEXT [TAG] TRIGGER WORDS=trigger1; trigger2`
+    * Because semicolons can be part of the trigger word itself, those semicolons
+    will be encapsulated in brackets. For example, `detected trigger with a ;` in
+    the input `TEXT` is reported as
+    `TEXT [TAG] TRIGGER WORDS=detected trigger with a [;]; some other trigger`.
+* `TEXT [TAG] TRIGGER WORDS OFFSET`: Each group of indexes, referring to the same
+trigger word reported in sequence, is separated by a semicolon followed by a
+space. Indexes within a single group are separated by commas.
+    * Example `TEXT [TAG] TRIGGER WORDS=trigger1; trigger2`,
+    `TEXT [TAG] TRIGGER WORDS OFFSET=0-7, 20-27; 55-62`, means that `trigger1`
+    occurs twice in the text at the index ranges 0-7 and 20-17, and `trigger2`
+    occurs once at index range 55-62.
+
+Note that all `TEXT [TAG] TRIGGER WORDS` results are trimmed of leading and
+trailing whitespace, regardless of the regex pattern used. The respective
+`TEXT [TAG] TRIGGER WORDS OFFSET` indexes refer to the trimmed substrings.
 
 The tags associated with the trigger words will be stored in a `TAGS` output
 property in alphabetical order, separated by semicolons. Note that there is only
-one `TAGS` output property. This is unlike `TRIGGER WORDS` and `TRIGGER WORDS OFFSET`,
-which are prefixed by the input property that produced those trigger words.
-Each tag will only appear once in `TAGS` no matter how many trigger words
-activate that tag. It doesn't matter if the trigger words are found in only one
-or multiple input properties defined in `FEED_FORWARD_PROP_TO_PROCESS`.
+one `TAGS` output property. This is unlike `TRIGGER WORDS` and
+`TRIGGER WORDS OFFSET`, which are prefixed by the input property that produced those
+trigger words. Each tag will only appear once in `TAGS` no matter how many trigger
+words activate that tag. It doesn't matter if the trigger words are found in only
+one or multiple input properties defined in `FEED_FORWARD_PROP_TO_PROCESS`.