Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
784fd04
creating transformer sentence tagging component
mcrensh Sep 14, 2023
23474e3
Get tests to work in local dev. env.
jrobble Sep 16, 2023
bd834fb
added translation to props to process, changed custom corpus test
mcrensh Sep 23, 2023
4883351
Improve speed.
jrobble Sep 25, 2023
cc9ce01
Refactor.
jrobble Sep 25, 2023
9aac898
wip
Chris7C Dec 19, 2023
0c42ed2
made tags added by feedforward transformer tagger case insensitive
Chris7C Jan 3, 2024
4b01009
Updates to transformer and keyword tagger to include brackets around …
Chris7C Jan 12, 2024
90350a3
Updates to dockerfiles
Chris7C Jan 12, 2024
28a4df9
Merge branch 'develop' of https://github.com/openmpf/openmpf-componen…
Chris7C Jan 12, 2024
a792e35
undoing bad commit
Chris7C Jan 12, 2024
efc6349
added trackType
Chris7C Jan 12, 2024
5c42c1e
updates based on review
Chris7C Jan 16, 2024
b5d88ba
Merge branch 'develop' of https://github.com/openmpf/openmpf-componen…
Chris7C Jan 19, 2024
a723cc8
Merge branch 'develop' of https://github.com/openmpf/openmpf-componen…
Chris7C Jan 22, 2024
aea5cc0
Update KeywordTagging README.
jrobble Jan 31, 2024
803a41a
Fix TransformerTagging char offset.
jrobble Jan 31, 2024
54097db
updated transformer tagger with update version and new changes from a…
Chris7C Jan 31, 2024
f5b575b
Improve TransformerTagging character offset logic to handle repeats.
jrobble Jan 31, 2024
0cc3a8a
Improve tagging READMEs.
jrobble Jan 31, 2024
7cfc5a7
Merge branch 'jrobble/transformer-tagging-jenkins' into jrobble/trans…
jrobble Feb 1, 2024
af44aac
Merge branch 'feature/transformer-tagging' into jrobble/transformer-t…
jrobble Feb 1, 2024
ef6e96f
Add cache_folder.
jrobble Feb 1, 2024
c029664
Remove debug line.
jrobble Feb 1, 2024
ab219fb
Preserve newline characters.
jrobble Feb 1, 2024
8954ac0
Fix offset.
jrobble Feb 1, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 40 additions & 19 deletions cpp/KeywordTagging/KeywordTagging.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

#include "KeywordTagging.h"
#include <string>
#include <codecvt>
#include <vector>
#include <iostream>
#include <fstream>
Expand Down Expand Up @@ -278,7 +279,7 @@ bool KeywordTagging::comp_regex(const MPFJob &job, const wstring &full_text,

set<wstring> KeywordTagging::search_regex(const MPFJob &job, const wstring &full_text,
const map<wstring, vector<pair<wstring, bool>>> &json_kvs_regex,
map<wstring, vector<string>> &trigger_words_offset,
map<wstring, map<wstring, vector<string>>> &trigger_tags_words_offset,
bool full_regex) {
wstring found_tags_regex = L"";
set<wstring> found_keys_regex;
Expand All @@ -288,19 +289,21 @@ set<wstring> KeywordTagging::search_regex(const MPFJob &job, const wstring &full
}

for (const auto &kv : json_kvs_regex) {
auto key = kv.first;
auto key = boost::locale::to_lower(kv.first);
auto values = kv.second;
map<wstring, vector<string>> trigger_words_offset;
for (const pair<wstring, bool> &value : values) {
wstring regex_pattern = value.first;
bool case_sens = value.second;

if (comp_regex(job, full_text, regex_pattern, trigger_words_offset, full_regex, case_sens)) {
found_keys_regex.insert(key);
trigger_tags_words_offset[key] = trigger_words_offset;
// Discontinue searching unless full regex search is enabled.
if (!full_regex) {
break;
}
}
}
}
}

Expand Down Expand Up @@ -542,31 +545,49 @@ void KeywordTagging::process_text_tagging(Properties &detection_properties, cons

bool full_regex = DetectionComponentUtils::GetProperty(job.job_properties, "FULL_REGEX_SEARCH", true);

set<wstring> trigger_words;
map<wstring, vector<string>> trigger_words_offset;
set<wstring> found_tags_regex = search_regex(job, text, json_kvs_regex, trigger_words_offset, full_regex);
map<wstring, map<wstring, vector<string>>> trigger_tags_words_offset;
set<wstring> found_tags_regex = search_regex(job, text, json_kvs_regex, trigger_tags_words_offset, full_regex);
all_found_tags.insert(found_tags_regex.begin(), found_tags_regex.end());

wstring tag_string = boost::algorithm::join(found_tags_regex, L"; ");

vector<string> offsets_list;
vector<wstring> triggers_list;

wstring tag_trigger = boost::algorithm::join(trigger_words, L"; ");

for (auto const& word_offset : trigger_words_offset) {
triggers_list.push_back(word_offset.first);
offsets_list.push_back(boost::algorithm::join(word_offset.second, ", "));
}
map<wstring, map<wstring, vector<string>>>::iterator trigger_tags_words_offset_iterator = trigger_tags_words_offset.begin();
while(trigger_tags_words_offset_iterator != trigger_tags_words_offset.end())
{
vector<string> offsets_list;
vector<wstring> triggers_list;

string tag_offset = boost::algorithm::join(offsets_list, "; ");
tag_trigger = tag_trigger + boost::algorithm::join(triggers_list, L"; ");
wstring tag = trigger_tags_words_offset_iterator->first;
boost::to_upper(tag);
map<wstring, vector<string>> trigger_words_offset = trigger_tags_words_offset_iterator->second;

detection_properties[boost::locale::conv::utf_to_utf<char>(prop) + " TRIGGER WORDS"] = boost::locale::conv::utf_to_utf<char>(tag_trigger);
detection_properties[boost::locale::conv::utf_to_utf<char>(prop)+ " TRIGGER WORDS OFFSET"] = tag_offset;
for (auto const& word_offset : trigger_words_offset) {
triggers_list.push_back(word_offset.first);
offsets_list.push_back(boost::algorithm::join(word_offset.second, ", "));
}

string tag_offset = boost::algorithm::join(offsets_list, "; ");
wstring tag_trigger = boost::algorithm::join(triggers_list, L"; ");

detection_properties[boost::locale::conv::utf_to_utf<char>(prop) + " " + boost::locale::conv::utf_to_utf<char>(tag) + " TRIGGER WORDS"] = boost::locale::conv::utf_to_utf<char>(tag_trigger);
detection_properties[boost::locale::conv::utf_to_utf<char>(prop) + " " + boost::locale::conv::utf_to_utf<char>(tag) + " TRIGGER WORDS OFFSET"] = tag_offset;
trigger_tags_words_offset_iterator++;
}
}

if (has_text) {
// store off earlier tags
boost::regex delimiter{"( *; *)"};
boost::sregex_token_iterator iter(detection_properties["TAGS"].begin(),
detection_properties["TAGS"].end(), delimiter, -1);
boost::sregex_token_iterator end;

while(iter != end)
{
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> convert_s_to_ws;
all_found_tags.insert(boost::to_lower_copy(convert_s_to_ws.from_bytes(*iter++)));
}

wstring tag_string = boost::algorithm::join(all_found_tags, L"; ");
detection_properties["TAGS"] = boost::locale::conv::utf_to_utf<char>(tag_string);
}
Expand Down
2 changes: 1 addition & 1 deletion cpp/KeywordTagging/KeywordTagging.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ class KeywordTagging : public MPFDetectionComponent {

std::set<std::wstring> search_regex(const MPFJob &job, const std::wstring &full_text,
const std::map<std::wstring, std::vector<std::pair<std::wstring, bool>>> &json_kvs_regex,
std::map<std::wstring, std::vector<std::string>> &trigger_words_offset,
std::map<std::wstring, std::map<std::wstring, std::vector<std::string>>> &trigger_tags_words_offset,
bool full_regex);

void process_regex_match(const boost::wsmatch &match, const std::wstring &full_text,
Expand Down
111 changes: 59 additions & 52 deletions cpp/KeywordTagging/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,14 @@ component.

# Inputs

When performing keyword tagging on a text file, the contents of the file will be
stored in a `TEXT` output property. When performing keyword tagging on
feed-forward detections generated from some other component in a multi-stage
pipeline, the output properties from that component will be preserved. This
means that if those detections have a `TEXT` output property, then this
component will generate detections with the same `TEXT` output. Similarly, if
those detections have a `TRANSCRIPT` output property, then this component will
generate detections with the same `TRANSCRIPT` output.

Keyword tagging will be performed on all of the input properties listed in
`FEED_FORWARD_PROP_TO_PROCESS`, if present. If none of the input properties are
present then keyword tagging is not performed and the feed-forward detection
is returned unmodified. For the sake of discussion, let's assume we need to
perform keyword tagging on the `TEXT` property.
When acting as a downstream stage of a feed-forward pipeline, this component will
accept feed-forward tracks as input. The `FEED_FORWARD_PROP_TO_PROCESS` job
property will be used to determine which properties in the feed-forward track
should be processed. For example, if `FEED_FORWARD_PROP_TO_PROCESS` is set to
`TEXT,TRANSLATION` then this component will look for tags in both the `TEXT` and
`TRANSLATION` properties in the feed-forward track. The trigger words for each of
these properties will be represented as seperate outputs. Refer to the Outputs
section below.

# JSON Tagging File

Expand Down Expand Up @@ -122,48 +116,61 @@ pattern becomes `(\\b)end(\\W+)of(\\W+)a(\\W+)sentence\\.`. Note that the `.`
symbol is typically used in regex to match any character, which is why we use `\\.`
instead.

# Outputs

Each input property listed in `FEED_FORWARD_PROP_TO_PROCESS` that's
present, and not just whitespace, will result in a `TRIGGER WORDS` and
`TRIGGER WORDS OFFSET` output property. For example, if
`FEED_FORWARD_PROP_TO_PROCESS=TEXT,TRANSLATION`, and the `TEXT` and `TRANSLATION`
properties are both present, then the following output properties will be produced:
# Outputs

- `TEXT TRIGGER WORDS`
- `TEXT TRIGGER WORDS OFFSET`
- `TRANSLATION TRIGGER WORDS`
- `TRANSLATION TRIGGER WORDS OFFSET`
When performing keyword tagging on a text file, the contents of the file will be
stored in a `TEXT` output property. When performing keyword tagging on
feed-forward detections generated from some other component in a multi-stage
pipeline, the output properties from that component will be preserved.This
means that if those detections have a `TEXT` output property, then this
component will generate detections with the same `TEXT` output. Similarly, if
those detections have a `TRANSLATION` output property, then this component will
generate detections with the same `TRANSLATION` output.

Each input property listed in `FEED_FORWARD_PROP_TO_PROCESS` that's present, and
not just whitespace, will result in a `[TAG] TRIGGER WORDS` and
`[TAG] TRIGGER WORDS OFFSET` output property. The `[TAG]` will be the tag property
that matched in the input text. For example, in
`FEED_FORWARD_PROP_TO_PROCESS=TEXT,TRANSLATION`, the `TEXT` and `TRANSLATION`
properties are both present, so the following output properties will be produced
if trigger words are found:

- `TEXT [TAG] TRIGGER WORDS`
- `TEXT [TAG] TRIGGER WORDS OFFSET`
- `TRANSLATION [TAG] TRIGGER WORDS`
- `TRANSLATION [TAG] TRIGGER WORDS OFFSET`

Let's assume that we need process the `TEXT` property. The substring(s) that
triggered each tag will be stored in `TEXT TRIGGER WORDS` in alphabetical order.
For each trigger word the substring index range relative to the `TEXT` output
will be stored in `TEXT TRIGGER WORDS OFFSET`. Because the same trigger word
can be encountered multiple times in the `TEXT` output, the results are organized
as follows:

* `TEXT TRIGGER WORDS`: Each distinct trigger word is separated by a semicolon
followed by a space. For example: `TEXT TRIGGER WORDS=trigger1; trigger2`
* Because semicolons can be part of the trigger word itself, those
semicolons will be encapsulated in brackets. For example,
`detected trigger with a ;` in the input `TEXT` is reported as
`TEXT TRIGGER WORDS=detected trigger with a [;]; some other trigger`.
* `TEXT TRIGGER WORDS OFFSET`: Each group of indexes, referring to the same trigger
word reported in sequence, is separated by a semicolon followed by a space.
Indexes within a single group are separated by commas.
* Example `TEXT TRIGGER WORDS=trigger1; trigger2`,
`TEXT TRIGGER WORDS OFFSET=0-5, 6-10; 12-15`, means that `trigger1` occurs twice
in the text at the index ranges 0-5 and 6-10, and `trigger2` occurs at index
range 12-15.

Note that all `TEXT TRIGGER WORDS` results are trimmed of leading and trailing
whitespace, regardless of the regex pattern used. The respective
`TEXT TRIGGER WORDS OFFSET` indexes refer to the trimmed substrings.
triggered each tag will be stored in `TEXT [TAG] TRIGGER WORDS` in alphabetical
order. For each trigger word the substring index range relative to the `TEXT`
output will be stored in `TEXT [TAG] TRIGGER WORDS OFFSET`. Because the same
trigger word can be encountered multiple times in the `TEXT` output, the results
are organized as follows:

* `TEXT [TAG] TRIGGER WORDS`: Each distinct trigger word is separated by a
semicolon followed by a space. For example:
`TEXT [TAG] TRIGGER WORDS=trigger1; trigger2`
* Because semicolons can be part of the trigger word itself, those semicolons
will be encapsulated in brackets. For example, `detected trigger with a ;` in
the input `TEXT` is reported as
`TEXT [TAG] TRIGGER WORDS=detected trigger with a [;]; some other trigger`.
* `TEXT [TAG] TRIGGER WORDS OFFSET`: Each group of indexes, referring to the same
trigger word reported in sequence, is separated by a semicolon followed by a
space. Indexes within a single group are separated by commas.
* Example `TEXT [TAG] TRIGGER WORDS=trigger1; trigger2`,
`TEXT [TAG] TRIGGER WORDS OFFSET=0-7, 20-27; 55-62`, means that `trigger1`
occurs twice in the text at the index ranges 0-7 and 20-17, and `trigger2`
occurs once at index range 55-62.

Note that all `TEXT [TAG] TRIGGER WORDS` results are trimmed of leading and
trailing whitespace, regardless of the regex pattern used. The respective
`TEXT [TAG] TRIGGER WORDS OFFSET` indexes refer to the trimmed substrings.

The tags associated with the trigger words will be stored in a `TAGS` output
property in alphabetical order, separated by semicolons. Note that there is only
one `TAGS` output property. This is unlike `TRIGGER WORDS` and `TRIGGER WORDS OFFSET`,
which are prefixed by the input property that produced those trigger words.
Each tag will only appear once in `TAGS` no matter how many trigger words
activate that tag. It doesn't matter if the trigger words are found in only one
or multiple input properties defined in `FEED_FORWARD_PROP_TO_PROCESS`.
one `TAGS` output property. This is unlike `TRIGGER WORDS` and
`TRIGGER WORDS OFFSET`, which are prefixed by the input property that produced those
trigger words. Each tag will only appear once in `TAGS` no matter how many trigger
words activate that tag. It doesn't matter if the trigger words are found in only
one or multiple input properties defined in `FEED_FORWARD_PROP_TO_PROCESS`.
Loading