From c502b2f5adb58036573dad69e3558ca6b7384a5b Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Tue, 2 Apr 2024 21:32:03 -0400 Subject: [PATCH 01/22] Update Azure Translation character limits. Add NLP TextSplitting Models. Update TextSplitter for input langs. --- python/AzureTranslation/Dockerfile | 16 + python/AzureTranslation/README.md | 86 +++-- .../acs_translation_component.py | 247 ++++---------- .../nlp_text_splitter/__init__.py | 27 ++ .../nlp_text_splitter/text_splitter.py | 213 ++++++++++++ .../nlp_text_splitter/wtp_lang_settings.py | 259 +++++++++++++++ .../plugin-files/descriptor/descriptor.json | 25 +- .../AzureTranslation/sample_acs_translator.py | 8 +- python/AzureTranslation/setup.cfg | 5 +- .../art-of-war-translation-1.json | 4 +- .../art-of-war-translation-2.json | 4 +- .../art-of-war-translation-3.json | 4 +- .../art-of-war-translation-4.json | 10 + .../art-of-war-break-sentence-1.json | 9 - .../art-of-war-break-sentence-2.json | 19 -- .../art-of-war-break-sentence-3.json | 12 - .../art-of-war-translation-1.json | 10 - .../art-of-war-translation-2.json | 10 - .../art-of-war-translation-3.json | 10 - .../art-of-war-translation-4.json | 10 - .../art-of-war-translation-5.json | 10 - .../art-of-war-translation-6.json | 10 - .../data/invalid-lang-detect-result.json | 8 + .../tests/test_acs_translation.py | 302 +++++++++++------- 24 files changed, 887 insertions(+), 431 deletions(-) create mode 100644 python/AzureTranslation/nlp_text_splitter/__init__.py create mode 100644 python/AzureTranslation/nlp_text_splitter/text_splitter.py create mode 100644 python/AzureTranslation/nlp_text_splitter/wtp_lang_settings.py create mode 100644 python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-4.json delete mode 100644 python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-break-sentence-1.json delete mode 100644 python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-break-sentence-2.json delete mode 100644 python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-break-sentence-3.json delete mode 100644 python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-1.json delete mode 100644 python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-2.json delete mode 100644 python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-3.json delete mode 100644 python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-4.json delete mode 100644 python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-5.json delete mode 100644 python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-6.json create mode 100644 python/AzureTranslation/tests/data/invalid-lang-detect-result.json diff --git a/python/AzureTranslation/Dockerfile b/python/AzureTranslation/Dockerfile index 1e0308ea..ac6999ae 100644 --- a/python/AzureTranslation/Dockerfile +++ b/python/AzureTranslation/Dockerfile @@ -34,6 +34,22 @@ ARG RUN_TESTS=false RUN pip install --no-cache-dir langcodes +RUN apt-get update && \ + apt-get install -y git git-lfs && \ + git lfs install && \ + rm -rf /var/lib/apt/lists/* + +# Install WtP and spaCy +RUN pip install --upgrade pip && \ + pip install spacy>=3.7.4 && \ + pip install wtpsplit>=1.3.0 && \ + pip install torch --index-url https://download.pytorch.org/whl/cpu + +# Modify to add downloads for other models of interest. +RUN mkdir /wtp_models && cd /wtp_models && \ + git clone https://huggingface.co/benjamin/wtp-bert-mini && \ + python3 -m spacy download xx_sent_ud_sm + RUN --mount=target=.,readwrite \ install-component.sh; \ if [ "${RUN_TESTS,,}" == true ]; then python tests/test_acs_translation.py; fi diff --git a/python/AzureTranslation/README.md b/python/AzureTranslation/README.md index 2740de41..e01a6b0a 100644 --- a/python/AzureTranslation/README.md +++ b/python/AzureTranslation/README.md @@ -1,74 +1,106 @@ # Overview This repository contains source code for the OpenMPF Azure Cognitive Services -Translation Component. This component utilizes the [Azure Cognitive Services +Translation Component. This component utilizes the [Azure Cognitive Services Translator REST endpoint](https://docs.microsoft.com/en-us/azure/cognitive-services/translator/reference/v3-0-translate) to translate the content of detection properties. It has only been tested against v3.0 of the API. This component translates the content of existing detection properties, -so it only makes sense to use it with -[feed forward](https://openmpf.github.io/docs/site/Feed-Forward-Guide) and +so it only makes sense to use it with +[feed forward](https://openmpf.github.io/docs/site/Feed-Forward-Guide) and when it isn't the first element of a pipeline. - -When a detection property is translated, the translation is put in to a new -detection property named `TRANSLATION`. The original detection property is not -modified. A property named `TRANSLATION TO LANGUAGE` containing the BCP-47 + +When a detection property is translated, the translation is put in to a new +detection property named `TRANSLATION`. The original detection property is not +modified. A property named `TRANSLATION TO LANGUAGE` containing the BCP-47 language code of the translated text will also be added. If the language of the input text is detected to be the same as the `TO_LANGUAGE` job property, -then no translation will occur. When translation is skipped because of -matching languages, the `TRANSLATION` detection property will be omitted and +then no translation will occur. When translation is skipped because of +matching languages, the `TRANSLATION` detection property will be omitted and `SKIPPED TRANSLATION=TRUE` will be added to the detection properties. When the source text is multiple languages, the translation endpoint will only -translate one of the languages. For example, translating -"你叫什么名字? ¿Cómo te llamas?" to English results in +translate one of the languages. For example, translating +"你叫什么名字? ¿Cómo te llamas?" to English results in "What is your name? The Cómo te llamas?". # Required Job Properties In order for the component to process any jobs, the job properties listed below -must be provided. Neither has a default value. +must be provided. Neither has a default value. -- `ACS_URL`: Base URL for the Azure Cognitive Services Translator Endpoint. - e.g. `https://api.cognitive.microsofttranslator.com` or +- `ACS_URL`: Base URL for the Azure Cognitive Services Translator Endpoint. + e.g. `https://api.cognitive.microsofttranslator.com` or `https:///translator/text/v3.0`. The URL should not end with `/translate` because two separate endpoints are used. `ACS_URL + '/translate'` is used for translation. `ACS_URL + '/breaksentence'` is used to break up text when it is too long for a single translation request. This property can also be configured using an environment variable named `MPF_PROP_ACS_URL`. - + - `ACS_SUBSCRIPTION_KEY`: A string containing your Azure Cognitive Services - subscription key. To get one you will need to create an + subscription key. To get one you will need to create an Azure Cognitive Services account. This property can also be configured using an environment variable named `MPF_PROP_ACS_SUBSCRIPTION_KEY`. - - + + # Important Job Properties: -- `TO_LANGUAGE`: The BCP-47 language code for language that the properties +- `TO_LANGUAGE`: The BCP-47 language code for language that the properties should be translated to. -- `FEED_FORWARD_PROP_TO_PROCESS`: Comma-separated list of property names indicating - which properties in the feed-forward track or detection to consider +- `FEED_FORWARD_PROP_TO_PROCESS`: Comma-separated list of property names indicating + which properties in the feed-forward track or detection to consider translating. For example, `TEXT,TRANSCRIPT`. If the first property listed is present, then that property will be translated. If it's not, then the next property in the list is considered. At most, one property will be translated. - `FROM_LANGUAGE`: In most cases, this property should not be used. It should - only be used when automatic language detection is detecting the wrong - language. Providing this property prevents the translation endpoint from - doing automatic language detection. If `FROM_LANGUAGE` is provided, and the - text is actually another language, the translation endpoint will return the + only be used when automatic language detection is detecting the wrong + language. Providing this property prevents the translation endpoint from + doing automatic language detection. If `FROM_LANGUAGE` is provided, and the + text is actually another language, the translation endpoint will return the input text unchanged. - + # Listing Supported Languages -To list the supported languages replace `${ACS_URL}` and +To list the supported languages replace `${ACS_URL}` and `${ACS_SUBSCRIPTION_KEY}` in the following command and run it: ```shell script curl -H "Ocp-Apim-Subscription-Key: ${ACS_SUBSCRIPTION_KEY}" "https://${ACS_URL}/languages?api-version=3.0&scope=translation" ``` +# Secondary Job Properties - Text Splitter: + The following settings control the behavior of dividing input + text into acceptable chunks for processing. + + Through preliminary investigation, we identified the [WtP library ("Where's the Point")](https://github.com/bminixhofer/wtpsplit) + and spaCy's multilingual sentence detection model for identifying sentence breaks + in a large section of text. + + WtP models are trained to break up multilingual text without the need + of an input language tag. The disadvantage is that the most accurate + WtP models will need ~3.5 GB of GPU memory. On the other hand, spaCy has a single + multilingual sentence detection that appears to work better for breaking up English + text in certain cases, unfortunately this model lacks support handling for Chinese punctuation. + + - `SENTENCE_MODEL` - Specifies the desired WtP or spaCy sentence detection model. + For CPU and runtime considerations, the author of WtP recommends using `wtp-bert-mini`; + more advanced WtP models that use GPU resources (up to ~8 GB) are also available [(see list of WtP model names here)](https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#available-models). + The only available spaCy model (for text with unknown language) is `xx_sent_ud_sm`. + + [Review list of languages supported by WtP here.](https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#supported-languages) + [Review models and languages supported by spaCy here.](https://spacy.io/models) + + - `SENTENCE_SPLITTER_CHAR_COUNT` - Specifies maximum number of characters to process + through sentence/text splitter. + Default to 500 characters as we only need to process a subsection of text to determine an appropriate + split [(see discussion of potential char lengths - Mozilla Common Voice)](https://discourse.mozilla.org/t/proposal-sentences-lenght-limit-from-14-words-to-100-characters). + + - `SENTENCE_SPLITTER_INCLUDE_INPUT_LANG` - Specifies whether to pass input language to + sentence splitter algorithm. Currently, only WtP supports model threshold adjustments + by input language. + + # Sample Program `sample_acs_translator.py` can be used to quickly test with the Azure endpoint. It translates strings provided via command line arguments. diff --git a/python/AzureTranslation/acs_translation_component/acs_translation_component.py b/python/AzureTranslation/acs_translation_component/acs_translation_component.py index c1d3f679..0e2e816e 100644 --- a/python/AzureTranslation/acs_translation_component/acs_translation_component.py +++ b/python/AzureTranslation/acs_translation_component/acs_translation_component.py @@ -36,21 +36,26 @@ import urllib.parse import urllib.request import uuid -from typing import Callable, Dict, Iterator, List, Literal, Mapping, Match, NamedTuple, \ +from typing import Callable, Dict, List, Literal, Mapping, Match, NamedTuple, \ Optional, Sequence, TypedDict, TypeVar, Union import mpf_component_api as mpf import mpf_component_util as mpf_util +from nlp_text_splitter.text_splitter import TextSplitter, TextSplitterModel + from . import convert_language_code log = logging.getLogger('AcsTranslationComponent') + class AcsTranslationComponent: - @staticmethod - def get_detections_from_video(job: mpf.VideoJob) -> Sequence[mpf.VideoTrack]: + def __init__(self) -> None: + self._cached_sent_model = TextSplitterModel("wtp-bert-mini", "cpu") + + def get_detections_from_video(self, job: mpf.VideoJob) -> Sequence[mpf.VideoTrack]: try: log.info(f'Received video job: {job}') ff_track = job.feed_forward_track @@ -59,7 +64,7 @@ def get_detections_from_video(job: mpf.VideoJob) -> Sequence[mpf.VideoTrack]: 'Component can only process feed forward jobs, ' 'but no feed forward track provided. ') - tc = TranslationClient(job.job_properties) + tc = TranslationClient(job.job_properties, self._cached_sent_model) tc.add_translations(ff_track.detection_properties) for ff_location in ff_track.frame_locations.values(): tc.add_translations(ff_location.detection_properties) @@ -71,18 +76,24 @@ def get_detections_from_video(job: mpf.VideoJob) -> Sequence[mpf.VideoTrack]: log.exception('Failed to complete job due to the following exception:') raise - @staticmethod - def get_detections_from_image(job: mpf.ImageJob) -> Sequence[mpf.ImageLocation]: - return get_detections_from_non_composite(job, job.feed_forward_location) - @staticmethod - def get_detections_from_audio(job: mpf.AudioJob) -> Sequence[mpf.AudioTrack]: - return get_detections_from_non_composite(job, job.feed_forward_track) + def get_detections_from_image(self, job: mpf.ImageJob) -> Sequence[mpf.ImageLocation]: + return get_detections_from_non_composite(job, + self._cached_sent_model, + job.feed_forward_location) - @staticmethod - def get_detections_from_generic(job: mpf.GenericJob) -> Sequence[mpf.GenericTrack]: + + def get_detections_from_audio(self, job: mpf.AudioJob) -> Sequence[mpf.AudioTrack]: + return get_detections_from_non_composite(job, + self._cached_sent_model, + job.feed_forward_track) + + + def get_detections_from_generic(self, job: mpf.GenericJob) -> Sequence[mpf.GenericTrack]: if job.feed_forward_track: - return get_detections_from_non_composite(job, job.feed_forward_track) + return get_detections_from_non_composite(job, + self._cached_sent_model, + job.feed_forward_track) else: log.info('Job did not contain a feed forward track. Assuming media ' 'file is a plain text file containing the text to be translated.') @@ -90,13 +101,16 @@ def get_detections_from_generic(job: mpf.GenericJob) -> Sequence[mpf.GenericTrac track = mpf.GenericTrack(detection_properties=dict(TEXT=text)) modified_job_props = {**job.job_properties, 'FEED_FORWARD_PROP_TO_PROCESS': 'TEXT'} modified_job = job._replace(job_properties=modified_job_props) - return get_detections_from_non_composite(modified_job, track) + return get_detections_from_non_composite(modified_job, + self._cached_sent_model, + track) T_FF_OBJ = TypeVar('T_FF_OBJ', mpf.AudioTrack, mpf.GenericTrack, mpf.ImageLocation) def get_detections_from_non_composite( job: Union[mpf.AudioJob, mpf.GenericJob, mpf.ImageJob], + sentence_model: TextSplitterModel, ff_track: Optional[T_FF_OBJ]) -> Sequence[T_FF_OBJ]: try: log.info(f'Received job: {job}') @@ -105,7 +119,7 @@ def get_detections_from_non_composite( 'Component can only process feed forward jobs, ' 'but no feed forward track provided.') - tc = TranslationClient(job.job_properties) + tc = TranslationClient(job.job_properties, sentence_model) tc.add_translations(ff_track.detection_properties) log.info(f'Processing complete. Translated {tc.translation_count} properties.') return (ff_track,) @@ -146,7 +160,7 @@ class UnsupportedSourceLanguage(Exception): class TranslationClient: DETECT_MAX_CHARS = 50_000 - def __init__(self, job_properties: Mapping[str, str]): + def __init__(self, job_properties: Mapping[str, str], sentence_model: TextSplitterModel): self._subscription_key = get_required_property('ACS_SUBSCRIPTION_KEY', job_properties) self._http_retry = mpf_util.HttpRetry.from_properties(job_properties, log.warning) @@ -170,8 +184,7 @@ def __init__(self, job_properties: Mapping[str, str]): acs_url = get_required_property('ACS_URL', job_properties) self._detect_url = create_url(acs_url, 'detect', {}) - self._break_sentence_client = BreakSentenceClient(job_properties, self._subscription_key, - self._http_retry) + self._break_sentence_client = BreakSentenceClient(job_properties, sentence_model) prop_names = job_properties.get('FEED_FORWARD_PROP_TO_PROCESS', 'TEXT,TRANSCRIPT') self._props_to_translate = [p.strip() for p in prop_names.split(',')] @@ -427,25 +440,28 @@ def _send_detect_request(self, text) -> 'AcsResponses.Detect': class BreakSentenceClient: """ - Class to interact with Azure's "/breaksentence" endpoint. It is only used when the text to - translate exceeds the translation endpoint's character limit. + Class to break up large sections of text using WtP and spaCy. + It is only used when the text to translate exceeds + the translation endpoint's character limit. """ # ACS limits the number of characters that can be translated in a single /translate call. - # Taken from https://docs.microsoft.com/en-us/azure/cognitive-services/translator/reference/v3-0-translate - TRANSLATION_MAX_CHARS = 10_000 - - # ACS limits the number of characters that can be processed in a single /breaksentence call. - # Taken from https://docs.microsoft.com/en-us/azure/cognitive-services/translator/reference/v3-0-break-sentence - BREAK_SENTENCE_MAX_CHARS = 50_000 - - - def __init__(self, job_properties: Mapping[str, str], subscription_key: str, - http_retry: mpf_util.HttpRetry): - self._acs_url = get_required_property('ACS_URL', job_properties) - self._subscription_key = subscription_key - self._http_retry = http_retry - + # Taken from + # https://docs.microsoft.com/en-us/azure/cognitive-services/translator/reference/v3-0-translate + TRANSLATION_MAX_CHARS = 50_000 + + def __init__(self, job_properties: Mapping[str, str], + sentence_model:TextSplitterModel): + self._sentence_model = sentence_model + self._num_boundary_chars = mpf_util.get_property(job_properties, + "SENTENCE_SPLITTER_CHAR_COUNT", + 500) + nlp_model_name = mpf_util.get_property(job_properties, "SENTENCE_MODEL", "wtp-bert-mini") + self._incl_input_lang = mpf_util.get_property(job_properties, + "SENTENCE_SPLITTER_INCLUDE_INPUT_LANG", + True) + nlp_model_setting = "cpu" + self._sentence_model.update_model(nlp_model_name, nlp_model_setting) def split_input_text(self, text: str, from_lang: Optional[str], from_lang_confidence: Optional[float]) -> SplitTextResult: @@ -461,81 +477,28 @@ def split_input_text(self, text: str, from_lang: Optional[str], f'{self.TRANSLATION_MAX_CHARS} Azure characters, but the text contained ' f'{azure_char_count} Azure characters.') - if azure_char_count > self.BREAK_SENTENCE_MAX_CHARS: - log.warning('Guessing sentence breaks because the break sentence endpoint allows a ' - f'maximum of {self.BREAK_SENTENCE_MAX_CHARS} Azure characters, but the' - f'text contained {azure_char_count} Azure characters.') - chunks = list(SentenceBreakGuesser.guess_breaks(text)) - log.warning(f'Broke text up in to {len(chunks)} chunks. Each chunk will be sent to ' - 'the break sentence endpoint.') - else: - chunks = (text,) - - if from_lang: - break_sentence_url = create_url(self._acs_url, 'breaksentence', - dict(language=from_lang)) + if self._incl_input_lang: + divided_text_list = TextSplitter.split( + text, + BreakSentenceClient.TRANSLATION_MAX_CHARS, + self._num_boundary_chars, + get_azure_char_count, + self._sentence_model, + from_lang) else: - break_sentence_url = create_url(self._acs_url, 'breaksentence', {}) + divided_text_list = TextSplitter.split( + text, + BreakSentenceClient.TRANSLATION_MAX_CHARS, + self._num_boundary_chars, + get_azure_char_count, + self._sentence_model) - chunk_iter = iter(chunks) - chunk = next(chunk_iter) - response_body = self._send_break_sentence_request(break_sentence_url, chunk) - if not from_lang: - detected_lang_info = response_body[0].get('detectedLanguage') - if detected_lang_info: - from_lang = detected_lang_info['language'] - from_lang_confidence = detected_lang_info['score'] - grouped_sentences = list(self._process_break_sentence_response(chunk, response_body)) - - for chunk in chunk_iter: - response_body = self._send_break_sentence_request(break_sentence_url, chunk) - grouped_sentences.extend(self._process_break_sentence_response(chunk, response_body)) - - log.info('Grouped sentences into %s chunks.', len(grouped_sentences)) - return SplitTextResult(grouped_sentences, from_lang, from_lang_confidence) - - - def _send_break_sentence_request( - self, break_sentence_url: str, text: str) -> 'AcsResponses.BreakSentence': - request_body = [ - {'Text': text} - ] - encoded_body = json.dumps(request_body).encode('utf-8') - request = urllib.request.Request(break_sentence_url, encoded_body, - get_acs_headers(self._subscription_key)) - log.info(f'Sending POST {break_sentence_url}') - log_json(request_body) - with self._http_retry.urlopen(request) as response: - response_body: AcsResponses.BreakSentence = json.load(response) - log.info('Received break sentence response with %s sentences.', - len(response_body[0]['sentLen'])) - log_json(response_body) - return response_body - - - @classmethod - def _process_break_sentence_response( - cls, text: str, response_body: AcsResponses.BreakSentence) -> Iterator[str]: - current_chunk_length = 0 - current_chunk_begin = 0 - current_chunk_azure_char_count = 0 - for length in response_body[0]['sentLen']: - sentence_begin = current_chunk_begin + current_chunk_length - sentence = text[sentence_begin: sentence_begin + length] - sentence_azure_char_count = get_azure_char_count(sentence) - # The /breaksentence endpoint will return sentences <= 1000 characters, so the - # following condition will be true at least once. - if sentence_azure_char_count + current_chunk_azure_char_count <= cls.TRANSLATION_MAX_CHARS: - current_chunk_length += len(sentence) - current_chunk_azure_char_count += sentence_azure_char_count - else: - current_chunk_end = current_chunk_begin + current_chunk_length - yield text[current_chunk_begin:current_chunk_end] - current_chunk_begin = current_chunk_end - current_chunk_length = len(sentence) - current_chunk_azure_char_count = sentence_azure_char_count - yield text[current_chunk_begin:] + chunks = list(divided_text_list) + log.warning(f'Broke text up in to {len(chunks)} chunks. Each chunk will be sent to ' + 'the translation endpoint.') + log.info('Grouped sentences into %s chunks.', len(chunks)) + return SplitTextResult(chunks, from_lang, from_lang_confidence) def get_n_azure_chars(input_str: str, begin: int, count: int) -> str: substr = input_str[begin: begin + count] @@ -583,79 +546,7 @@ def set_query_params(url: str, query_params: Mapping[str, str]) -> str: return urllib.parse.urlunparse(replaced_parts) -class SentenceBreakGuesser: - @classmethod - def guess_breaks(cls, text: str) -> Iterator[str]: - """ - Splits text up in to substrings that are all at most - BreakSentenceClient.BREAK_SENTENCE_MAX_CHARS in length. It is preferable to use the - /breaksentence endpoint because splitting a sentence in the middle will cause incorrect - translations. When the input text is too long for /breaksentence, our only option is to - use some heuristics to guess a good location to split the input text. - We attempt to do the minimal number of splits with this method. The substrings produced - by this method will be further split up using the much more accurate /breaksentence - endpoint. - - :param text: Text to split up - :return: Generator producing substrings of input text - """ - current_pos = 0 - max_chars = BreakSentenceClient.BREAK_SENTENCE_MAX_CHARS - while True: - chunk = get_n_azure_chars(text, current_pos, max_chars) - is_last_chunk = len(text) <= current_pos + len(chunk) - if is_last_chunk: - yield chunk - return - else: - break_pos = cls._get_break_pos(chunk) - yield chunk[:break_pos] - current_pos += break_pos - - # Characters we know indicate the end of a sentence. The list is not exhaustive and may need to - # be updated if we come across others. - SENTENCE_END_PUNCTUATION = { - '.', '!', '?', # Latin scripts - '。', '!', '?'} # Chinese (full width) versions - - @classmethod - def _get_break_pos(cls, text: str) -> int: - # Two newlines in a row result in a blank line. Blank lines are commonly used to delimit - # paragraphs. - double_newline_pos = text.rfind('\n\n') - if double_newline_pos > 0: - return double_newline_pos + 2 - - # Look for the last sentence breaking punctuation character in the text. - last_punctuation_pos = next( - (i for i in reversed(range(len(text))) if text[i] in cls.SENTENCE_END_PUNCTUATION), - -1) - if last_punctuation_pos > 0: - return last_punctuation_pos + 1 - - single_newline_pos = text.rfind('\n') - if single_newline_pos > 0: - return single_newline_pos + 1 - - # Look for last punctuation character in the text. - # This will catch non-sentence breaking punctuation, but we already made our best effort - # to use sentence breaking punctuation above. - last_punctuation_pos = next( - (i for i in reversed(range(len(text))) if cls._is_punctuation(text[i])), - -1) - if last_punctuation_pos > 0: - return last_punctuation_pos + 1 - - if (last_space_pos := text.rfind(' ')) > 0: - return last_space_pos + 1 - - # No suitable break found. Use entire input. - return len(text) - - @staticmethod - def _is_punctuation(char): - return unicodedata.category(char) == 'Po' def get_acs_headers(subscription_key: str) -> Dict[str, str]: diff --git a/python/AzureTranslation/nlp_text_splitter/__init__.py b/python/AzureTranslation/nlp_text_splitter/__init__.py new file mode 100644 index 00000000..09805b64 --- /dev/null +++ b/python/AzureTranslation/nlp_text_splitter/__init__.py @@ -0,0 +1,27 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +from .text_splitter import TextSplitter, TextSplitterModel \ No newline at end of file diff --git a/python/AzureTranslation/nlp_text_splitter/text_splitter.py b/python/AzureTranslation/nlp_text_splitter/text_splitter.py new file mode 100644 index 00000000..52dc650c --- /dev/null +++ b/python/AzureTranslation/nlp_text_splitter/text_splitter.py @@ -0,0 +1,213 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +import logging +import os +import pkg_resources + +import spacy +from wtpsplit import WtP +from typing import Callable, List, Optional, Tuple + +from .wtp_lang_settings import WtpLanguageSettings + + +DEFAULT_WTP_MODELS = "/wtp_models" + +# If we want to package model installation with this utility in the future: +WTP_MODELS_PATH = pkg_resources.resource_filename( + __name__, "models" +) + +log = logging.getLogger(__name__) + +class TextSplitterModel: + # To hold spaCy, WtP, and other potential sentence detection models in cache + + def __init__(self, model_name: str, model_setting: str) -> None: + self._model_name = "" + self.split = lambda t, **param: [t] + self.update_model(model_name, model_setting) + + def update_model(self, model_name: str, model_setting: str = ""): + if model_name: + if "wtp" in model_name: + self._update_wtp_model(model_name, model_setting) + self.split = self._split_wtp + log.info(f"Setup up WtP model: {model_name}") + else: + self._update_spacy_model(model_name) + self.split = self._split_spacy + log.info(f"Setup up spaCy model: {model_name}") + + def _update_wtp_model(self, wtp_model_name: str, + model_setting: str = "cpu") -> None: + + if self._model_name != wtp_model_name: + self._model_name = wtp_model_name + # Check if model has been downloaded + if os.path.exists(os.path.join(WTP_MODELS_PATH, wtp_model_name)): + log.info(f"Using downloaded {wtp_model_name} model.") + wtp_model_name = os.path.join(WTP_MODELS_PATH, wtp_model_name) + + elif os.path.exists(os.path.join(DEFAULT_WTP_MODELS, + wtp_model_name)): + + log.info(f"Using downloaded {wtp_model_name} model.") + wtp_model_name = os.path.join(DEFAULT_WTP_MODELS, + wtp_model_name) + + else: + log.warning(f"Model {wtp_model_name} not found, " + "downloading from hugging face.") + + self.wtp_model = WtP(wtp_model_name) + + if model_setting != "cpu" and model_setting != "cuda": + log.warning(f"Invalid setting for WtP runtime {model_setting}. " + "Defaulting to CPU mode.") + model_setting = "cpu" + self.wtp_model.to(model_setting) + + def _split_wtp(self, text: str, lang: Optional[str] = None) -> List[str]: + if lang: + iso_lang = WtpLanguageSettings.convert_to_iso(lang) + if iso_lang: + return self.wtp_model.split(text, lang_code=iso_lang) # type: ignore + return self.wtp_model.split(text) # type: ignore + + def _update_spacy_model(self, spacy_model_name: str): + self.spacy_model = spacy.load(spacy_model_name, exclude=["parser"]) + self.spacy_model.enable_pipe("senter") + + def _split_spacy(self, text: str, lang: Optional[str] = None) -> List[str]: + # TODO: We may add an auto model selection for spaCy in the future. + # However, the drawback is we will also need to + # download a large number of spaCy models beforehand. + processed_text = self.spacy_model(text) + return [sent.text_with_ws for sent in processed_text.sents] + +class TextSplitter: + # Authors: Brian Rosenberg, Howard Huang + + def __init__( + self, text: str, limit: int, num_boundary_chars: int, + get_text_size: Callable[[str], int], + sentence_model: TextSplitterModel, + in_lang: Optional[str] = None) -> None: + self._sentence_model = sentence_model + self._limit = limit + self._num_boundary_chars = num_boundary_chars + self._get_text_size = get_text_size + self._text = "" + self._text_full_size = 0 + self._num_overhead_bytes = 0 + self._soft_limit = self._limit + self._in_lang = in_lang + + if text: + self.set_text(text) + + def set_text(self, text: str): + self._text = text + self._text_full_size = self._get_text_size(text) + chars_per_size = len(text) / self._text_full_size + self._num_overhead_bytes = self._get_text_size('') + + self._soft_limit = int(self._limit * chars_per_size) - self._num_overhead_bytes + + if self._soft_limit <= 1: + # Caused by an unusually large overhead relative to text. + # This is unlikely to occur except during testing of small text limits. + # Recalculate overhead bytes with chars_per_size weighting. + self._soft_limit = max(1, + int((self._limit - self._num_overhead_bytes) * chars_per_size)) + + def _isolate_largest_section(self, text:str) -> str: + # Using cached word splitting model, isolate largest section of text + string_length = len(text) + + if self._num_boundary_chars <= 0: + num_chars_to_process = string_length + else: + num_chars_to_process = self._num_boundary_chars + + start_indx = max(0, string_length - num_chars_to_process) + substring = text[start_indx: string_length] + substring_list = self._sentence_model.split(substring, lang = self._in_lang) + div_index = string_length - len(substring_list[-1]) + + if div_index==start_indx: + return text + + return text[0:div_index] + + @classmethod + def split(cls, + text: str, limit: int, num_boundary_chars: int, get_text_size: Callable[[str], int], + sentence_model: TextSplitterModel, + in_lang: Optional[str] = None + ): + return cls(text, limit, num_boundary_chars, get_text_size, sentence_model, in_lang)._split() + + + def _split(self): + if self._text_full_size <= self._limit: + yield self._text + else: + yield from self._split_internal(self._text) + + def _split_internal(self, text): + right = text + while True: + left, right = self._divide(right) + yield left + if not right: + return + + def _divide(self, text) -> Tuple[str, str]: + limit = self._soft_limit + while True: + left = text[:limit] + left_size = self._get_text_size(left) + + if left_size <= self._limit: + if left != text: + # If dividing into two parts + # Determine soft boundary for left segment + left = self._isolate_largest_section(left) + return left, text[len(left):] + + char_per_size = len(left) / left_size + + + limit = int(self._limit * char_per_size) - self._num_overhead_bytes + + if limit < 1: + # Caused by an unusually large overhead relative to text. + # This is unlikely to occur except during testing of small text limits. + # Recalculate overhead bytes with chars_per_size weighting. + limit = max(1, int((self._limit - self._num_overhead_bytes) * char_per_size)) \ No newline at end of file diff --git a/python/AzureTranslation/nlp_text_splitter/wtp_lang_settings.py b/python/AzureTranslation/nlp_text_splitter/wtp_lang_settings.py new file mode 100644 index 00000000..c682fd3f --- /dev/null +++ b/python/AzureTranslation/nlp_text_splitter/wtp_lang_settings.py @@ -0,0 +1,259 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +from typing import Optional + +class WtpLanguageSettings: + # Supported languages and ISO 639-1, 639-2 codes for WtP models. + # https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#supported-languages + # https://www.loc.gov/standards/iso639-2/php/code_list.php + _wtp_lang_map = { + 'afrikaans': 'af', + 'afr': 'af', + 'amharic': 'am', + 'amh': 'am', + 'arabic': 'ar', + 'ara': 'ar', + 'azerbaijani': 'az', + 'aze': 'az', + 'belarusian': 'be', + 'bel': 'be', + 'bulgarian': 'bg', + 'bul': 'bg', + 'bengali': 'bn', + 'ben': 'bn', + 'catalan': 'ca', + 'valencian': 'ca', + 'cat': 'ca', + 'cebuano': 'ceb', # In some cases, ISO-639-1 is not available, use ISO-639-2 + 'ceb': 'ceb', + 'czech': 'cs', + 'cze': 'cs', + 'ces': 'cs', + 'welsh': 'cy', + 'wel': 'cy', + 'cym': 'cy', + 'danish': 'da', + 'dan': 'da', + 'german': 'de', + 'ger': 'de', + 'deu': 'de', + 'greek': 'el', + 'gre': 'el', + 'ell': 'el', + 'english': 'en', + 'eng': 'en', + 'esperanto': 'eo', + 'epo': 'eo', + 'spanish': 'es', + 'castilian': 'es', + 'spa': 'es', + 'estonian': 'et', + 'est': 'et', + 'basque': 'eu', + 'baq': 'eu', + 'eus': 'eu', + 'persian': 'fa', + 'per': 'fa', + 'fas': 'fa', + 'finnish': 'fi', + 'fin': 'fi', + 'french': 'fr', + 'fre': 'fr', + 'fra': 'fr', + 'western frisian': 'fy', + 'fry': 'fy', + 'irish': 'ga', + 'gle': 'ga', + 'gaelic': 'gd', + 'scottish gaelic': 'gd', + 'gla': 'gd', + 'galician': 'gl', + 'glg': 'gl', + 'gujarati': 'gu', + 'guj': 'gu', + 'hausa': 'ha', + 'hau': 'ha', + 'hebrew': 'he', + 'heb': 'he', + 'hindi': 'hi', + 'hin': 'hi', + 'hungarian': 'hu', + 'hun': 'hu', + 'armenian': 'hy', + 'arm': 'hy', + 'hye': 'hy', + 'indonesian': 'id', + 'ind': 'id', + 'igbo': 'ig', + 'ibo': 'ig', + 'icelandic': 'is', + 'ice': 'is', + 'isl': 'is', + 'italian': 'it', + 'ita': 'it', + 'japanese': 'ja', + 'jpn': 'ja', + 'javanese': 'jv', + 'jav': 'jv', + 'georgian': 'ka', + 'geo': 'ka', + 'kat': 'ka', + 'kazakh': 'kk', + 'kaz': 'kk', + 'central khmer': 'km', + 'khm': 'km', + 'kannada': 'kn', + 'kan': 'kn', + 'korean': 'ko', + 'kor': 'ko', + 'kurdish': 'ku', + 'kur': 'ku', + 'kirghiz': 'ky', + 'kyrgyz': 'ky', + 'kir': 'ky', + 'latin': 'la', + 'lat': 'la', + 'lithuanian': 'lt', + 'lit': 'lt', + 'latvian': 'lv', + 'lav': 'lv', + 'malagasy': 'mg', + 'mlg': 'mg', + 'macedonian': 'mk', + 'mac': 'mk', + 'mkd': 'mk', + 'malayalam': 'ml', + 'mal': 'ml', + 'mongolian': 'mn', + 'mon': 'mn', + 'marathi': 'mr', + 'mar': 'mr', + 'malay': 'ms', + 'may': 'ms', + 'msa': 'ms', + 'maltese': 'mt', + 'mlt': 'mt', + 'burmese': 'my', + 'bur': 'my', + 'mya': 'my', + 'nepali': 'ne', + 'nep': 'ne', + 'dutch': 'nl', + 'flemish': 'nl', + 'dut': 'nl', + 'nld': 'nl', + 'norwegian': 'no', + 'nor': 'no', + 'panjabi': 'pa', + 'punjabi': 'pa', + 'pan': 'pa', + 'polish': 'pl', + 'pol': 'pl', + 'pushto': 'ps', + 'pashto': 'ps', + 'pus': 'ps', + 'portuguese': 'pt', + 'por': 'pt', + 'romanian': 'ro', + 'moldavian': 'ro', + 'moldovan': 'ro', + 'rum': 'ro', + 'ron': 'ro', + 'russian': 'ru', + 'rus': 'ru', + 'sinhala': 'si', + 'sinhalese': 'si', + 'sin': 'si', + 'slovak': 'sk', + 'slo': 'sk', + 'slk': 'sk', + 'slovenian': 'sl', + 'slv': 'sl', + 'albanian': 'sq', + 'alb': 'sq', + 'sqi': 'sq', + 'serbian': 'sr', + 'srp': 'sr', + 'swedish': 'sv', + 'swe': 'sv', + 'tamil': 'ta', + 'tam': 'ta', + 'telugu': 'te', + 'tel': 'te', + 'tajik': 'tg', + 'tgk': 'tg', + 'thai': 'th', + 'tha': 'th', + 'turkish': 'tr', + 'tur': 'tr', + 'ukrainian': 'uk', + 'ukr': 'uk', + 'urdu': 'ur', + 'urd': 'ur', + 'uzbek': 'uz', + 'uzb': 'uz', + 'vietnamese': 'vi', + 'vie': 'vi', + 'xhosa': 'xh', + 'xho': 'xh', + 'yiddish': 'yi', + 'yid': 'yi', + 'yoruba': 'yo', + 'yor': 'yo', + 'chinese': 'zh', + 'chi': 'zh', + 'zho': 'zh', + 'zulu': 'zu', + 'zul': 'zu', + 'hans':'zh', # Also check for chinese scripts + 'hant': 'zh', + 'cmn':'zh' # In some cases we use 'cmn' = 'Mandarin' + } + + _wtp_iso_set = set(_wtp_lang_map.values()) + + @classmethod + def convert_to_iso(cls, lang: str) -> Optional[str]: + # ISO 639-2 (language) is sometimes paired with ISO 15924 (script). + # Extract the language portion and check if supported in WtP. + if not lang: + return None + + if '-' in lang: + lang = lang.split('-')[0] + if '_' in lang: + lang = lang.split('_')[0] + + lang = lang.strip().lower() + + if lang in cls._wtp_iso_set: + return lang + + if lang in cls._wtp_lang_map: + return cls._wtp_lang_map[lang] + + return None diff --git a/python/AzureTranslation/plugin-files/descriptor/descriptor.json b/python/AzureTranslation/plugin-files/descriptor/descriptor.json index 2533d996..d2839218 100644 --- a/python/AzureTranslation/plugin-files/descriptor/descriptor.json +++ b/python/AzureTranslation/plugin-files/descriptor/descriptor.json @@ -10,7 +10,7 @@ "description": "Uses Azure Cognitive Services to perform translation.", "actionType": "DETECTION", "trackType": "TRANSLATION", - "outputChangedCounter" : 1, + "outputChangedCounter": 1, "requiresCollection": { "states": [] }, @@ -80,6 +80,24 @@ "description": "Comma-separated list of property names indicating which properties in the feed-forward track or detection determine the language from which to translate. If the first property listed is present, then that property will be used. If it's not, then the next property in the list is considered. If none are present, fall back to FROM_LANGUAGE.", "type": "STRING", "defaultValue": "ISO_LANGUAGE,DECODED_LANGUAGE,LANGUAGE" + }, + { + "name": "SENTENCE_SPLITTER_CHAR_COUNT", + "description": "Integer value specifying maximum number of characters to process through sentence splitter. Defaults to 500 characters.", + "type": "INT", + "defaultValue": "500" + }, + { + "name": "SENTENCE_MODEL", + "description": "Name of sentence segmentation model. Supported options are spaCy's multilingual `xx_sent_ud_sm` model and the Where's the Point (WtP) `wtp-bert-mini` model.", + "type": "STRING", + "defaultValue": "wtp-bert-mini" + }, + { + "name": "SENTENCE_SPLITTER_INCLUDE_INPUT_LANG", + "description": "Specifies whether to pass input language to sentence splitter algorithm. Currently, only WtP supports model adjustments by input language.", + "type": "BOOLEAN", + "defaultValue": "TRUE" } ] } @@ -104,8 +122,7 @@ "name": "AZURE TRANSLATION TEXT FILE ACTION", "description": "Uses Azure Cognitive Services to perform translation on a plain text file.", "algorithm": "AZURETRANSLATION", - "properties": [ - ] + "properties": [] } ], "tasks": [ @@ -133,4 +150,4 @@ ] } ] -} +} \ No newline at end of file diff --git a/python/AzureTranslation/sample_acs_translator.py b/python/AzureTranslation/sample_acs_translator.py index 0a31a121..ac144a0d 100644 --- a/python/AzureTranslation/sample_acs_translator.py +++ b/python/AzureTranslation/sample_acs_translator.py @@ -29,6 +29,7 @@ import sys from acs_translation_component import TranslationClient +from nlp_text_splitter import TextSplitterModel def main(): @@ -40,10 +41,13 @@ def main(): detection_props = dict(TEXT=text) job_props = dict(TO_LANGUAGE=to_lang, ACS_URL=acs_url, ACS_SUBSCRIPTION_KEY=acs_subscription_key) - TranslationClient(job_props).add_translations(detection_props) + + wtp_model = TextSplitterModel("wtp-bert-mini", "cpu") + TranslationClient(job_props, wtp_model).add_translations(detection_props) print('TRANSLATION SOURCE LANGUAGE:', detection_props['TRANSLATION SOURCE LANGUAGE']) - print('TRANSLATION SOURCE LANGUAGE CONFIDENCE:', detection_props['TRANSLATION SOURCE LANGUAGE CONFIDENCE']) + print('TRANSLATION SOURCE LANGUAGE CONFIDENCE:', + detection_props['TRANSLATION SOURCE LANGUAGE CONFIDENCE']) print('TRANSLATION:') print(detection_props['TRANSLATION']) diff --git a/python/AzureTranslation/setup.cfg b/python/AzureTranslation/setup.cfg index 36a5e35d..cb650a2d 100644 --- a/python/AzureTranslation/setup.cfg +++ b/python/AzureTranslation/setup.cfg @@ -29,11 +29,14 @@ name = AzureTranslation version = 8.0 [options] -packages = acs_translation_component +packages = find: install_requires = mpf_component_api>=8.0 mpf_component_util>=8.0 langcodes + spacy>=3.7.4 + wtpsplit>=1.3.0 + torch>=2.2.0 [options.entry_points] mpf.exported_component = diff --git a/python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-1.json b/python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-1.json index e91c2f42..8d7d5f2e 100644 --- a/python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-1.json +++ b/python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-1.json @@ -2,9 +2,9 @@ { "translations": [ { - "text": "Soldiers, great things of the country, the land of death, the way of survival, can not be ignored also. Therefore, the five things, the school to count, and the feelings: one road, two days, three earths, four will, five law. Taoists, so that the people agree, can die with it, can live with it, not dangerous also; heaven, yin and yang, cold and summer, time system also; earth, far and near, dangerous, narrow, dead also; generals, wisdom, faith, benevolence, courage, strict also; law, music, official, main use also.", + "text": "Soldiers, great things of the country, the land of death, the way of survival, can not be ignored also. Therefore, the five things, the school to count, and the feelings: one road, two days, three earths, four will, five law. Taoists, so that the people agree, can die with it, can live with it, not dangerous also;", "to": "en" } ] } -] +] \ No newline at end of file diff --git a/python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-2.json b/python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-2.json index 94ee875c..5a7b0316 100644 --- a/python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-2.json +++ b/python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-2.json @@ -2,9 +2,9 @@ { "translations": [ { - "text": "Where these five, will not hear, know the winner, do not know the winner. So the school to count, and ask for its feelings, : The Lord has a way? Will you be able to? Heaven and earth? The law? Soldiers? A soldier's practice? Reward and punishment? I know the winner or loser in this way. Will listen to my plan, use it will win, stay, will not listen to my plan, use it will lose, go. Profit to listen, is the trend, to the outside. The powerful, for profit and power also. Soldiers, trickery too.", + "text": "heaven, yin and yang, cold and summer, time system also; earth, far and near, dangerous, narrow, dead also; generals, wisdom, faith, benevolence, courage, strict also; law, music, official, main use also. Where these five, will not hear, know the winner, do not know the winner. So the school to count, and ask for its feelings, : The Lord has a way? Will you be able to? Heaven and earth? The law? Soldiers?", "to": "en" } ] } -] +] \ No newline at end of file diff --git a/python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-3.json b/python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-3.json index e39940c0..a1216f22 100644 --- a/python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-3.json +++ b/python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-3.json @@ -2,9 +2,9 @@ { "translations": [ { - "text": "Therefore, can show can not, use and show not use, near and far, far and near. To be tempted, to take it in disorder, to be prepared, to be strong and to avoid, to be angry and scratched, to be humble and proud, to work, to leave, to attack it un prepared, to be satisfactory. The victory of this soldier cannot be passed on first. The husband does not fight and the temple counts the winner, the more also; More odds, less chances, but nothing! I see it this way, win or lose.", + "text": "A soldier's practice? Reward and punishment? I know the winner or loser in this way. Will listen to my plan, use it will win, stay, will not listen to my plan, use it will lose, go. Profit to listen, is the trend, to the outside. The powerful, for profit and power also. Soldiers, trickery too. Therefore, can show can not, use and show not use, near and far, far and near.", "to": "en" } ] } -] +] \ No newline at end of file diff --git a/python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-4.json b/python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-4.json new file mode 100644 index 00000000..8fe82471 --- /dev/null +++ b/python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-4.json @@ -0,0 +1,10 @@ +[ + { + "translations": [ + { + "text": "To be tempted, to take it in disorder, to be prepared, to be strong and to avoid, to be angry and scratched, to be humble and proud, to work, to leave, to attack it un prepared, to be satisfactory. The victory of this soldier cannot be passed on first. The husband does not fight and the temple counts the winner, the more also; More odds, less chances, but nothing! I see it this way, win or lose.", + "to": "en" + } + ] + } +] \ No newline at end of file diff --git a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-break-sentence-1.json b/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-break-sentence-1.json deleted file mode 100644 index 2cc23f6f..00000000 --- a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-break-sentence-1.json +++ /dev/null @@ -1,9 +0,0 @@ -[ - { - "sentLen": [ - 24, - 37, - 81 - ] - } -] diff --git a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-break-sentence-2.json b/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-break-sentence-2.json deleted file mode 100644 index b937f291..00000000 --- a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-break-sentence-2.json +++ /dev/null @@ -1,19 +0,0 @@ -[ - { - "sentLen": [ - 22, - 18, - 5, - 5, - 5, - 5, - 5, - 5, - 8, - 27, - 15, - 10, - 7 - ] - } -] diff --git a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-break-sentence-3.json b/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-break-sentence-3.json deleted file mode 100644 index 6c796889..00000000 --- a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-break-sentence-3.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "sentLen": [ - 27, - 50, - 12, - 28, - 13, - 11 - ] - } -] diff --git a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-1.json b/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-1.json deleted file mode 100644 index f143b3a7..00000000 --- a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-1.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - { - "translations": [ - { - "text": "Soldiers, great things of the country, the land of death, the way of survival, can not be ignored also. Therefore, the five things, the school to count, and the feelings: one road, two days, three earths, four will, five law.", - "to": "en" - } - ] - } -] diff --git a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-2.json b/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-2.json deleted file mode 100644 index 5ab5c472..00000000 --- a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-2.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - { - "translations": [ - { - "text": "Taoists, so that the people agree, can die with it, can live with it, not dangerous also; heaven, yin and yang, cold and summer, time system also; earth, far and near, dangerous, narrow, dead also; generals, wisdom, faith, benevolence, courage, strict also; law, music, official, main use also.", - "to": "en" - } - ] - } -] diff --git a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-3.json b/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-3.json deleted file mode 100644 index 95ebeb51..00000000 --- a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-3.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - { - "translations": [ - { - "text": "Where these five, will not hear, know the winner, do not know the winner. So the school to count, and ask for its feelings, : The Lord has a way? Will you be able to? Heaven and earth? The law? Soldiers? A soldier's practice? Reward and punishment? I know the winner or loser in this way.", - "to": "en" - } - ] - } -] diff --git a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-4.json b/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-4.json deleted file mode 100644 index 4cdf8adb..00000000 --- a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-4.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - { - "translations": [ - { - "text": "Will listen to my plan, use it will win, stay, will not listen to my plan, use it will lose, go. Profit to listen, is the trend, to the outside. The powerful, for profit and power also. Soldiers, trickery too.", - "to": "en" - } - ] - } -] diff --git a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-5.json b/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-5.json deleted file mode 100644 index ab6da980..00000000 --- a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-5.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - { - "translations": [ - { - "text": "Therefore, can show can not, use and show not use, near and far, far and near. To be tempted, to take it in disorder, to be prepared, to be strong and to avoid, to be angry and scratched, to be humble and proud, to work, to leave, to attack it un prepared, to be satisfactory. The victory of this soldier cannot be passed on first.", - "to": "en" - } - ] - } -] diff --git a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-6.json b/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-6.json deleted file mode 100644 index 6b8ddc60..00000000 --- a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-6.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - { - "translations": [ - { - "text": "The husband does not fight and the temple counts the winner, the more also; More odds, less chances, but nothing! I see it this way, win or lose.", - "to": "en" - } - ] - } -] diff --git a/python/AzureTranslation/tests/data/invalid-lang-detect-result.json b/python/AzureTranslation/tests/data/invalid-lang-detect-result.json new file mode 100644 index 00000000..7fcfc64b --- /dev/null +++ b/python/AzureTranslation/tests/data/invalid-lang-detect-result.json @@ -0,0 +1,8 @@ +[ + { + "language": "fake-lang", + "score": 1.0, + "isTranslationSupported": true, + "isTransliterationSupported": true + } +] \ No newline at end of file diff --git a/python/AzureTranslation/tests/test_acs_translation.py b/python/AzureTranslation/tests/test_acs_translation.py index d9651d1b..fa81c1e4 100644 --- a/python/AzureTranslation/tests/test_acs_translation.py +++ b/python/AzureTranslation/tests/test_acs_translation.py @@ -42,10 +42,10 @@ import mpf_component_api as mpf sys.path.insert(0, str(pathlib.Path(__file__).parent.parent)) +from nlp_text_splitter import TextSplitterModel, TextSplitter from acs_translation_component.acs_translation_component import (AcsTranslationComponent, get_azure_char_count, TranslationClient, NewLineBehavior, ChineseAndJapaneseCodePoints, - AcsTranslateUrlBuilder, BreakSentenceClient, SentenceBreakGuesser, get_n_azure_chars) - + AcsTranslateUrlBuilder, BreakSentenceClient, get_n_azure_chars) SEEN_TRACE_IDS = set() @@ -63,10 +63,15 @@ class TestAcsTranslation(unittest.TestCase): mock_server: ClassVar['MockServer'] + wtp_model: ClassVar['TextSplitterModel'] + spacy_model: ClassVar['TextSplitterModel'] @classmethod def setUpClass(cls): cls.mock_server = MockServer() + cls.wtp_model = TextSplitterModel("wtp-bert-mini", "cpu") + cls.spacy_model = TextSplitterModel("xx_sent_ud_sm", "cpu") + @classmethod def tearDownClass(cls): @@ -100,9 +105,11 @@ def validate_results(results): result.detection_properties['TRANSLATION']) self.assertEqual('EN', result.detection_properties['TRANSLATION TO LANGUAGE']) - self.assertEqual('zh-Hans', result.detection_properties['TRANSLATION SOURCE LANGUAGE']) + self.assertEqual('zh-Hans', + result.detection_properties['TRANSLATION SOURCE LANGUAGE']) self.assertAlmostEqual( - 1.0, float(result.detection_properties['TRANSLATION SOURCE LANGUAGE CONFIDENCE'])) + 1.0, + float(result.detection_properties['TRANSLATION SOURCE LANGUAGE CONFIDENCE'])) self.assertNotIn('SKIPPED TRANSLATION', result.detection_properties) detect_request_body = self.get_request_body() @@ -182,29 +189,31 @@ def test_video_job(self): self.assertEqual(CHINESE_SAMPLE_TEXT, - result.frame_locations[0].detection_properties['TEXT']) + result.frame_locations[0].detection_properties['TEXT']) self.assertEqual(CHINESE_SAMPLE_TEXT_ENG_TRANSLATE, - result.frame_locations[0].detection_properties['TRANSLATION']) + result.frame_locations[0].detection_properties['TRANSLATION']) self.assertEqual('EN', - result.frame_locations[0].detection_properties['TRANSLATION TO LANGUAGE']) + result.frame_locations[0].detection_properties['TRANSLATION TO LANGUAGE']) self.assertEqual('zh-Hans', - result.frame_locations[0].detection_properties['TRANSLATION SOURCE LANGUAGE']) + result.frame_locations[0].detection_properties['TRANSLATION SOURCE LANGUAGE']) self.assertAlmostEqual( 1.0, - float(result.frame_locations[0].detection_properties['TRANSLATION SOURCE LANGUAGE CONFIDENCE'])) + float(result.frame_locations[0]\ + .detection_properties['TRANSLATION SOURCE LANGUAGE CONFIDENCE'])) self.assertEqual(SPANISH_SAMPLE_TEXT, - result.frame_locations[1].detection_properties['TEXT']) + result.frame_locations[1].detection_properties['TEXT']) self.assertEqual(SPANISH_SAMPLE_TEXT_ENG_TRANSLATE, - result.frame_locations[1].detection_properties['TRANSLATION']) + result.frame_locations[1].detection_properties['TRANSLATION']) self.assertEqual('EN', - result.frame_locations[1].detection_properties['TRANSLATION TO LANGUAGE']) + result.frame_locations[1].detection_properties['TRANSLATION TO LANGUAGE']) self.assertEqual('es', - result.frame_locations[1].detection_properties['TRANSLATION SOURCE LANGUAGE']) + result.frame_locations[1].detection_properties['TRANSLATION SOURCE LANGUAGE']) self.assertAlmostEqual( 1.0, - float(result.frame_locations[1].detection_properties['TRANSLATION SOURCE LANGUAGE CONFIDENCE'])) + float(result.frame_locations[1]\ + .detection_properties['TRANSLATION SOURCE LANGUAGE CONFIDENCE'])) request_body1 = self.get_request_body() self.assertEqual(1, len(request_body1)) @@ -260,14 +269,14 @@ def test_detect_lang_disabled(self): def test_no_feed_forward_location(self): job = mpf.ImageJob('Test', 'test.jpg', get_test_properties(), {}) with self.assertRaises(mpf.DetectionException) as cm: - list(AcsTranslationComponent.get_detections_from_image(job)) + list(AcsTranslationComponent().get_detections_from_image(job)) self.assertEqual(mpf.DetectionError.UNSUPPORTED_DATA_TYPE, cm.exception.error_code) def test_no_feed_forward_track(self): job = mpf.VideoJob('test', 'test.jpg', 0, 1, get_test_properties(), {}) with self.assertRaises(mpf.DetectionException) as cm: - list(AcsTranslationComponent.get_detections_from_video(job)) + list(AcsTranslationComponent().get_detections_from_video(job)) self.assertEqual(mpf.DetectionError.UNSUPPORTED_DATA_TYPE, cm.exception.error_code) @@ -283,7 +292,7 @@ def test_reports_error_when_server_error(self, _): ff_track) with self.assertRaises(mpf.DetectionException) as cm: - AcsTranslationComponent.get_detections_from_video(job) + AcsTranslationComponent().get_detections_from_video(job) self.assertEqual(mpf.DetectionError.NETWORK_ERROR, cm.exception.error_code) @@ -294,14 +303,14 @@ def test_reports_error_when_missing_acs_props(self): del test_props['ACS_URL'] job = mpf.ImageJob('Test', 'test.jpg', test_props, {}, ff_loc) with self.assertRaises(mpf.DetectionException) as cm: - AcsTranslationComponent.get_detections_from_image(job) + AcsTranslationComponent().get_detections_from_image(job) self.assertEqual(mpf.DetectionError.MISSING_PROPERTY, cm.exception.error_code) test_props = get_test_properties() del test_props['ACS_SUBSCRIPTION_KEY'] job = mpf.ImageJob('Test', 'test.jpg', test_props, {}, ff_loc) with self.assertRaises(mpf.DetectionException) as cm: - AcsTranslationComponent.get_detections_from_image(job) + AcsTranslationComponent().get_detections_from_image(job) self.assertEqual(mpf.DetectionError.MISSING_PROPERTY, cm.exception.error_code) @@ -314,7 +323,7 @@ def test_missing_required_properties(self): job = mpf.ImageJob('Test', 'test.jpg', test_props, {}, ff_loc) with self.assertRaises(mpf.DetectionException) as cm: - AcsTranslationComponent.get_detections_from_image(job) + AcsTranslationComponent().get_detections_from_image(job) self.assertEqual(mpf.DetectionError.MISSING_PROPERTY, cm.exception.error_code) @@ -363,7 +372,7 @@ def test_translation_cache(self): job = mpf.VideoJob('test', 'test.jpg', 0, 1, get_test_properties(), {}, ff_track) - results = list(AcsTranslationComponent.get_detections_from_video(job)) + results = list(AcsTranslationComponent().get_detections_from_video(job)) self.assertEqual(1, len(results)) result = results[0] @@ -504,20 +513,17 @@ def assert_expected_url(job_properties, expected_to, expected_from, expected_que dict(ACS_URL='http://example.com/test?suggestedFrom=ru&category=whatever'), 'en', None, {'suggestedFrom': 'ru', 'category': 'whatever'}) - - - @mock.patch.object(BreakSentenceClient, 'TRANSLATION_MAX_CHARS', new_callable=lambda: 150) def test_split_text(self, _): self.set_results_file('traditional-chinese-detect-result.json') - self.set_results_file('break-sentence/break-sentence-art-of-war-results.json') self.set_results_file('break-sentence/art-of-war-translation-1.json') self.set_results_file('break-sentence/art-of-war-translation-2.json') self.set_results_file('break-sentence/art-of-war-translation-3.json') + self.set_results_file('break-sentence/art-of-war-translation-4.json') text = (TEST_DATA / 'break-sentence/art-of-war.txt').read_text() detection_props = dict(TEXT=text) - TranslationClient(get_test_properties()).add_translations(detection_props) + TranslationClient(get_test_properties(), self.wtp_model).add_translations(detection_props) self.assertEqual(5, len(detection_props)) self.assertEqual(text, detection_props['TEXT']) @@ -528,52 +534,71 @@ def test_split_text(self, _): self.assertEqual('EN', detection_props['TRANSLATION TO LANGUAGE']) self.assertEqual('zh-Hant', detection_props['TRANSLATION SOURCE LANGUAGE']) - self.assertAlmostEqual(1.0, float(detection_props['TRANSLATION SOURCE LANGUAGE CONFIDENCE'])) + self.assertAlmostEqual(1.0, + float(detection_props['TRANSLATION SOURCE LANGUAGE CONFIDENCE'])) detect_request_text = self.get_request_body()[0]['Text'] self.assertEqual(text, detect_request_text) + behavior = NewLineBehavior.get({}) + actual = list(TextSplitter.split(behavior(text, 'zh-Hant'), + BreakSentenceClient.TRANSLATION_MAX_CHARS, + BreakSentenceClient.TRANSLATION_MAX_CHARS, + get_azure_char_count, + self.wtp_model, + 'zh-Hant')) - break_sentence_url, break_sentence_response = self.get_request() - self.assertIn('language=zh-Hant', break_sentence_url) - break_sentence_request_text = break_sentence_response[0]['Text'] - - self.assertNotIn('\n', break_sentence_request_text, 'Newlines were not properly removed') - self.assertNotIn(' ', break_sentence_request_text, - 'Spaces should not be added to Chinese text.') - - expected_chunk_lengths = [142, 137, 141] - self.assertEqual(sum(expected_chunk_lengths), len(break_sentence_request_text)) + self.assertEqual(4, len(actual)) translation_request1 = self.get_request_body()[0]['Text'] - self.assertEqual(expected_chunk_lengths[0], len(translation_request1)) self.assertTrue(translation_request1.startswith('兵者,')) - self.assertTrue(translation_request1.endswith('主用也。')) + self.assertTrue(translation_request1.endswith('而不危也;')) + self.assertNotIn('\n', translation_request1, + 'Newlines were not properly removed') + self.assertNotIn(' ', translation_request1, + 'Spaces should not be added to Chinese text.') + translation_request2 = self.get_request_body()[0]['Text'] - self.assertEqual(expected_chunk_lengths[1], len(translation_request2)) - self.assertTrue(translation_request2.startswith('凡此五')) - self.assertTrue(translation_request2.endswith('詭道也。')) + self.assertTrue(translation_request2.startswith('天者,陰陽')) + self.assertTrue(translation_request2.endswith('兵眾孰強?')) + self.assertNotIn('\n', translation_request1, + 'Newlines were not properly removed') + self.assertNotIn(' ', translation_request1, + 'Spaces should not be added to Chinese text.') + translation_request3 = self.get_request_body()[0]['Text'] - self.assertEqual(expected_chunk_lengths[2], len(translation_request3)) - self.assertTrue(translation_request3.startswith('故能而')) - self.assertTrue(translation_request3.endswith('勝負見矣。')) + self.assertTrue(translation_request3.startswith('士卒孰練?')) + self.assertTrue(translation_request3.endswith('遠而示之近。')) + self.assertNotIn('\n', translation_request3, + 'Newlines were not properly removed') + self.assertNotIn(' ', translation_request3, + 'Spaces should not be added to Chinese text.') + + + translation_request4 = self.get_request_body()[0]['Text'] + self.assertTrue(translation_request4.startswith('利而誘之,')) + self.assertTrue(translation_request4.endswith('勝負見矣。')) + self.assertNotIn('\n', translation_request4, + 'Newlines were not properly removed') + self.assertNotIn(' ', translation_request4, + 'Spaces should not be added to Chinese text.') - @mock.patch.object(BreakSentenceClient, 'TRANSLATION_MAX_CHARS', new_callable=lambda: 100) - @mock.patch.object(BreakSentenceClient, 'BREAK_SENTENCE_MAX_CHARS', new_callable=lambda: 150) - @mock.patch.object(TranslationClient, 'DETECT_MAX_CHARS', new_callable=lambda: 20) - def test_guess_break_with_break_sentence(self, _, __, ___): - self.set_results_file('traditional-chinese-detect-result.json') - for i in range(1, 4): - self.set_results_file( - f'break-sentence/with-guessing/art-of-war-break-sentence-{i}.json') - for i in range(1, 7): - self.set_results_file(f'break-sentence/with-guessing/art-of-war-translation-{i}.json') + + @mock.patch.object(BreakSentenceClient, 'TRANSLATION_MAX_CHARS', new_callable=lambda: 150) + def test_split_text_check_wtp_unusual_lang(self, _): + # Check that the text splitter does not have an issue + # processing an unknown detected language. + self.set_results_file('invalid-lang-detect-result.json') + self.set_results_file('break-sentence/art-of-war-translation-1.json') + self.set_results_file('break-sentence/art-of-war-translation-2.json') + self.set_results_file('break-sentence/art-of-war-translation-3.json') + self.set_results_file('break-sentence/art-of-war-translation-4.json') text = (TEST_DATA / 'break-sentence/art-of-war.txt').read_text() detection_props = dict(TEXT=text) - TranslationClient(get_test_properties()).add_translations(detection_props) + TranslationClient(get_test_properties(), self.wtp_model).add_translations(detection_props) self.assertEqual(5, len(detection_props)) self.assertEqual(text, detection_props['TEXT']) @@ -583,32 +608,30 @@ def test_guess_break_with_break_sentence(self, _, __, ___): self.assertEqual(expected_translation, detection_props['TRANSLATION']) self.assertEqual('EN', detection_props['TRANSLATION TO LANGUAGE']) - self.assertEqual('zh-Hant', detection_props['TRANSLATION SOURCE LANGUAGE']) + self.assertEqual('fake-lang', detection_props['TRANSLATION SOURCE LANGUAGE']) self.assertAlmostEqual(1.0, - float(detection_props['TRANSLATION SOURCE LANGUAGE CONFIDENCE'])) + float(detection_props['TRANSLATION SOURCE LANGUAGE CONFIDENCE'])) detect_request_text = self.get_request_body()[0]['Text'] - self.assertEqual(text[:TranslationClient.DETECT_MAX_CHARS], detect_request_text) - - for i in range(3): - break_sentence_url, break_sentence_request = self.get_request() - self.assertIn('language=zh-Hant', break_sentence_url) - break_sentence_request_text = break_sentence_request[0]['Text'] - self.assertNotIn('\n', break_sentence_request_text, - 'Newlines were not properly removed') - self.assertNotIn(' ', break_sentence_request_text, - 'Spaces should not be added to Chinese text.') - self.assertEqual('。', break_sentence_request_text[-1]) - - for i in range(6): - translate_url, translate_request = self.get_request() - self.assertIn('from=zh-Hant', translate_url) - translate_request_text = translate_request[0]['Text'] - self.assertNotIn('\n', translate_request_text, - 'Newlines were not properly removed') - self.assertNotIn(' ', translate_request_text, - 'Spaces should not be added to Chinese text.') - self.assertEqual('。', translate_request_text[-1]) + self.assertEqual(text, detect_request_text) + + # WtP will split by newlines, so some of the chunks + # here are different from the previous test. + translation_request1 = self.get_request_body()[0]['Text'] + self.assertTrue(translation_request1.startswith('兵者,')) + self.assertTrue(translation_request1.endswith('而不危也;')) + + translation_request2 = self.get_request_body()[0]['Text'] + self.assertTrue(translation_request2.startswith('天者,陰陽')) + self.assertTrue(translation_request2.endswith('兵眾孰強?')) + + translation_request3 = self.get_request_body()[0]['Text'] + self.assertTrue(translation_request3.startswith('士卒孰練?')) + self.assertTrue(translation_request3.endswith('亂而取之, ')) + + translation_request4 = self.get_request_body()[0]['Text'] + self.assertTrue(translation_request4.startswith('實而備之,')) + self.assertTrue(translation_request4.endswith('勝負見矣。 ')) def test_newline_removal(self): @@ -618,7 +641,7 @@ def replace(text): self.set_results_file('results-chinese.json') props = get_test_properties(DETECT_BEFORE_TRANSLATE='FALSE') - TranslationClient(props).add_translations(dict(TEXT=text)) + TranslationClient(props, self.wtp_model).add_translations(dict(TEXT=text)) return self.get_request_body()[0]['Text'] with self.subTest('English'): @@ -768,7 +791,7 @@ def test_category_and_explicit_from_language(self): get_test_properties(FROM_LANGUAGE='zh-Hans', CATEGORY='My category'), {}, ff_loc) - results = list(AcsTranslationComponent.get_detections_from_image(job)) + results = list(AcsTranslationComponent().get_detections_from_image(job)) self.assertEqual(1, len(results)) result = results[0] @@ -803,7 +826,7 @@ def test_suggested_from(self): ff_loc = mpf.ImageLocation(0, 0, 10, 20, -1, dict(TEXT=input_text)) props = get_test_properties(SUGGESTED_FROM_LANGUAGE='ja', DETECT_BEFORE_TRANSLATE='false') job = mpf.ImageJob('Test', 'test.jpg', props, {}, ff_loc) - results = list(AcsTranslationComponent.get_detections_from_image(job)) + results = list(AcsTranslationComponent().get_detections_from_image(job)) self.assertEqual(1, len(results)) result = results[0] @@ -825,53 +848,112 @@ def test_suggested_from(self): self.assertEqual(['en'], query_dict['to']) - @mock.patch.object(BreakSentenceClient, 'BREAK_SENTENCE_MAX_CHARS', new_callable=lambda: 5) + @mock.patch.object(BreakSentenceClient, 'TRANSLATION_MAX_CHARS', new_callable=lambda: 5) def test_guess_breaks_all_types(self, _): input_text = 'a.bc,d.efg,hij kl\n\nmnopqrs.tu' - actual = list(SentenceBreakGuesser.guess_breaks(input_text)) + + actual = list(TextSplitter.split(input_text, + BreakSentenceClient.TRANSLATION_MAX_CHARS, + BreakSentenceClient.TRANSLATION_MAX_CHARS, + get_azure_char_count, + self.wtp_model)) + + self.assertEqual(input_text, ''.join(actual)) + self.assertEqual(6, len(actual)) + + # a.bc, + self.assertEqual('a.bc,', actual[0]) + # bc,d. + self.assertEqual('d.efg', actual[1]) + # efg,h + self.assertEqual(',hij ', actual[2]) + # hij k + self.assertEqual('kl\n\n', actual[3]) + # kl\n\nm + self.assertEqual('mnopq', actual[4]) + # mnopq + self.assertEqual('rs.tu', actual[5]) + + actual = list(TextSplitter.split(input_text, + BreakSentenceClient.TRANSLATION_MAX_CHARS, + BreakSentenceClient.TRANSLATION_MAX_CHARS, + get_azure_char_count, + self.spacy_model)) self.assertEqual(input_text, ''.join(actual)) - self.assertEqual(7, len(actual)) + self.assertEqual(6, len(actual)) # a.bc, - self.assertEqual('a.', actual[0]) + self.assertEqual('a.bc', actual[0]) # bc,d. - self.assertEqual('bc,d.', actual[1]) + self.assertEqual(',d.ef', actual[1]) # efg,h - self.assertEqual('efg,', actual[2]) + self.assertEqual('g,hij', actual[2]) # hij k - self.assertEqual('hij ', actual[3]) + self.assertEqual(' kl\n\n', actual[3]) # kl\n\nm - self.assertEqual('kl\n\n', actual[4]) + self.assertEqual('mnopq', actual[4]) # mnopq - self.assertEqual('mnopq', actual[5]) - # rs.tu - self.assertEqual('rs.tu', actual[6], 'Should not divide final segment of text.') + self.assertEqual('rs.tu', actual[5]) - @mock.patch.object(BreakSentenceClient, 'BREAK_SENTENCE_MAX_CHARS', new_callable=lambda: 20) + @mock.patch.object(BreakSentenceClient, 'TRANSLATION_MAX_CHARS', new_callable=lambda: 28) def test_guess_breaks_actual_sentence(self, _): input_text = 'Hello, what is your name? My name is John.' - actual = list(SentenceBreakGuesser.guess_breaks(input_text)) + actual = list(TextSplitter.split(input_text, + BreakSentenceClient.TRANSLATION_MAX_CHARS, + BreakSentenceClient.TRANSLATION_MAX_CHARS, + get_azure_char_count, + self.wtp_model)) self.assertEqual(input_text, ''.join(actual)) - self.assertEqual(3, len(actual)) + self.assertEqual(2, len(actual)) + + # "Hello, what is your name?" + self.assertEqual('Hello, what is your name? ', actual[0]) + # " My name is John." + self.assertEqual('My name is John.', actual[1]) + + input_text = 'Hello, what is your name? My name is John.' + actual = list(TextSplitter.split(input_text, + BreakSentenceClient.TRANSLATION_MAX_CHARS, + BreakSentenceClient.TRANSLATION_MAX_CHARS, + get_azure_char_count, + self.spacy_model)) + self.assertEqual(input_text, ''.join(actual)) + self.assertEqual(2, len(actual)) - # "Hello, what is your " - self.assertEqual('Hello,', actual[0]) - # " what is your name? " - self.assertEqual(' what is your name?', actual[1]) + # "Hello, what is your name?" + self.assertEqual('Hello, what is your name? ', actual[0]) # " My name is John." - self.assertEqual(' My name is John.', actual[2]) + self.assertEqual('My name is John.', actual[1]) - @mock.patch.object(BreakSentenceClient, 'BREAK_SENTENCE_MAX_CHARS', new_callable=lambda: 20) + + @mock.patch.object(BreakSentenceClient, 'TRANSLATION_MAX_CHARS', new_callable=lambda: 20) def test_sentence_end_punctuation(self, _): input_text = 'Hello. How are you? asdfasdf' - actual = list(SentenceBreakGuesser.guess_breaks(input_text)) + actual = list(TextSplitter.split(input_text, + BreakSentenceClient.TRANSLATION_MAX_CHARS, + 10, + get_azure_char_count, + self.wtp_model)) + + self.assertEqual(input_text, ''.join(actual)) + self.assertEqual(2, len(actual)) + + self.assertEqual('Hello. How are you? ', actual[0]) + self.assertEqual('asdfasdf', actual[1]) + + actual = list(TextSplitter.split(input_text, + BreakSentenceClient.TRANSLATION_MAX_CHARS, + 10, + get_azure_char_count, + self.spacy_model)) + self.assertEqual(input_text, ''.join(actual)) self.assertEqual(2, len(actual)) - self.assertEqual('Hello. How are you?', actual[0]) - self.assertEqual(' asdfasdf', actual[1]) + self.assertEqual('Hello. How are you? ', actual[0]) + self.assertEqual('asdfasdf', actual[1]) def test_no_translate_no_detect_when_language_ff_prop_matches(self): @@ -1098,19 +1180,13 @@ def do_POST(self): is_detect = url_parts.path == '/translator/detect' is_translate = url_parts.path == '/translator/translate' - is_break_sentence = url_parts.path == '/translator/breaksentence' - if not is_detect and not is_translate and not is_break_sentence: + if not is_detect and not is_translate: self._send_error(404, 000, 'NOT FOUND') return self._validate_headers() self._validate_query_string(url_parts.query, is_translate) - if is_detect: - max_chars = TranslationClient.DETECT_MAX_CHARS - elif is_translate: - max_chars = BreakSentenceClient.TRANSLATION_MAX_CHARS - else: - max_chars = BreakSentenceClient.BREAK_SENTENCE_MAX_CHARS + max_chars = TranslationClient.DETECT_MAX_CHARS self._validate_body(max_chars) self.send_response(200) From 434f1cad172900e75071736defba9b9514262dd9 Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Wed, 3 Apr 2024 23:39:25 -0400 Subject: [PATCH 02/22] Code refactor and tooltip update. --- python/AzureTranslation/LICENSE | 74 +++++++++ python/AzureTranslation/README.md | 7 +- .../acs_translation_component.py | 44 +++--- .../nlp_text_splitter/text_splitter.py | 27 ++-- .../plugin-files/descriptor/descriptor.json | 2 +- python/AzureTranslation/tests/data/NOTICE | 2 +- .../art-of-war-translation-1.json | 0 .../art-of-war-translation-2.json | 0 .../art-of-war-translation-3.json | 0 .../art-of-war-translation-4.json | 0 .../art-of-war.txt | 0 .../art-war-translation.txt | 0 .../break-sentence-art-of-war-results.json | 0 .../tests/test_acs_translation.py | 141 ++++++++---------- 14 files changed, 176 insertions(+), 121 deletions(-) create mode 100644 python/AzureTranslation/LICENSE rename python/AzureTranslation/tests/data/{break-sentence => split-sentence}/art-of-war-translation-1.json (100%) rename python/AzureTranslation/tests/data/{break-sentence => split-sentence}/art-of-war-translation-2.json (100%) rename python/AzureTranslation/tests/data/{break-sentence => split-sentence}/art-of-war-translation-3.json (100%) rename python/AzureTranslation/tests/data/{break-sentence => split-sentence}/art-of-war-translation-4.json (100%) rename python/AzureTranslation/tests/data/{break-sentence => split-sentence}/art-of-war.txt (100%) rename python/AzureTranslation/tests/data/{break-sentence => split-sentence}/art-war-translation.txt (100%) rename python/AzureTranslation/tests/data/{break-sentence => split-sentence}/break-sentence-art-of-war-results.json (100%) diff --git a/python/AzureTranslation/LICENSE b/python/AzureTranslation/LICENSE new file mode 100644 index 00000000..42dfb8b2 --- /dev/null +++ b/python/AzureTranslation/LICENSE @@ -0,0 +1,74 @@ +/***************************************************************************** +* Copyright 2024 The MITRE Corporation * +* * +* Licensed under the Apache License, Version 2.0 (the "License"); * +* you may not use this file except in compliance with the License. * +* You may obtain a copy of the License at * +* * +* http://www.apache.org/licenses/LICENSE-2.0 * +* * +* Unless required by applicable law or agreed to in writing, software * +* distributed under the License is distributed on an "AS IS" BASIS, * +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * +* See the License for the specific language governing permissions and * +* limitations under the License. * +******************************************************************************/ + +This project contains content developed by The MITRE Corporation. If this code +is used in a deployment or embedded within another project, it is requested +that you send an email to opensource@mitre.org in order to let us know where +this software is being used. + +***************************************************************************** + +The WtP, "Where the Point", sentence segmentation library falls under the MIT License: + +https://github.com/bminixhofer/wtpsplit/blob/main/LICENSE + +MIT License + +Copyright (c) 2023 Benjamin Minixhofer + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +***************************************************************************** + +The spaCy Natural Language Processing library falls under the MIT License: + +The MIT License (MIT) + +Copyright (C) 2016-2023 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. \ No newline at end of file diff --git a/python/AzureTranslation/README.md b/python/AzureTranslation/README.md index e01a6b0a..18e9d3b0 100644 --- a/python/AzureTranslation/README.md +++ b/python/AzureTranslation/README.md @@ -35,8 +35,7 @@ must be provided. Neither has a default value. `https:///translator/text/v3.0`. The URL should not end with `/translate` because two separate endpoints are used. `ACS_URL + '/translate'` is used for translation. - `ACS_URL + '/breaksentence'` is used to break up text when it is too long - for a single translation request. This property can also be configured + This property can also be configured using an environment variable named `MPF_PROP_ACS_URL`. - `ACS_SUBSCRIPTION_KEY`: A string containing your Azure Cognitive Services @@ -77,10 +76,10 @@ curl -H "Ocp-Apim-Subscription-Key: ${ACS_SUBSCRIPTION_KEY}" "https://${ACS_URL} and spaCy's multilingual sentence detection model for identifying sentence breaks in a large section of text. - WtP models are trained to break up multilingual text without the need + WtP models are trained to split up multilingual text by sentence without the need of an input language tag. The disadvantage is that the most accurate WtP models will need ~3.5 GB of GPU memory. On the other hand, spaCy has a single - multilingual sentence detection that appears to work better for breaking up English + multilingual sentence detection that appears to work better for splitting up English text in certain cases, unfortunately this model lacks support handling for Chinese punctuation. - `SENTENCE_MODEL` - Specifies the desired WtP or spaCy sentence detection model. diff --git a/python/AzureTranslation/acs_translation_component/acs_translation_component.py b/python/AzureTranslation/acs_translation_component/acs_translation_component.py index 0e2e816e..fe18557d 100644 --- a/python/AzureTranslation/acs_translation_component/acs_translation_component.py +++ b/python/AzureTranslation/acs_translation_component/acs_translation_component.py @@ -158,6 +158,9 @@ class UnsupportedSourceLanguage(Exception): class TranslationClient: + # ACS limits the number of characters that can be translated in a single /translate call. + # Taken from + # https://docs.microsoft.com/en-us/azure/cognitive-services/translator/reference/v3-0-translate DETECT_MAX_CHARS = 50_000 def __init__(self, job_properties: Mapping[str, str], sentence_model: TextSplitterModel): @@ -184,7 +187,7 @@ def __init__(self, job_properties: Mapping[str, str], sentence_model: TextSplitt acs_url = get_required_property('ACS_URL', job_properties) self._detect_url = create_url(acs_url, 'detect', {}) - self._break_sentence_client = BreakSentenceClient(job_properties, sentence_model) + self._sentence_splitter = SentenceSplitter(job_properties, sentence_model) prop_names = job_properties.get('FEED_FORWARD_PROP_TO_PROCESS', 'TEXT,TRANSCRIPT') self._props_to_translate = [p.strip() for p in prop_names.split(',')] @@ -241,7 +244,7 @@ def add_translations(self, detection_properties: Dict[str, str]) -> None: def _translate_text(self, text: str, detection_properties: Dict[str, str]) -> TranslationResult: """ - Translates the given text. If the text is longer than ACS allows, we will break up the + Translates the given text. If the text is longer than ACS allows, we will split up the text and translate each part separately. If, during the current job, we have seen the exact text before, we return a cached result instead of making a REST call. """ @@ -272,7 +275,7 @@ def _translate_text(self, text: str, detection_properties: Dict[str, str]) -> Tr text, DetectResult(from_lang, from_lang_confidence), skipped=True) else: text_replaced_newlines = self._newline_behavior(text, from_lang) - grouped_sentences = self._break_sentence_client.split_input_text( + grouped_sentences = self._sentence_splitter.split_input_text( text_replaced_newlines, from_lang, from_lang_confidence) if not detect_result and grouped_sentences.detected_language: assert grouped_sentences.detected_language_confidence is not None @@ -438,18 +441,13 @@ def _send_detect_request(self, text) -> 'AcsResponses.Detect': return response_body -class BreakSentenceClient: +class SentenceSplitter: """ - Class to break up large sections of text using WtP and spaCy. + Class to divide large sections of text at sentence breaks using WtP and spaCy. It is only used when the text to translate exceeds the translation endpoint's character limit. """ - # ACS limits the number of characters that can be translated in a single /translate call. - # Taken from - # https://docs.microsoft.com/en-us/azure/cognitive-services/translator/reference/v3-0-translate - TRANSLATION_MAX_CHARS = 50_000 - def __init__(self, job_properties: Mapping[str, str], sentence_model:TextSplitterModel): self._sentence_model = sentence_model @@ -466,21 +464,22 @@ def __init__(self, job_properties: Mapping[str, str], def split_input_text(self, text: str, from_lang: Optional[str], from_lang_confidence: Optional[float]) -> SplitTextResult: """ - Breaks up the given text in to chunks that are under TRANSLATION_MAX_CHARS. Each chunk - will contain one or more complete sentences as reported by the break sentence endpoint. + Splits up the given text in to chunks that are under TranslationClient.DETECT_MAX_CHARS. + Each chunk will contain one or more complete sentences as reported + by the (WtP or spaCy) sentence splitter. """ azure_char_count = get_azure_char_count(text) - if azure_char_count <= self.TRANSLATION_MAX_CHARS: + if azure_char_count <= TranslationClient.DETECT_MAX_CHARS: return SplitTextResult([text], from_lang, from_lang_confidence) log.info('Splitting input text because the translation endpoint allows a maximum of ' - f'{self.TRANSLATION_MAX_CHARS} Azure characters, but the text contained ' + f'{TranslationClient.DETECT_MAX_CHARS} Azure characters, but the text contained ' f'{azure_char_count} Azure characters.') if self._incl_input_lang: divided_text_list = TextSplitter.split( text, - BreakSentenceClient.TRANSLATION_MAX_CHARS, + TranslationClient.DETECT_MAX_CHARS, self._num_boundary_chars, get_azure_char_count, self._sentence_model, @@ -488,16 +487,14 @@ def split_input_text(self, text: str, from_lang: Optional[str], else: divided_text_list = TextSplitter.split( text, - BreakSentenceClient.TRANSLATION_MAX_CHARS, + TranslationClient.DETECT_MAX_CHARS, self._num_boundary_chars, get_azure_char_count, self._sentence_model) chunks = list(divided_text_list) - log.warning(f'Broke text up in to {len(chunks)} chunks. Each chunk will be sent to ' - 'the translation endpoint.') - log.info('Grouped sentences into %s chunks.', len(chunks)) + log.info('Grouped sentences into %s chunks for translation.', len(chunks)) return SplitTextResult(chunks, from_lang, from_lang_confidence) def get_n_azure_chars(input_str: str, begin: int, count: int) -> str: @@ -603,7 +600,7 @@ def get_required_property(property_name: str, job_properties: Mapping[str, str]) class NewLineBehavior: """ - The Azure translation service treats newlines a separator between sentences. This results in + The Azure translation service treats newlines as a separator between sentences. This results in incorrect translations. We can't simply replace newlines with spaces because not all languages put spaces between words. When testing with Chinese, spaces resulted in completely different translations. @@ -722,13 +719,6 @@ class _TranslateTextInfo(TypedDict): Translate = List[_TranslateTextInfo] - class _SentenceLengthInfo(TypedDict): - sentLen: List[int] - detectedLanguage: Optional[AcsResponses._DetectedLangInfo] - - BreakSentence = List[_SentenceLengthInfo] - - class _AlternativeDetectedLang(TypedDict): language: str score: float diff --git a/python/AzureTranslation/nlp_text_splitter/text_splitter.py b/python/AzureTranslation/nlp_text_splitter/text_splitter.py index 52dc650c..6703b0b2 100644 --- a/python/AzureTranslation/nlp_text_splitter/text_splitter.py +++ b/python/AzureTranslation/nlp_text_splitter/text_splitter.py @@ -57,11 +57,11 @@ def update_model(self, model_name: str, model_setting: str = ""): if "wtp" in model_name: self._update_wtp_model(model_name, model_setting) self.split = self._split_wtp - log.info(f"Setup up WtP model: {model_name}") + log.info(f"Setup WtP model: {model_name}") else: self._update_spacy_model(model_name) self.split = self._split_spacy - log.info(f"Setup up spaCy model: {model_name}") + log.info(f"Setup spaCy model: {model_name}") def _update_wtp_model(self, wtp_model_name: str, model_setting: str = "cpu") -> None: @@ -96,8 +96,12 @@ def _split_wtp(self, text: str, lang: Optional[str] = None) -> List[str]: if lang: iso_lang = WtpLanguageSettings.convert_to_iso(lang) if iso_lang: - return self.wtp_model.split(text, lang_code=iso_lang) # type: ignore - return self.wtp_model.split(text) # type: ignore + return self.wtp_model.split(text, lang_code=iso_lang) + else: + log.warning(f"Warning: Language {lang} was used to train WtP model." + "Please consider using spaCy's sentence detection model by" + "setting `SENTENCE_MODEL='xx_sent_ud_sm'`.") + return self.wtp_model.split(text) def _update_spacy_model(self, spacy_model_name: str): self.spacy_model = spacy.load(spacy_model_name, exclude=["parser"]) @@ -124,7 +128,7 @@ def __init__( self._get_text_size = get_text_size self._text = "" self._text_full_size = 0 - self._num_overhead_bytes = 0 + self._overhead_size = 0 self._soft_limit = self._limit self._in_lang = in_lang @@ -135,16 +139,16 @@ def set_text(self, text: str): self._text = text self._text_full_size = self._get_text_size(text) chars_per_size = len(text) / self._text_full_size - self._num_overhead_bytes = self._get_text_size('') + self._overhead_size = self._get_text_size('') - self._soft_limit = int(self._limit * chars_per_size) - self._num_overhead_bytes + self._soft_limit = int(self._limit * chars_per_size) - self._overhead_size if self._soft_limit <= 1: # Caused by an unusually large overhead relative to text. # This is unlikely to occur except during testing of small text limits. # Recalculate overhead bytes with chars_per_size weighting. self._soft_limit = max(1, - int((self._limit - self._num_overhead_bytes) * chars_per_size)) + int((self._limit - self._overhead_size) * chars_per_size)) def _isolate_largest_section(self, text:str) -> str: # Using cached word splitting model, isolate largest section of text @@ -204,10 +208,11 @@ def _divide(self, text) -> Tuple[str, str]: char_per_size = len(left) / left_size - limit = int(self._limit * char_per_size) - self._num_overhead_bytes + limit = int(self._limit * char_per_size) - self._overhead_size if limit < 1: # Caused by an unusually large overhead relative to text. # This is unlikely to occur except during testing of small text limits. - # Recalculate overhead bytes with chars_per_size weighting. - limit = max(1, int((self._limit - self._num_overhead_bytes) * char_per_size)) \ No newline at end of file + # Recalculate soft limit by subtracting overhead from limit before + # applying chars_per_size weighting. + limit = max(1, int((self._limit - self._overhead_size) * char_per_size)) \ No newline at end of file diff --git a/python/AzureTranslation/plugin-files/descriptor/descriptor.json b/python/AzureTranslation/plugin-files/descriptor/descriptor.json index d2839218..0f4ec92e 100644 --- a/python/AzureTranslation/plugin-files/descriptor/descriptor.json +++ b/python/AzureTranslation/plugin-files/descriptor/descriptor.json @@ -83,7 +83,7 @@ }, { "name": "SENTENCE_SPLITTER_CHAR_COUNT", - "description": "Integer value specifying maximum number of characters to process through sentence splitter. Defaults to 500 characters.", + "description": "Integer value specifying maximum number of characters to process through sentence splitter.", "type": "INT", "defaultValue": "500" }, diff --git a/python/AzureTranslation/tests/data/NOTICE b/python/AzureTranslation/tests/data/NOTICE index 944aa424..9fc41f18 100644 --- a/python/AzureTranslation/tests/data/NOTICE +++ b/python/AzureTranslation/tests/data/NOTICE @@ -1,4 +1,4 @@ -# break-sentence/art-of-war.txt +# split-sentence/art-of-war.txt Contains the beginning of "The Art of War" by Sunzi in Traditional Chinese. Public Domain https://www.gutenberg.org/ebooks/12407 diff --git a/python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-1.json b/python/AzureTranslation/tests/data/split-sentence/art-of-war-translation-1.json similarity index 100% rename from python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-1.json rename to python/AzureTranslation/tests/data/split-sentence/art-of-war-translation-1.json diff --git a/python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-2.json b/python/AzureTranslation/tests/data/split-sentence/art-of-war-translation-2.json similarity index 100% rename from python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-2.json rename to python/AzureTranslation/tests/data/split-sentence/art-of-war-translation-2.json diff --git a/python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-3.json b/python/AzureTranslation/tests/data/split-sentence/art-of-war-translation-3.json similarity index 100% rename from python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-3.json rename to python/AzureTranslation/tests/data/split-sentence/art-of-war-translation-3.json diff --git a/python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-4.json b/python/AzureTranslation/tests/data/split-sentence/art-of-war-translation-4.json similarity index 100% rename from python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-4.json rename to python/AzureTranslation/tests/data/split-sentence/art-of-war-translation-4.json diff --git a/python/AzureTranslation/tests/data/break-sentence/art-of-war.txt b/python/AzureTranslation/tests/data/split-sentence/art-of-war.txt similarity index 100% rename from python/AzureTranslation/tests/data/break-sentence/art-of-war.txt rename to python/AzureTranslation/tests/data/split-sentence/art-of-war.txt diff --git a/python/AzureTranslation/tests/data/break-sentence/art-war-translation.txt b/python/AzureTranslation/tests/data/split-sentence/art-war-translation.txt similarity index 100% rename from python/AzureTranslation/tests/data/break-sentence/art-war-translation.txt rename to python/AzureTranslation/tests/data/split-sentence/art-war-translation.txt diff --git a/python/AzureTranslation/tests/data/break-sentence/break-sentence-art-of-war-results.json b/python/AzureTranslation/tests/data/split-sentence/break-sentence-art-of-war-results.json similarity index 100% rename from python/AzureTranslation/tests/data/break-sentence/break-sentence-art-of-war-results.json rename to python/AzureTranslation/tests/data/split-sentence/break-sentence-art-of-war-results.json diff --git a/python/AzureTranslation/tests/test_acs_translation.py b/python/AzureTranslation/tests/test_acs_translation.py index fa81c1e4..49229a87 100644 --- a/python/AzureTranslation/tests/test_acs_translation.py +++ b/python/AzureTranslation/tests/test_acs_translation.py @@ -45,7 +45,7 @@ from nlp_text_splitter import TextSplitterModel, TextSplitter from acs_translation_component.acs_translation_component import (AcsTranslationComponent, get_azure_char_count, TranslationClient, NewLineBehavior, ChineseAndJapaneseCodePoints, - AcsTranslateUrlBuilder, BreakSentenceClient, get_n_azure_chars) + AcsTranslateUrlBuilder, SentenceSplitter, get_n_azure_chars) SEEN_TRACE_IDS = set() @@ -513,22 +513,23 @@ def assert_expected_url(job_properties, expected_to, expected_from, expected_que dict(ACS_URL='http://example.com/test?suggestedFrom=ru&category=whatever'), 'en', None, {'suggestedFrom': 'ru', 'category': 'whatever'}) - @mock.patch.object(BreakSentenceClient, 'TRANSLATION_MAX_CHARS', new_callable=lambda: 150) - def test_split_text(self, _): + + @mock.patch.object(TranslationClient, 'DETECT_MAX_CHARS', new_callable=lambda: 150) + def test_split_wtp_known_language(self, _): self.set_results_file('traditional-chinese-detect-result.json') - self.set_results_file('break-sentence/art-of-war-translation-1.json') - self.set_results_file('break-sentence/art-of-war-translation-2.json') - self.set_results_file('break-sentence/art-of-war-translation-3.json') - self.set_results_file('break-sentence/art-of-war-translation-4.json') + self.set_results_file('split-sentence/art-of-war-translation-1.json') + self.set_results_file('split-sentence/art-of-war-translation-2.json') + self.set_results_file('split-sentence/art-of-war-translation-3.json') + self.set_results_file('split-sentence/art-of-war-translation-4.json') - text = (TEST_DATA / 'break-sentence/art-of-war.txt').read_text() + text = (TEST_DATA / 'split-sentence/art-of-war.txt').read_text() detection_props = dict(TEXT=text) TranslationClient(get_test_properties(), self.wtp_model).add_translations(detection_props) self.assertEqual(5, len(detection_props)) self.assertEqual(text, detection_props['TEXT']) - expected_translation = (TEST_DATA / 'break-sentence/art-war-translation.txt') \ + expected_translation = (TEST_DATA / 'split-sentence/art-war-translation.txt') \ .read_text().strip() self.assertEqual(expected_translation, detection_props['TRANSLATION']) self.assertEqual('EN', detection_props['TRANSLATION TO LANGUAGE']) @@ -538,20 +539,14 @@ def test_split_text(self, _): float(detection_props['TRANSLATION SOURCE LANGUAGE CONFIDENCE'])) detect_request_text = self.get_request_body()[0]['Text'] - self.assertEqual(text, detect_request_text) - behavior = NewLineBehavior.get({}) - actual = list(TextSplitter.split(behavior(text, 'zh-Hant'), - BreakSentenceClient.TRANSLATION_MAX_CHARS, - BreakSentenceClient.TRANSLATION_MAX_CHARS, - get_azure_char_count, - self.wtp_model, - 'zh-Hant')) - - self.assertEqual(4, len(actual)) + self.assertEqual(text[0:TranslationClient.DETECT_MAX_CHARS], detect_request_text) + expected_chunk_lengths = [86, 116, 104, 114] + self.assertEqual(sum(expected_chunk_lengths), len(text.replace('\n',''))) translation_request1 = self.get_request_body()[0]['Text'] self.assertTrue(translation_request1.startswith('兵者,')) self.assertTrue(translation_request1.endswith('而不危也;')) + self.assertEqual(expected_chunk_lengths[0], len(translation_request1)) self.assertNotIn('\n', translation_request1, 'Newlines were not properly removed') self.assertNotIn(' ', translation_request1, @@ -559,6 +554,7 @@ def test_split_text(self, _): translation_request2 = self.get_request_body()[0]['Text'] + self.assertEqual(expected_chunk_lengths[1], len(translation_request2)) self.assertTrue(translation_request2.startswith('天者,陰陽')) self.assertTrue(translation_request2.endswith('兵眾孰強?')) self.assertNotIn('\n', translation_request1, @@ -568,6 +564,7 @@ def test_split_text(self, _): translation_request3 = self.get_request_body()[0]['Text'] + self.assertEqual(expected_chunk_lengths[2], len(translation_request3)) self.assertTrue(translation_request3.startswith('士卒孰練?')) self.assertTrue(translation_request3.endswith('遠而示之近。')) self.assertNotIn('\n', translation_request3, @@ -577,6 +574,7 @@ def test_split_text(self, _): translation_request4 = self.get_request_body()[0]['Text'] + self.assertEqual(expected_chunk_lengths[3], len(translation_request4)) self.assertTrue(translation_request4.startswith('利而誘之,')) self.assertTrue(translation_request4.endswith('勝負見矣。')) self.assertNotIn('\n', translation_request4, @@ -585,25 +583,24 @@ def test_split_text(self, _): 'Spaces should not be added to Chinese text.') - - @mock.patch.object(BreakSentenceClient, 'TRANSLATION_MAX_CHARS', new_callable=lambda: 150) - def test_split_text_check_wtp_unusual_lang(self, _): + @mock.patch.object(TranslationClient, 'DETECT_MAX_CHARS', new_callable=lambda: 150) + def test_split_wtp_unknown_lang(self, _): # Check that the text splitter does not have an issue # processing an unknown detected language. self.set_results_file('invalid-lang-detect-result.json') - self.set_results_file('break-sentence/art-of-war-translation-1.json') - self.set_results_file('break-sentence/art-of-war-translation-2.json') - self.set_results_file('break-sentence/art-of-war-translation-3.json') - self.set_results_file('break-sentence/art-of-war-translation-4.json') + self.set_results_file('split-sentence/art-of-war-translation-1.json') + self.set_results_file('split-sentence/art-of-war-translation-2.json') + self.set_results_file('split-sentence/art-of-war-translation-3.json') + self.set_results_file('split-sentence/art-of-war-translation-4.json') - text = (TEST_DATA / 'break-sentence/art-of-war.txt').read_text() + text = (TEST_DATA / 'split-sentence/art-of-war.txt').read_text() detection_props = dict(TEXT=text) TranslationClient(get_test_properties(), self.wtp_model).add_translations(detection_props) self.assertEqual(5, len(detection_props)) self.assertEqual(text, detection_props['TEXT']) - expected_translation = (TEST_DATA / 'break-sentence/art-war-translation.txt') \ + expected_translation = (TEST_DATA / 'split-sentence/art-war-translation.txt') \ .read_text().strip() self.assertEqual(expected_translation, detection_props['TRANSLATION']) self.assertEqual('EN', detection_props['TRANSLATION TO LANGUAGE']) @@ -613,23 +610,30 @@ def test_split_text_check_wtp_unusual_lang(self, _): float(detection_props['TRANSLATION SOURCE LANGUAGE CONFIDENCE'])) detect_request_text = self.get_request_body()[0]['Text'] - self.assertEqual(text, detect_request_text) + self.assertEqual(text[0:TranslationClient.DETECT_MAX_CHARS], detect_request_text) + + expected_chunk_lengths = [88, 118, 116, 106] + self.assertEqual(sum(expected_chunk_lengths), len(text)) # WtP will split by newlines, so some of the chunks # here are different from the previous test. translation_request1 = self.get_request_body()[0]['Text'] + self.assertEqual(expected_chunk_lengths[0], len(translation_request1)) self.assertTrue(translation_request1.startswith('兵者,')) self.assertTrue(translation_request1.endswith('而不危也;')) translation_request2 = self.get_request_body()[0]['Text'] + self.assertEqual(expected_chunk_lengths[1], len(translation_request2)) self.assertTrue(translation_request2.startswith('天者,陰陽')) self.assertTrue(translation_request2.endswith('兵眾孰強?')) translation_request3 = self.get_request_body()[0]['Text'] + self.assertEqual(expected_chunk_lengths[2], len(translation_request3)) self.assertTrue(translation_request3.startswith('士卒孰練?')) self.assertTrue(translation_request3.endswith('亂而取之, ')) translation_request4 = self.get_request_body()[0]['Text'] + self.assertEqual(expected_chunk_lengths[3], len(translation_request4)) self.assertTrue(translation_request4.startswith('實而備之,')) self.assertTrue(translation_request4.endswith('勝負見矣。 ')) @@ -750,7 +754,7 @@ def test_job_prop_overrides_from_lang(self): def test_chinese_japanese_char_detection(self): - art_of_war_text = (TEST_DATA / 'break-sentence/art-of-war.txt').read_text() + art_of_war_text = (TEST_DATA / 'split-sentence/art-of-war.txt').read_text() self.assertTrue(all(ChineseAndJapaneseCodePoints.check_char(ch) for ch in art_of_war_text if not ch.isspace())) @@ -848,60 +852,44 @@ def test_suggested_from(self): self.assertEqual(['en'], query_dict['to']) - @mock.patch.object(BreakSentenceClient, 'TRANSLATION_MAX_CHARS', new_callable=lambda: 5) - def test_guess_breaks_all_types(self, _): - input_text = 'a.bc,d.efg,hij kl\n\nmnopqrs.tu' + def test_guess_split_edge_cases(self): + input_text = ("This is a sentence (Dr.Test). Is this a sentence as well? " + "Maybe...maybe not?") + # Split using WtP model. actual = list(TextSplitter.split(input_text, - BreakSentenceClient.TRANSLATION_MAX_CHARS, - BreakSentenceClient.TRANSLATION_MAX_CHARS, + 35, + 35, get_azure_char_count, self.wtp_model)) self.assertEqual(input_text, ''.join(actual)) - self.assertEqual(6, len(actual)) - - # a.bc, - self.assertEqual('a.bc,', actual[0]) - # bc,d. - self.assertEqual('d.efg', actual[1]) - # efg,h - self.assertEqual(',hij ', actual[2]) - # hij k - self.assertEqual('kl\n\n', actual[3]) - # kl\n\nm - self.assertEqual('mnopq', actual[4]) - # mnopq - self.assertEqual('rs.tu', actual[5]) + self.assertEqual(3, len(actual)) + + # WtP should detect and split out each sentence + self.assertEqual("This is a sentence (Dr.Test). ", actual[0]) + self.assertEqual("Is this a sentence as well? ", actual[1]) + self.assertEqual("Maybe...maybe not?", actual[2]) actual = list(TextSplitter.split(input_text, - BreakSentenceClient.TRANSLATION_MAX_CHARS, - BreakSentenceClient.TRANSLATION_MAX_CHARS, + 35, + 35, get_azure_char_count, self.spacy_model)) self.assertEqual(input_text, ''.join(actual)) - self.assertEqual(6, len(actual)) - - # a.bc, - self.assertEqual('a.bc', actual[0]) - # bc,d. - self.assertEqual(',d.ef', actual[1]) - # efg,h - self.assertEqual('g,hij', actual[2]) - # hij k - self.assertEqual(' kl\n\n', actual[3]) - # kl\n\nm - self.assertEqual('mnopq', actual[4]) - # mnopq - self.assertEqual('rs.tu', actual[5]) - - - @mock.patch.object(BreakSentenceClient, 'TRANSLATION_MAX_CHARS', new_callable=lambda: 28) - def test_guess_breaks_actual_sentence(self, _): + self.assertEqual(3, len(actual)) + + # Split using spaCy model. + self.assertEqual("This is a sentence (Dr.Test). ", actual[0]) + self.assertEqual("Is this a sentence as well? ", actual[1]) + self.assertEqual("Maybe...maybe not?", actual[2]) + + + def test_guess_split_simple_sentence(self): input_text = 'Hello, what is your name? My name is John.' actual = list(TextSplitter.split(input_text, - BreakSentenceClient.TRANSLATION_MAX_CHARS, - BreakSentenceClient.TRANSLATION_MAX_CHARS, + 28, + 28, get_azure_char_count, self.wtp_model)) self.assertEqual(input_text, ''.join(actual)) @@ -914,8 +902,8 @@ def test_guess_breaks_actual_sentence(self, _): input_text = 'Hello, what is your name? My name is John.' actual = list(TextSplitter.split(input_text, - BreakSentenceClient.TRANSLATION_MAX_CHARS, - BreakSentenceClient.TRANSLATION_MAX_CHARS, + 28, + 28, get_azure_char_count, self.spacy_model)) self.assertEqual(input_text, ''.join(actual)) @@ -928,11 +916,10 @@ def test_guess_breaks_actual_sentence(self, _): - @mock.patch.object(BreakSentenceClient, 'TRANSLATION_MAX_CHARS', new_callable=lambda: 20) - def test_sentence_end_punctuation(self, _): + def test_split_sentence_end_punctuation(self): input_text = 'Hello. How are you? asdfasdf' actual = list(TextSplitter.split(input_text, - BreakSentenceClient.TRANSLATION_MAX_CHARS, + 20, 10, get_azure_char_count, self.wtp_model)) @@ -944,7 +931,7 @@ def test_sentence_end_punctuation(self, _): self.assertEqual('asdfasdf', actual[1]) actual = list(TextSplitter.split(input_text, - BreakSentenceClient.TRANSLATION_MAX_CHARS, + 20, 10, get_azure_char_count, self.spacy_model)) From 303048fa7f1a8cd9f4aa2984cc6ad232686e41af Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Thu, 4 Apr 2024 00:47:23 -0400 Subject: [PATCH 03/22] Minor tooltip update. --- python/AzureTranslation/nlp_text_splitter/text_splitter.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/AzureTranslation/nlp_text_splitter/text_splitter.py b/python/AzureTranslation/nlp_text_splitter/text_splitter.py index 6703b0b2..503723dc 100644 --- a/python/AzureTranslation/nlp_text_splitter/text_splitter.py +++ b/python/AzureTranslation/nlp_text_splitter/text_splitter.py @@ -98,8 +98,9 @@ def _split_wtp(self, text: str, lang: Optional[str] = None) -> List[str]: if iso_lang: return self.wtp_model.split(text, lang_code=iso_lang) else: - log.warning(f"Warning: Language {lang} was used to train WtP model." - "Please consider using spaCy's sentence detection model by" + log.warning(f"Warning: Language {lang} was not used to train WtP model." + "If text splitting is not working well with WtP, " + "consider trying spaCy's sentence detection model by" "setting `SENTENCE_MODEL='xx_sent_ud_sm'`.") return self.wtp_model.split(text) From 192fb29cf957de1635bb3b65bc05be69c1591856 Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Thu, 4 Apr 2024 00:58:27 -0400 Subject: [PATCH 04/22] Minor tooltip update. --- python/AzureTranslation/nlp_text_splitter/text_splitter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/AzureTranslation/nlp_text_splitter/text_splitter.py b/python/AzureTranslation/nlp_text_splitter/text_splitter.py index 503723dc..3d73e99c 100644 --- a/python/AzureTranslation/nlp_text_splitter/text_splitter.py +++ b/python/AzureTranslation/nlp_text_splitter/text_splitter.py @@ -147,7 +147,8 @@ def set_text(self, text: str): if self._soft_limit <= 1: # Caused by an unusually large overhead relative to text. # This is unlikely to occur except during testing of small text limits. - # Recalculate overhead bytes with chars_per_size weighting. + # Recalculate soft limit by subtracting overhead from limit + # before applying chars_per_size weighting. self._soft_limit = max(1, int((self._limit - self._overhead_size) * chars_per_size)) From a74b91916b300b1593f7bf1c2604606d86b29bff Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Thu, 4 Apr 2024 01:25:06 -0400 Subject: [PATCH 05/22] Update edge case for testing text splits. --- .../tests/test_acs_translation.py | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/python/AzureTranslation/tests/test_acs_translation.py b/python/AzureTranslation/tests/test_acs_translation.py index 49229a87..a1bc788e 100644 --- a/python/AzureTranslation/tests/test_acs_translation.py +++ b/python/AzureTranslation/tests/test_acs_translation.py @@ -853,23 +853,26 @@ def test_suggested_from(self): def test_guess_split_edge_cases(self): - input_text = ("This is a sentence (Dr.Test). Is this a sentence as well? " - "Maybe...maybe not?") + input_text = ("This is a sentence (Dr.Test). Is this," + " a sentence as well? Maybe...maybe not?" + " \n All done, I think!") # Split using WtP model. actual = list(TextSplitter.split(input_text, - 35, - 35, + 30, + 30, get_azure_char_count, self.wtp_model)) self.assertEqual(input_text, ''.join(actual)) - self.assertEqual(3, len(actual)) + self.assertEqual(4, len(actual)) # WtP should detect and split out each sentence self.assertEqual("This is a sentence (Dr.Test). ", actual[0]) - self.assertEqual("Is this a sentence as well? ", actual[1]) - self.assertEqual("Maybe...maybe not?", actual[2]) + self.assertEqual("Is this, a sentence as well? ", actual[1]) + self.assertEqual("Maybe...maybe not? \n ", actual[2]) + self.assertEqual("All done, I think!", actual[3]) + actual = list(TextSplitter.split(input_text, 35, @@ -877,12 +880,14 @@ def test_guess_split_edge_cases(self): get_azure_char_count, self.spacy_model)) self.assertEqual(input_text, ''.join(actual)) - self.assertEqual(3, len(actual)) + self.assertEqual(4, len(actual)) # Split using spaCy model. self.assertEqual("This is a sentence (Dr.Test). ", actual[0]) - self.assertEqual("Is this a sentence as well? ", actual[1]) - self.assertEqual("Maybe...maybe not?", actual[2]) + self.assertEqual("Is this, a sentence as well? ", actual[1]) + self.assertEqual("Maybe...maybe not? \n ", actual[2]) + self.assertEqual("All done, I think!", actual[3]) + def test_guess_split_simple_sentence(self): From 4c6fd7adc404745d8ec2b0808267b42e63e860ae Mon Sep 17 00:00:00 2001 From: jrobble Date: Tue, 9 Apr 2024 14:23:54 -0400 Subject: [PATCH 06/22] Improve formatting. --- python/AzureTranslation/README.md | 70 +++++++++++++++++-------------- 1 file changed, 38 insertions(+), 32 deletions(-) diff --git a/python/AzureTranslation/README.md b/python/AzureTranslation/README.md index 18e9d3b0..1a5922fd 100644 --- a/python/AzureTranslation/README.md +++ b/python/AzureTranslation/README.md @@ -44,14 +44,16 @@ must be provided. Neither has a default value. using an environment variable named `MPF_PROP_ACS_SUBSCRIPTION_KEY`. -# Important Job Properties: -- `TO_LANGUAGE`: The BCP-47 language code for language that the properties +# Primary Job Properties +- `TO_LANGUAGE`: The BCP-47 language code for the language that the properties should be translated to. + - `FEED_FORWARD_PROP_TO_PROCESS`: Comma-separated list of property names indicating which properties in the feed-forward track or detection to consider translating. For example, `TEXT,TRANSCRIPT`. If the first property listed is present, then that property will be translated. If it's not, then the next property in the list is considered. At most, one property will be translated. + - `FROM_LANGUAGE`: In most cases, this property should not be used. It should only be used when automatic language detection is detecting the wrong language. Providing this property prevents the translation endpoint from @@ -60,44 +62,48 @@ must be provided. Neither has a default value. input text unchanged. -# Listing Supported Languages -To list the supported languages replace `${ACS_URL}` and -`${ACS_SUBSCRIPTION_KEY}` in the following command and run it: -```shell script -curl -H "Ocp-Apim-Subscription-Key: ${ACS_SUBSCRIPTION_KEY}" "https://${ACS_URL}/languages?api-version=3.0&scope=translation" -``` +# Text Splitter Job Properties +The following settings control the behavior of dividing input text into acceptable chunks +for processing. +Through preliminary investigation, we identified the [WtP library ("Where's the +Point")](https://github.com/bminixhofer/wtpsplit) and spaCy's multilingual sentence +detection model for identifying sentence breaks in a large section of text. -# Secondary Job Properties - Text Splitter: - The following settings control the behavior of dividing input - text into acceptable chunks for processing. +WtP models are trained to split up multilingual text by sentence without the need of an +input language tag. The disadvantage is that the most accurate WtP models will need ~3.5 +GB of GPU memory. On the other hand, spaCy has a single multilingual sentence detection +that appears to work better for splitting up English text in certain cases, unfortunately +this model lacks support handling for Chinese punctuation. - Through preliminary investigation, we identified the [WtP library ("Where's the Point")](https://github.com/bminixhofer/wtpsplit) - and spaCy's multilingual sentence detection model for identifying sentence breaks - in a large section of text. +- `SENTENCE_MODEL`: Specifies the desired WtP or spaCy sentence detection model. For CPU + and runtime considerations, the author of WtP recommends using `wtp-bert-mini`. More + advanced WtP models that use GPU resources (up to ~8 GB) are also available. See list of + WtP model names + [here](https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#available-models). The + only available spaCy model (for text with unknown language) is `xx_sent_ud_sm`. - WtP models are trained to split up multilingual text by sentence without the need - of an input language tag. The disadvantage is that the most accurate - WtP models will need ~3.5 GB of GPU memory. On the other hand, spaCy has a single - multilingual sentence detection that appears to work better for splitting up English - text in certain cases, unfortunately this model lacks support handling for Chinese punctuation. + Review list of languages supported by WtP + [here](https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#supported-languages). + Review models and languages supported by spaCy [here](https://spacy.io/models). - - `SENTENCE_MODEL` - Specifies the desired WtP or spaCy sentence detection model. - For CPU and runtime considerations, the author of WtP recommends using `wtp-bert-mini`; - more advanced WtP models that use GPU resources (up to ~8 GB) are also available [(see list of WtP model names here)](https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#available-models). - The only available spaCy model (for text with unknown language) is `xx_sent_ud_sm`. +- `SENTENCE_SPLITTER_CHAR_COUNT`: Specifies maximum number of characters to process + through sentence/text splitter. Default to 500 characters as we only need to process a + subsection of text to determine an appropriate split. (See discussion of potential char + lengths + [here](https://discourse.mozilla.org/t/proposal-sentences-lenght-limit-from-14-words-to-100-characters). - [Review list of languages supported by WtP here.](https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#supported-languages) - [Review models and languages supported by spaCy here.](https://spacy.io/models) +- `SENTENCE_SPLITTER_INCLUDE_INPUT_LANG`: Specifies whether to pass input language to + sentence splitter algorithm. Currently, only WtP supports model threshold adjustments by + input language. - - `SENTENCE_SPLITTER_CHAR_COUNT` - Specifies maximum number of characters to process - through sentence/text splitter. - Default to 500 characters as we only need to process a subsection of text to determine an appropriate - split [(see discussion of potential char lengths - Mozilla Common Voice)](https://discourse.mozilla.org/t/proposal-sentences-lenght-limit-from-14-words-to-100-characters). - - `SENTENCE_SPLITTER_INCLUDE_INPUT_LANG` - Specifies whether to pass input language to - sentence splitter algorithm. Currently, only WtP supports model threshold adjustments - by input language. +# Listing Supported Languages +To list the supported languages replace `${ACS_URL}` and `${ACS_SUBSCRIPTION_KEY}` in the +following command and run it: +```shell script +curl -H "Ocp-Apim-Subscription-Key: ${ACS_SUBSCRIPTION_KEY}" "https://${ACS_URL}/languages?api-version=3.0&scope=translation" +``` # Sample Program From ba61756aa38399fb1f6a5fc33541c9199561b093 Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Fri, 12 Apr 2024 03:48:03 -0400 Subject: [PATCH 07/22] Tooltip update, and adding additional WtP text splitter support. --- python/AzureTranslation/README.md | 25 +-- .../acs_translation_component.py | 16 +- .../nlp_text_splitter/text_splitter.py | 33 +++- .../plugin-files/descriptor/descriptor.json | 12 ++ .../tests/test_acs_translation.py | 161 +++++++++++++----- 5 files changed, 183 insertions(+), 64 deletions(-) diff --git a/python/AzureTranslation/README.md b/python/AzureTranslation/README.md index 7fed5fc7..873fdb6b 100644 --- a/python/AzureTranslation/README.md +++ b/python/AzureTranslation/README.md @@ -31,14 +31,6 @@ In order for the component to process any jobs, the job properties listed below must be provided. Neither has a default value. - `ACS_URL`: Base URL for the Azure Cognitive Services Translator Endpoint. -<<<<<<< HEAD - e.g. `https://api.cognitive.microsofttranslator.com` or - `https:///translator/text/v3.0`. The URL should - not end with `/translate` because two separate endpoints are - used. `ACS_URL + '/translate'` is used for translation. - This property can also be configured - using an environment variable named `MPF_PROP_ACS_URL`. -======= e.g. `https://api.cognitive.microsofttranslator.com` or `https:///translator/text/v3.0`. The URL should not end with `/translate` because two separate endpoints are @@ -46,7 +38,6 @@ must be provided. Neither has a default value. `ACS_URL + '/breaksentence'` is used to break up text when it is too long for a single translation request. This property can also be configured using an environment variable named `MPF_PROP_ACS_URL`. ->>>>>>> origin - `ACS_SUBSCRIPTION_KEY`: A string containing your Azure Cognitive Services subscription key. To get one you will need to create an @@ -94,8 +85,9 @@ The following settings control the behavior of dividing input text into acceptab for processing. Through preliminary investigation, we identified the [WtP library ("Where's the -Point")](https://github.com/bminixhofer/wtpsplit) and spaCy's multilingual sentence -detection model for identifying sentence breaks in a large section of text. +Point")](https://github.com/bminixhofer/wtpsplit) and [spaCy's multilingual sentence +detection model](https://spacy.io/models) for identifying sentence breaks +in a large section of text. WtP models are trained to split up multilingual text by sentence without the need of an input language tag. The disadvantage is that the most accurate WtP models will need ~3.5 @@ -124,6 +116,17 @@ this model lacks support handling for Chinese punctuation. sentence splitter algorithm. Currently, only WtP supports model threshold adjustments by input language. +- `SENTENCE_MODEL_CPU_ONLY`: If set to TRUE, only use CPU resources for the sentence + detection model. If set to FALSE, allow sentence model to also use GPU resources. + For most runs using spaCy `xx_sent_ud_sm` or `wtp-bert-mini` models, GPU resources + are not required. If using more advanced WtP models (i.e. `wtp-canine-s-12l`), it + is recommended to set `SENTENCE_MODEL_CPU_ONLY=FALSE` as such models can use up to + to ~3.5 GB of GPU memory. + +- `SENTENCE_MODEL_WTP_DEFAULT_ADAPTOR_LANGUAGE`: More advanced WTP models will + require a target language. This property sets the default language to use for + sentence splitting, and is overwritten whenever `FROM_LANGUAGE`, `SUGGESTED_FROM_LANGUAGE`, + or Azure language detection return a different, WtP-supported language option. # Listing Supported Languages To list the supported languages replace `${ACS_URL}` and diff --git a/python/AzureTranslation/acs_translation_component/acs_translation_component.py b/python/AzureTranslation/acs_translation_component/acs_translation_component.py index fe18557d..929c6fc3 100644 --- a/python/AzureTranslation/acs_translation_component/acs_translation_component.py +++ b/python/AzureTranslation/acs_translation_component/acs_translation_component.py @@ -53,7 +53,7 @@ class AcsTranslationComponent: def __init__(self) -> None: - self._cached_sent_model = TextSplitterModel("wtp-bert-mini", "cpu") + self._cached_sent_model = TextSplitterModel("wtp-bert-mini", "cpu", "en") def get_detections_from_video(self, job: mpf.VideoJob) -> Sequence[mpf.VideoTrack]: try: @@ -458,8 +458,18 @@ def __init__(self, job_properties: Mapping[str, str], self._incl_input_lang = mpf_util.get_property(job_properties, "SENTENCE_SPLITTER_INCLUDE_INPUT_LANG", True) - nlp_model_setting = "cpu" - self._sentence_model.update_model(nlp_model_name, nlp_model_setting) + + wtp_default_language = mpf_util.get_property(job_properties, + "SENTENCE_MODEL_WTP_DEFAULT_ADAPTOR_LANGUAGE", + "en") + nlp_model_setting = mpf_util.get_property(job_properties, "SENTENCE_MODEL_CPU_ONLY", True) + + if not nlp_model_setting: + nlp_model_setting = "gpu" + else: + nlp_model_setting = "cpu" + + self._sentence_model.update_model(nlp_model_name, nlp_model_setting, wtp_default_language) def split_input_text(self, text: str, from_lang: Optional[str], from_lang_confidence: Optional[float]) -> SplitTextResult: diff --git a/python/AzureTranslation/nlp_text_splitter/text_splitter.py b/python/AzureTranslation/nlp_text_splitter/text_splitter.py index 3d73e99c..603c13b9 100644 --- a/python/AzureTranslation/nlp_text_splitter/text_splitter.py +++ b/python/AzureTranslation/nlp_text_splitter/text_splitter.py @@ -44,18 +44,27 @@ log = logging.getLogger(__name__) +# These models must have an specified language during sentence splitting. +WTP_MANDATORY_ADAPTOR = ['wtp-canine-s-1l', + 'wtp-canine-s-3l', + 'wtp-canine-s-6l', + 'wtp-canine-s-9l', + 'wtp-canine-s-12l'] + class TextSplitterModel: # To hold spaCy, WtP, and other potential sentence detection models in cache - def __init__(self, model_name: str, model_setting: str) -> None: + def __init__(self, model_name: str, model_setting: str, default_lang: str = "en") -> None: self._model_name = "" + self._default_lang = default_lang + self._mandatory_wtp_language = False self.split = lambda t, **param: [t] self.update_model(model_name, model_setting) - def update_model(self, model_name: str, model_setting: str = ""): + def update_model(self, model_name: str, model_setting: str = "", default_lang: str="en"): if model_name: if "wtp" in model_name: - self._update_wtp_model(model_name, model_setting) + self._update_wtp_model(model_name, model_setting, default_lang) self.split = self._split_wtp log.info(f"Setup WtP model: {model_name}") else: @@ -64,7 +73,12 @@ def update_model(self, model_name: str, model_setting: str = ""): log.info(f"Setup spaCy model: {model_name}") def _update_wtp_model(self, wtp_model_name: str, - model_setting: str = "cpu") -> None: + model_setting: str = "cpu", + default_lang: str="en") -> None: + + if wtp_model_name in WTP_MANDATORY_ADAPTOR: + self._mandatory_wtp_language = True + self._default_lang = default_lang if self._model_name != wtp_model_name: self._model_name = wtp_model_name @@ -98,10 +112,15 @@ def _split_wtp(self, text: str, lang: Optional[str] = None) -> List[str]: if iso_lang: return self.wtp_model.split(text, lang_code=iso_lang) else: - log.warning(f"Warning: Language {lang} was not used to train WtP model." + log.warning(f"Language {lang} was not used to train WtP model. " "If text splitting is not working well with WtP, " - "consider trying spaCy's sentence detection model by" - "setting `SENTENCE_MODEL='xx_sent_ud_sm'`.") + "consider trying spaCy's sentence detection model." + ) + if self._mandatory_wtp_language: + log.warning("WtP model requires a language." + f"Using default language : {self._default_lang}.") + iso_lang = WtpLanguageSettings.convert_to_iso(self._default_lang) + return self.wtp_model.split(text, lang_code=iso_lang) return self.wtp_model.split(text) def _update_spacy_model(self, spacy_model_name: str): diff --git a/python/AzureTranslation/plugin-files/descriptor/descriptor.json b/python/AzureTranslation/plugin-files/descriptor/descriptor.json index d80bdd46..99f65b8e 100644 --- a/python/AzureTranslation/plugin-files/descriptor/descriptor.json +++ b/python/AzureTranslation/plugin-files/descriptor/descriptor.json @@ -93,6 +93,18 @@ "type": "STRING", "defaultValue": "wtp-bert-mini" }, + { + "name": "SENTENCE_MODEL_CPU_ONLY", + "description": "If set to true, only use CPU resources for the sentence detection model. If set to False, allow sentence model to also use GPU resources (larger WtP models will use up to ~3.5 GB of GPU memory).", + "type": "BOOLEAN", + "defaultValue": "TRUE" + }, + { + "name": "SENTENCE_MODEL_WTP_DEFAULT_ADAPTOR_LANGUAGE", + "description": "More advanced WTP models will require a target language. This property sets the default language to use for sentence splitting, unless `FROM_LANGUAGE`, `SUGGESTED_FROM_LANGUAGE`, or Azure language detection return a different, WtP-supported language option.", + "type": "STRING", + "defaultValue": "en" + }, { "name": "SENTENCE_SPLITTER_INCLUDE_INPUT_LANG", "description": "Specifies whether to pass input language to sentence splitter algorithm. Currently, only WtP supports model adjustments by input language.", diff --git a/python/AzureTranslation/tests/test_acs_translation.py b/python/AzureTranslation/tests/test_acs_translation.py index a330e666..1ed52bf4 100644 --- a/python/AzureTranslation/tests/test_acs_translation.py +++ b/python/AzureTranslation/tests/test_acs_translation.py @@ -49,7 +49,9 @@ from acs_translation_component.convert_language_code import iso_to_bcp - +# Set to true to test the WtP canine-s-1l model locally +# Note, this will download ~1 GB to your local storage. +LOCAL_TEST_WTP_MODEL = False SEEN_TRACE_IDS = set() CHINESE_SAMPLE_TEXT = '你好,你叫什么名字?' @@ -71,8 +73,10 @@ class TestAcsTranslation(unittest.TestCase): @classmethod def setUpClass(cls): cls.mock_server = MockServer() - cls.wtp_model = TextSplitterModel("wtp-bert-mini", "cpu") - cls.spacy_model = TextSplitterModel("xx_sent_ud_sm", "cpu") + cls.wtp_model = TextSplitterModel("wtp-bert-mini", "cpu", "en") + if LOCAL_TEST_WTP_MODEL: + cls.wtp_adv_model = TextSplitterModel("wtp-canine-s-1l", "cpu", "en") + cls.spacy_model = TextSplitterModel("xx_sent_ud_sm", "cpu", "en") @classmethod @@ -596,6 +600,78 @@ def test_split_wtp_known_language(self, _): 'Spaces should not be added to Chinese text.') + @mock.patch.object(TranslationClient, 'DETECT_MAX_CHARS', new_callable=lambda: 150) + def test_split_wtp_advanced_known_language(self, _): + # This test should only be run locally, as the WtP canine model is a bit large (~1 GB). + if not LOCAL_TEST_WTP_MODEL: + return + + self.set_results_file('traditional-chinese-detect-result.json') + self.set_results_file('split-sentence/art-of-war-translation-1.json') + self.set_results_file('split-sentence/art-of-war-translation-2.json') + self.set_results_file('split-sentence/art-of-war-translation-3.json') + self.set_results_file('split-sentence/art-of-war-translation-4.json') + + text = (TEST_DATA / 'split-sentence/art-of-war.txt').read_text() + detection_props = dict(TEXT=text) + TranslationClient(get_test_properties(), self.wtp_adv_model).add_translations(detection_props) + + self.assertEqual(5, len(detection_props)) + self.assertEqual(text, detection_props['TEXT']) + + expected_translation = (TEST_DATA / 'split-sentence/art-war-translation.txt') \ + .read_text().strip() + self.assertEqual(expected_translation, detection_props['TRANSLATION']) + self.assertEqual('EN', detection_props['TRANSLATION TO LANGUAGE']) + + self.assertEqual('zh-Hant', detection_props['TRANSLATION SOURCE LANGUAGE']) + self.assertAlmostEqual(1.0, + float(detection_props['TRANSLATION SOURCE LANGUAGE CONFIDENCE'])) + + detect_request_text = self.get_request_body()[0]['Text'] + self.assertEqual(text[0:TranslationClient.DETECT_MAX_CHARS], detect_request_text) + + expected_chunk_lengths = [86, 116, 104, 114] + self.assertEqual(sum(expected_chunk_lengths), len(text.replace('\n',''))) + translation_request1 = self.get_request_body()[0]['Text'] + self.assertTrue(translation_request1.startswith('兵者,')) + self.assertTrue(translation_request1.endswith('而不危也;')) + self.assertEqual(expected_chunk_lengths[0], len(translation_request1)) + self.assertNotIn('\n', translation_request1, + 'Newlines were not properly removed') + self.assertNotIn(' ', translation_request1, + 'Spaces should not be added to Chinese text.') + + + translation_request2 = self.get_request_body()[0]['Text'] + self.assertEqual(expected_chunk_lengths[1], len(translation_request2)) + self.assertTrue(translation_request2.startswith('天者,陰陽')) + self.assertTrue(translation_request2.endswith('兵眾孰強?')) + self.assertNotIn('\n', translation_request1, + 'Newlines were not properly removed') + self.assertNotIn(' ', translation_request1, + 'Spaces should not be added to Chinese text.') + + + translation_request3 = self.get_request_body()[0]['Text'] + self.assertEqual(expected_chunk_lengths[2], len(translation_request3)) + self.assertTrue(translation_request3.startswith('士卒孰練?')) + self.assertTrue(translation_request3.endswith('遠而示之近。')) + self.assertNotIn('\n', translation_request3, + 'Newlines were not properly removed') + self.assertNotIn(' ', translation_request3, + 'Spaces should not be added to Chinese text.') + + + translation_request4 = self.get_request_body()[0]['Text'] + self.assertEqual(expected_chunk_lengths[3], len(translation_request4)) + self.assertTrue(translation_request4.startswith('利而誘之,')) + self.assertTrue(translation_request4.endswith('勝負見矣。')) + self.assertNotIn('\n', translation_request4, + 'Newlines were not properly removed') + self.assertNotIn(' ', translation_request4, + 'Spaces should not be added to Chinese text.') + @mock.patch.object(TranslationClient, 'DETECT_MAX_CHARS', new_callable=lambda: 150) def test_split_wtp_unknown_lang(self, _): # Check that the text splitter does not have an issue @@ -628,8 +704,8 @@ def test_split_wtp_unknown_lang(self, _): expected_chunk_lengths = [88, 118, 116, 106] self.assertEqual(sum(expected_chunk_lengths), len(text)) - # WtP will split by newlines, so some of the chunks - # here are different from the previous test. + # WtP will split by newlines (over the '。' character), + # so some of the chunks here are different from the previous test. translation_request1 = self.get_request_body()[0]['Text'] self.assertEqual(expected_chunk_lengths[0], len(translation_request1)) self.assertTrue(translation_request1.startswith('兵者,')) @@ -865,44 +941,6 @@ def test_suggested_from(self): self.assertEqual(['en'], query_dict['to']) - def test_guess_split_edge_cases(self): - input_text = ("This is a sentence (Dr.Test). Is this," - " a sentence as well? Maybe...maybe not?" - " \n All done, I think!") - - # Split using WtP model. - actual = list(TextSplitter.split(input_text, - 30, - 30, - get_azure_char_count, - self.wtp_model)) - - self.assertEqual(input_text, ''.join(actual)) - self.assertEqual(4, len(actual)) - - # WtP should detect and split out each sentence - self.assertEqual("This is a sentence (Dr.Test). ", actual[0]) - self.assertEqual("Is this, a sentence as well? ", actual[1]) - self.assertEqual("Maybe...maybe not? \n ", actual[2]) - self.assertEqual("All done, I think!", actual[3]) - - - actual = list(TextSplitter.split(input_text, - 35, - 35, - get_azure_char_count, - self.spacy_model)) - self.assertEqual(input_text, ''.join(actual)) - self.assertEqual(4, len(actual)) - - # Split using spaCy model. - self.assertEqual("This is a sentence (Dr.Test). ", actual[0]) - self.assertEqual("Is this, a sentence as well? ", actual[1]) - self.assertEqual("Maybe...maybe not? \n ", actual[2]) - self.assertEqual("All done, I think!", actual[3]) - - - def test_guess_split_simple_sentence(self): input_text = 'Hello, what is your name? My name is John.' actual = list(TextSplitter.split(input_text, @@ -961,6 +999,43 @@ def test_split_sentence_end_punctuation(self): self.assertEqual('asdfasdf', actual[1]) + def test_guess_split_edge_cases(self): + input_text = ("This is a sentence (Dr.Test). Is this," + " a sentence as well? Maybe...maybe not?" + " \n All done, I think!") + + # Split using WtP model. + actual = list(TextSplitter.split(input_text, + 30, + 30, + get_azure_char_count, + self.wtp_model)) + + self.assertEqual(input_text, ''.join(actual)) + self.assertEqual(4, len(actual)) + + # WtP should detect and split out each sentence + self.assertEqual("This is a sentence (Dr.Test). ", actual[0]) + self.assertEqual("Is this, a sentence as well? ", actual[1]) + self.assertEqual("Maybe...maybe not? \n ", actual[2]) + self.assertEqual("All done, I think!", actual[3]) + + + actual = list(TextSplitter.split(input_text, + 35, + 35, + get_azure_char_count, + self.spacy_model)) + self.assertEqual(input_text, ''.join(actual)) + self.assertEqual(4, len(actual)) + + # Split using spaCy model. + self.assertEqual("This is a sentence (Dr.Test). ", actual[0]) + self.assertEqual("Is this, a sentence as well? ", actual[1]) + self.assertEqual("Maybe...maybe not? \n ", actual[2]) + self.assertEqual("All done, I think!", actual[3]) + + def test_no_translate_no_detect_when_language_ff_prop_matches(self): ff_loc = mpf.ImageLocation(0, 0, 10, 10, -1, dict(TEXT='Hello', DECODED_LANGUAGE='eng')) job = mpf.ImageJob('Test', 'test.jpg', get_test_properties(), {}, ff_loc) From 7438731b6211124a2889a68cfa2f1f4401d9a7f3 Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Fri, 12 Apr 2024 03:58:19 -0400 Subject: [PATCH 08/22] Tooltip update, and adding additional WtP text splitter support. --- python/AzureTranslation/README.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/AzureTranslation/README.md b/python/AzureTranslation/README.md index 873fdb6b..0092baae 100644 --- a/python/AzureTranslation/README.md +++ b/python/AzureTranslation/README.md @@ -49,13 +49,11 @@ must be provided. Neither has a default value. - `TO_LANGUAGE`: The BCP-47 language code for language that the properties should be translated to. - - `FEED_FORWARD_PROP_TO_PROCESS`: Comma-separated list of property names indicating which properties in the feed-forward track or detection to consider translating. For example, `TEXT,TRANSCRIPT`. If the first property listed is present, then that property will be translated. If it's not, then the next property in the list is considered. At most, one property will be translated. - - `FROM_LANGUAGE`: In most cases, this property should not be used. It should only be used when automatic language detection is detecting the wrong language: Users can provide a BCP-47 code to force the translation service @@ -131,7 +129,6 @@ this model lacks support handling for Chinese punctuation. # Listing Supported Languages To list the supported languages replace `${ACS_URL}` and `${ACS_SUBSCRIPTION_KEY}` in the following command and run it: - ```shell script curl -H "Ocp-Apim-Subscription-Key: ${ACS_SUBSCRIPTION_KEY}" "https://${ACS_URL}/languages?api-version=3.0&scope=translation" ``` From 983dce7016bab8d531ee9c0690dc8bf4dc31f9df Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Fri, 12 Apr 2024 04:12:29 -0400 Subject: [PATCH 09/22] Tooltip update. --- python/AzureTranslation/README.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/python/AzureTranslation/README.md b/python/AzureTranslation/README.md index 0092baae..19359b71 100644 --- a/python/AzureTranslation/README.md +++ b/python/AzureTranslation/README.md @@ -45,15 +45,16 @@ must be provided. Neither has a default value. using an environment variable named `MPF_PROP_ACS_SUBSCRIPTION_KEY`. -# Important Job Properties: -- `TO_LANGUAGE`: The BCP-47 language code for language that the properties - should be translated to. +# Primary Job Properties +- `TO_LANGUAGE`: The BCP-47 language code for the language that the properties +- should be translated to. - `FEED_FORWARD_PROP_TO_PROCESS`: Comma-separated list of property names indicating which properties in the feed-forward track or detection to consider translating. For example, `TEXT,TRANSCRIPT`. If the first property listed is present, then that property will be translated. If it's not, then the next property in the list is considered. At most, one property will be translated. + - `FROM_LANGUAGE`: In most cases, this property should not be used. It should only be used when automatic language detection is detecting the wrong language: Users can provide a BCP-47 code to force the translation service @@ -126,9 +127,10 @@ this model lacks support handling for Chinese punctuation. sentence splitting, and is overwritten whenever `FROM_LANGUAGE`, `SUGGESTED_FROM_LANGUAGE`, or Azure language detection return a different, WtP-supported language option. + # Listing Supported Languages -To list the supported languages replace `${ACS_URL}` and -`${ACS_SUBSCRIPTION_KEY}` in the following command and run it: +To list the supported languages replace `${ACS_URL}` and `${ACS_SUBSCRIPTION_KEY}` in the +following command and run it: ```shell script curl -H "Ocp-Apim-Subscription-Key: ${ACS_SUBSCRIPTION_KEY}" "https://${ACS_URL}/languages?api-version=3.0&scope=translation" ``` From dda479ba416b1a29de1213949f767ac3d697e1cb Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Tue, 16 Apr 2024 04:01:28 -0400 Subject: [PATCH 10/22] Tooltip updates and PyTorch cuda build. --- python/AzureTranslation/Dockerfile | 9 ++-- python/AzureTranslation/README.md | 16 ++++--- .../acs_translation_component.py | 2 +- .../nlp_text_splitter/text_splitter.py | 18 ++++++- .../plugin-files/descriptor/descriptor.json | 2 +- .../tests/test_acs_translation.py | 47 +++++++++++++++++-- 6 files changed, 77 insertions(+), 17 deletions(-) diff --git a/python/AzureTranslation/Dockerfile b/python/AzureTranslation/Dockerfile index ac6999ae..4c95de8a 100644 --- a/python/AzureTranslation/Dockerfile +++ b/python/AzureTranslation/Dockerfile @@ -31,6 +31,7 @@ ARG BUILD_TAG=latest FROM ${BUILD_REGISTRY}openmpf_python_executor_ssb:${BUILD_TAG} ARG RUN_TESTS=false +ARG ADD_GPU_SUPPORT=false RUN pip install --no-cache-dir langcodes @@ -40,10 +41,12 @@ RUN apt-get update && \ rm -rf /var/lib/apt/lists/* # Install WtP and spaCy +# if ADD_GPU_SUPPORT is enabled, install PyTorch with cuda support. RUN pip install --upgrade pip && \ - pip install spacy>=3.7.4 && \ - pip install wtpsplit>=1.3.0 && \ - pip install torch --index-url https://download.pytorch.org/whl/cpu + pip install "spacy>=3.7.4" && \ + pip install "wtpsplit>=1.3.0"; \ + if [ "${ADD_GPU_SUPPORT,,}" == true ]; then pip install torch \ + else pip install torch --index-url https://download.pytorch.org/whl/cpu; fi # Modify to add downloads for other models of interest. RUN mkdir /wtp_models && cd /wtp_models && \ diff --git a/python/AzureTranslation/README.md b/python/AzureTranslation/README.md index 19359b71..67ceff04 100644 --- a/python/AzureTranslation/README.md +++ b/python/AzureTranslation/README.md @@ -35,9 +35,8 @@ must be provided. Neither has a default value. `https:///translator/text/v3.0`. The URL should not end with `/translate` because two separate endpoints are used. `ACS_URL + '/translate'` is used for translation. - `ACS_URL + '/breaksentence'` is used to break up text when it is too long - for a single translation request. This property can also be configured - using an environment variable named `MPF_PROP_ACS_URL`. + This property can also be configured using an environment variable + named `MPF_PROP_ACS_URL`. - `ACS_SUBSCRIPTION_KEY`: A string containing your Azure Cognitive Services subscription key. To get one you will need to create an @@ -47,7 +46,7 @@ must be provided. Neither has a default value. # Primary Job Properties - `TO_LANGUAGE`: The BCP-47 language code for the language that the properties -- should be translated to. + should be translated to. - `FEED_FORWARD_PROP_TO_PROCESS`: Comma-separated list of property names indicating which properties in the feed-forward track or detection to consider @@ -118,9 +117,12 @@ this model lacks support handling for Chinese punctuation. - `SENTENCE_MODEL_CPU_ONLY`: If set to TRUE, only use CPU resources for the sentence detection model. If set to FALSE, allow sentence model to also use GPU resources. For most runs using spaCy `xx_sent_ud_sm` or `wtp-bert-mini` models, GPU resources - are not required. If using more advanced WtP models (i.e. `wtp-canine-s-12l`), it - is recommended to set `SENTENCE_MODEL_CPU_ONLY=FALSE` as such models can use up to - to ~3.5 GB of GPU memory. + are not required. If using more advanced WtP models like `wtp-canine-s-12l`, + it is recommended to set `SENTENCE_MODEL_CPU_ONLY=FALSE` to improve performance. + That model can use up to ~3.5 GB of GPU memory. + + Please note, to enable this option, you must also rebuild the Docker container + with the following change: Within the Dockerfile, set `ARG ADD_GPU_SUPPORT=true`. - `SENTENCE_MODEL_WTP_DEFAULT_ADAPTOR_LANGUAGE`: More advanced WTP models will require a target language. This property sets the default language to use for diff --git a/python/AzureTranslation/acs_translation_component/acs_translation_component.py b/python/AzureTranslation/acs_translation_component/acs_translation_component.py index 929c6fc3..ccd9faf9 100644 --- a/python/AzureTranslation/acs_translation_component/acs_translation_component.py +++ b/python/AzureTranslation/acs_translation_component/acs_translation_component.py @@ -465,7 +465,7 @@ def __init__(self, job_properties: Mapping[str, str], nlp_model_setting = mpf_util.get_property(job_properties, "SENTENCE_MODEL_CPU_ONLY", True) if not nlp_model_setting: - nlp_model_setting = "gpu" + nlp_model_setting = "cuda" else: nlp_model_setting = "cpu" diff --git a/python/AzureTranslation/nlp_text_splitter/text_splitter.py b/python/AzureTranslation/nlp_text_splitter/text_splitter.py index 603c13b9..e5fd644f 100644 --- a/python/AzureTranslation/nlp_text_splitter/text_splitter.py +++ b/python/AzureTranslation/nlp_text_splitter/text_splitter.py @@ -34,6 +34,8 @@ from .wtp_lang_settings import WtpLanguageSettings +import torch + DEFAULT_WTP_MODELS = "/wtp_models" @@ -51,6 +53,11 @@ 'wtp-canine-s-9l', 'wtp-canine-s-12l'] +GPU_AVAILABLE = False +if torch.cuda.is_available(): + GPU_AVAILABLE = True + + class TextSplitterModel: # To hold spaCy, WtP, and other potential sentence detection models in cache @@ -76,6 +83,15 @@ def _update_wtp_model(self, wtp_model_name: str, model_setting: str = "cpu", default_lang: str="en") -> None: + if model_setting == "gpu" or model_setting == "cuda": + if GPU_AVAILABLE: + model_setting = "cuda" + else: + log.warning("Warning, no cuda support for this installation of PyTorch. " + "Please reinstall PyTorch with GPU support by updating " + "`ADD_GPU_SUPPORT=true` to the Dockerfile for this component.") + model_setting = "cpu" + if wtp_model_name in WTP_MANDATORY_ADAPTOR: self._mandatory_wtp_language = True self._default_lang = default_lang @@ -117,7 +133,7 @@ def _split_wtp(self, text: str, lang: Optional[str] = None) -> List[str]: "consider trying spaCy's sentence detection model." ) if self._mandatory_wtp_language: - log.warning("WtP model requires a language." + log.warning("WtP model requires a language. " f"Using default language : {self._default_lang}.") iso_lang = WtpLanguageSettings.convert_to_iso(self._default_lang) return self.wtp_model.split(text, lang_code=iso_lang) diff --git a/python/AzureTranslation/plugin-files/descriptor/descriptor.json b/python/AzureTranslation/plugin-files/descriptor/descriptor.json index 99f65b8e..41c12f90 100644 --- a/python/AzureTranslation/plugin-files/descriptor/descriptor.json +++ b/python/AzureTranslation/plugin-files/descriptor/descriptor.json @@ -95,7 +95,7 @@ }, { "name": "SENTENCE_MODEL_CPU_ONLY", - "description": "If set to true, only use CPU resources for the sentence detection model. If set to False, allow sentence model to also use GPU resources (larger WtP models will use up to ~3.5 GB of GPU memory).", + "description": "If set to true, only use CPU resources for the sentence detection model. If set to False, allow sentence model to also use GPU resources (consult README.md for details).", "type": "BOOLEAN", "defaultValue": "TRUE" }, diff --git a/python/AzureTranslation/tests/test_acs_translation.py b/python/AzureTranslation/tests/test_acs_translation.py index 1ed52bf4..9b2ccc65 100644 --- a/python/AzureTranslation/tests/test_acs_translation.py +++ b/python/AzureTranslation/tests/test_acs_translation.py @@ -75,7 +75,7 @@ def setUpClass(cls): cls.mock_server = MockServer() cls.wtp_model = TextSplitterModel("wtp-bert-mini", "cpu", "en") if LOCAL_TEST_WTP_MODEL: - cls.wtp_adv_model = TextSplitterModel("wtp-canine-s-1l", "cpu", "en") + cls.wtp_adv_model = TextSplitterModel("wtp-canine-s-1l", "cuda", "en") cls.spacy_model = TextSplitterModel("xx_sent_ud_sm", "cpu", "en") @@ -600,9 +600,30 @@ def test_split_wtp_known_language(self, _): 'Spaces should not be added to Chinese text.') + def test_split_engine_difference(self): + # Note: we can only use the WtP models for subsequent tests + # involving Chinese text because only WtP's multilingual models + # can detect some of '。' characters used for this language. + text = (TEST_DATA / 'split-sentence/art-of-war.txt').read_text() + + text_without_newlines = text.replace('\n', '') + + actual = self.wtp_model._split_wtp(text_without_newlines) + self.assertEqual(3, len(actual)) + + actual = self.spacy_model._split_spacy(text_without_newlines) + self.assertEqual(1, len(actual)) + + # However, WtP prefers newlines over the '。' character. + actual = self.wtp_model._split_wtp(text) + self.assertEqual(10, len(actual)) + @mock.patch.object(TranslationClient, 'DETECT_MAX_CHARS', new_callable=lambda: 150) def test_split_wtp_advanced_known_language(self, _): - # This test should only be run locally, as the WtP canine model is a bit large (~1 GB). + # This test should only be run manually outside of a Docker build. + # The WtP canine model is ~1 GB and not worth downloading and adding to the pre-built Docker image. + + if not LOCAL_TEST_WTP_MODEL: return @@ -704,27 +725,45 @@ def test_split_wtp_unknown_lang(self, _): expected_chunk_lengths = [88, 118, 116, 106] self.assertEqual(sum(expected_chunk_lengths), len(text)) - # WtP will split by newlines (over the '。' character), - # so some of the chunks here are different from the previous test. + # Due to an incorrect language detection, newlines are + # not properly replaced for Chinese text, and + # additional whitespace is present in the text. + # This alters the behavior of WtP sentence splitting. translation_request1 = self.get_request_body()[0]['Text'] self.assertEqual(expected_chunk_lengths[0], len(translation_request1)) self.assertTrue(translation_request1.startswith('兵者,')) self.assertTrue(translation_request1.endswith('而不危也;')) + self.assertNotIn('\n', translation_request1, + 'Newlines were not properly removed') + self.assertIn(' ', translation_request1, + 'Spaces should be kept due to incorrect language detection.') translation_request2 = self.get_request_body()[0]['Text'] self.assertEqual(expected_chunk_lengths[1], len(translation_request2)) self.assertTrue(translation_request2.startswith('天者,陰陽')) self.assertTrue(translation_request2.endswith('兵眾孰強?')) + self.assertNotIn('\n', translation_request2, + 'Newlines were not properly removed') + self.assertIn(' ', translation_request2, + 'Spaces should be kept due to incorrect language detection.') translation_request3 = self.get_request_body()[0]['Text'] self.assertEqual(expected_chunk_lengths[2], len(translation_request3)) self.assertTrue(translation_request3.startswith('士卒孰練?')) self.assertTrue(translation_request3.endswith('亂而取之, ')) + self.assertNotIn('\n', translation_request3, + 'Newlines were not properly removed') + self.assertIn(' ', translation_request3, + 'Spaces should be kept due to incorrect language detection.') translation_request4 = self.get_request_body()[0]['Text'] self.assertEqual(expected_chunk_lengths[3], len(translation_request4)) self.assertTrue(translation_request4.startswith('實而備之,')) self.assertTrue(translation_request4.endswith('勝負見矣。 ')) + self.assertNotIn('\n', translation_request4, + 'Newlines were not properly removed') + self.assertIn(' ', translation_request4, + 'Spaces should be kept due to incorrect language detection.') def test_newline_removal(self): From 23ae3b3cf3ef22879c5503a65c1446fa0a7bf4bc Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Tue, 16 Apr 2024 04:42:56 -0400 Subject: [PATCH 11/22] Tooltip update. --- python/AzureTranslation/README.md | 4 +++- .../AzureTranslation/plugin-files/descriptor/descriptor.json | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/python/AzureTranslation/README.md b/python/AzureTranslation/README.md index 67ceff04..13eb1e88 100644 --- a/python/AzureTranslation/README.md +++ b/python/AzureTranslation/README.md @@ -121,8 +121,10 @@ this model lacks support handling for Chinese punctuation. it is recommended to set `SENTENCE_MODEL_CPU_ONLY=FALSE` to improve performance. That model can use up to ~3.5 GB of GPU memory. - Please note, to enable this option, you must also rebuild the Docker container + Please note, to fully enable this option, you must also rebuild the Docker container with the following change: Within the Dockerfile, set `ARG ADD_GPU_SUPPORT=true`. + Otherwise, the PyTorch version installed will not come with cuda support and + component will always default to CPU processing. - `SENTENCE_MODEL_WTP_DEFAULT_ADAPTOR_LANGUAGE`: More advanced WTP models will require a target language. This property sets the default language to use for diff --git a/python/AzureTranslation/plugin-files/descriptor/descriptor.json b/python/AzureTranslation/plugin-files/descriptor/descriptor.json index 41c12f90..918f09e6 100644 --- a/python/AzureTranslation/plugin-files/descriptor/descriptor.json +++ b/python/AzureTranslation/plugin-files/descriptor/descriptor.json @@ -95,7 +95,7 @@ }, { "name": "SENTENCE_MODEL_CPU_ONLY", - "description": "If set to true, only use CPU resources for the sentence detection model. If set to False, allow sentence model to also use GPU resources (consult README.md for details).", + "description": "If set to true, only use CPU resources for the sentence detection model. If set to False, allow sentence model to also use GPU resources (for Docker deployments, please consult README for more info).", "type": "BOOLEAN", "defaultValue": "TRUE" }, From ddad147982df5ec5522046744cbb180052c4f79e Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Tue, 16 Apr 2024 10:42:04 -0400 Subject: [PATCH 12/22] Add minor check. --- python/AzureTranslation/tests/test_acs_translation.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/AzureTranslation/tests/test_acs_translation.py b/python/AzureTranslation/tests/test_acs_translation.py index 9b2ccc65..70d5a476 100644 --- a/python/AzureTranslation/tests/test_acs_translation.py +++ b/python/AzureTranslation/tests/test_acs_translation.py @@ -610,6 +610,8 @@ def test_split_engine_difference(self): actual = self.wtp_model._split_wtp(text_without_newlines) self.assertEqual(3, len(actual)) + for line in actual: + self.assertTrue(line.endswith('。')) actual = self.spacy_model._split_spacy(text_without_newlines) self.assertEqual(1, len(actual)) From 13ee66e7f282b91f1199b6430501666d89853d78 Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Tue, 16 Apr 2024 11:32:01 -0400 Subject: [PATCH 13/22] Improved stage builds for gpu/cpu options. --- python/AzureTranslation/Dockerfile | 25 +++++++++++++++---- python/AzureTranslation/README.md | 4 +-- .../nlp_text_splitter/text_splitter.py | 4 +-- 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/python/AzureTranslation/Dockerfile b/python/AzureTranslation/Dockerfile index 4c95de8a..ea6b1b6e 100644 --- a/python/AzureTranslation/Dockerfile +++ b/python/AzureTranslation/Dockerfile @@ -31,7 +31,10 @@ ARG BUILD_TAG=latest FROM ${BUILD_REGISTRY}openmpf_python_executor_ssb:${BUILD_TAG} ARG RUN_TESTS=false -ARG ADD_GPU_SUPPORT=false + +# To enable GPU resources, update +# below line to BUILD_MODE=gpu +ARG BUILD_MODE=cpu RUN pip install --no-cache-dir langcodes @@ -41,23 +44,35 @@ RUN apt-get update && \ rm -rf /var/lib/apt/lists/* # Install WtP and spaCy -# if ADD_GPU_SUPPORT is enabled, install PyTorch with cuda support. RUN pip install --upgrade pip && \ pip install "spacy>=3.7.4" && \ - pip install "wtpsplit>=1.3.0"; \ - if [ "${ADD_GPU_SUPPORT,,}" == true ]; then pip install torch \ - else pip install torch --index-url https://download.pytorch.org/whl/cpu; fi + pip install "wtpsplit>=1.3.0"\ # Modify to add downloads for other models of interest. RUN mkdir /wtp_models && cd /wtp_models && \ git clone https://huggingface.co/benjamin/wtp-bert-mini && \ python3 -m spacy download xx_sent_ud_sm +######################################################################## +FROM install_wget as cpu_component_build +RUN install torch --index-url https://download.pytorch.org/whl/cpu + +######################################################################## +FROM install_wget as gpu_component_build +RUN pip install torch + +######################################################################## +# Change to gpu to test out GPU build mode. +FROM cpu_component_build as run_tests_true + RUN --mount=target=.,readwrite \ install-component.sh; \ if [ "${RUN_TESTS,,}" == true ]; then python tests/test_acs_translation.py; fi +######################################################################## +FROM ${BUILD_TYPE}_component_build + LABEL org.label-schema.license="Apache 2.0" \ org.label-schema.name="OpenMPF Azure Translation" \ org.label-schema.schema-version="1.0" \ diff --git a/python/AzureTranslation/README.md b/python/AzureTranslation/README.md index 13eb1e88..a73d1ebc 100644 --- a/python/AzureTranslation/README.md +++ b/python/AzureTranslation/README.md @@ -122,8 +122,8 @@ this model lacks support handling for Chinese punctuation. That model can use up to ~3.5 GB of GPU memory. Please note, to fully enable this option, you must also rebuild the Docker container - with the following change: Within the Dockerfile, set `ARG ADD_GPU_SUPPORT=true`. - Otherwise, the PyTorch version installed will not come with cuda support and + with the following change: Within the Dockerfile, set `ARG BUILD_MODE=gpu`. + Otherwise, PyTorch will be installed without cuda support and component will always default to CPU processing. - `SENTENCE_MODEL_WTP_DEFAULT_ADAPTOR_LANGUAGE`: More advanced WTP models will diff --git a/python/AzureTranslation/nlp_text_splitter/text_splitter.py b/python/AzureTranslation/nlp_text_splitter/text_splitter.py index e5fd644f..3017a1de 100644 --- a/python/AzureTranslation/nlp_text_splitter/text_splitter.py +++ b/python/AzureTranslation/nlp_text_splitter/text_splitter.py @@ -88,8 +88,8 @@ def _update_wtp_model(self, wtp_model_name: str, model_setting = "cuda" else: log.warning("Warning, no cuda support for this installation of PyTorch. " - "Please reinstall PyTorch with GPU support by updating " - "`ADD_GPU_SUPPORT=true` to the Dockerfile for this component.") + "Please reinstall PyTorch with GPU support by setting " + "`ARGS BUILD_MODE=gpu` in the Dockerfile for this component.") model_setting = "cpu" if wtp_model_name in WTP_MANDATORY_ADAPTOR: From 16c7b51fd650921d5a6cb5af544aad62cf900328 Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Wed, 17 Apr 2024 04:00:33 -0400 Subject: [PATCH 14/22] Submitting tested changes for Docker build. --- python/AzureTranslation/Dockerfile | 18 +++++++++--------- python/AzureTranslation/README.md | 2 +- .../nlp_text_splitter/text_splitter.py | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/python/AzureTranslation/Dockerfile b/python/AzureTranslation/Dockerfile index ea6b1b6e..6b79781f 100644 --- a/python/AzureTranslation/Dockerfile +++ b/python/AzureTranslation/Dockerfile @@ -28,13 +28,14 @@ ARG BUILD_REGISTRY ARG BUILD_TAG=latest -FROM ${BUILD_REGISTRY}openmpf_python_executor_ssb:${BUILD_TAG} ARG RUN_TESTS=false - # To enable GPU resources, update -# below line to BUILD_MODE=gpu -ARG BUILD_MODE=cpu +# below line to BUILD_TYPE=gpu +ARG BUILD_TYPE=gpu + +FROM ${BUILD_REGISTRY}openmpf_python_executor_ssb:${BUILD_TAG} as download_python_packages + RUN pip install --no-cache-dir langcodes @@ -46,7 +47,7 @@ RUN apt-get update && \ # Install WtP and spaCy RUN pip install --upgrade pip && \ pip install "spacy>=3.7.4" && \ - pip install "wtpsplit>=1.3.0"\ + pip install "wtpsplit>=1.3.0" # Modify to add downloads for other models of interest. RUN mkdir /wtp_models && cd /wtp_models && \ @@ -54,22 +55,21 @@ RUN mkdir /wtp_models && cd /wtp_models && \ python3 -m spacy download xx_sent_ud_sm ######################################################################## -FROM install_wget as cpu_component_build +FROM download_python_packages as cpu_component_build RUN install torch --index-url https://download.pytorch.org/whl/cpu ######################################################################## -FROM install_wget as gpu_component_build +FROM download_python_packages as gpu_component_build RUN pip install torch ######################################################################## # Change to gpu to test out GPU build mode. -FROM cpu_component_build as run_tests_true +FROM gpu_component_build as run_tests_true RUN --mount=target=.,readwrite \ install-component.sh; \ if [ "${RUN_TESTS,,}" == true ]; then python tests/test_acs_translation.py; fi - ######################################################################## FROM ${BUILD_TYPE}_component_build diff --git a/python/AzureTranslation/README.md b/python/AzureTranslation/README.md index a73d1ebc..d1e4674d 100644 --- a/python/AzureTranslation/README.md +++ b/python/AzureTranslation/README.md @@ -122,7 +122,7 @@ this model lacks support handling for Chinese punctuation. That model can use up to ~3.5 GB of GPU memory. Please note, to fully enable this option, you must also rebuild the Docker container - with the following change: Within the Dockerfile, set `ARG BUILD_MODE=gpu`. + with the following change: Within the Dockerfile, set `ARG BUILD_TYPE=gpu`. Otherwise, PyTorch will be installed without cuda support and component will always default to CPU processing. diff --git a/python/AzureTranslation/nlp_text_splitter/text_splitter.py b/python/AzureTranslation/nlp_text_splitter/text_splitter.py index 3017a1de..74b31262 100644 --- a/python/AzureTranslation/nlp_text_splitter/text_splitter.py +++ b/python/AzureTranslation/nlp_text_splitter/text_splitter.py @@ -89,7 +89,7 @@ def _update_wtp_model(self, wtp_model_name: str, else: log.warning("Warning, no cuda support for this installation of PyTorch. " "Please reinstall PyTorch with GPU support by setting " - "`ARGS BUILD_MODE=gpu` in the Dockerfile for this component.") + "`ARGS BUILD_TYPE=gpu` in the Dockerfile for this component.") model_setting = "cpu" if wtp_model_name in WTP_MANDATORY_ADAPTOR: From 9657eee6a2a2bca4a0fbad30bb95a5d82af24fba Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Wed, 17 Apr 2024 05:07:17 -0400 Subject: [PATCH 15/22] Final changes (cleanup + test cpu/gpu staged builds). --- python/AzureTranslation/Dockerfile | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/python/AzureTranslation/Dockerfile b/python/AzureTranslation/Dockerfile index 6b79781f..c56a0197 100644 --- a/python/AzureTranslation/Dockerfile +++ b/python/AzureTranslation/Dockerfile @@ -32,7 +32,7 @@ ARG BUILD_TAG=latest ARG RUN_TESTS=false # To enable GPU resources, update # below line to BUILD_TYPE=gpu -ARG BUILD_TYPE=gpu +ARG BUILD_TYPE=cpu FROM ${BUILD_REGISTRY}openmpf_python_executor_ssb:${BUILD_TAG} as download_python_packages @@ -56,23 +56,20 @@ RUN mkdir /wtp_models && cd /wtp_models && \ ######################################################################## FROM download_python_packages as cpu_component_build -RUN install torch --index-url https://download.pytorch.org/whl/cpu +RUN pip install torch --index-url https://download.pytorch.org/whl/cpu ######################################################################## FROM download_python_packages as gpu_component_build RUN pip install torch ######################################################################## -# Change to gpu to test out GPU build mode. -FROM gpu_component_build as run_tests_true + +FROM ${BUILD_TYPE}_component_build as component_final RUN --mount=target=.,readwrite \ install-component.sh; \ if [ "${RUN_TESTS,,}" == true ]; then python tests/test_acs_translation.py; fi -######################################################################## -FROM ${BUILD_TYPE}_component_build - LABEL org.label-schema.license="Apache 2.0" \ org.label-schema.name="OpenMPF Azure Translation" \ org.label-schema.schema-version="1.0" \ From d6e59eba58c2e319f26b0a01dca24d2d0c4713be Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Wed, 17 Apr 2024 22:20:52 -0400 Subject: [PATCH 16/22] Tooltip update. --- .../AzureTranslation/nlp_text_splitter/text_splitter.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/python/AzureTranslation/nlp_text_splitter/text_splitter.py b/python/AzureTranslation/nlp_text_splitter/text_splitter.py index 74b31262..f07265bd 100644 --- a/python/AzureTranslation/nlp_text_splitter/text_splitter.py +++ b/python/AzureTranslation/nlp_text_splitter/text_splitter.py @@ -87,9 +87,11 @@ def _update_wtp_model(self, wtp_model_name: str, if GPU_AVAILABLE: model_setting = "cuda" else: - log.warning("Warning, no cuda support for this installation of PyTorch. " - "Please reinstall PyTorch with GPU support by setting " - "`ARGS BUILD_TYPE=gpu` in the Dockerfile for this component.") + log.warning("PyTorch determined that CUDA is not available. " + "You may need to update the NVIDIA driver for the host system, " + "or reinstall PyTorch with GPU support by setting " + "ARGS BUILD_TYPE=gpu in the Dockerfile when building this component.") + model_setting = "cpu" if wtp_model_name in WTP_MANDATORY_ADAPTOR: From 28b641b57de9dc20baa35f84f83b9644f47760c2 Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Thu, 18 Apr 2024 03:13:26 -0400 Subject: [PATCH 17/22] Set NVIDIA environment variables in Docker image. --- python/AzureTranslation/Dockerfile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/AzureTranslation/Dockerfile b/python/AzureTranslation/Dockerfile index c56a0197..a7f70bef 100644 --- a/python/AzureTranslation/Dockerfile +++ b/python/AzureTranslation/Dockerfile @@ -60,6 +60,11 @@ RUN pip install torch --index-url https://download.pytorch.org/whl/cpu ######################################################################## FROM download_python_packages as gpu_component_build + +# Environment variables required by nvidia runtime. +ENV NVIDIA_VISIBLE_DEVICES=all +ENV NVIDIA_DRIVER_CAPABILITIES compute,utility + RUN pip install torch ######################################################################## From cf4375ffe817d823fc13b81e112600db618fb0f7 Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Thu, 18 Apr 2024 03:23:20 -0400 Subject: [PATCH 18/22] Toggling GPU mode (final check). --- python/AzureTranslation/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/AzureTranslation/Dockerfile b/python/AzureTranslation/Dockerfile index a7f70bef..04a7e577 100644 --- a/python/AzureTranslation/Dockerfile +++ b/python/AzureTranslation/Dockerfile @@ -32,7 +32,7 @@ ARG BUILD_TAG=latest ARG RUN_TESTS=false # To enable GPU resources, update # below line to BUILD_TYPE=gpu -ARG BUILD_TYPE=cpu +ARG BUILD_TYPE=gpu FROM ${BUILD_REGISTRY}openmpf_python_executor_ssb:${BUILD_TAG} as download_python_packages From 1a7bad17ce216eab9ce979d8af43ff0a3ad2382d Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Thu, 18 Apr 2024 11:05:26 -0400 Subject: [PATCH 19/22] GPU mode passed, reverting change back to CPU. --- python/AzureTranslation/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/AzureTranslation/Dockerfile b/python/AzureTranslation/Dockerfile index 04a7e577..a7f70bef 100644 --- a/python/AzureTranslation/Dockerfile +++ b/python/AzureTranslation/Dockerfile @@ -32,7 +32,7 @@ ARG BUILD_TAG=latest ARG RUN_TESTS=false # To enable GPU resources, update # below line to BUILD_TYPE=gpu -ARG BUILD_TYPE=gpu +ARG BUILD_TYPE=cpu FROM ${BUILD_REGISTRY}openmpf_python_executor_ssb:${BUILD_TAG} as download_python_packages From 9d4dfcb54fa6354cbdf26a792aaa2695e5bffc6e Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Thu, 18 Apr 2024 19:28:57 -0400 Subject: [PATCH 20/22] Minor Copyright Date Update. --- python/AzureTranslation/Dockerfile | 4 ++-- python/AzureTranslation/LICENSE | 4 ++-- python/AzureTranslation/acs_translation_component/__init__.py | 4 ++-- .../acs_translation_component/acs_translation_component.py | 4 ++-- .../acs_translation_component/convert_language_code.py | 4 ++-- python/AzureTranslation/pyproject.toml | 4 ++-- python/AzureTranslation/sample_acs_translator.py | 4 ++-- python/AzureTranslation/setup.cfg | 4 ++-- python/AzureTranslation/tests/test_acs_translation.py | 4 ++-- 9 files changed, 18 insertions(+), 18 deletions(-) diff --git a/python/AzureTranslation/Dockerfile b/python/AzureTranslation/Dockerfile index a7f70bef..4ae5195e 100644 --- a/python/AzureTranslation/Dockerfile +++ b/python/AzureTranslation/Dockerfile @@ -7,11 +7,11 @@ # under contract, and is subject to the Rights in Data-General Clause # # 52.227-14, Alt. IV (DEC 2007). # # # -# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # ############################################################################# ############################################################################# -# Copyright 2023 The MITRE Corporation # +# Copyright 2024 The MITRE Corporation # # # # Licensed under the Apache License, Version 2.0 (the "License"); # # you may not use this file except in compliance with the License. # diff --git a/python/AzureTranslation/LICENSE b/python/AzureTranslation/LICENSE index 42dfb8b2..2344b622 100644 --- a/python/AzureTranslation/LICENSE +++ b/python/AzureTranslation/LICENSE @@ -27,7 +27,7 @@ https://github.com/bminixhofer/wtpsplit/blob/main/LICENSE MIT License -Copyright (c) 2023 Benjamin Minixhofer +Copyright (c) 2024 Benjamin Minixhofer Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -53,7 +53,7 @@ The spaCy Natural Language Processing library falls under the MIT License: The MIT License (MIT) -Copyright (C) 2016-2023 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal +Copyright (C) 2016-2024 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/python/AzureTranslation/acs_translation_component/__init__.py b/python/AzureTranslation/acs_translation_component/__init__.py index 92489ddf..37ccd093 100644 --- a/python/AzureTranslation/acs_translation_component/__init__.py +++ b/python/AzureTranslation/acs_translation_component/__init__.py @@ -5,11 +5,11 @@ # under contract, and is subject to the Rights in Data-General Clause # # 52.227-14, Alt. IV (DEC 2007). # # # -# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # ############################################################################# ############################################################################# -# Copyright 2023 The MITRE Corporation # +# Copyright 2024 The MITRE Corporation # # # # Licensed under the Apache License, Version 2.0 (the "License"); # # you may not use this file except in compliance with the License. # diff --git a/python/AzureTranslation/acs_translation_component/acs_translation_component.py b/python/AzureTranslation/acs_translation_component/acs_translation_component.py index ccd9faf9..2b089e31 100644 --- a/python/AzureTranslation/acs_translation_component/acs_translation_component.py +++ b/python/AzureTranslation/acs_translation_component/acs_translation_component.py @@ -5,11 +5,11 @@ # under contract, and is subject to the Rights in Data-General Clause # # 52.227-14, Alt. IV (DEC 2007). # # # -# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # ############################################################################# ############################################################################# -# Copyright 2023 The MITRE Corporation # +# Copyright 2024 The MITRE Corporation # # # # Licensed under the Apache License, Version 2.0 (the "License"); # # you may not use this file except in compliance with the License. # diff --git a/python/AzureTranslation/acs_translation_component/convert_language_code.py b/python/AzureTranslation/acs_translation_component/convert_language_code.py index 402da4e4..967f0607 100644 --- a/python/AzureTranslation/acs_translation_component/convert_language_code.py +++ b/python/AzureTranslation/acs_translation_component/convert_language_code.py @@ -5,11 +5,11 @@ # under contract, and is subject to the Rights in Data-General Clause # # 52.227-14, Alt. IV (DEC 2007). # # # -# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # ############################################################################# ############################################################################# -# Copyright 2023 The MITRE Corporation # +# Copyright 2024 The MITRE Corporation # # # # Licensed under the Apache License, Version 2.0 (the "License"); # # you may not use this file except in compliance with the License. # diff --git a/python/AzureTranslation/pyproject.toml b/python/AzureTranslation/pyproject.toml index 52c60148..bcd2b658 100644 --- a/python/AzureTranslation/pyproject.toml +++ b/python/AzureTranslation/pyproject.toml @@ -5,11 +5,11 @@ # under contract, and is subject to the Rights in Data-General Clause # # 52.227-14, Alt. IV (DEC 2007). # # # -# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # ############################################################################# ############################################################################# -# Copyright 2023 The MITRE Corporation # +# Copyright 2024 The MITRE Corporation # # # # Licensed under the Apache License, Version 2.0 (the "License"); # # you may not use this file except in compliance with the License. # diff --git a/python/AzureTranslation/sample_acs_translator.py b/python/AzureTranslation/sample_acs_translator.py index ac144a0d..628c044f 100644 --- a/python/AzureTranslation/sample_acs_translator.py +++ b/python/AzureTranslation/sample_acs_translator.py @@ -7,11 +7,11 @@ # under contract, and is subject to the Rights in Data-General Clause # # 52.227-14, Alt. IV (DEC 2007). # # # -# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # ############################################################################# ############################################################################# -# Copyright 2023 The MITRE Corporation # +# Copyright 2024 The MITRE Corporation # # # # Licensed under the Apache License, Version 2.0 (the "License"); # # you may not use this file except in compliance with the License. # diff --git a/python/AzureTranslation/setup.cfg b/python/AzureTranslation/setup.cfg index cb650a2d..b3fa4200 100644 --- a/python/AzureTranslation/setup.cfg +++ b/python/AzureTranslation/setup.cfg @@ -5,11 +5,11 @@ # under contract, and is subject to the Rights in Data-General Clause # # 52.227-14, Alt. IV (DEC 2007). # # # -# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # ############################################################################# ############################################################################# -# Copyright 2023 The MITRE Corporation # +# Copyright 2024 The MITRE Corporation # # # # Licensed under the Apache License, Version 2.0 (the "License"); # # you may not use this file except in compliance with the License. # diff --git a/python/AzureTranslation/tests/test_acs_translation.py b/python/AzureTranslation/tests/test_acs_translation.py index 70d5a476..ef8534aa 100644 --- a/python/AzureTranslation/tests/test_acs_translation.py +++ b/python/AzureTranslation/tests/test_acs_translation.py @@ -6,11 +6,11 @@ # under contract, and is subject to the Rights in Data-General Clause # # 52.227-14, Alt. IV (DEC 2007). # # # -# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # ############################################################################# ############################################################################# -# Copyright 2023 The MITRE Corporation # +# Copyright 2024 The MITRE Corporation # # # # Licensed under the Apache License, Version 2.0 (the "License"); # # you may not use this file except in compliance with the License. # From ed916634db06c155a112129afb93a84c70c23d68 Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Fri, 19 Apr 2024 03:12:27 -0400 Subject: [PATCH 21/22] Additonal adjustments, test fix. --- .../nlp_text_splitter/text_splitter.py | 18 +++++++---- .../tests/test_acs_translation.py | 32 ++++++++----------- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/python/AzureTranslation/nlp_text_splitter/text_splitter.py b/python/AzureTranslation/nlp_text_splitter/text_splitter.py index f07265bd..2cc1c704 100644 --- a/python/AzureTranslation/nlp_text_splitter/text_splitter.py +++ b/python/AzureTranslation/nlp_text_splitter/text_splitter.py @@ -66,9 +66,9 @@ def __init__(self, model_name: str, model_setting: str, default_lang: str = "en" self._default_lang = default_lang self._mandatory_wtp_language = False self.split = lambda t, **param: [t] - self.update_model(model_name, model_setting) + self.update_model(model_name, model_setting, default_lang) - def update_model(self, model_name: str, model_setting: str = "", default_lang: str="en"): + def update_model(self, model_name: str, model_setting: str = "cpu", default_lang: str="en"): if model_name: if "wtp" in model_name: self._update_wtp_model(model_name, model_setting, default_lang) @@ -80,8 +80,8 @@ def update_model(self, model_name: str, model_setting: str = "", default_lang: s log.info(f"Setup spaCy model: {model_name}") def _update_wtp_model(self, wtp_model_name: str, - model_setting: str = "cpu", - default_lang: str="en") -> None: + model_setting, + default_lang) -> None: if model_setting == "gpu" or model_setting == "cuda": if GPU_AVAILABLE: @@ -91,14 +91,20 @@ def _update_wtp_model(self, wtp_model_name: str, "You may need to update the NVIDIA driver for the host system, " "or reinstall PyTorch with GPU support by setting " "ARGS BUILD_TYPE=gpu in the Dockerfile when building this component.") - model_setting = "cpu" + elif model_setting != "cpu": + log.warning("Invalid WtP model setting. Only `cpu` and `cuda` " + "(or `gpu`) WtP model options available at this time. " + "Defaulting to `cpu` mode.") + model_setting = "cpu" if wtp_model_name in WTP_MANDATORY_ADAPTOR: self._mandatory_wtp_language = True self._default_lang = default_lang - if self._model_name != wtp_model_name: + if self._model_name == wtp_model_name: + log.info(f"Using cached model: {self._model_name}") + else: self._model_name = wtp_model_name # Check if model has been downloaded if os.path.exists(os.path.join(WTP_MODELS_PATH, wtp_model_name)): diff --git a/python/AzureTranslation/tests/test_acs_translation.py b/python/AzureTranslation/tests/test_acs_translation.py index ef8534aa..c871fcdb 100644 --- a/python/AzureTranslation/tests/test_acs_translation.py +++ b/python/AzureTranslation/tests/test_acs_translation.py @@ -75,7 +75,7 @@ def setUpClass(cls): cls.mock_server = MockServer() cls.wtp_model = TextSplitterModel("wtp-bert-mini", "cpu", "en") if LOCAL_TEST_WTP_MODEL: - cls.wtp_adv_model = TextSplitterModel("wtp-canine-s-1l", "cuda", "en") + cls.wtp_adv_model = TextSplitterModel("wtp-canine-s-1l", "cuda", "zh") cls.spacy_model = TextSplitterModel("xx_sent_ud_sm", "cpu", "en") @@ -624,11 +624,11 @@ def test_split_engine_difference(self): def test_split_wtp_advanced_known_language(self, _): # This test should only be run manually outside of a Docker build. # The WtP canine model is ~1 GB and not worth downloading and adding to the pre-built Docker image. - - if not LOCAL_TEST_WTP_MODEL: return + # For this test, we're more interested in the changes in behavior + # caused by WtP split. So the translation files are mainly placeholders. self.set_results_file('traditional-chinese-detect-result.json') self.set_results_file('split-sentence/art-of-war-translation-1.json') self.set_results_file('split-sentence/art-of-war-translation-2.json') @@ -637,7 +637,7 @@ def test_split_wtp_advanced_known_language(self, _): text = (TEST_DATA / 'split-sentence/art-of-war.txt').read_text() detection_props = dict(TEXT=text) - TranslationClient(get_test_properties(), self.wtp_adv_model).add_translations(detection_props) + TranslationClient(get_test_properties(SENTENCE_MODEL="wtp-canine-s-1l"), self.wtp_adv_model).add_translations(detection_props) self.assertEqual(5, len(detection_props)) self.assertEqual(text, detection_props['TEXT']) @@ -646,7 +646,6 @@ def test_split_wtp_advanced_known_language(self, _): .read_text().strip() self.assertEqual(expected_translation, detection_props['TRANSLATION']) self.assertEqual('EN', detection_props['TRANSLATION TO LANGUAGE']) - self.assertEqual('zh-Hant', detection_props['TRANSLATION SOURCE LANGUAGE']) self.assertAlmostEqual(1.0, float(detection_props['TRANSLATION SOURCE LANGUAGE CONFIDENCE'])) @@ -654,41 +653,39 @@ def test_split_wtp_advanced_known_language(self, _): detect_request_text = self.get_request_body()[0]['Text'] self.assertEqual(text[0:TranslationClient.DETECT_MAX_CHARS], detect_request_text) - expected_chunk_lengths = [86, 116, 104, 114] + # Main test starts here: + expected_chunk_lengths = [61, 150, 61, 148] self.assertEqual(sum(expected_chunk_lengths), len(text.replace('\n',''))) translation_request1 = self.get_request_body()[0]['Text'] self.assertTrue(translation_request1.startswith('兵者,')) - self.assertTrue(translation_request1.endswith('而不危也;')) + self.assertTrue(translation_request1.endswith('四曰將,五曰法。')) self.assertEqual(expected_chunk_lengths[0], len(translation_request1)) self.assertNotIn('\n', translation_request1, 'Newlines were not properly removed') self.assertNotIn(' ', translation_request1, 'Spaces should not be added to Chinese text.') - translation_request2 = self.get_request_body()[0]['Text'] self.assertEqual(expected_chunk_lengths[1], len(translation_request2)) - self.assertTrue(translation_request2.startswith('天者,陰陽')) - self.assertTrue(translation_request2.endswith('兵眾孰強?')) - self.assertNotIn('\n', translation_request1, + self.assertTrue(translation_request2.startswith('道者,令民於上同意')) + self.assertTrue(translation_request2.endswith('賞罰孰明')) + self.assertNotIn('\n', translation_request2, 'Newlines were not properly removed') - self.assertNotIn(' ', translation_request1, + self.assertNotIn(' ', translation_request2, 'Spaces should not be added to Chinese text.') - translation_request3 = self.get_request_body()[0]['Text'] self.assertEqual(expected_chunk_lengths[2], len(translation_request3)) - self.assertTrue(translation_request3.startswith('士卒孰練?')) - self.assertTrue(translation_request3.endswith('遠而示之近。')) + self.assertTrue(translation_request3.startswith('?吾以此知勝')) + self.assertTrue(translation_request3.endswith('因利而制權也。')) self.assertNotIn('\n', translation_request3, 'Newlines were not properly removed') self.assertNotIn(' ', translation_request3, 'Spaces should not be added to Chinese text.') - translation_request4 = self.get_request_body()[0]['Text'] self.assertEqual(expected_chunk_lengths[3], len(translation_request4)) - self.assertTrue(translation_request4.startswith('利而誘之,')) + self.assertTrue(translation_request4.startswith('兵者,詭道也。')) self.assertTrue(translation_request4.endswith('勝負見矣。')) self.assertNotIn('\n', translation_request4, 'Newlines were not properly removed') @@ -1061,7 +1058,6 @@ def test_guess_split_edge_cases(self): self.assertEqual("Maybe...maybe not? \n ", actual[2]) self.assertEqual("All done, I think!", actual[3]) - actual = list(TextSplitter.split(input_text, 35, 35, From 487b7e73caac3a23c54f4c6534a0da99d933b3e1 Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Fri, 19 Apr 2024 11:01:23 -0400 Subject: [PATCH 22/22] Tooltip update. --- python/AzureTranslation/nlp_text_splitter/text_splitter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/AzureTranslation/nlp_text_splitter/text_splitter.py b/python/AzureTranslation/nlp_text_splitter/text_splitter.py index 2cc1c704..5afaa740 100644 --- a/python/AzureTranslation/nlp_text_splitter/text_splitter.py +++ b/python/AzureTranslation/nlp_text_splitter/text_splitter.py @@ -80,8 +80,8 @@ def update_model(self, model_name: str, model_setting: str = "cpu", default_lang log.info(f"Setup spaCy model: {model_name}") def _update_wtp_model(self, wtp_model_name: str, - model_setting, - default_lang) -> None: + model_setting: str, + default_lang: str) -> None: if model_setting == "gpu" or model_setting == "cuda": if GPU_AVAILABLE: