diff --git a/python/AzureTranslation/Dockerfile b/python/AzureTranslation/Dockerfile index 1e0308ea..4ae5195e 100644 --- a/python/AzureTranslation/Dockerfile +++ b/python/AzureTranslation/Dockerfile @@ -7,11 +7,11 @@ # under contract, and is subject to the Rights in Data-General Clause # # 52.227-14, Alt. IV (DEC 2007). # # # -# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # ############################################################################# ############################################################################# -# Copyright 2023 The MITRE Corporation # +# Copyright 2024 The MITRE Corporation # # # # Licensed under the Apache License, Version 2.0 (the "License"); # # you may not use this file except in compliance with the License. # @@ -28,17 +28,53 @@ ARG BUILD_REGISTRY ARG BUILD_TAG=latest -FROM ${BUILD_REGISTRY}openmpf_python_executor_ssb:${BUILD_TAG} ARG RUN_TESTS=false +# To enable GPU resources, update +# below line to BUILD_TYPE=gpu +ARG BUILD_TYPE=cpu + +FROM ${BUILD_REGISTRY}openmpf_python_executor_ssb:${BUILD_TAG} as download_python_packages + RUN pip install --no-cache-dir langcodes +RUN apt-get update && \ + apt-get install -y git git-lfs && \ + git lfs install && \ + rm -rf /var/lib/apt/lists/* + +# Install WtP and spaCy +RUN pip install --upgrade pip && \ + pip install "spacy>=3.7.4" && \ + pip install "wtpsplit>=1.3.0" + +# Modify to add downloads for other models of interest. +RUN mkdir /wtp_models && cd /wtp_models && \ + git clone https://huggingface.co/benjamin/wtp-bert-mini && \ + python3 -m spacy download xx_sent_ud_sm + +######################################################################## +FROM download_python_packages as cpu_component_build +RUN pip install torch --index-url https://download.pytorch.org/whl/cpu + +######################################################################## +FROM download_python_packages as gpu_component_build + +# Environment variables required by nvidia runtime. +ENV NVIDIA_VISIBLE_DEVICES=all +ENV NVIDIA_DRIVER_CAPABILITIES compute,utility + +RUN pip install torch + +######################################################################## + +FROM ${BUILD_TYPE}_component_build as component_final + RUN --mount=target=.,readwrite \ install-component.sh; \ if [ "${RUN_TESTS,,}" == true ]; then python tests/test_acs_translation.py; fi - LABEL org.label-schema.license="Apache 2.0" \ org.label-schema.name="OpenMPF Azure Translation" \ org.label-schema.schema-version="1.0" \ diff --git a/python/AzureTranslation/LICENSE b/python/AzureTranslation/LICENSE new file mode 100644 index 00000000..2344b622 --- /dev/null +++ b/python/AzureTranslation/LICENSE @@ -0,0 +1,74 @@ +/***************************************************************************** +* Copyright 2024 The MITRE Corporation * +* * +* Licensed under the Apache License, Version 2.0 (the "License"); * +* you may not use this file except in compliance with the License. * +* You may obtain a copy of the License at * +* * +* http://www.apache.org/licenses/LICENSE-2.0 * +* * +* Unless required by applicable law or agreed to in writing, software * +* distributed under the License is distributed on an "AS IS" BASIS, * +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * +* See the License for the specific language governing permissions and * +* limitations under the License. * +******************************************************************************/ + +This project contains content developed by The MITRE Corporation. If this code +is used in a deployment or embedded within another project, it is requested +that you send an email to opensource@mitre.org in order to let us know where +this software is being used. + +***************************************************************************** + +The WtP, "Where the Point", sentence segmentation library falls under the MIT License: + +https://github.com/bminixhofer/wtpsplit/blob/main/LICENSE + +MIT License + +Copyright (c) 2024 Benjamin Minixhofer + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +***************************************************************************** + +The spaCy Natural Language Processing library falls under the MIT License: + +The MIT License (MIT) + +Copyright (C) 2016-2024 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. \ No newline at end of file diff --git a/python/AzureTranslation/README.md b/python/AzureTranslation/README.md index d58c4aa4..d1e4674d 100644 --- a/python/AzureTranslation/README.md +++ b/python/AzureTranslation/README.md @@ -35,9 +35,8 @@ must be provided. Neither has a default value. `https:///translator/text/v3.0`. The URL should not end with `/translate` because two separate endpoints are used. `ACS_URL + '/translate'` is used for translation. - `ACS_URL + '/breaksentence'` is used to break up text when it is too long - for a single translation request. This property can also be configured - using an environment variable named `MPF_PROP_ACS_URL`. + This property can also be configured using an environment variable + named `MPF_PROP_ACS_URL`. - `ACS_SUBSCRIPTION_KEY`: A string containing your Azure Cognitive Services subscription key. To get one you will need to create an @@ -45,15 +44,16 @@ must be provided. Neither has a default value. using an environment variable named `MPF_PROP_ACS_SUBSCRIPTION_KEY`. -# Important Job Properties: -- `TO_LANGUAGE`: The BCP-47 language code for language that the properties +# Primary Job Properties +- `TO_LANGUAGE`: The BCP-47 language code for the language that the properties should be translated to. - + - `FEED_FORWARD_PROP_TO_PROCESS`: Comma-separated list of property names indicating which properties in the feed-forward track or detection to consider translating. For example, `TEXT,TRANSCRIPT`. If the first property listed is present, then that property will be translated. If it's not, then the next property in the list is considered. At most, one property will be translated. + - `FROM_LANGUAGE`: In most cases, this property should not be used. It should only be used when automatic language detection is detecting the wrong language: Users can provide a BCP-47 code to force the translation service @@ -78,9 +78,63 @@ must be provided. Neither has a default value. to identify the source language of the incoming text. +# Text Splitter Job Properties +The following settings control the behavior of dividing input text into acceptable chunks +for processing. + +Through preliminary investigation, we identified the [WtP library ("Where's the +Point")](https://github.com/bminixhofer/wtpsplit) and [spaCy's multilingual sentence +detection model](https://spacy.io/models) for identifying sentence breaks +in a large section of text. + +WtP models are trained to split up multilingual text by sentence without the need of an +input language tag. The disadvantage is that the most accurate WtP models will need ~3.5 +GB of GPU memory. On the other hand, spaCy has a single multilingual sentence detection +that appears to work better for splitting up English text in certain cases, unfortunately +this model lacks support handling for Chinese punctuation. + +- `SENTENCE_MODEL`: Specifies the desired WtP or spaCy sentence detection model. For CPU + and runtime considerations, the author of WtP recommends using `wtp-bert-mini`. More + advanced WtP models that use GPU resources (up to ~8 GB) are also available. See list of + WtP model names + [here](https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#available-models). The + only available spaCy model (for text with unknown language) is `xx_sent_ud_sm`. + + Review list of languages supported by WtP + [here](https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#supported-languages). + Review models and languages supported by spaCy [here](https://spacy.io/models). + +- `SENTENCE_SPLITTER_CHAR_COUNT`: Specifies maximum number of characters to process + through sentence/text splitter. Default to 500 characters as we only need to process a + subsection of text to determine an appropriate split. (See discussion of potential char + lengths + [here](https://discourse.mozilla.org/t/proposal-sentences-lenght-limit-from-14-words-to-100-characters). + +- `SENTENCE_SPLITTER_INCLUDE_INPUT_LANG`: Specifies whether to pass input language to + sentence splitter algorithm. Currently, only WtP supports model threshold adjustments by + input language. + +- `SENTENCE_MODEL_CPU_ONLY`: If set to TRUE, only use CPU resources for the sentence + detection model. If set to FALSE, allow sentence model to also use GPU resources. + For most runs using spaCy `xx_sent_ud_sm` or `wtp-bert-mini` models, GPU resources + are not required. If using more advanced WtP models like `wtp-canine-s-12l`, + it is recommended to set `SENTENCE_MODEL_CPU_ONLY=FALSE` to improve performance. + That model can use up to ~3.5 GB of GPU memory. + + Please note, to fully enable this option, you must also rebuild the Docker container + with the following change: Within the Dockerfile, set `ARG BUILD_TYPE=gpu`. + Otherwise, PyTorch will be installed without cuda support and + component will always default to CPU processing. + +- `SENTENCE_MODEL_WTP_DEFAULT_ADAPTOR_LANGUAGE`: More advanced WTP models will + require a target language. This property sets the default language to use for + sentence splitting, and is overwritten whenever `FROM_LANGUAGE`, `SUGGESTED_FROM_LANGUAGE`, + or Azure language detection return a different, WtP-supported language option. + + # Listing Supported Languages -To list the supported languages replace `${ACS_URL}` and -`${ACS_SUBSCRIPTION_KEY}` in the following command and run it: +To list the supported languages replace `${ACS_URL}` and `${ACS_SUBSCRIPTION_KEY}` in the +following command and run it: ```shell script curl -H "Ocp-Apim-Subscription-Key: ${ACS_SUBSCRIPTION_KEY}" "https://${ACS_URL}/languages?api-version=3.0&scope=translation" ``` diff --git a/python/AzureTranslation/acs_translation_component/__init__.py b/python/AzureTranslation/acs_translation_component/__init__.py index 92489ddf..37ccd093 100644 --- a/python/AzureTranslation/acs_translation_component/__init__.py +++ b/python/AzureTranslation/acs_translation_component/__init__.py @@ -5,11 +5,11 @@ # under contract, and is subject to the Rights in Data-General Clause # # 52.227-14, Alt. IV (DEC 2007). # # # -# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # ############################################################################# ############################################################################# -# Copyright 2023 The MITRE Corporation # +# Copyright 2024 The MITRE Corporation # # # # Licensed under the Apache License, Version 2.0 (the "License"); # # you may not use this file except in compliance with the License. # diff --git a/python/AzureTranslation/acs_translation_component/acs_translation_component.py b/python/AzureTranslation/acs_translation_component/acs_translation_component.py index c1d3f679..2b089e31 100644 --- a/python/AzureTranslation/acs_translation_component/acs_translation_component.py +++ b/python/AzureTranslation/acs_translation_component/acs_translation_component.py @@ -5,11 +5,11 @@ # under contract, and is subject to the Rights in Data-General Clause # # 52.227-14, Alt. IV (DEC 2007). # # # -# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # ############################################################################# ############################################################################# -# Copyright 2023 The MITRE Corporation # +# Copyright 2024 The MITRE Corporation # # # # Licensed under the Apache License, Version 2.0 (the "License"); # # you may not use this file except in compliance with the License. # @@ -36,21 +36,26 @@ import urllib.parse import urllib.request import uuid -from typing import Callable, Dict, Iterator, List, Literal, Mapping, Match, NamedTuple, \ +from typing import Callable, Dict, List, Literal, Mapping, Match, NamedTuple, \ Optional, Sequence, TypedDict, TypeVar, Union import mpf_component_api as mpf import mpf_component_util as mpf_util +from nlp_text_splitter.text_splitter import TextSplitter, TextSplitterModel + from . import convert_language_code log = logging.getLogger('AcsTranslationComponent') + class AcsTranslationComponent: - @staticmethod - def get_detections_from_video(job: mpf.VideoJob) -> Sequence[mpf.VideoTrack]: + def __init__(self) -> None: + self._cached_sent_model = TextSplitterModel("wtp-bert-mini", "cpu", "en") + + def get_detections_from_video(self, job: mpf.VideoJob) -> Sequence[mpf.VideoTrack]: try: log.info(f'Received video job: {job}') ff_track = job.feed_forward_track @@ -59,7 +64,7 @@ def get_detections_from_video(job: mpf.VideoJob) -> Sequence[mpf.VideoTrack]: 'Component can only process feed forward jobs, ' 'but no feed forward track provided. ') - tc = TranslationClient(job.job_properties) + tc = TranslationClient(job.job_properties, self._cached_sent_model) tc.add_translations(ff_track.detection_properties) for ff_location in ff_track.frame_locations.values(): tc.add_translations(ff_location.detection_properties) @@ -71,18 +76,24 @@ def get_detections_from_video(job: mpf.VideoJob) -> Sequence[mpf.VideoTrack]: log.exception('Failed to complete job due to the following exception:') raise - @staticmethod - def get_detections_from_image(job: mpf.ImageJob) -> Sequence[mpf.ImageLocation]: - return get_detections_from_non_composite(job, job.feed_forward_location) - @staticmethod - def get_detections_from_audio(job: mpf.AudioJob) -> Sequence[mpf.AudioTrack]: - return get_detections_from_non_composite(job, job.feed_forward_track) + def get_detections_from_image(self, job: mpf.ImageJob) -> Sequence[mpf.ImageLocation]: + return get_detections_from_non_composite(job, + self._cached_sent_model, + job.feed_forward_location) - @staticmethod - def get_detections_from_generic(job: mpf.GenericJob) -> Sequence[mpf.GenericTrack]: + + def get_detections_from_audio(self, job: mpf.AudioJob) -> Sequence[mpf.AudioTrack]: + return get_detections_from_non_composite(job, + self._cached_sent_model, + job.feed_forward_track) + + + def get_detections_from_generic(self, job: mpf.GenericJob) -> Sequence[mpf.GenericTrack]: if job.feed_forward_track: - return get_detections_from_non_composite(job, job.feed_forward_track) + return get_detections_from_non_composite(job, + self._cached_sent_model, + job.feed_forward_track) else: log.info('Job did not contain a feed forward track. Assuming media ' 'file is a plain text file containing the text to be translated.') @@ -90,13 +101,16 @@ def get_detections_from_generic(job: mpf.GenericJob) -> Sequence[mpf.GenericTrac track = mpf.GenericTrack(detection_properties=dict(TEXT=text)) modified_job_props = {**job.job_properties, 'FEED_FORWARD_PROP_TO_PROCESS': 'TEXT'} modified_job = job._replace(job_properties=modified_job_props) - return get_detections_from_non_composite(modified_job, track) + return get_detections_from_non_composite(modified_job, + self._cached_sent_model, + track) T_FF_OBJ = TypeVar('T_FF_OBJ', mpf.AudioTrack, mpf.GenericTrack, mpf.ImageLocation) def get_detections_from_non_composite( job: Union[mpf.AudioJob, mpf.GenericJob, mpf.ImageJob], + sentence_model: TextSplitterModel, ff_track: Optional[T_FF_OBJ]) -> Sequence[T_FF_OBJ]: try: log.info(f'Received job: {job}') @@ -105,7 +119,7 @@ def get_detections_from_non_composite( 'Component can only process feed forward jobs, ' 'but no feed forward track provided.') - tc = TranslationClient(job.job_properties) + tc = TranslationClient(job.job_properties, sentence_model) tc.add_translations(ff_track.detection_properties) log.info(f'Processing complete. Translated {tc.translation_count} properties.') return (ff_track,) @@ -144,9 +158,12 @@ class UnsupportedSourceLanguage(Exception): class TranslationClient: + # ACS limits the number of characters that can be translated in a single /translate call. + # Taken from + # https://docs.microsoft.com/en-us/azure/cognitive-services/translator/reference/v3-0-translate DETECT_MAX_CHARS = 50_000 - def __init__(self, job_properties: Mapping[str, str]): + def __init__(self, job_properties: Mapping[str, str], sentence_model: TextSplitterModel): self._subscription_key = get_required_property('ACS_SUBSCRIPTION_KEY', job_properties) self._http_retry = mpf_util.HttpRetry.from_properties(job_properties, log.warning) @@ -170,8 +187,7 @@ def __init__(self, job_properties: Mapping[str, str]): acs_url = get_required_property('ACS_URL', job_properties) self._detect_url = create_url(acs_url, 'detect', {}) - self._break_sentence_client = BreakSentenceClient(job_properties, self._subscription_key, - self._http_retry) + self._sentence_splitter = SentenceSplitter(job_properties, sentence_model) prop_names = job_properties.get('FEED_FORWARD_PROP_TO_PROCESS', 'TEXT,TRANSCRIPT') self._props_to_translate = [p.strip() for p in prop_names.split(',')] @@ -228,7 +244,7 @@ def add_translations(self, detection_properties: Dict[str, str]) -> None: def _translate_text(self, text: str, detection_properties: Dict[str, str]) -> TranslationResult: """ - Translates the given text. If the text is longer than ACS allows, we will break up the + Translates the given text. If the text is longer than ACS allows, we will split up the text and translate each part separately. If, during the current job, we have seen the exact text before, we return a cached result instead of making a REST call. """ @@ -259,7 +275,7 @@ def _translate_text(self, text: str, detection_properties: Dict[str, str]) -> Tr text, DetectResult(from_lang, from_lang_confidence), skipped=True) else: text_replaced_newlines = self._newline_behavior(text, from_lang) - grouped_sentences = self._break_sentence_client.split_input_text( + grouped_sentences = self._sentence_splitter.split_input_text( text_replaced_newlines, from_lang, from_lang_confidence) if not detect_result and grouped_sentences.detected_language: assert grouped_sentences.detected_language_confidence is not None @@ -425,117 +441,71 @@ def _send_detect_request(self, text) -> 'AcsResponses.Detect': return response_body -class BreakSentenceClient: +class SentenceSplitter: """ - Class to interact with Azure's "/breaksentence" endpoint. It is only used when the text to - translate exceeds the translation endpoint's character limit. + Class to divide large sections of text at sentence breaks using WtP and spaCy. + It is only used when the text to translate exceeds + the translation endpoint's character limit. """ - # ACS limits the number of characters that can be translated in a single /translate call. - # Taken from https://docs.microsoft.com/en-us/azure/cognitive-services/translator/reference/v3-0-translate - TRANSLATION_MAX_CHARS = 10_000 - - # ACS limits the number of characters that can be processed in a single /breaksentence call. - # Taken from https://docs.microsoft.com/en-us/azure/cognitive-services/translator/reference/v3-0-break-sentence - BREAK_SENTENCE_MAX_CHARS = 50_000 - - - def __init__(self, job_properties: Mapping[str, str], subscription_key: str, - http_retry: mpf_util.HttpRetry): - self._acs_url = get_required_property('ACS_URL', job_properties) - self._subscription_key = subscription_key - self._http_retry = http_retry + def __init__(self, job_properties: Mapping[str, str], + sentence_model:TextSplitterModel): + self._sentence_model = sentence_model + self._num_boundary_chars = mpf_util.get_property(job_properties, + "SENTENCE_SPLITTER_CHAR_COUNT", + 500) + nlp_model_name = mpf_util.get_property(job_properties, "SENTENCE_MODEL", "wtp-bert-mini") + self._incl_input_lang = mpf_util.get_property(job_properties, + "SENTENCE_SPLITTER_INCLUDE_INPUT_LANG", + True) + + wtp_default_language = mpf_util.get_property(job_properties, + "SENTENCE_MODEL_WTP_DEFAULT_ADAPTOR_LANGUAGE", + "en") + nlp_model_setting = mpf_util.get_property(job_properties, "SENTENCE_MODEL_CPU_ONLY", True) + + if not nlp_model_setting: + nlp_model_setting = "cuda" + else: + nlp_model_setting = "cpu" + self._sentence_model.update_model(nlp_model_name, nlp_model_setting, wtp_default_language) def split_input_text(self, text: str, from_lang: Optional[str], from_lang_confidence: Optional[float]) -> SplitTextResult: """ - Breaks up the given text in to chunks that are under TRANSLATION_MAX_CHARS. Each chunk - will contain one or more complete sentences as reported by the break sentence endpoint. + Splits up the given text in to chunks that are under TranslationClient.DETECT_MAX_CHARS. + Each chunk will contain one or more complete sentences as reported + by the (WtP or spaCy) sentence splitter. """ azure_char_count = get_azure_char_count(text) - if azure_char_count <= self.TRANSLATION_MAX_CHARS: + if azure_char_count <= TranslationClient.DETECT_MAX_CHARS: return SplitTextResult([text], from_lang, from_lang_confidence) log.info('Splitting input text because the translation endpoint allows a maximum of ' - f'{self.TRANSLATION_MAX_CHARS} Azure characters, but the text contained ' + f'{TranslationClient.DETECT_MAX_CHARS} Azure characters, but the text contained ' f'{azure_char_count} Azure characters.') - if azure_char_count > self.BREAK_SENTENCE_MAX_CHARS: - log.warning('Guessing sentence breaks because the break sentence endpoint allows a ' - f'maximum of {self.BREAK_SENTENCE_MAX_CHARS} Azure characters, but the' - f'text contained {azure_char_count} Azure characters.') - chunks = list(SentenceBreakGuesser.guess_breaks(text)) - log.warning(f'Broke text up in to {len(chunks)} chunks. Each chunk will be sent to ' - 'the break sentence endpoint.') - else: - chunks = (text,) - - if from_lang: - break_sentence_url = create_url(self._acs_url, 'breaksentence', - dict(language=from_lang)) + if self._incl_input_lang: + divided_text_list = TextSplitter.split( + text, + TranslationClient.DETECT_MAX_CHARS, + self._num_boundary_chars, + get_azure_char_count, + self._sentence_model, + from_lang) else: - break_sentence_url = create_url(self._acs_url, 'breaksentence', {}) + divided_text_list = TextSplitter.split( + text, + TranslationClient.DETECT_MAX_CHARS, + self._num_boundary_chars, + get_azure_char_count, + self._sentence_model) - chunk_iter = iter(chunks) - chunk = next(chunk_iter) - response_body = self._send_break_sentence_request(break_sentence_url, chunk) - if not from_lang: - detected_lang_info = response_body[0].get('detectedLanguage') - if detected_lang_info: - from_lang = detected_lang_info['language'] - from_lang_confidence = detected_lang_info['score'] - grouped_sentences = list(self._process_break_sentence_response(chunk, response_body)) - - for chunk in chunk_iter: - response_body = self._send_break_sentence_request(break_sentence_url, chunk) - grouped_sentences.extend(self._process_break_sentence_response(chunk, response_body)) - - log.info('Grouped sentences into %s chunks.', len(grouped_sentences)) - return SplitTextResult(grouped_sentences, from_lang, from_lang_confidence) - - - def _send_break_sentence_request( - self, break_sentence_url: str, text: str) -> 'AcsResponses.BreakSentence': - request_body = [ - {'Text': text} - ] - encoded_body = json.dumps(request_body).encode('utf-8') - request = urllib.request.Request(break_sentence_url, encoded_body, - get_acs_headers(self._subscription_key)) - log.info(f'Sending POST {break_sentence_url}') - log_json(request_body) - with self._http_retry.urlopen(request) as response: - response_body: AcsResponses.BreakSentence = json.load(response) - log.info('Received break sentence response with %s sentences.', - len(response_body[0]['sentLen'])) - log_json(response_body) - return response_body - - - @classmethod - def _process_break_sentence_response( - cls, text: str, response_body: AcsResponses.BreakSentence) -> Iterator[str]: - current_chunk_length = 0 - current_chunk_begin = 0 - current_chunk_azure_char_count = 0 - for length in response_body[0]['sentLen']: - sentence_begin = current_chunk_begin + current_chunk_length - sentence = text[sentence_begin: sentence_begin + length] - sentence_azure_char_count = get_azure_char_count(sentence) - # The /breaksentence endpoint will return sentences <= 1000 characters, so the - # following condition will be true at least once. - if sentence_azure_char_count + current_chunk_azure_char_count <= cls.TRANSLATION_MAX_CHARS: - current_chunk_length += len(sentence) - current_chunk_azure_char_count += sentence_azure_char_count - else: - current_chunk_end = current_chunk_begin + current_chunk_length - yield text[current_chunk_begin:current_chunk_end] - current_chunk_begin = current_chunk_end - current_chunk_length = len(sentence) - current_chunk_azure_char_count = sentence_azure_char_count - yield text[current_chunk_begin:] + chunks = list(divided_text_list) + log.info('Grouped sentences into %s chunks for translation.', len(chunks)) + return SplitTextResult(chunks, from_lang, from_lang_confidence) def get_n_azure_chars(input_str: str, begin: int, count: int) -> str: substr = input_str[begin: begin + count] @@ -583,79 +553,7 @@ def set_query_params(url: str, query_params: Mapping[str, str]) -> str: return urllib.parse.urlunparse(replaced_parts) -class SentenceBreakGuesser: - @classmethod - def guess_breaks(cls, text: str) -> Iterator[str]: - """ - Splits text up in to substrings that are all at most - BreakSentenceClient.BREAK_SENTENCE_MAX_CHARS in length. It is preferable to use the - /breaksentence endpoint because splitting a sentence in the middle will cause incorrect - translations. When the input text is too long for /breaksentence, our only option is to - use some heuristics to guess a good location to split the input text. - We attempt to do the minimal number of splits with this method. The substrings produced - by this method will be further split up using the much more accurate /breaksentence - endpoint. - - :param text: Text to split up - :return: Generator producing substrings of input text - """ - current_pos = 0 - max_chars = BreakSentenceClient.BREAK_SENTENCE_MAX_CHARS - while True: - chunk = get_n_azure_chars(text, current_pos, max_chars) - is_last_chunk = len(text) <= current_pos + len(chunk) - if is_last_chunk: - yield chunk - return - else: - break_pos = cls._get_break_pos(chunk) - yield chunk[:break_pos] - current_pos += break_pos - - # Characters we know indicate the end of a sentence. The list is not exhaustive and may need to - # be updated if we come across others. - SENTENCE_END_PUNCTUATION = { - '.', '!', '?', # Latin scripts - '。', '!', '?'} # Chinese (full width) versions - - - @classmethod - def _get_break_pos(cls, text: str) -> int: - # Two newlines in a row result in a blank line. Blank lines are commonly used to delimit - # paragraphs. - double_newline_pos = text.rfind('\n\n') - if double_newline_pos > 0: - return double_newline_pos + 2 - - # Look for the last sentence breaking punctuation character in the text. - last_punctuation_pos = next( - (i for i in reversed(range(len(text))) if text[i] in cls.SENTENCE_END_PUNCTUATION), - -1) - if last_punctuation_pos > 0: - return last_punctuation_pos + 1 - - single_newline_pos = text.rfind('\n') - if single_newline_pos > 0: - return single_newline_pos + 1 - - # Look for last punctuation character in the text. - # This will catch non-sentence breaking punctuation, but we already made our best effort - # to use sentence breaking punctuation above. - last_punctuation_pos = next( - (i for i in reversed(range(len(text))) if cls._is_punctuation(text[i])), - -1) - if last_punctuation_pos > 0: - return last_punctuation_pos + 1 - - if (last_space_pos := text.rfind(' ')) > 0: - return last_space_pos + 1 - - # No suitable break found. Use entire input. - return len(text) - @staticmethod - def _is_punctuation(char): - return unicodedata.category(char) == 'Po' def get_acs_headers(subscription_key: str) -> Dict[str, str]: @@ -712,7 +610,7 @@ def get_required_property(property_name: str, job_properties: Mapping[str, str]) class NewLineBehavior: """ - The Azure translation service treats newlines a separator between sentences. This results in + The Azure translation service treats newlines as a separator between sentences. This results in incorrect translations. We can't simply replace newlines with spaces because not all languages put spaces between words. When testing with Chinese, spaces resulted in completely different translations. @@ -831,13 +729,6 @@ class _TranslateTextInfo(TypedDict): Translate = List[_TranslateTextInfo] - class _SentenceLengthInfo(TypedDict): - sentLen: List[int] - detectedLanguage: Optional[AcsResponses._DetectedLangInfo] - - BreakSentence = List[_SentenceLengthInfo] - - class _AlternativeDetectedLang(TypedDict): language: str score: float diff --git a/python/AzureTranslation/acs_translation_component/convert_language_code.py b/python/AzureTranslation/acs_translation_component/convert_language_code.py index 402da4e4..967f0607 100644 --- a/python/AzureTranslation/acs_translation_component/convert_language_code.py +++ b/python/AzureTranslation/acs_translation_component/convert_language_code.py @@ -5,11 +5,11 @@ # under contract, and is subject to the Rights in Data-General Clause # # 52.227-14, Alt. IV (DEC 2007). # # # -# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # ############################################################################# ############################################################################# -# Copyright 2023 The MITRE Corporation # +# Copyright 2024 The MITRE Corporation # # # # Licensed under the Apache License, Version 2.0 (the "License"); # # you may not use this file except in compliance with the License. # diff --git a/python/AzureTranslation/nlp_text_splitter/__init__.py b/python/AzureTranslation/nlp_text_splitter/__init__.py new file mode 100644 index 00000000..09805b64 --- /dev/null +++ b/python/AzureTranslation/nlp_text_splitter/__init__.py @@ -0,0 +1,27 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +from .text_splitter import TextSplitter, TextSplitterModel \ No newline at end of file diff --git a/python/AzureTranslation/nlp_text_splitter/text_splitter.py b/python/AzureTranslation/nlp_text_splitter/text_splitter.py new file mode 100644 index 00000000..5afaa740 --- /dev/null +++ b/python/AzureTranslation/nlp_text_splitter/text_splitter.py @@ -0,0 +1,263 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +import logging +import os +import pkg_resources + +import spacy +from wtpsplit import WtP +from typing import Callable, List, Optional, Tuple + +from .wtp_lang_settings import WtpLanguageSettings + +import torch + + +DEFAULT_WTP_MODELS = "/wtp_models" + +# If we want to package model installation with this utility in the future: +WTP_MODELS_PATH = pkg_resources.resource_filename( + __name__, "models" +) + +log = logging.getLogger(__name__) + +# These models must have an specified language during sentence splitting. +WTP_MANDATORY_ADAPTOR = ['wtp-canine-s-1l', + 'wtp-canine-s-3l', + 'wtp-canine-s-6l', + 'wtp-canine-s-9l', + 'wtp-canine-s-12l'] + +GPU_AVAILABLE = False +if torch.cuda.is_available(): + GPU_AVAILABLE = True + + +class TextSplitterModel: + # To hold spaCy, WtP, and other potential sentence detection models in cache + + def __init__(self, model_name: str, model_setting: str, default_lang: str = "en") -> None: + self._model_name = "" + self._default_lang = default_lang + self._mandatory_wtp_language = False + self.split = lambda t, **param: [t] + self.update_model(model_name, model_setting, default_lang) + + def update_model(self, model_name: str, model_setting: str = "cpu", default_lang: str="en"): + if model_name: + if "wtp" in model_name: + self._update_wtp_model(model_name, model_setting, default_lang) + self.split = self._split_wtp + log.info(f"Setup WtP model: {model_name}") + else: + self._update_spacy_model(model_name) + self.split = self._split_spacy + log.info(f"Setup spaCy model: {model_name}") + + def _update_wtp_model(self, wtp_model_name: str, + model_setting: str, + default_lang: str) -> None: + + if model_setting == "gpu" or model_setting == "cuda": + if GPU_AVAILABLE: + model_setting = "cuda" + else: + log.warning("PyTorch determined that CUDA is not available. " + "You may need to update the NVIDIA driver for the host system, " + "or reinstall PyTorch with GPU support by setting " + "ARGS BUILD_TYPE=gpu in the Dockerfile when building this component.") + model_setting = "cpu" + elif model_setting != "cpu": + log.warning("Invalid WtP model setting. Only `cpu` and `cuda` " + "(or `gpu`) WtP model options available at this time. " + "Defaulting to `cpu` mode.") + model_setting = "cpu" + + if wtp_model_name in WTP_MANDATORY_ADAPTOR: + self._mandatory_wtp_language = True + self._default_lang = default_lang + + if self._model_name == wtp_model_name: + log.info(f"Using cached model: {self._model_name}") + else: + self._model_name = wtp_model_name + # Check if model has been downloaded + if os.path.exists(os.path.join(WTP_MODELS_PATH, wtp_model_name)): + log.info(f"Using downloaded {wtp_model_name} model.") + wtp_model_name = os.path.join(WTP_MODELS_PATH, wtp_model_name) + + elif os.path.exists(os.path.join(DEFAULT_WTP_MODELS, + wtp_model_name)): + + log.info(f"Using downloaded {wtp_model_name} model.") + wtp_model_name = os.path.join(DEFAULT_WTP_MODELS, + wtp_model_name) + + else: + log.warning(f"Model {wtp_model_name} not found, " + "downloading from hugging face.") + + self.wtp_model = WtP(wtp_model_name) + + if model_setting != "cpu" and model_setting != "cuda": + log.warning(f"Invalid setting for WtP runtime {model_setting}. " + "Defaulting to CPU mode.") + model_setting = "cpu" + self.wtp_model.to(model_setting) + + def _split_wtp(self, text: str, lang: Optional[str] = None) -> List[str]: + if lang: + iso_lang = WtpLanguageSettings.convert_to_iso(lang) + if iso_lang: + return self.wtp_model.split(text, lang_code=iso_lang) + else: + log.warning(f"Language {lang} was not used to train WtP model. " + "If text splitting is not working well with WtP, " + "consider trying spaCy's sentence detection model." + ) + if self._mandatory_wtp_language: + log.warning("WtP model requires a language. " + f"Using default language : {self._default_lang}.") + iso_lang = WtpLanguageSettings.convert_to_iso(self._default_lang) + return self.wtp_model.split(text, lang_code=iso_lang) + return self.wtp_model.split(text) + + def _update_spacy_model(self, spacy_model_name: str): + self.spacy_model = spacy.load(spacy_model_name, exclude=["parser"]) + self.spacy_model.enable_pipe("senter") + + def _split_spacy(self, text: str, lang: Optional[str] = None) -> List[str]: + # TODO: We may add an auto model selection for spaCy in the future. + # However, the drawback is we will also need to + # download a large number of spaCy models beforehand. + processed_text = self.spacy_model(text) + return [sent.text_with_ws for sent in processed_text.sents] + +class TextSplitter: + # Authors: Brian Rosenberg, Howard Huang + + def __init__( + self, text: str, limit: int, num_boundary_chars: int, + get_text_size: Callable[[str], int], + sentence_model: TextSplitterModel, + in_lang: Optional[str] = None) -> None: + self._sentence_model = sentence_model + self._limit = limit + self._num_boundary_chars = num_boundary_chars + self._get_text_size = get_text_size + self._text = "" + self._text_full_size = 0 + self._overhead_size = 0 + self._soft_limit = self._limit + self._in_lang = in_lang + + if text: + self.set_text(text) + + def set_text(self, text: str): + self._text = text + self._text_full_size = self._get_text_size(text) + chars_per_size = len(text) / self._text_full_size + self._overhead_size = self._get_text_size('') + + self._soft_limit = int(self._limit * chars_per_size) - self._overhead_size + + if self._soft_limit <= 1: + # Caused by an unusually large overhead relative to text. + # This is unlikely to occur except during testing of small text limits. + # Recalculate soft limit by subtracting overhead from limit + # before applying chars_per_size weighting. + self._soft_limit = max(1, + int((self._limit - self._overhead_size) * chars_per_size)) + + def _isolate_largest_section(self, text:str) -> str: + # Using cached word splitting model, isolate largest section of text + string_length = len(text) + + if self._num_boundary_chars <= 0: + num_chars_to_process = string_length + else: + num_chars_to_process = self._num_boundary_chars + + start_indx = max(0, string_length - num_chars_to_process) + substring = text[start_indx: string_length] + substring_list = self._sentence_model.split(substring, lang = self._in_lang) + div_index = string_length - len(substring_list[-1]) + + if div_index==start_indx: + return text + + return text[0:div_index] + + @classmethod + def split(cls, + text: str, limit: int, num_boundary_chars: int, get_text_size: Callable[[str], int], + sentence_model: TextSplitterModel, + in_lang: Optional[str] = None + ): + return cls(text, limit, num_boundary_chars, get_text_size, sentence_model, in_lang)._split() + + + def _split(self): + if self._text_full_size <= self._limit: + yield self._text + else: + yield from self._split_internal(self._text) + + def _split_internal(self, text): + right = text + while True: + left, right = self._divide(right) + yield left + if not right: + return + + def _divide(self, text) -> Tuple[str, str]: + limit = self._soft_limit + while True: + left = text[:limit] + left_size = self._get_text_size(left) + + if left_size <= self._limit: + if left != text: + # If dividing into two parts + # Determine soft boundary for left segment + left = self._isolate_largest_section(left) + return left, text[len(left):] + + char_per_size = len(left) / left_size + + + limit = int(self._limit * char_per_size) - self._overhead_size + + if limit < 1: + # Caused by an unusually large overhead relative to text. + # This is unlikely to occur except during testing of small text limits. + # Recalculate soft limit by subtracting overhead from limit before + # applying chars_per_size weighting. + limit = max(1, int((self._limit - self._overhead_size) * char_per_size)) \ No newline at end of file diff --git a/python/AzureTranslation/nlp_text_splitter/wtp_lang_settings.py b/python/AzureTranslation/nlp_text_splitter/wtp_lang_settings.py new file mode 100644 index 00000000..c682fd3f --- /dev/null +++ b/python/AzureTranslation/nlp_text_splitter/wtp_lang_settings.py @@ -0,0 +1,259 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +from typing import Optional + +class WtpLanguageSettings: + # Supported languages and ISO 639-1, 639-2 codes for WtP models. + # https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#supported-languages + # https://www.loc.gov/standards/iso639-2/php/code_list.php + _wtp_lang_map = { + 'afrikaans': 'af', + 'afr': 'af', + 'amharic': 'am', + 'amh': 'am', + 'arabic': 'ar', + 'ara': 'ar', + 'azerbaijani': 'az', + 'aze': 'az', + 'belarusian': 'be', + 'bel': 'be', + 'bulgarian': 'bg', + 'bul': 'bg', + 'bengali': 'bn', + 'ben': 'bn', + 'catalan': 'ca', + 'valencian': 'ca', + 'cat': 'ca', + 'cebuano': 'ceb', # In some cases, ISO-639-1 is not available, use ISO-639-2 + 'ceb': 'ceb', + 'czech': 'cs', + 'cze': 'cs', + 'ces': 'cs', + 'welsh': 'cy', + 'wel': 'cy', + 'cym': 'cy', + 'danish': 'da', + 'dan': 'da', + 'german': 'de', + 'ger': 'de', + 'deu': 'de', + 'greek': 'el', + 'gre': 'el', + 'ell': 'el', + 'english': 'en', + 'eng': 'en', + 'esperanto': 'eo', + 'epo': 'eo', + 'spanish': 'es', + 'castilian': 'es', + 'spa': 'es', + 'estonian': 'et', + 'est': 'et', + 'basque': 'eu', + 'baq': 'eu', + 'eus': 'eu', + 'persian': 'fa', + 'per': 'fa', + 'fas': 'fa', + 'finnish': 'fi', + 'fin': 'fi', + 'french': 'fr', + 'fre': 'fr', + 'fra': 'fr', + 'western frisian': 'fy', + 'fry': 'fy', + 'irish': 'ga', + 'gle': 'ga', + 'gaelic': 'gd', + 'scottish gaelic': 'gd', + 'gla': 'gd', + 'galician': 'gl', + 'glg': 'gl', + 'gujarati': 'gu', + 'guj': 'gu', + 'hausa': 'ha', + 'hau': 'ha', + 'hebrew': 'he', + 'heb': 'he', + 'hindi': 'hi', + 'hin': 'hi', + 'hungarian': 'hu', + 'hun': 'hu', + 'armenian': 'hy', + 'arm': 'hy', + 'hye': 'hy', + 'indonesian': 'id', + 'ind': 'id', + 'igbo': 'ig', + 'ibo': 'ig', + 'icelandic': 'is', + 'ice': 'is', + 'isl': 'is', + 'italian': 'it', + 'ita': 'it', + 'japanese': 'ja', + 'jpn': 'ja', + 'javanese': 'jv', + 'jav': 'jv', + 'georgian': 'ka', + 'geo': 'ka', + 'kat': 'ka', + 'kazakh': 'kk', + 'kaz': 'kk', + 'central khmer': 'km', + 'khm': 'km', + 'kannada': 'kn', + 'kan': 'kn', + 'korean': 'ko', + 'kor': 'ko', + 'kurdish': 'ku', + 'kur': 'ku', + 'kirghiz': 'ky', + 'kyrgyz': 'ky', + 'kir': 'ky', + 'latin': 'la', + 'lat': 'la', + 'lithuanian': 'lt', + 'lit': 'lt', + 'latvian': 'lv', + 'lav': 'lv', + 'malagasy': 'mg', + 'mlg': 'mg', + 'macedonian': 'mk', + 'mac': 'mk', + 'mkd': 'mk', + 'malayalam': 'ml', + 'mal': 'ml', + 'mongolian': 'mn', + 'mon': 'mn', + 'marathi': 'mr', + 'mar': 'mr', + 'malay': 'ms', + 'may': 'ms', + 'msa': 'ms', + 'maltese': 'mt', + 'mlt': 'mt', + 'burmese': 'my', + 'bur': 'my', + 'mya': 'my', + 'nepali': 'ne', + 'nep': 'ne', + 'dutch': 'nl', + 'flemish': 'nl', + 'dut': 'nl', + 'nld': 'nl', + 'norwegian': 'no', + 'nor': 'no', + 'panjabi': 'pa', + 'punjabi': 'pa', + 'pan': 'pa', + 'polish': 'pl', + 'pol': 'pl', + 'pushto': 'ps', + 'pashto': 'ps', + 'pus': 'ps', + 'portuguese': 'pt', + 'por': 'pt', + 'romanian': 'ro', + 'moldavian': 'ro', + 'moldovan': 'ro', + 'rum': 'ro', + 'ron': 'ro', + 'russian': 'ru', + 'rus': 'ru', + 'sinhala': 'si', + 'sinhalese': 'si', + 'sin': 'si', + 'slovak': 'sk', + 'slo': 'sk', + 'slk': 'sk', + 'slovenian': 'sl', + 'slv': 'sl', + 'albanian': 'sq', + 'alb': 'sq', + 'sqi': 'sq', + 'serbian': 'sr', + 'srp': 'sr', + 'swedish': 'sv', + 'swe': 'sv', + 'tamil': 'ta', + 'tam': 'ta', + 'telugu': 'te', + 'tel': 'te', + 'tajik': 'tg', + 'tgk': 'tg', + 'thai': 'th', + 'tha': 'th', + 'turkish': 'tr', + 'tur': 'tr', + 'ukrainian': 'uk', + 'ukr': 'uk', + 'urdu': 'ur', + 'urd': 'ur', + 'uzbek': 'uz', + 'uzb': 'uz', + 'vietnamese': 'vi', + 'vie': 'vi', + 'xhosa': 'xh', + 'xho': 'xh', + 'yiddish': 'yi', + 'yid': 'yi', + 'yoruba': 'yo', + 'yor': 'yo', + 'chinese': 'zh', + 'chi': 'zh', + 'zho': 'zh', + 'zulu': 'zu', + 'zul': 'zu', + 'hans':'zh', # Also check for chinese scripts + 'hant': 'zh', + 'cmn':'zh' # In some cases we use 'cmn' = 'Mandarin' + } + + _wtp_iso_set = set(_wtp_lang_map.values()) + + @classmethod + def convert_to_iso(cls, lang: str) -> Optional[str]: + # ISO 639-2 (language) is sometimes paired with ISO 15924 (script). + # Extract the language portion and check if supported in WtP. + if not lang: + return None + + if '-' in lang: + lang = lang.split('-')[0] + if '_' in lang: + lang = lang.split('_')[0] + + lang = lang.strip().lower() + + if lang in cls._wtp_iso_set: + return lang + + if lang in cls._wtp_lang_map: + return cls._wtp_lang_map[lang] + + return None diff --git a/python/AzureTranslation/plugin-files/descriptor/descriptor.json b/python/AzureTranslation/plugin-files/descriptor/descriptor.json index 24dd014f..918f09e6 100644 --- a/python/AzureTranslation/plugin-files/descriptor/descriptor.json +++ b/python/AzureTranslation/plugin-files/descriptor/descriptor.json @@ -80,6 +80,36 @@ "description": "Comma-separated list of property names indicating which properties in the feed-forward track or detection determine the language from which to translate. If the first property listed is present, then that property will be used. If it's not, then the next property in the list is considered. If none are present, fall back to FROM_LANGUAGE.", "type": "STRING", "defaultValue": "ISO_LANGUAGE,DECODED_LANGUAGE,LANGUAGE" + }, + { + "name": "SENTENCE_SPLITTER_CHAR_COUNT", + "description": "Integer value specifying maximum number of characters to process through sentence splitter.", + "type": "INT", + "defaultValue": "500" + }, + { + "name": "SENTENCE_MODEL", + "description": "Name of sentence segmentation model. Supported options are spaCy's multilingual `xx_sent_ud_sm` model and the Where's the Point (WtP) `wtp-bert-mini` model.", + "type": "STRING", + "defaultValue": "wtp-bert-mini" + }, + { + "name": "SENTENCE_MODEL_CPU_ONLY", + "description": "If set to true, only use CPU resources for the sentence detection model. If set to False, allow sentence model to also use GPU resources (for Docker deployments, please consult README for more info).", + "type": "BOOLEAN", + "defaultValue": "TRUE" + }, + { + "name": "SENTENCE_MODEL_WTP_DEFAULT_ADAPTOR_LANGUAGE", + "description": "More advanced WTP models will require a target language. This property sets the default language to use for sentence splitting, unless `FROM_LANGUAGE`, `SUGGESTED_FROM_LANGUAGE`, or Azure language detection return a different, WtP-supported language option.", + "type": "STRING", + "defaultValue": "en" + }, + { + "name": "SENTENCE_SPLITTER_INCLUDE_INPUT_LANG", + "description": "Specifies whether to pass input language to sentence splitter algorithm. Currently, only WtP supports model adjustments by input language.", + "type": "BOOLEAN", + "defaultValue": "TRUE" } ] } @@ -132,4 +162,4 @@ ] } ] -} +} \ No newline at end of file diff --git a/python/AzureTranslation/pyproject.toml b/python/AzureTranslation/pyproject.toml index 52c60148..bcd2b658 100644 --- a/python/AzureTranslation/pyproject.toml +++ b/python/AzureTranslation/pyproject.toml @@ -5,11 +5,11 @@ # under contract, and is subject to the Rights in Data-General Clause # # 52.227-14, Alt. IV (DEC 2007). # # # -# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # ############################################################################# ############################################################################# -# Copyright 2023 The MITRE Corporation # +# Copyright 2024 The MITRE Corporation # # # # Licensed under the Apache License, Version 2.0 (the "License"); # # you may not use this file except in compliance with the License. # diff --git a/python/AzureTranslation/sample_acs_translator.py b/python/AzureTranslation/sample_acs_translator.py index 0a31a121..628c044f 100644 --- a/python/AzureTranslation/sample_acs_translator.py +++ b/python/AzureTranslation/sample_acs_translator.py @@ -7,11 +7,11 @@ # under contract, and is subject to the Rights in Data-General Clause # # 52.227-14, Alt. IV (DEC 2007). # # # -# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # ############################################################################# ############################################################################# -# Copyright 2023 The MITRE Corporation # +# Copyright 2024 The MITRE Corporation # # # # Licensed under the Apache License, Version 2.0 (the "License"); # # you may not use this file except in compliance with the License. # @@ -29,6 +29,7 @@ import sys from acs_translation_component import TranslationClient +from nlp_text_splitter import TextSplitterModel def main(): @@ -40,10 +41,13 @@ def main(): detection_props = dict(TEXT=text) job_props = dict(TO_LANGUAGE=to_lang, ACS_URL=acs_url, ACS_SUBSCRIPTION_KEY=acs_subscription_key) - TranslationClient(job_props).add_translations(detection_props) + + wtp_model = TextSplitterModel("wtp-bert-mini", "cpu") + TranslationClient(job_props, wtp_model).add_translations(detection_props) print('TRANSLATION SOURCE LANGUAGE:', detection_props['TRANSLATION SOURCE LANGUAGE']) - print('TRANSLATION SOURCE LANGUAGE CONFIDENCE:', detection_props['TRANSLATION SOURCE LANGUAGE CONFIDENCE']) + print('TRANSLATION SOURCE LANGUAGE CONFIDENCE:', + detection_props['TRANSLATION SOURCE LANGUAGE CONFIDENCE']) print('TRANSLATION:') print(detection_props['TRANSLATION']) diff --git a/python/AzureTranslation/setup.cfg b/python/AzureTranslation/setup.cfg index 36a5e35d..b3fa4200 100644 --- a/python/AzureTranslation/setup.cfg +++ b/python/AzureTranslation/setup.cfg @@ -5,11 +5,11 @@ # under contract, and is subject to the Rights in Data-General Clause # # 52.227-14, Alt. IV (DEC 2007). # # # -# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # ############################################################################# ############################################################################# -# Copyright 2023 The MITRE Corporation # +# Copyright 2024 The MITRE Corporation # # # # Licensed under the Apache License, Version 2.0 (the "License"); # # you may not use this file except in compliance with the License. # @@ -29,11 +29,14 @@ name = AzureTranslation version = 8.0 [options] -packages = acs_translation_component +packages = find: install_requires = mpf_component_api>=8.0 mpf_component_util>=8.0 langcodes + spacy>=3.7.4 + wtpsplit>=1.3.0 + torch>=2.2.0 [options.entry_points] mpf.exported_component = diff --git a/python/AzureTranslation/tests/data/NOTICE b/python/AzureTranslation/tests/data/NOTICE index 944aa424..9fc41f18 100644 --- a/python/AzureTranslation/tests/data/NOTICE +++ b/python/AzureTranslation/tests/data/NOTICE @@ -1,4 +1,4 @@ -# break-sentence/art-of-war.txt +# split-sentence/art-of-war.txt Contains the beginning of "The Art of War" by Sunzi in Traditional Chinese. Public Domain https://www.gutenberg.org/ebooks/12407 diff --git a/python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-1.json b/python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-1.json deleted file mode 100644 index e91c2f42..00000000 --- a/python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-1.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - { - "translations": [ - { - "text": "Soldiers, great things of the country, the land of death, the way of survival, can not be ignored also. Therefore, the five things, the school to count, and the feelings: one road, two days, three earths, four will, five law. Taoists, so that the people agree, can die with it, can live with it, not dangerous also; heaven, yin and yang, cold and summer, time system also; earth, far and near, dangerous, narrow, dead also; generals, wisdom, faith, benevolence, courage, strict also; law, music, official, main use also.", - "to": "en" - } - ] - } -] diff --git a/python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-2.json b/python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-2.json deleted file mode 100644 index 94ee875c..00000000 --- a/python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-2.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - { - "translations": [ - { - "text": "Where these five, will not hear, know the winner, do not know the winner. So the school to count, and ask for its feelings, : The Lord has a way? Will you be able to? Heaven and earth? The law? Soldiers? A soldier's practice? Reward and punishment? I know the winner or loser in this way. Will listen to my plan, use it will win, stay, will not listen to my plan, use it will lose, go. Profit to listen, is the trend, to the outside. The powerful, for profit and power also. Soldiers, trickery too.", - "to": "en" - } - ] - } -] diff --git a/python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-3.json b/python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-3.json deleted file mode 100644 index e39940c0..00000000 --- a/python/AzureTranslation/tests/data/break-sentence/art-of-war-translation-3.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - { - "translations": [ - { - "text": "Therefore, can show can not, use and show not use, near and far, far and near. To be tempted, to take it in disorder, to be prepared, to be strong and to avoid, to be angry and scratched, to be humble and proud, to work, to leave, to attack it un prepared, to be satisfactory. The victory of this soldier cannot be passed on first. The husband does not fight and the temple counts the winner, the more also; More odds, less chances, but nothing! I see it this way, win or lose.", - "to": "en" - } - ] - } -] diff --git a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-break-sentence-1.json b/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-break-sentence-1.json deleted file mode 100644 index 2cc23f6f..00000000 --- a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-break-sentence-1.json +++ /dev/null @@ -1,9 +0,0 @@ -[ - { - "sentLen": [ - 24, - 37, - 81 - ] - } -] diff --git a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-break-sentence-2.json b/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-break-sentence-2.json deleted file mode 100644 index b937f291..00000000 --- a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-break-sentence-2.json +++ /dev/null @@ -1,19 +0,0 @@ -[ - { - "sentLen": [ - 22, - 18, - 5, - 5, - 5, - 5, - 5, - 5, - 8, - 27, - 15, - 10, - 7 - ] - } -] diff --git a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-break-sentence-3.json b/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-break-sentence-3.json deleted file mode 100644 index 6c796889..00000000 --- a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-break-sentence-3.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - { - "sentLen": [ - 27, - 50, - 12, - 28, - 13, - 11 - ] - } -] diff --git a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-2.json b/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-2.json deleted file mode 100644 index 5ab5c472..00000000 --- a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-2.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - { - "translations": [ - { - "text": "Taoists, so that the people agree, can die with it, can live with it, not dangerous also; heaven, yin and yang, cold and summer, time system also; earth, far and near, dangerous, narrow, dead also; generals, wisdom, faith, benevolence, courage, strict also; law, music, official, main use also.", - "to": "en" - } - ] - } -] diff --git a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-3.json b/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-3.json deleted file mode 100644 index 95ebeb51..00000000 --- a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-3.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - { - "translations": [ - { - "text": "Where these five, will not hear, know the winner, do not know the winner. So the school to count, and ask for its feelings, : The Lord has a way? Will you be able to? Heaven and earth? The law? Soldiers? A soldier's practice? Reward and punishment? I know the winner or loser in this way.", - "to": "en" - } - ] - } -] diff --git a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-4.json b/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-4.json deleted file mode 100644 index 4cdf8adb..00000000 --- a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-4.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - { - "translations": [ - { - "text": "Will listen to my plan, use it will win, stay, will not listen to my plan, use it will lose, go. Profit to listen, is the trend, to the outside. The powerful, for profit and power also. Soldiers, trickery too.", - "to": "en" - } - ] - } -] diff --git a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-5.json b/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-5.json deleted file mode 100644 index ab6da980..00000000 --- a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-5.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - { - "translations": [ - { - "text": "Therefore, can show can not, use and show not use, near and far, far and near. To be tempted, to take it in disorder, to be prepared, to be strong and to avoid, to be angry and scratched, to be humble and proud, to work, to leave, to attack it un prepared, to be satisfactory. The victory of this soldier cannot be passed on first.", - "to": "en" - } - ] - } -] diff --git a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-6.json b/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-6.json deleted file mode 100644 index 6b8ddc60..00000000 --- a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-6.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - { - "translations": [ - { - "text": "The husband does not fight and the temple counts the winner, the more also; More odds, less chances, but nothing! I see it this way, win or lose.", - "to": "en" - } - ] - } -] diff --git a/python/AzureTranslation/tests/data/invalid-lang-detect-result.json b/python/AzureTranslation/tests/data/invalid-lang-detect-result.json new file mode 100644 index 00000000..7fcfc64b --- /dev/null +++ b/python/AzureTranslation/tests/data/invalid-lang-detect-result.json @@ -0,0 +1,8 @@ +[ + { + "language": "fake-lang", + "score": 1.0, + "isTranslationSupported": true, + "isTransliterationSupported": true + } +] \ No newline at end of file diff --git a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-1.json b/python/AzureTranslation/tests/data/split-sentence/art-of-war-translation-1.json similarity index 66% rename from python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-1.json rename to python/AzureTranslation/tests/data/split-sentence/art-of-war-translation-1.json index f143b3a7..8d7d5f2e 100644 --- a/python/AzureTranslation/tests/data/break-sentence/with-guessing/art-of-war-translation-1.json +++ b/python/AzureTranslation/tests/data/split-sentence/art-of-war-translation-1.json @@ -2,9 +2,9 @@ { "translations": [ { - "text": "Soldiers, great things of the country, the land of death, the way of survival, can not be ignored also. Therefore, the five things, the school to count, and the feelings: one road, two days, three earths, four will, five law.", + "text": "Soldiers, great things of the country, the land of death, the way of survival, can not be ignored also. Therefore, the five things, the school to count, and the feelings: one road, two days, three earths, four will, five law. Taoists, so that the people agree, can die with it, can live with it, not dangerous also;", "to": "en" } ] } -] +] \ No newline at end of file diff --git a/python/AzureTranslation/tests/data/split-sentence/art-of-war-translation-2.json b/python/AzureTranslation/tests/data/split-sentence/art-of-war-translation-2.json new file mode 100644 index 00000000..5a7b0316 --- /dev/null +++ b/python/AzureTranslation/tests/data/split-sentence/art-of-war-translation-2.json @@ -0,0 +1,10 @@ +[ + { + "translations": [ + { + "text": "heaven, yin and yang, cold and summer, time system also; earth, far and near, dangerous, narrow, dead also; generals, wisdom, faith, benevolence, courage, strict also; law, music, official, main use also. Where these five, will not hear, know the winner, do not know the winner. So the school to count, and ask for its feelings, : The Lord has a way? Will you be able to? Heaven and earth? The law? Soldiers?", + "to": "en" + } + ] + } +] \ No newline at end of file diff --git a/python/AzureTranslation/tests/data/split-sentence/art-of-war-translation-3.json b/python/AzureTranslation/tests/data/split-sentence/art-of-war-translation-3.json new file mode 100644 index 00000000..a1216f22 --- /dev/null +++ b/python/AzureTranslation/tests/data/split-sentence/art-of-war-translation-3.json @@ -0,0 +1,10 @@ +[ + { + "translations": [ + { + "text": "A soldier's practice? Reward and punishment? I know the winner or loser in this way. Will listen to my plan, use it will win, stay, will not listen to my plan, use it will lose, go. Profit to listen, is the trend, to the outside. The powerful, for profit and power also. Soldiers, trickery too. Therefore, can show can not, use and show not use, near and far, far and near.", + "to": "en" + } + ] + } +] \ No newline at end of file diff --git a/python/AzureTranslation/tests/data/split-sentence/art-of-war-translation-4.json b/python/AzureTranslation/tests/data/split-sentence/art-of-war-translation-4.json new file mode 100644 index 00000000..8fe82471 --- /dev/null +++ b/python/AzureTranslation/tests/data/split-sentence/art-of-war-translation-4.json @@ -0,0 +1,10 @@ +[ + { + "translations": [ + { + "text": "To be tempted, to take it in disorder, to be prepared, to be strong and to avoid, to be angry and scratched, to be humble and proud, to work, to leave, to attack it un prepared, to be satisfactory. The victory of this soldier cannot be passed on first. The husband does not fight and the temple counts the winner, the more also; More odds, less chances, but nothing! I see it this way, win or lose.", + "to": "en" + } + ] + } +] \ No newline at end of file diff --git a/python/AzureTranslation/tests/data/break-sentence/art-of-war.txt b/python/AzureTranslation/tests/data/split-sentence/art-of-war.txt similarity index 100% rename from python/AzureTranslation/tests/data/break-sentence/art-of-war.txt rename to python/AzureTranslation/tests/data/split-sentence/art-of-war.txt diff --git a/python/AzureTranslation/tests/data/break-sentence/art-war-translation.txt b/python/AzureTranslation/tests/data/split-sentence/art-war-translation.txt similarity index 100% rename from python/AzureTranslation/tests/data/break-sentence/art-war-translation.txt rename to python/AzureTranslation/tests/data/split-sentence/art-war-translation.txt diff --git a/python/AzureTranslation/tests/data/break-sentence/break-sentence-art-of-war-results.json b/python/AzureTranslation/tests/data/split-sentence/break-sentence-art-of-war-results.json similarity index 100% rename from python/AzureTranslation/tests/data/break-sentence/break-sentence-art-of-war-results.json rename to python/AzureTranslation/tests/data/split-sentence/break-sentence-art-of-war-results.json diff --git a/python/AzureTranslation/tests/test_acs_translation.py b/python/AzureTranslation/tests/test_acs_translation.py index 83eac939..c871fcdb 100644 --- a/python/AzureTranslation/tests/test_acs_translation.py +++ b/python/AzureTranslation/tests/test_acs_translation.py @@ -6,11 +6,11 @@ # under contract, and is subject to the Rights in Data-General Clause # # 52.227-14, Alt. IV (DEC 2007). # # # -# Copyright 2023 The MITRE Corporation. All Rights Reserved. # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # ############################################################################# ############################################################################# -# Copyright 2023 The MITRE Corporation # +# Copyright 2024 The MITRE Corporation # # # # Licensed under the Apache License, Version 2.0 (the "License"); # # you may not use this file except in compliance with the License. # @@ -42,14 +42,16 @@ import mpf_component_api as mpf sys.path.insert(0, str(pathlib.Path(__file__).parent.parent)) +from nlp_text_splitter import TextSplitterModel, TextSplitter from acs_translation_component.acs_translation_component import (AcsTranslationComponent, get_azure_char_count, TranslationClient, NewLineBehavior, ChineseAndJapaneseCodePoints, - AcsTranslateUrlBuilder, BreakSentenceClient, SentenceBreakGuesser, get_n_azure_chars) + AcsTranslateUrlBuilder, get_n_azure_chars) from acs_translation_component.convert_language_code import iso_to_bcp - - +# Set to true to test the WtP canine-s-1l model locally +# Note, this will download ~1 GB to your local storage. +LOCAL_TEST_WTP_MODEL = False SEEN_TRACE_IDS = set() CHINESE_SAMPLE_TEXT = '你好,你叫什么名字?' @@ -65,10 +67,17 @@ class TestAcsTranslation(unittest.TestCase): mock_server: ClassVar['MockServer'] + wtp_model: ClassVar['TextSplitterModel'] + spacy_model: ClassVar['TextSplitterModel'] @classmethod def setUpClass(cls): cls.mock_server = MockServer() + cls.wtp_model = TextSplitterModel("wtp-bert-mini", "cpu", "en") + if LOCAL_TEST_WTP_MODEL: + cls.wtp_adv_model = TextSplitterModel("wtp-canine-s-1l", "cuda", "zh") + cls.spacy_model = TextSplitterModel("xx_sent_ud_sm", "cpu", "en") + @classmethod def tearDownClass(cls): @@ -113,9 +122,11 @@ def validate_results(results): result.detection_properties['TRANSLATION']) self.assertEqual('EN', result.detection_properties['TRANSLATION TO LANGUAGE']) - self.assertEqual('zh-Hans', result.detection_properties['TRANSLATION SOURCE LANGUAGE']) + self.assertEqual('zh-Hans', + result.detection_properties['TRANSLATION SOURCE LANGUAGE']) self.assertAlmostEqual( - 1.0, float(result.detection_properties['TRANSLATION SOURCE LANGUAGE CONFIDENCE'])) + 1.0, + float(result.detection_properties['TRANSLATION SOURCE LANGUAGE CONFIDENCE'])) self.assertNotIn('SKIPPED TRANSLATION', result.detection_properties) detect_request_body = self.get_request_body() @@ -195,29 +206,31 @@ def test_video_job(self): self.assertEqual(CHINESE_SAMPLE_TEXT, - result.frame_locations[0].detection_properties['TEXT']) + result.frame_locations[0].detection_properties['TEXT']) self.assertEqual(CHINESE_SAMPLE_TEXT_ENG_TRANSLATE, - result.frame_locations[0].detection_properties['TRANSLATION']) + result.frame_locations[0].detection_properties['TRANSLATION']) self.assertEqual('EN', - result.frame_locations[0].detection_properties['TRANSLATION TO LANGUAGE']) + result.frame_locations[0].detection_properties['TRANSLATION TO LANGUAGE']) self.assertEqual('zh-Hans', - result.frame_locations[0].detection_properties['TRANSLATION SOURCE LANGUAGE']) + result.frame_locations[0].detection_properties['TRANSLATION SOURCE LANGUAGE']) self.assertAlmostEqual( 1.0, - float(result.frame_locations[0].detection_properties['TRANSLATION SOURCE LANGUAGE CONFIDENCE'])) + float(result.frame_locations[0]\ + .detection_properties['TRANSLATION SOURCE LANGUAGE CONFIDENCE'])) self.assertEqual(SPANISH_SAMPLE_TEXT, - result.frame_locations[1].detection_properties['TEXT']) + result.frame_locations[1].detection_properties['TEXT']) self.assertEqual(SPANISH_SAMPLE_TEXT_ENG_TRANSLATE, - result.frame_locations[1].detection_properties['TRANSLATION']) + result.frame_locations[1].detection_properties['TRANSLATION']) self.assertEqual('EN', - result.frame_locations[1].detection_properties['TRANSLATION TO LANGUAGE']) + result.frame_locations[1].detection_properties['TRANSLATION TO LANGUAGE']) self.assertEqual('es', - result.frame_locations[1].detection_properties['TRANSLATION SOURCE LANGUAGE']) + result.frame_locations[1].detection_properties['TRANSLATION SOURCE LANGUAGE']) self.assertAlmostEqual( 1.0, - float(result.frame_locations[1].detection_properties['TRANSLATION SOURCE LANGUAGE CONFIDENCE'])) + float(result.frame_locations[1]\ + .detection_properties['TRANSLATION SOURCE LANGUAGE CONFIDENCE'])) request_body1 = self.get_request_body() self.assertEqual(1, len(request_body1)) @@ -273,14 +286,14 @@ def test_detect_lang_disabled(self): def test_no_feed_forward_location(self): job = mpf.ImageJob('Test', 'test.jpg', get_test_properties(), {}) with self.assertRaises(mpf.DetectionException) as cm: - list(AcsTranslationComponent.get_detections_from_image(job)) + list(AcsTranslationComponent().get_detections_from_image(job)) self.assertEqual(mpf.DetectionError.UNSUPPORTED_DATA_TYPE, cm.exception.error_code) def test_no_feed_forward_track(self): job = mpf.VideoJob('test', 'test.jpg', 0, 1, get_test_properties(), {}) with self.assertRaises(mpf.DetectionException) as cm: - list(AcsTranslationComponent.get_detections_from_video(job)) + list(AcsTranslationComponent().get_detections_from_video(job)) self.assertEqual(mpf.DetectionError.UNSUPPORTED_DATA_TYPE, cm.exception.error_code) @@ -296,7 +309,7 @@ def test_reports_error_when_server_error(self, _): ff_track) with self.assertRaises(mpf.DetectionException) as cm: - AcsTranslationComponent.get_detections_from_video(job) + AcsTranslationComponent().get_detections_from_video(job) self.assertEqual(mpf.DetectionError.NETWORK_ERROR, cm.exception.error_code) @@ -307,14 +320,14 @@ def test_reports_error_when_missing_acs_props(self): del test_props['ACS_URL'] job = mpf.ImageJob('Test', 'test.jpg', test_props, {}, ff_loc) with self.assertRaises(mpf.DetectionException) as cm: - AcsTranslationComponent.get_detections_from_image(job) + AcsTranslationComponent().get_detections_from_image(job) self.assertEqual(mpf.DetectionError.MISSING_PROPERTY, cm.exception.error_code) test_props = get_test_properties() del test_props['ACS_SUBSCRIPTION_KEY'] job = mpf.ImageJob('Test', 'test.jpg', test_props, {}, ff_loc) with self.assertRaises(mpf.DetectionException) as cm: - AcsTranslationComponent.get_detections_from_image(job) + AcsTranslationComponent().get_detections_from_image(job) self.assertEqual(mpf.DetectionError.MISSING_PROPERTY, cm.exception.error_code) @@ -327,7 +340,7 @@ def test_missing_required_properties(self): job = mpf.ImageJob('Test', 'test.jpg', test_props, {}, ff_loc) with self.assertRaises(mpf.DetectionException) as cm: - AcsTranslationComponent.get_detections_from_image(job) + AcsTranslationComponent().get_detections_from_image(job) self.assertEqual(mpf.DetectionError.MISSING_PROPERTY, cm.exception.error_code) @@ -376,7 +389,7 @@ def test_translation_cache(self): job = mpf.VideoJob('test', 'test.jpg', 0, 1, get_test_properties(), {}, ff_track) - results = list(AcsTranslationComponent.get_detections_from_video(job)) + results = list(AcsTranslationComponent().get_detections_from_video(job)) self.assertEqual(1, len(results)) result = results[0] @@ -518,110 +531,238 @@ def assert_expected_url(job_properties, expected_to, expected_from, expected_que 'en', None, {'suggestedFrom': 'ru', 'category': 'whatever'}) - - - @mock.patch.object(BreakSentenceClient, 'TRANSLATION_MAX_CHARS', new_callable=lambda: 150) - def test_split_text(self, _): + @mock.patch.object(TranslationClient, 'DETECT_MAX_CHARS', new_callable=lambda: 150) + def test_split_wtp_known_language(self, _): self.set_results_file('traditional-chinese-detect-result.json') - self.set_results_file('break-sentence/break-sentence-art-of-war-results.json') - self.set_results_file('break-sentence/art-of-war-translation-1.json') - self.set_results_file('break-sentence/art-of-war-translation-2.json') - self.set_results_file('break-sentence/art-of-war-translation-3.json') + self.set_results_file('split-sentence/art-of-war-translation-1.json') + self.set_results_file('split-sentence/art-of-war-translation-2.json') + self.set_results_file('split-sentence/art-of-war-translation-3.json') + self.set_results_file('split-sentence/art-of-war-translation-4.json') - text = (TEST_DATA / 'break-sentence/art-of-war.txt').read_text() + text = (TEST_DATA / 'split-sentence/art-of-war.txt').read_text() detection_props = dict(TEXT=text) - TranslationClient(get_test_properties()).add_translations(detection_props) + TranslationClient(get_test_properties(), self.wtp_model).add_translations(detection_props) self.assertEqual(5, len(detection_props)) self.assertEqual(text, detection_props['TEXT']) - expected_translation = (TEST_DATA / 'break-sentence/art-war-translation.txt') \ + expected_translation = (TEST_DATA / 'split-sentence/art-war-translation.txt') \ .read_text().strip() self.assertEqual(expected_translation, detection_props['TRANSLATION']) self.assertEqual('EN', detection_props['TRANSLATION TO LANGUAGE']) self.assertEqual('zh-Hant', detection_props['TRANSLATION SOURCE LANGUAGE']) - self.assertAlmostEqual(1.0, float(detection_props['TRANSLATION SOURCE LANGUAGE CONFIDENCE'])) + self.assertAlmostEqual(1.0, + float(detection_props['TRANSLATION SOURCE LANGUAGE CONFIDENCE'])) detect_request_text = self.get_request_body()[0]['Text'] - self.assertEqual(text, detect_request_text) - - break_sentence_url, break_sentence_response = self.get_request() - self.assertIn('language=zh-Hant', break_sentence_url) - break_sentence_request_text = break_sentence_response[0]['Text'] - - self.assertNotIn('\n', break_sentence_request_text, 'Newlines were not properly removed') - self.assertNotIn(' ', break_sentence_request_text, - 'Spaces should not be added to Chinese text.') - - expected_chunk_lengths = [142, 137, 141] - self.assertEqual(sum(expected_chunk_lengths), len(break_sentence_request_text)) + self.assertEqual(text[0:TranslationClient.DETECT_MAX_CHARS], detect_request_text) + expected_chunk_lengths = [86, 116, 104, 114] + self.assertEqual(sum(expected_chunk_lengths), len(text.replace('\n',''))) translation_request1 = self.get_request_body()[0]['Text'] - self.assertEqual(expected_chunk_lengths[0], len(translation_request1)) self.assertTrue(translation_request1.startswith('兵者,')) - self.assertTrue(translation_request1.endswith('主用也。')) + self.assertTrue(translation_request1.endswith('而不危也;')) + self.assertEqual(expected_chunk_lengths[0], len(translation_request1)) + self.assertNotIn('\n', translation_request1, + 'Newlines were not properly removed') + self.assertNotIn(' ', translation_request1, + 'Spaces should not be added to Chinese text.') + translation_request2 = self.get_request_body()[0]['Text'] self.assertEqual(expected_chunk_lengths[1], len(translation_request2)) - self.assertTrue(translation_request2.startswith('凡此五')) - self.assertTrue(translation_request2.endswith('詭道也。')) + self.assertTrue(translation_request2.startswith('天者,陰陽')) + self.assertTrue(translation_request2.endswith('兵眾孰強?')) + self.assertNotIn('\n', translation_request1, + 'Newlines were not properly removed') + self.assertNotIn(' ', translation_request1, + 'Spaces should not be added to Chinese text.') + translation_request3 = self.get_request_body()[0]['Text'] self.assertEqual(expected_chunk_lengths[2], len(translation_request3)) - self.assertTrue(translation_request3.startswith('故能而')) - self.assertTrue(translation_request3.endswith('勝負見矣。')) + self.assertTrue(translation_request3.startswith('士卒孰練?')) + self.assertTrue(translation_request3.endswith('遠而示之近。')) + self.assertNotIn('\n', translation_request3, + 'Newlines were not properly removed') + self.assertNotIn(' ', translation_request3, + 'Spaces should not be added to Chinese text.') + + + translation_request4 = self.get_request_body()[0]['Text'] + self.assertEqual(expected_chunk_lengths[3], len(translation_request4)) + self.assertTrue(translation_request4.startswith('利而誘之,')) + self.assertTrue(translation_request4.endswith('勝負見矣。')) + self.assertNotIn('\n', translation_request4, + 'Newlines were not properly removed') + self.assertNotIn(' ', translation_request4, + 'Spaces should not be added to Chinese text.') + + + def test_split_engine_difference(self): + # Note: we can only use the WtP models for subsequent tests + # involving Chinese text because only WtP's multilingual models + # can detect some of '。' characters used for this language. + text = (TEST_DATA / 'split-sentence/art-of-war.txt').read_text() + + text_without_newlines = text.replace('\n', '') + + actual = self.wtp_model._split_wtp(text_without_newlines) + self.assertEqual(3, len(actual)) + for line in actual: + self.assertTrue(line.endswith('。')) + actual = self.spacy_model._split_spacy(text_without_newlines) + self.assertEqual(1, len(actual)) - @mock.patch.object(BreakSentenceClient, 'TRANSLATION_MAX_CHARS', new_callable=lambda: 100) - @mock.patch.object(BreakSentenceClient, 'BREAK_SENTENCE_MAX_CHARS', new_callable=lambda: 150) - @mock.patch.object(TranslationClient, 'DETECT_MAX_CHARS', new_callable=lambda: 20) - def test_guess_break_with_break_sentence(self, _, __, ___): + # However, WtP prefers newlines over the '。' character. + actual = self.wtp_model._split_wtp(text) + self.assertEqual(10, len(actual)) + + @mock.patch.object(TranslationClient, 'DETECT_MAX_CHARS', new_callable=lambda: 150) + def test_split_wtp_advanced_known_language(self, _): + # This test should only be run manually outside of a Docker build. + # The WtP canine model is ~1 GB and not worth downloading and adding to the pre-built Docker image. + if not LOCAL_TEST_WTP_MODEL: + return + + # For this test, we're more interested in the changes in behavior + # caused by WtP split. So the translation files are mainly placeholders. self.set_results_file('traditional-chinese-detect-result.json') - for i in range(1, 4): - self.set_results_file( - f'break-sentence/with-guessing/art-of-war-break-sentence-{i}.json') - for i in range(1, 7): - self.set_results_file(f'break-sentence/with-guessing/art-of-war-translation-{i}.json') + self.set_results_file('split-sentence/art-of-war-translation-1.json') + self.set_results_file('split-sentence/art-of-war-translation-2.json') + self.set_results_file('split-sentence/art-of-war-translation-3.json') + self.set_results_file('split-sentence/art-of-war-translation-4.json') - text = (TEST_DATA / 'break-sentence/art-of-war.txt').read_text() + text = (TEST_DATA / 'split-sentence/art-of-war.txt').read_text() detection_props = dict(TEXT=text) - TranslationClient(get_test_properties()).add_translations(detection_props) + TranslationClient(get_test_properties(SENTENCE_MODEL="wtp-canine-s-1l"), self.wtp_adv_model).add_translations(detection_props) self.assertEqual(5, len(detection_props)) self.assertEqual(text, detection_props['TEXT']) - expected_translation = (TEST_DATA / 'break-sentence/art-war-translation.txt') \ + expected_translation = (TEST_DATA / 'split-sentence/art-war-translation.txt') \ .read_text().strip() self.assertEqual(expected_translation, detection_props['TRANSLATION']) self.assertEqual('EN', detection_props['TRANSLATION TO LANGUAGE']) - self.assertEqual('zh-Hant', detection_props['TRANSLATION SOURCE LANGUAGE']) self.assertAlmostEqual(1.0, - float(detection_props['TRANSLATION SOURCE LANGUAGE CONFIDENCE'])) + float(detection_props['TRANSLATION SOURCE LANGUAGE CONFIDENCE'])) detect_request_text = self.get_request_body()[0]['Text'] - self.assertEqual(text[:TranslationClient.DETECT_MAX_CHARS], detect_request_text) - - for i in range(3): - break_sentence_url, break_sentence_request = self.get_request() - self.assertIn('language=zh-Hant', break_sentence_url) - break_sentence_request_text = break_sentence_request[0]['Text'] - self.assertNotIn('\n', break_sentence_request_text, - 'Newlines were not properly removed') - self.assertNotIn(' ', break_sentence_request_text, - 'Spaces should not be added to Chinese text.') - self.assertEqual('。', break_sentence_request_text[-1]) - - for i in range(6): - translate_url, translate_request = self.get_request() - self.assertIn('from=zh-Hant', translate_url) - translate_request_text = translate_request[0]['Text'] - self.assertNotIn('\n', translate_request_text, - 'Newlines were not properly removed') - self.assertNotIn(' ', translate_request_text, - 'Spaces should not be added to Chinese text.') - self.assertEqual('。', translate_request_text[-1]) + self.assertEqual(text[0:TranslationClient.DETECT_MAX_CHARS], detect_request_text) + + # Main test starts here: + expected_chunk_lengths = [61, 150, 61, 148] + self.assertEqual(sum(expected_chunk_lengths), len(text.replace('\n',''))) + translation_request1 = self.get_request_body()[0]['Text'] + self.assertTrue(translation_request1.startswith('兵者,')) + self.assertTrue(translation_request1.endswith('四曰將,五曰法。')) + self.assertEqual(expected_chunk_lengths[0], len(translation_request1)) + self.assertNotIn('\n', translation_request1, + 'Newlines were not properly removed') + self.assertNotIn(' ', translation_request1, + 'Spaces should not be added to Chinese text.') + + translation_request2 = self.get_request_body()[0]['Text'] + self.assertEqual(expected_chunk_lengths[1], len(translation_request2)) + self.assertTrue(translation_request2.startswith('道者,令民於上同意')) + self.assertTrue(translation_request2.endswith('賞罰孰明')) + self.assertNotIn('\n', translation_request2, + 'Newlines were not properly removed') + self.assertNotIn(' ', translation_request2, + 'Spaces should not be added to Chinese text.') + + translation_request3 = self.get_request_body()[0]['Text'] + self.assertEqual(expected_chunk_lengths[2], len(translation_request3)) + self.assertTrue(translation_request3.startswith('?吾以此知勝')) + self.assertTrue(translation_request3.endswith('因利而制權也。')) + self.assertNotIn('\n', translation_request3, + 'Newlines were not properly removed') + self.assertNotIn(' ', translation_request3, + 'Spaces should not be added to Chinese text.') + + translation_request4 = self.get_request_body()[0]['Text'] + self.assertEqual(expected_chunk_lengths[3], len(translation_request4)) + self.assertTrue(translation_request4.startswith('兵者,詭道也。')) + self.assertTrue(translation_request4.endswith('勝負見矣。')) + self.assertNotIn('\n', translation_request4, + 'Newlines were not properly removed') + self.assertNotIn(' ', translation_request4, + 'Spaces should not be added to Chinese text.') + + @mock.patch.object(TranslationClient, 'DETECT_MAX_CHARS', new_callable=lambda: 150) + def test_split_wtp_unknown_lang(self, _): + # Check that the text splitter does not have an issue + # processing an unknown detected language. + self.set_results_file('invalid-lang-detect-result.json') + self.set_results_file('split-sentence/art-of-war-translation-1.json') + self.set_results_file('split-sentence/art-of-war-translation-2.json') + self.set_results_file('split-sentence/art-of-war-translation-3.json') + self.set_results_file('split-sentence/art-of-war-translation-4.json') + + text = (TEST_DATA / 'split-sentence/art-of-war.txt').read_text() + detection_props = dict(TEXT=text) + TranslationClient(get_test_properties(), self.wtp_model).add_translations(detection_props) + + self.assertEqual(5, len(detection_props)) + self.assertEqual(text, detection_props['TEXT']) + + expected_translation = (TEST_DATA / 'split-sentence/art-war-translation.txt') \ + .read_text().strip() + self.assertEqual(expected_translation, detection_props['TRANSLATION']) + self.assertEqual('EN', detection_props['TRANSLATION TO LANGUAGE']) + + self.assertEqual('fake-lang', detection_props['TRANSLATION SOURCE LANGUAGE']) + self.assertAlmostEqual(1.0, + float(detection_props['TRANSLATION SOURCE LANGUAGE CONFIDENCE'])) + + detect_request_text = self.get_request_body()[0]['Text'] + self.assertEqual(text[0:TranslationClient.DETECT_MAX_CHARS], detect_request_text) + + expected_chunk_lengths = [88, 118, 116, 106] + self.assertEqual(sum(expected_chunk_lengths), len(text)) + + # Due to an incorrect language detection, newlines are + # not properly replaced for Chinese text, and + # additional whitespace is present in the text. + # This alters the behavior of WtP sentence splitting. + translation_request1 = self.get_request_body()[0]['Text'] + self.assertEqual(expected_chunk_lengths[0], len(translation_request1)) + self.assertTrue(translation_request1.startswith('兵者,')) + self.assertTrue(translation_request1.endswith('而不危也;')) + self.assertNotIn('\n', translation_request1, + 'Newlines were not properly removed') + self.assertIn(' ', translation_request1, + 'Spaces should be kept due to incorrect language detection.') + + translation_request2 = self.get_request_body()[0]['Text'] + self.assertEqual(expected_chunk_lengths[1], len(translation_request2)) + self.assertTrue(translation_request2.startswith('天者,陰陽')) + self.assertTrue(translation_request2.endswith('兵眾孰強?')) + self.assertNotIn('\n', translation_request2, + 'Newlines were not properly removed') + self.assertIn(' ', translation_request2, + 'Spaces should be kept due to incorrect language detection.') + + translation_request3 = self.get_request_body()[0]['Text'] + self.assertEqual(expected_chunk_lengths[2], len(translation_request3)) + self.assertTrue(translation_request3.startswith('士卒孰練?')) + self.assertTrue(translation_request3.endswith('亂而取之, ')) + self.assertNotIn('\n', translation_request3, + 'Newlines were not properly removed') + self.assertIn(' ', translation_request3, + 'Spaces should be kept due to incorrect language detection.') + + translation_request4 = self.get_request_body()[0]['Text'] + self.assertEqual(expected_chunk_lengths[3], len(translation_request4)) + self.assertTrue(translation_request4.startswith('實而備之,')) + self.assertTrue(translation_request4.endswith('勝負見矣。 ')) + self.assertNotIn('\n', translation_request4, + 'Newlines were not properly removed') + self.assertIn(' ', translation_request4, + 'Spaces should be kept due to incorrect language detection.') def test_newline_removal(self): @@ -631,7 +772,7 @@ def replace(text): self.set_results_file('results-chinese.json') props = get_test_properties(DETECT_BEFORE_TRANSLATE='FALSE') - TranslationClient(props).add_translations(dict(TEXT=text)) + TranslationClient(props, self.wtp_model).add_translations(dict(TEXT=text)) return self.get_request_body()[0]['Text'] with self.subTest('English'): @@ -740,7 +881,7 @@ def test_job_prop_overrides_from_lang(self): def test_chinese_japanese_char_detection(self): - art_of_war_text = (TEST_DATA / 'break-sentence/art-of-war.txt').read_text() + art_of_war_text = (TEST_DATA / 'split-sentence/art-of-war.txt').read_text() self.assertTrue(all(ChineseAndJapaneseCodePoints.check_char(ch) for ch in art_of_war_text if not ch.isspace())) @@ -781,7 +922,7 @@ def test_category_and_explicit_from_language(self): get_test_properties(FROM_LANGUAGE='zh-Hans', CATEGORY='My category'), {}, ff_loc) - results = list(AcsTranslationComponent.get_detections_from_image(job)) + results = list(AcsTranslationComponent().get_detections_from_image(job)) self.assertEqual(1, len(results)) result = results[0] @@ -816,7 +957,7 @@ def test_suggested_from(self): ff_loc = mpf.ImageLocation(0, 0, 10, 20, -1, dict(TEXT=input_text)) props = get_test_properties(SUGGESTED_FROM_LANGUAGE='ja', DETECT_BEFORE_TRANSLATE='false') job = mpf.ImageJob('Test', 'test.jpg', props, {}, ff_loc) - results = list(AcsTranslationComponent.get_detections_from_image(job)) + results = list(AcsTranslationComponent().get_detections_from_image(job)) self.assertEqual(1, len(results)) result = results[0] @@ -838,53 +979,98 @@ def test_suggested_from(self): self.assertEqual(['en'], query_dict['to']) - @mock.patch.object(BreakSentenceClient, 'BREAK_SENTENCE_MAX_CHARS', new_callable=lambda: 5) - def test_guess_breaks_all_types(self, _): - input_text = 'a.bc,d.efg,hij kl\n\nmnopqrs.tu' - actual = list(SentenceBreakGuesser.guess_breaks(input_text)) + def test_guess_split_simple_sentence(self): + input_text = 'Hello, what is your name? My name is John.' + actual = list(TextSplitter.split(input_text, + 28, + 28, + get_azure_char_count, + self.wtp_model)) self.assertEqual(input_text, ''.join(actual)) - self.assertEqual(7, len(actual)) - - # a.bc, - self.assertEqual('a.', actual[0]) - # bc,d. - self.assertEqual('bc,d.', actual[1]) - # efg,h - self.assertEqual('efg,', actual[2]) - # hij k - self.assertEqual('hij ', actual[3]) - # kl\n\nm - self.assertEqual('kl\n\n', actual[4]) - # mnopq - self.assertEqual('mnopq', actual[5]) - # rs.tu - self.assertEqual('rs.tu', actual[6], 'Should not divide final segment of text.') - - - @mock.patch.object(BreakSentenceClient, 'BREAK_SENTENCE_MAX_CHARS', new_callable=lambda: 20) - def test_guess_breaks_actual_sentence(self, _): + self.assertEqual(2, len(actual)) + + # "Hello, what is your name?" + self.assertEqual('Hello, what is your name? ', actual[0]) + # " My name is John." + self.assertEqual('My name is John.', actual[1]) + input_text = 'Hello, what is your name? My name is John.' - actual = list(SentenceBreakGuesser.guess_breaks(input_text)) + actual = list(TextSplitter.split(input_text, + 28, + 28, + get_azure_char_count, + self.spacy_model)) self.assertEqual(input_text, ''.join(actual)) - self.assertEqual(3, len(actual)) + self.assertEqual(2, len(actual)) - # "Hello, what is your " - self.assertEqual('Hello,', actual[0]) - # " what is your name? " - self.assertEqual(' what is your name?', actual[1]) + # "Hello, what is your name?" + self.assertEqual('Hello, what is your name? ', actual[0]) # " My name is John." - self.assertEqual(' My name is John.', actual[2]) + self.assertEqual('My name is John.', actual[1]) + - @mock.patch.object(BreakSentenceClient, 'BREAK_SENTENCE_MAX_CHARS', new_callable=lambda: 20) - def test_sentence_end_punctuation(self, _): + def test_split_sentence_end_punctuation(self): input_text = 'Hello. How are you? asdfasdf' - actual = list(SentenceBreakGuesser.guess_breaks(input_text)) + actual = list(TextSplitter.split(input_text, + 20, + 10, + get_azure_char_count, + self.wtp_model)) + + self.assertEqual(input_text, ''.join(actual)) + self.assertEqual(2, len(actual)) + + self.assertEqual('Hello. How are you? ', actual[0]) + self.assertEqual('asdfasdf', actual[1]) + + actual = list(TextSplitter.split(input_text, + 20, + 10, + get_azure_char_count, + self.spacy_model)) + self.assertEqual(input_text, ''.join(actual)) self.assertEqual(2, len(actual)) - self.assertEqual('Hello. How are you?', actual[0]) - self.assertEqual(' asdfasdf', actual[1]) + self.assertEqual('Hello. How are you? ', actual[0]) + self.assertEqual('asdfasdf', actual[1]) + + + def test_guess_split_edge_cases(self): + input_text = ("This is a sentence (Dr.Test). Is this," + " a sentence as well? Maybe...maybe not?" + " \n All done, I think!") + + # Split using WtP model. + actual = list(TextSplitter.split(input_text, + 30, + 30, + get_azure_char_count, + self.wtp_model)) + + self.assertEqual(input_text, ''.join(actual)) + self.assertEqual(4, len(actual)) + + # WtP should detect and split out each sentence + self.assertEqual("This is a sentence (Dr.Test). ", actual[0]) + self.assertEqual("Is this, a sentence as well? ", actual[1]) + self.assertEqual("Maybe...maybe not? \n ", actual[2]) + self.assertEqual("All done, I think!", actual[3]) + + actual = list(TextSplitter.split(input_text, + 35, + 35, + get_azure_char_count, + self.spacy_model)) + self.assertEqual(input_text, ''.join(actual)) + self.assertEqual(4, len(actual)) + + # Split using spaCy model. + self.assertEqual("This is a sentence (Dr.Test). ", actual[0]) + self.assertEqual("Is this, a sentence as well? ", actual[1]) + self.assertEqual("Maybe...maybe not? \n ", actual[2]) + self.assertEqual("All done, I think!", actual[3]) def test_no_translate_no_detect_when_language_ff_prop_matches(self): @@ -1111,19 +1297,13 @@ def do_POST(self): is_detect = url_parts.path == '/translator/detect' is_translate = url_parts.path == '/translator/translate' - is_break_sentence = url_parts.path == '/translator/breaksentence' - if not is_detect and not is_translate and not is_break_sentence: + if not is_detect and not is_translate: self._send_error(404, 000, 'NOT FOUND') return self._validate_headers() self._validate_query_string(url_parts.query, is_translate) - if is_detect: - max_chars = TranslationClient.DETECT_MAX_CHARS - elif is_translate: - max_chars = BreakSentenceClient.TRANSLATION_MAX_CHARS - else: - max_chars = BreakSentenceClient.BREAK_SENTENCE_MAX_CHARS + max_chars = TranslationClient.DETECT_MAX_CHARS self._validate_body(max_chars) self.send_response(200)