From 4cc6dfe5a06eea3fa8c379180c7877d4185f24bd Mon Sep 17 00:00:00 2001 From: Brian Rosenberg Date: Fri, 26 Apr 2024 11:32:34 -0400 Subject: [PATCH 1/2] Install text splitter as a module --- python/AzureTranslation/Dockerfile | 54 ++-- .../acs_translation_component.py | 2 +- .../nlp_text_splitter/__init__.py | 27 -- .../nlp_text_splitter/text_splitter.py | 263 ------------------ .../nlp_text_splitter/wtp_lang_settings.py | 259 ----------------- python/AzureTranslation/setup.cfg | 1 - 6 files changed, 22 insertions(+), 584 deletions(-) delete mode 100644 python/AzureTranslation/nlp_text_splitter/__init__.py delete mode 100644 python/AzureTranslation/nlp_text_splitter/text_splitter.py delete mode 100644 python/AzureTranslation/nlp_text_splitter/wtp_lang_settings.py diff --git a/python/AzureTranslation/Dockerfile b/python/AzureTranslation/Dockerfile index 4ae5195e..3f95614d 100644 --- a/python/AzureTranslation/Dockerfile +++ b/python/AzureTranslation/Dockerfile @@ -1,4 +1,4 @@ -# syntax=docker/dockerfile:1.2 +# syntax=docker/dockerfile:1.4 ############################################################################# # NOTICE # @@ -29,47 +29,28 @@ ARG BUILD_REGISTRY ARG BUILD_TAG=latest -ARG RUN_TESTS=false # To enable GPU resources, update # below line to BUILD_TYPE=gpu ARG BUILD_TYPE=cpu -FROM ${BUILD_REGISTRY}openmpf_python_executor_ssb:${BUILD_TAG} as download_python_packages - - -RUN pip install --no-cache-dir langcodes - -RUN apt-get update && \ - apt-get install -y git git-lfs && \ - git lfs install && \ - rm -rf /var/lib/apt/lists/* +FROM ${BUILD_REGISTRY}openmpf_python_component_build:${BUILD_TAG} AS python_build -# Install WtP and spaCy -RUN pip install --upgrade pip && \ - pip install "spacy>=3.7.4" && \ - pip install "wtpsplit>=1.3.0" -# Modify to add downloads for other models of interest. -RUN mkdir /wtp_models && cd /wtp_models && \ - git clone https://huggingface.co/benjamin/wtp-bert-mini && \ - python3 -m spacy download xx_sent_ud_sm +FROM ${BUILD_REGISTRY}openmpf_python_executor_ssb:${BUILD_TAG} AS cpu_component -######################################################################## -FROM download_python_packages as cpu_component_build -RUN pip install torch --index-url https://download.pytorch.org/whl/cpu - -######################################################################## -FROM download_python_packages as gpu_component_build - -# Environment variables required by nvidia runtime. -ENV NVIDIA_VISIBLE_DEVICES=all -ENV NVIDIA_DRIVER_CAPABILITIES compute,utility +ARG BUILD_TYPE -RUN pip install torch +RUN --mount=from=python_build,source=/home/mpf/openmpf-projects/openmpf-python-component-sdk/detection/nlp_text_splitter,target=/tmp/nlp_text_splitter \ +< None: - self._model_name = "" - self._default_lang = default_lang - self._mandatory_wtp_language = False - self.split = lambda t, **param: [t] - self.update_model(model_name, model_setting, default_lang) - - def update_model(self, model_name: str, model_setting: str = "cpu", default_lang: str="en"): - if model_name: - if "wtp" in model_name: - self._update_wtp_model(model_name, model_setting, default_lang) - self.split = self._split_wtp - log.info(f"Setup WtP model: {model_name}") - else: - self._update_spacy_model(model_name) - self.split = self._split_spacy - log.info(f"Setup spaCy model: {model_name}") - - def _update_wtp_model(self, wtp_model_name: str, - model_setting: str, - default_lang: str) -> None: - - if model_setting == "gpu" or model_setting == "cuda": - if GPU_AVAILABLE: - model_setting = "cuda" - else: - log.warning("PyTorch determined that CUDA is not available. " - "You may need to update the NVIDIA driver for the host system, " - "or reinstall PyTorch with GPU support by setting " - "ARGS BUILD_TYPE=gpu in the Dockerfile when building this component.") - model_setting = "cpu" - elif model_setting != "cpu": - log.warning("Invalid WtP model setting. Only `cpu` and `cuda` " - "(or `gpu`) WtP model options available at this time. " - "Defaulting to `cpu` mode.") - model_setting = "cpu" - - if wtp_model_name in WTP_MANDATORY_ADAPTOR: - self._mandatory_wtp_language = True - self._default_lang = default_lang - - if self._model_name == wtp_model_name: - log.info(f"Using cached model: {self._model_name}") - else: - self._model_name = wtp_model_name - # Check if model has been downloaded - if os.path.exists(os.path.join(WTP_MODELS_PATH, wtp_model_name)): - log.info(f"Using downloaded {wtp_model_name} model.") - wtp_model_name = os.path.join(WTP_MODELS_PATH, wtp_model_name) - - elif os.path.exists(os.path.join(DEFAULT_WTP_MODELS, - wtp_model_name)): - - log.info(f"Using downloaded {wtp_model_name} model.") - wtp_model_name = os.path.join(DEFAULT_WTP_MODELS, - wtp_model_name) - - else: - log.warning(f"Model {wtp_model_name} not found, " - "downloading from hugging face.") - - self.wtp_model = WtP(wtp_model_name) - - if model_setting != "cpu" and model_setting != "cuda": - log.warning(f"Invalid setting for WtP runtime {model_setting}. " - "Defaulting to CPU mode.") - model_setting = "cpu" - self.wtp_model.to(model_setting) - - def _split_wtp(self, text: str, lang: Optional[str] = None) -> List[str]: - if lang: - iso_lang = WtpLanguageSettings.convert_to_iso(lang) - if iso_lang: - return self.wtp_model.split(text, lang_code=iso_lang) - else: - log.warning(f"Language {lang} was not used to train WtP model. " - "If text splitting is not working well with WtP, " - "consider trying spaCy's sentence detection model." - ) - if self._mandatory_wtp_language: - log.warning("WtP model requires a language. " - f"Using default language : {self._default_lang}.") - iso_lang = WtpLanguageSettings.convert_to_iso(self._default_lang) - return self.wtp_model.split(text, lang_code=iso_lang) - return self.wtp_model.split(text) - - def _update_spacy_model(self, spacy_model_name: str): - self.spacy_model = spacy.load(spacy_model_name, exclude=["parser"]) - self.spacy_model.enable_pipe("senter") - - def _split_spacy(self, text: str, lang: Optional[str] = None) -> List[str]: - # TODO: We may add an auto model selection for spaCy in the future. - # However, the drawback is we will also need to - # download a large number of spaCy models beforehand. - processed_text = self.spacy_model(text) - return [sent.text_with_ws for sent in processed_text.sents] - -class TextSplitter: - # Authors: Brian Rosenberg, Howard Huang - - def __init__( - self, text: str, limit: int, num_boundary_chars: int, - get_text_size: Callable[[str], int], - sentence_model: TextSplitterModel, - in_lang: Optional[str] = None) -> None: - self._sentence_model = sentence_model - self._limit = limit - self._num_boundary_chars = num_boundary_chars - self._get_text_size = get_text_size - self._text = "" - self._text_full_size = 0 - self._overhead_size = 0 - self._soft_limit = self._limit - self._in_lang = in_lang - - if text: - self.set_text(text) - - def set_text(self, text: str): - self._text = text - self._text_full_size = self._get_text_size(text) - chars_per_size = len(text) / self._text_full_size - self._overhead_size = self._get_text_size('') - - self._soft_limit = int(self._limit * chars_per_size) - self._overhead_size - - if self._soft_limit <= 1: - # Caused by an unusually large overhead relative to text. - # This is unlikely to occur except during testing of small text limits. - # Recalculate soft limit by subtracting overhead from limit - # before applying chars_per_size weighting. - self._soft_limit = max(1, - int((self._limit - self._overhead_size) * chars_per_size)) - - def _isolate_largest_section(self, text:str) -> str: - # Using cached word splitting model, isolate largest section of text - string_length = len(text) - - if self._num_boundary_chars <= 0: - num_chars_to_process = string_length - else: - num_chars_to_process = self._num_boundary_chars - - start_indx = max(0, string_length - num_chars_to_process) - substring = text[start_indx: string_length] - substring_list = self._sentence_model.split(substring, lang = self._in_lang) - div_index = string_length - len(substring_list[-1]) - - if div_index==start_indx: - return text - - return text[0:div_index] - - @classmethod - def split(cls, - text: str, limit: int, num_boundary_chars: int, get_text_size: Callable[[str], int], - sentence_model: TextSplitterModel, - in_lang: Optional[str] = None - ): - return cls(text, limit, num_boundary_chars, get_text_size, sentence_model, in_lang)._split() - - - def _split(self): - if self._text_full_size <= self._limit: - yield self._text - else: - yield from self._split_internal(self._text) - - def _split_internal(self, text): - right = text - while True: - left, right = self._divide(right) - yield left - if not right: - return - - def _divide(self, text) -> Tuple[str, str]: - limit = self._soft_limit - while True: - left = text[:limit] - left_size = self._get_text_size(left) - - if left_size <= self._limit: - if left != text: - # If dividing into two parts - # Determine soft boundary for left segment - left = self._isolate_largest_section(left) - return left, text[len(left):] - - char_per_size = len(left) / left_size - - - limit = int(self._limit * char_per_size) - self._overhead_size - - if limit < 1: - # Caused by an unusually large overhead relative to text. - # This is unlikely to occur except during testing of small text limits. - # Recalculate soft limit by subtracting overhead from limit before - # applying chars_per_size weighting. - limit = max(1, int((self._limit - self._overhead_size) * char_per_size)) \ No newline at end of file diff --git a/python/AzureTranslation/nlp_text_splitter/wtp_lang_settings.py b/python/AzureTranslation/nlp_text_splitter/wtp_lang_settings.py deleted file mode 100644 index c682fd3f..00000000 --- a/python/AzureTranslation/nlp_text_splitter/wtp_lang_settings.py +++ /dev/null @@ -1,259 +0,0 @@ -############################################################################# -# NOTICE # -# # -# This software (or technical data) was produced for the U.S. Government # -# under contract, and is subject to the Rights in Data-General Clause # -# 52.227-14, Alt. IV (DEC 2007). # -# # -# Copyright 2024 The MITRE Corporation. All Rights Reserved. # -############################################################################# - -############################################################################# -# Copyright 2024 The MITRE Corporation # -# # -# Licensed under the Apache License, Version 2.0 (the "License"); # -# you may not use this file except in compliance with the License. # -# You may obtain a copy of the License at # -# # -# http://www.apache.org/licenses/LICENSE-2.0 # -# # -# Unless required by applicable law or agreed to in writing, software # -# distributed under the License is distributed on an "AS IS" BASIS, # -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # -# See the License for the specific language governing permissions and # -# limitations under the License. # -############################################################################# - -from typing import Optional - -class WtpLanguageSettings: - # Supported languages and ISO 639-1, 639-2 codes for WtP models. - # https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#supported-languages - # https://www.loc.gov/standards/iso639-2/php/code_list.php - _wtp_lang_map = { - 'afrikaans': 'af', - 'afr': 'af', - 'amharic': 'am', - 'amh': 'am', - 'arabic': 'ar', - 'ara': 'ar', - 'azerbaijani': 'az', - 'aze': 'az', - 'belarusian': 'be', - 'bel': 'be', - 'bulgarian': 'bg', - 'bul': 'bg', - 'bengali': 'bn', - 'ben': 'bn', - 'catalan': 'ca', - 'valencian': 'ca', - 'cat': 'ca', - 'cebuano': 'ceb', # In some cases, ISO-639-1 is not available, use ISO-639-2 - 'ceb': 'ceb', - 'czech': 'cs', - 'cze': 'cs', - 'ces': 'cs', - 'welsh': 'cy', - 'wel': 'cy', - 'cym': 'cy', - 'danish': 'da', - 'dan': 'da', - 'german': 'de', - 'ger': 'de', - 'deu': 'de', - 'greek': 'el', - 'gre': 'el', - 'ell': 'el', - 'english': 'en', - 'eng': 'en', - 'esperanto': 'eo', - 'epo': 'eo', - 'spanish': 'es', - 'castilian': 'es', - 'spa': 'es', - 'estonian': 'et', - 'est': 'et', - 'basque': 'eu', - 'baq': 'eu', - 'eus': 'eu', - 'persian': 'fa', - 'per': 'fa', - 'fas': 'fa', - 'finnish': 'fi', - 'fin': 'fi', - 'french': 'fr', - 'fre': 'fr', - 'fra': 'fr', - 'western frisian': 'fy', - 'fry': 'fy', - 'irish': 'ga', - 'gle': 'ga', - 'gaelic': 'gd', - 'scottish gaelic': 'gd', - 'gla': 'gd', - 'galician': 'gl', - 'glg': 'gl', - 'gujarati': 'gu', - 'guj': 'gu', - 'hausa': 'ha', - 'hau': 'ha', - 'hebrew': 'he', - 'heb': 'he', - 'hindi': 'hi', - 'hin': 'hi', - 'hungarian': 'hu', - 'hun': 'hu', - 'armenian': 'hy', - 'arm': 'hy', - 'hye': 'hy', - 'indonesian': 'id', - 'ind': 'id', - 'igbo': 'ig', - 'ibo': 'ig', - 'icelandic': 'is', - 'ice': 'is', - 'isl': 'is', - 'italian': 'it', - 'ita': 'it', - 'japanese': 'ja', - 'jpn': 'ja', - 'javanese': 'jv', - 'jav': 'jv', - 'georgian': 'ka', - 'geo': 'ka', - 'kat': 'ka', - 'kazakh': 'kk', - 'kaz': 'kk', - 'central khmer': 'km', - 'khm': 'km', - 'kannada': 'kn', - 'kan': 'kn', - 'korean': 'ko', - 'kor': 'ko', - 'kurdish': 'ku', - 'kur': 'ku', - 'kirghiz': 'ky', - 'kyrgyz': 'ky', - 'kir': 'ky', - 'latin': 'la', - 'lat': 'la', - 'lithuanian': 'lt', - 'lit': 'lt', - 'latvian': 'lv', - 'lav': 'lv', - 'malagasy': 'mg', - 'mlg': 'mg', - 'macedonian': 'mk', - 'mac': 'mk', - 'mkd': 'mk', - 'malayalam': 'ml', - 'mal': 'ml', - 'mongolian': 'mn', - 'mon': 'mn', - 'marathi': 'mr', - 'mar': 'mr', - 'malay': 'ms', - 'may': 'ms', - 'msa': 'ms', - 'maltese': 'mt', - 'mlt': 'mt', - 'burmese': 'my', - 'bur': 'my', - 'mya': 'my', - 'nepali': 'ne', - 'nep': 'ne', - 'dutch': 'nl', - 'flemish': 'nl', - 'dut': 'nl', - 'nld': 'nl', - 'norwegian': 'no', - 'nor': 'no', - 'panjabi': 'pa', - 'punjabi': 'pa', - 'pan': 'pa', - 'polish': 'pl', - 'pol': 'pl', - 'pushto': 'ps', - 'pashto': 'ps', - 'pus': 'ps', - 'portuguese': 'pt', - 'por': 'pt', - 'romanian': 'ro', - 'moldavian': 'ro', - 'moldovan': 'ro', - 'rum': 'ro', - 'ron': 'ro', - 'russian': 'ru', - 'rus': 'ru', - 'sinhala': 'si', - 'sinhalese': 'si', - 'sin': 'si', - 'slovak': 'sk', - 'slo': 'sk', - 'slk': 'sk', - 'slovenian': 'sl', - 'slv': 'sl', - 'albanian': 'sq', - 'alb': 'sq', - 'sqi': 'sq', - 'serbian': 'sr', - 'srp': 'sr', - 'swedish': 'sv', - 'swe': 'sv', - 'tamil': 'ta', - 'tam': 'ta', - 'telugu': 'te', - 'tel': 'te', - 'tajik': 'tg', - 'tgk': 'tg', - 'thai': 'th', - 'tha': 'th', - 'turkish': 'tr', - 'tur': 'tr', - 'ukrainian': 'uk', - 'ukr': 'uk', - 'urdu': 'ur', - 'urd': 'ur', - 'uzbek': 'uz', - 'uzb': 'uz', - 'vietnamese': 'vi', - 'vie': 'vi', - 'xhosa': 'xh', - 'xho': 'xh', - 'yiddish': 'yi', - 'yid': 'yi', - 'yoruba': 'yo', - 'yor': 'yo', - 'chinese': 'zh', - 'chi': 'zh', - 'zho': 'zh', - 'zulu': 'zu', - 'zul': 'zu', - 'hans':'zh', # Also check for chinese scripts - 'hant': 'zh', - 'cmn':'zh' # In some cases we use 'cmn' = 'Mandarin' - } - - _wtp_iso_set = set(_wtp_lang_map.values()) - - @classmethod - def convert_to_iso(cls, lang: str) -> Optional[str]: - # ISO 639-2 (language) is sometimes paired with ISO 15924 (script). - # Extract the language portion and check if supported in WtP. - if not lang: - return None - - if '-' in lang: - lang = lang.split('-')[0] - if '_' in lang: - lang = lang.split('_')[0] - - lang = lang.strip().lower() - - if lang in cls._wtp_iso_set: - return lang - - if lang in cls._wtp_lang_map: - return cls._wtp_lang_map[lang] - - return None diff --git a/python/AzureTranslation/setup.cfg b/python/AzureTranslation/setup.cfg index b3fa4200..1b31b155 100644 --- a/python/AzureTranslation/setup.cfg +++ b/python/AzureTranslation/setup.cfg @@ -36,7 +36,6 @@ install_requires = langcodes spacy>=3.7.4 wtpsplit>=1.3.0 - torch>=2.2.0 [options.entry_points] mpf.exported_component = From 7d4fda9772d68f4a3dfd2b747a552eecc711c837 Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Tue, 30 Apr 2024 04:20:18 -0400 Subject: [PATCH 2/2] Minor code cleanup. --- .../tests/test_acs_translation.py | 191 +----------------- 1 file changed, 2 insertions(+), 189 deletions(-) diff --git a/python/AzureTranslation/tests/test_acs_translation.py b/python/AzureTranslation/tests/test_acs_translation.py index c871fcdb..d2297f71 100644 --- a/python/AzureTranslation/tests/test_acs_translation.py +++ b/python/AzureTranslation/tests/test_acs_translation.py @@ -42,16 +42,13 @@ import mpf_component_api as mpf sys.path.insert(0, str(pathlib.Path(__file__).parent.parent)) -from nlp_text_splitter import TextSplitterModel, TextSplitter +from nlp_text_splitter import TextSplitterModel from acs_translation_component.acs_translation_component import (AcsTranslationComponent, get_azure_char_count, TranslationClient, NewLineBehavior, ChineseAndJapaneseCodePoints, AcsTranslateUrlBuilder, get_n_azure_chars) from acs_translation_component.convert_language_code import iso_to_bcp -# Set to true to test the WtP canine-s-1l model locally -# Note, this will download ~1 GB to your local storage. -LOCAL_TEST_WTP_MODEL = False SEEN_TRACE_IDS = set() CHINESE_SAMPLE_TEXT = '你好,你叫什么名字?' @@ -74,8 +71,6 @@ class TestAcsTranslation(unittest.TestCase): def setUpClass(cls): cls.mock_server = MockServer() cls.wtp_model = TextSplitterModel("wtp-bert-mini", "cpu", "en") - if LOCAL_TEST_WTP_MODEL: - cls.wtp_adv_model = TextSplitterModel("wtp-canine-s-1l", "cuda", "zh") cls.spacy_model = TextSplitterModel("xx_sent_ud_sm", "cpu", "en") @@ -600,97 +595,6 @@ def test_split_wtp_known_language(self, _): 'Spaces should not be added to Chinese text.') - def test_split_engine_difference(self): - # Note: we can only use the WtP models for subsequent tests - # involving Chinese text because only WtP's multilingual models - # can detect some of '。' characters used for this language. - text = (TEST_DATA / 'split-sentence/art-of-war.txt').read_text() - - text_without_newlines = text.replace('\n', '') - - actual = self.wtp_model._split_wtp(text_without_newlines) - self.assertEqual(3, len(actual)) - for line in actual: - self.assertTrue(line.endswith('。')) - - actual = self.spacy_model._split_spacy(text_without_newlines) - self.assertEqual(1, len(actual)) - - # However, WtP prefers newlines over the '。' character. - actual = self.wtp_model._split_wtp(text) - self.assertEqual(10, len(actual)) - - @mock.patch.object(TranslationClient, 'DETECT_MAX_CHARS', new_callable=lambda: 150) - def test_split_wtp_advanced_known_language(self, _): - # This test should only be run manually outside of a Docker build. - # The WtP canine model is ~1 GB and not worth downloading and adding to the pre-built Docker image. - if not LOCAL_TEST_WTP_MODEL: - return - - # For this test, we're more interested in the changes in behavior - # caused by WtP split. So the translation files are mainly placeholders. - self.set_results_file('traditional-chinese-detect-result.json') - self.set_results_file('split-sentence/art-of-war-translation-1.json') - self.set_results_file('split-sentence/art-of-war-translation-2.json') - self.set_results_file('split-sentence/art-of-war-translation-3.json') - self.set_results_file('split-sentence/art-of-war-translation-4.json') - - text = (TEST_DATA / 'split-sentence/art-of-war.txt').read_text() - detection_props = dict(TEXT=text) - TranslationClient(get_test_properties(SENTENCE_MODEL="wtp-canine-s-1l"), self.wtp_adv_model).add_translations(detection_props) - - self.assertEqual(5, len(detection_props)) - self.assertEqual(text, detection_props['TEXT']) - - expected_translation = (TEST_DATA / 'split-sentence/art-war-translation.txt') \ - .read_text().strip() - self.assertEqual(expected_translation, detection_props['TRANSLATION']) - self.assertEqual('EN', detection_props['TRANSLATION TO LANGUAGE']) - self.assertEqual('zh-Hant', detection_props['TRANSLATION SOURCE LANGUAGE']) - self.assertAlmostEqual(1.0, - float(detection_props['TRANSLATION SOURCE LANGUAGE CONFIDENCE'])) - - detect_request_text = self.get_request_body()[0]['Text'] - self.assertEqual(text[0:TranslationClient.DETECT_MAX_CHARS], detect_request_text) - - # Main test starts here: - expected_chunk_lengths = [61, 150, 61, 148] - self.assertEqual(sum(expected_chunk_lengths), len(text.replace('\n',''))) - translation_request1 = self.get_request_body()[0]['Text'] - self.assertTrue(translation_request1.startswith('兵者,')) - self.assertTrue(translation_request1.endswith('四曰將,五曰法。')) - self.assertEqual(expected_chunk_lengths[0], len(translation_request1)) - self.assertNotIn('\n', translation_request1, - 'Newlines were not properly removed') - self.assertNotIn(' ', translation_request1, - 'Spaces should not be added to Chinese text.') - - translation_request2 = self.get_request_body()[0]['Text'] - self.assertEqual(expected_chunk_lengths[1], len(translation_request2)) - self.assertTrue(translation_request2.startswith('道者,令民於上同意')) - self.assertTrue(translation_request2.endswith('賞罰孰明')) - self.assertNotIn('\n', translation_request2, - 'Newlines were not properly removed') - self.assertNotIn(' ', translation_request2, - 'Spaces should not be added to Chinese text.') - - translation_request3 = self.get_request_body()[0]['Text'] - self.assertEqual(expected_chunk_lengths[2], len(translation_request3)) - self.assertTrue(translation_request3.startswith('?吾以此知勝')) - self.assertTrue(translation_request3.endswith('因利而制權也。')) - self.assertNotIn('\n', translation_request3, - 'Newlines were not properly removed') - self.assertNotIn(' ', translation_request3, - 'Spaces should not be added to Chinese text.') - - translation_request4 = self.get_request_body()[0]['Text'] - self.assertEqual(expected_chunk_lengths[3], len(translation_request4)) - self.assertTrue(translation_request4.startswith('兵者,詭道也。')) - self.assertTrue(translation_request4.endswith('勝負見矣。')) - self.assertNotIn('\n', translation_request4, - 'Newlines were not properly removed') - self.assertNotIn(' ', translation_request4, - 'Spaces should not be added to Chinese text.') @mock.patch.object(TranslationClient, 'DETECT_MAX_CHARS', new_callable=lambda: 150) def test_split_wtp_unknown_lang(self, _): @@ -979,98 +883,7 @@ def test_suggested_from(self): self.assertEqual(['en'], query_dict['to']) - def test_guess_split_simple_sentence(self): - input_text = 'Hello, what is your name? My name is John.' - actual = list(TextSplitter.split(input_text, - 28, - 28, - get_azure_char_count, - self.wtp_model)) - self.assertEqual(input_text, ''.join(actual)) - self.assertEqual(2, len(actual)) - - # "Hello, what is your name?" - self.assertEqual('Hello, what is your name? ', actual[0]) - # " My name is John." - self.assertEqual('My name is John.', actual[1]) - - input_text = 'Hello, what is your name? My name is John.' - actual = list(TextSplitter.split(input_text, - 28, - 28, - get_azure_char_count, - self.spacy_model)) - self.assertEqual(input_text, ''.join(actual)) - self.assertEqual(2, len(actual)) - - # "Hello, what is your name?" - self.assertEqual('Hello, what is your name? ', actual[0]) - # " My name is John." - self.assertEqual('My name is John.', actual[1]) - - - - def test_split_sentence_end_punctuation(self): - input_text = 'Hello. How are you? asdfasdf' - actual = list(TextSplitter.split(input_text, - 20, - 10, - get_azure_char_count, - self.wtp_model)) - - self.assertEqual(input_text, ''.join(actual)) - self.assertEqual(2, len(actual)) - - self.assertEqual('Hello. How are you? ', actual[0]) - self.assertEqual('asdfasdf', actual[1]) - - actual = list(TextSplitter.split(input_text, - 20, - 10, - get_azure_char_count, - self.spacy_model)) - - self.assertEqual(input_text, ''.join(actual)) - self.assertEqual(2, len(actual)) - - self.assertEqual('Hello. How are you? ', actual[0]) - self.assertEqual('asdfasdf', actual[1]) - - - def test_guess_split_edge_cases(self): - input_text = ("This is a sentence (Dr.Test). Is this," - " a sentence as well? Maybe...maybe not?" - " \n All done, I think!") - - # Split using WtP model. - actual = list(TextSplitter.split(input_text, - 30, - 30, - get_azure_char_count, - self.wtp_model)) - - self.assertEqual(input_text, ''.join(actual)) - self.assertEqual(4, len(actual)) - - # WtP should detect and split out each sentence - self.assertEqual("This is a sentence (Dr.Test). ", actual[0]) - self.assertEqual("Is this, a sentence as well? ", actual[1]) - self.assertEqual("Maybe...maybe not? \n ", actual[2]) - self.assertEqual("All done, I think!", actual[3]) - - actual = list(TextSplitter.split(input_text, - 35, - 35, - get_azure_char_count, - self.spacy_model)) - self.assertEqual(input_text, ''.join(actual)) - self.assertEqual(4, len(actual)) - - # Split using spaCy model. - self.assertEqual("This is a sentence (Dr.Test). ", actual[0]) - self.assertEqual("Is this, a sentence as well? ", actual[1]) - self.assertEqual("Maybe...maybe not? \n ", actual[2]) - self.assertEqual("All done, I think!", actual[3]) + def test_no_translate_no_detect_when_language_ff_prop_matches(self):