From 4cc6dfe5a06eea3fa8c379180c7877d4185f24bd Mon Sep 17 00:00:00 2001
From: Brian Rosenberg <brosenberg@mitre.org>
Date: Fri, 26 Apr 2024 11:32:34 -0400
Subject: [PATCH 1/2] Install text splitter as a module

---
 python/AzureTranslation/Dockerfile            |  54 ++--
 .../acs_translation_component.py              |   2 +-
 .../nlp_text_splitter/__init__.py             |  27 --
 .../nlp_text_splitter/text_splitter.py        | 263 ------------------
 .../nlp_text_splitter/wtp_lang_settings.py    | 259 -----------------
 python/AzureTranslation/setup.cfg             |   1 -
 6 files changed, 22 insertions(+), 584 deletions(-)
 delete mode 100644 python/AzureTranslation/nlp_text_splitter/__init__.py
 delete mode 100644 python/AzureTranslation/nlp_text_splitter/text_splitter.py
 delete mode 100644 python/AzureTranslation/nlp_text_splitter/wtp_lang_settings.py

diff --git a/python/AzureTranslation/Dockerfile b/python/AzureTranslation/Dockerfile
index 4ae5195e..3f95614d 100644
--- a/python/AzureTranslation/Dockerfile
+++ b/python/AzureTranslation/Dockerfile
@@ -1,4 +1,4 @@
-# syntax=docker/dockerfile:1.2
+# syntax=docker/dockerfile:1.4
 
 #############################################################################
 # NOTICE                                                                    #
@@ -29,47 +29,28 @@
 ARG BUILD_REGISTRY
 ARG BUILD_TAG=latest
 
-ARG RUN_TESTS=false
 # To enable GPU resources, update
 # below line to BUILD_TYPE=gpu
 ARG BUILD_TYPE=cpu
 
-FROM ${BUILD_REGISTRY}openmpf_python_executor_ssb:${BUILD_TAG} as download_python_packages
-
-
-RUN pip install --no-cache-dir langcodes
-
-RUN apt-get update && \
-    apt-get install -y git git-lfs && \
-    git lfs install && \
-    rm -rf /var/lib/apt/lists/*
+FROM ${BUILD_REGISTRY}openmpf_python_component_build:${BUILD_TAG} AS python_build
 
-# Install WtP and spaCy
-RUN pip install --upgrade pip && \
-    pip install "spacy>=3.7.4" && \
-    pip install "wtpsplit>=1.3.0"
 
-# Modify to add downloads for other models of interest.
-RUN mkdir /wtp_models && cd /wtp_models && \
-    git clone https://huggingface.co/benjamin/wtp-bert-mini && \
-    python3 -m spacy download xx_sent_ud_sm
+FROM ${BUILD_REGISTRY}openmpf_python_executor_ssb:${BUILD_TAG} AS cpu_component
 
-########################################################################
-FROM download_python_packages as cpu_component_build
-RUN pip install torch --index-url https://download.pytorch.org/whl/cpu
-
-########################################################################
-FROM download_python_packages as gpu_component_build
-
-# Environment variables required by nvidia runtime.
-ENV NVIDIA_VISIBLE_DEVICES=all
-ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
+ARG BUILD_TYPE
 
-RUN pip install torch
+RUN --mount=from=python_build,source=/home/mpf/openmpf-projects/openmpf-python-component-sdk/detection/nlp_text_splitter,target=/tmp/nlp_text_splitter \
+<<eot
+    cd /tmp/nlp_text_splitter
+    if [[ ${BUILD_TYPE,,} = gpu ]]; then
+        ./install.sh --gpu
+    else
+        ./install.sh
+    fi
+eot
 
-########################################################################
-
-FROM ${BUILD_TYPE}_component_build as component_final
+ARG RUN_TESTS=false
 
 RUN --mount=target=.,readwrite \
     install-component.sh; \
@@ -81,3 +62,10 @@ LABEL org.label-schema.license="Apache 2.0" \
       org.label-schema.url="https://openmpf.github.io" \
       org.label-schema.vcs-url="https://github.com/openmpf/openmpf-components" \
       org.label-schema.vendor="MITRE"
+
+FROM cpu_component AS gpu_component
+
+ENV NVIDIA_VISIBLE_DEVICES=all
+ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
+
+FROM ${BUILD_TYPE}_component
diff --git a/python/AzureTranslation/acs_translation_component/acs_translation_component.py b/python/AzureTranslation/acs_translation_component/acs_translation_component.py
index 2b089e31..0ecf5195 100644
--- a/python/AzureTranslation/acs_translation_component/acs_translation_component.py
+++ b/python/AzureTranslation/acs_translation_component/acs_translation_component.py
@@ -42,7 +42,7 @@
 import mpf_component_api as mpf
 import mpf_component_util as mpf_util
 
-from nlp_text_splitter.text_splitter import TextSplitter, TextSplitterModel
+from nlp_text_splitter import TextSplitterModel, TextSplitter
 
 from . import convert_language_code
 
diff --git a/python/AzureTranslation/nlp_text_splitter/__init__.py b/python/AzureTranslation/nlp_text_splitter/__init__.py
deleted file mode 100644
index 09805b64..00000000
--- a/python/AzureTranslation/nlp_text_splitter/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-#############################################################################
-# NOTICE                                                                    #
-#                                                                           #
-# This software (or technical data) was produced for the U.S. Government    #
-# under contract, and is subject to the Rights in Data-General Clause       #
-# 52.227-14, Alt. IV (DEC 2007).                                            #
-#                                                                           #
-# Copyright 2024 The MITRE Corporation. All Rights Reserved.                #
-#############################################################################
-
-#############################################################################
-# Copyright 2024 The MITRE Corporation                                      #
-#                                                                           #
-# Licensed under the Apache License, Version 2.0 (the "License");           #
-# you may not use this file except in compliance with the License.          #
-# You may obtain a copy of the License at                                   #
-#                                                                           #
-#    http://www.apache.org/licenses/LICENSE-2.0                             #
-#                                                                           #
-# Unless required by applicable law or agreed to in writing, software       #
-# distributed under the License is distributed on an "AS IS" BASIS,         #
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  #
-# See the License for the specific language governing permissions and       #
-# limitations under the License.                                            #
-#############################################################################
-
-from .text_splitter import TextSplitter, TextSplitterModel
\ No newline at end of file
diff --git a/python/AzureTranslation/nlp_text_splitter/text_splitter.py b/python/AzureTranslation/nlp_text_splitter/text_splitter.py
deleted file mode 100644
index 5afaa740..00000000
--- a/python/AzureTranslation/nlp_text_splitter/text_splitter.py
+++ /dev/null
@@ -1,263 +0,0 @@
-#############################################################################
-# NOTICE                                                                    #
-#                                                                           #
-# This software (or technical data) was produced for the U.S. Government    #
-# under contract, and is subject to the Rights in Data-General Clause       #
-# 52.227-14, Alt. IV (DEC 2007).                                            #
-#                                                                           #
-# Copyright 2024 The MITRE Corporation. All Rights Reserved.                #
-#############################################################################
-
-#############################################################################
-# Copyright 2024 The MITRE Corporation                                      #
-#                                                                           #
-# Licensed under the Apache License, Version 2.0 (the "License");           #
-# you may not use this file except in compliance with the License.          #
-# You may obtain a copy of the License at                                   #
-#                                                                           #
-#    http://www.apache.org/licenses/LICENSE-2.0                             #
-#                                                                           #
-# Unless required by applicable law or agreed to in writing, software       #
-# distributed under the License is distributed on an "AS IS" BASIS,         #
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  #
-# See the License for the specific language governing permissions and       #
-# limitations under the License.                                            #
-#############################################################################
-
-import logging
-import os
-import pkg_resources
-
-import spacy
-from wtpsplit import WtP
-from typing import Callable, List, Optional, Tuple
-
-from .wtp_lang_settings import WtpLanguageSettings
-
-import torch
-
-
-DEFAULT_WTP_MODELS = "/wtp_models"
-
-# If we want to package model installation with this utility in the future:
-WTP_MODELS_PATH = pkg_resources.resource_filename(
-   __name__, "models"
-)
-
-log = logging.getLogger(__name__)
-
-# These models must have an specified language during sentence splitting.
-WTP_MANDATORY_ADAPTOR = ['wtp-canine-s-1l',
-                         'wtp-canine-s-3l',
-                         'wtp-canine-s-6l',
-                         'wtp-canine-s-9l',
-                         'wtp-canine-s-12l']
-
-GPU_AVAILABLE = False
-if torch.cuda.is_available():
-    GPU_AVAILABLE = True
-
-
-class TextSplitterModel:
-    # To hold spaCy, WtP, and other potential sentence detection models in cache
-
-    def __init__(self, model_name: str, model_setting: str, default_lang: str = "en") -> None:
-        self._model_name = ""
-        self._default_lang = default_lang
-        self._mandatory_wtp_language = False
-        self.split = lambda t, **param: [t]
-        self.update_model(model_name, model_setting, default_lang)
-
-    def update_model(self, model_name: str, model_setting: str = "cpu", default_lang: str="en"):
-        if model_name:
-            if "wtp" in model_name:
-                self._update_wtp_model(model_name, model_setting, default_lang)
-                self.split = self._split_wtp
-                log.info(f"Setup WtP model: {model_name}")
-            else:
-                self._update_spacy_model(model_name)
-                self.split = self._split_spacy
-                log.info(f"Setup spaCy model: {model_name}")
-
-    def _update_wtp_model(self, wtp_model_name: str,
-                          model_setting: str,
-                          default_lang: str) -> None:
-
-        if model_setting == "gpu" or model_setting == "cuda":
-            if GPU_AVAILABLE:
-                model_setting = "cuda"
-            else:
-                log.warning("PyTorch determined that CUDA is not available. "
-                            "You may need to update the NVIDIA driver for the host system, "
-                            "or reinstall PyTorch with GPU support by setting "
-                            "ARGS BUILD_TYPE=gpu in the Dockerfile when building this component.")
-                model_setting = "cpu"
-        elif model_setting != "cpu":
-            log.warning("Invalid WtP model setting. Only `cpu` and `cuda` "
-                        "(or `gpu`) WtP model options available at this time. "
-                        "Defaulting to `cpu` mode.")
-            model_setting = "cpu"
-
-        if wtp_model_name in WTP_MANDATORY_ADAPTOR:
-            self._mandatory_wtp_language = True
-            self._default_lang = default_lang
-
-        if self._model_name == wtp_model_name:
-            log.info(f"Using cached model: {self._model_name}")
-        else:
-            self._model_name = wtp_model_name
-            # Check if model has been downloaded
-            if os.path.exists(os.path.join(WTP_MODELS_PATH, wtp_model_name)):
-                log.info(f"Using downloaded {wtp_model_name} model.")
-                wtp_model_name = os.path.join(WTP_MODELS_PATH, wtp_model_name)
-
-            elif os.path.exists(os.path.join(DEFAULT_WTP_MODELS,
-                                             wtp_model_name)):
-
-                log.info(f"Using downloaded {wtp_model_name} model.")
-                wtp_model_name = os.path.join(DEFAULT_WTP_MODELS,
-                                              wtp_model_name)
-
-            else:
-                log.warning(f"Model {wtp_model_name} not found, "
-                             "downloading from hugging face.")
-
-            self.wtp_model =  WtP(wtp_model_name)
-
-            if model_setting != "cpu" and model_setting != "cuda":
-                log.warning(f"Invalid setting for WtP runtime {model_setting}. "
-                             "Defaulting to CPU mode.")
-                model_setting = "cpu"
-            self.wtp_model.to(model_setting)
-
-    def _split_wtp(self, text: str, lang: Optional[str] = None) -> List[str]:
-        if lang:
-            iso_lang = WtpLanguageSettings.convert_to_iso(lang)
-            if iso_lang:
-                return self.wtp_model.split(text, lang_code=iso_lang)
-            else:
-                log.warning(f"Language {lang} was not used to train WtP model. "
-                            "If text splitting is not working well with WtP, "
-                            "consider trying spaCy's sentence detection model."
-                            )
-        if self._mandatory_wtp_language:
-            log.warning("WtP model requires a language. "
-                        f"Using default language : {self._default_lang}.")
-            iso_lang = WtpLanguageSettings.convert_to_iso(self._default_lang)
-            return self.wtp_model.split(text, lang_code=iso_lang)
-        return self.wtp_model.split(text)
-
-    def _update_spacy_model(self, spacy_model_name: str):
-        self.spacy_model = spacy.load(spacy_model_name, exclude=["parser"])
-        self.spacy_model.enable_pipe("senter")
-
-    def _split_spacy(self, text: str, lang: Optional[str] = None) -> List[str]:
-        # TODO: We may add an auto model selection for spaCy in the future.
-        # However, the drawback is we will also need to
-        # download a large number of spaCy models beforehand.
-        processed_text = self.spacy_model(text)
-        return [sent.text_with_ws for sent in processed_text.sents]
-
-class TextSplitter:
-    # Authors: Brian Rosenberg, Howard Huang
-
-    def __init__(
-        self, text: str, limit: int, num_boundary_chars: int,
-        get_text_size: Callable[[str], int],
-        sentence_model: TextSplitterModel,
-        in_lang: Optional[str] = None) -> None:
-        self._sentence_model = sentence_model
-        self._limit = limit
-        self._num_boundary_chars = num_boundary_chars
-        self._get_text_size = get_text_size
-        self._text = ""
-        self._text_full_size = 0
-        self._overhead_size = 0
-        self._soft_limit = self._limit
-        self._in_lang = in_lang
-
-        if text:
-            self.set_text(text)
-
-    def set_text(self, text: str):
-        self._text = text
-        self._text_full_size = self._get_text_size(text)
-        chars_per_size = len(text) / self._text_full_size
-        self._overhead_size = self._get_text_size('')
-
-        self._soft_limit = int(self._limit * chars_per_size) - self._overhead_size
-
-        if self._soft_limit <= 1:
-            # Caused by an unusually large overhead relative to text.
-            # This is unlikely to occur except during testing of small text limits.
-            # Recalculate soft limit by subtracting overhead from limit
-            # before applying chars_per_size weighting.
-            self._soft_limit = max(1,
-                                   int((self._limit - self._overhead_size) * chars_per_size))
-
-    def _isolate_largest_section(self, text:str) -> str:
-        # Using cached word splitting model, isolate largest section of text
-        string_length = len(text)
-
-        if self._num_boundary_chars <= 0:
-            num_chars_to_process = string_length
-        else:
-            num_chars_to_process = self._num_boundary_chars
-
-        start_indx = max(0, string_length - num_chars_to_process)
-        substring = text[start_indx: string_length]
-        substring_list = self._sentence_model.split(substring, lang = self._in_lang)
-        div_index = string_length - len(substring_list[-1])
-
-        if div_index==start_indx:
-            return text
-
-        return text[0:div_index]
-
-    @classmethod
-    def split(cls,
-              text: str, limit: int, num_boundary_chars: int, get_text_size: Callable[[str], int],
-              sentence_model: TextSplitterModel,
-              in_lang: Optional[str] = None
-             ):
-        return cls(text, limit, num_boundary_chars, get_text_size, sentence_model, in_lang)._split()
-
-
-    def _split(self):
-        if self._text_full_size <= self._limit:
-            yield self._text
-        else:
-            yield from self._split_internal(self._text)
-
-    def _split_internal(self, text):
-        right = text
-        while True:
-            left, right = self._divide(right)
-            yield left
-            if not right:
-                return
-
-    def _divide(self, text) -> Tuple[str, str]:
-        limit = self._soft_limit
-        while True:
-            left = text[:limit]
-            left_size = self._get_text_size(left)
-
-            if left_size <= self._limit:
-                if left != text:
-                    # If dividing into two parts
-                    # Determine soft boundary for left segment
-                    left = self._isolate_largest_section(left)
-                return left, text[len(left):]
-
-            char_per_size = len(left) / left_size
-
-
-            limit = int(self._limit * char_per_size) - self._overhead_size
-
-            if limit < 1:
-            # Caused by an unusually large overhead relative to text.
-            # This is unlikely to occur except during testing of small text limits.
-            # Recalculate soft limit by subtracting overhead from limit before
-            # applying chars_per_size weighting.
-                limit = max(1, int((self._limit - self._overhead_size) * char_per_size))
\ No newline at end of file
diff --git a/python/AzureTranslation/nlp_text_splitter/wtp_lang_settings.py b/python/AzureTranslation/nlp_text_splitter/wtp_lang_settings.py
deleted file mode 100644
index c682fd3f..00000000
--- a/python/AzureTranslation/nlp_text_splitter/wtp_lang_settings.py
+++ /dev/null
@@ -1,259 +0,0 @@
-#############################################################################
-# NOTICE                                                                    #
-#                                                                           #
-# This software (or technical data) was produced for the U.S. Government    #
-# under contract, and is subject to the Rights in Data-General Clause       #
-# 52.227-14, Alt. IV (DEC 2007).                                            #
-#                                                                           #
-# Copyright 2024 The MITRE Corporation. All Rights Reserved.                #
-#############################################################################
-
-#############################################################################
-# Copyright 2024 The MITRE Corporation                                      #
-#                                                                           #
-# Licensed under the Apache License, Version 2.0 (the "License");           #
-# you may not use this file except in compliance with the License.          #
-# You may obtain a copy of the License at                                   #
-#                                                                           #
-#    http://www.apache.org/licenses/LICENSE-2.0                             #
-#                                                                           #
-# Unless required by applicable law or agreed to in writing, software       #
-# distributed under the License is distributed on an "AS IS" BASIS,         #
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  #
-# See the License for the specific language governing permissions and       #
-# limitations under the License.                                            #
-#############################################################################
-
-from typing import Optional
-
-class WtpLanguageSettings:
-    # Supported languages and ISO 639-1, 639-2 codes for WtP models.
-    # https://github.com/bminixhofer/wtpsplit?tab=readme-ov-file#supported-languages
-    # https://www.loc.gov/standards/iso639-2/php/code_list.php
-    _wtp_lang_map = {
-        'afrikaans': 'af',
-        'afr': 'af',
-        'amharic': 'am',
-        'amh': 'am',
-        'arabic': 'ar',
-        'ara': 'ar',
-        'azerbaijani': 'az',
-        'aze': 'az',
-        'belarusian': 'be',
-        'bel': 'be',
-        'bulgarian': 'bg',
-        'bul': 'bg',
-        'bengali': 'bn',
-        'ben': 'bn',
-        'catalan': 'ca',
-        'valencian': 'ca',
-        'cat': 'ca',
-        'cebuano': 'ceb', # In some cases, ISO-639-1 is not available, use ISO-639-2
-        'ceb': 'ceb',
-        'czech': 'cs',
-        'cze': 'cs',
-        'ces': 'cs',
-        'welsh': 'cy',
-        'wel': 'cy',
-        'cym': 'cy',
-        'danish': 'da',
-        'dan': 'da',
-        'german': 'de',
-        'ger': 'de',
-        'deu': 'de',
-        'greek': 'el',
-        'gre': 'el',
-        'ell': 'el',
-        'english': 'en',
-        'eng': 'en',
-        'esperanto': 'eo',
-        'epo': 'eo',
-        'spanish': 'es',
-        'castilian': 'es',
-        'spa': 'es',
-        'estonian': 'et',
-        'est': 'et',
-        'basque': 'eu',
-        'baq': 'eu',
-        'eus': 'eu',
-        'persian': 'fa',
-        'per': 'fa',
-        'fas': 'fa',
-        'finnish': 'fi',
-        'fin': 'fi',
-        'french': 'fr',
-        'fre': 'fr',
-        'fra': 'fr',
-        'western frisian': 'fy',
-        'fry': 'fy',
-        'irish': 'ga',
-        'gle': 'ga',
-        'gaelic': 'gd',
-        'scottish gaelic': 'gd',
-        'gla': 'gd',
-        'galician': 'gl',
-        'glg': 'gl',
-        'gujarati': 'gu',
-        'guj': 'gu',
-        'hausa': 'ha',
-        'hau': 'ha',
-        'hebrew': 'he',
-        'heb': 'he',
-        'hindi': 'hi',
-        'hin': 'hi',
-        'hungarian': 'hu',
-        'hun': 'hu',
-        'armenian': 'hy',
-        'arm': 'hy',
-        'hye': 'hy',
-        'indonesian': 'id',
-        'ind': 'id',
-        'igbo': 'ig',
-        'ibo': 'ig',
-        'icelandic': 'is',
-        'ice': 'is',
-        'isl': 'is',
-        'italian': 'it',
-        'ita': 'it',
-        'japanese': 'ja',
-        'jpn': 'ja',
-        'javanese': 'jv',
-        'jav': 'jv',
-        'georgian': 'ka',
-        'geo': 'ka',
-        'kat': 'ka',
-        'kazakh': 'kk',
-        'kaz': 'kk',
-        'central khmer': 'km',
-        'khm': 'km',
-        'kannada': 'kn',
-        'kan': 'kn',
-        'korean': 'ko',
-        'kor': 'ko',
-        'kurdish': 'ku',
-        'kur': 'ku',
-        'kirghiz': 'ky',
-        'kyrgyz': 'ky',
-        'kir': 'ky',
-        'latin': 'la',
-        'lat': 'la',
-        'lithuanian': 'lt',
-        'lit': 'lt',
-        'latvian': 'lv',
-        'lav': 'lv',
-        'malagasy': 'mg',
-        'mlg': 'mg',
-        'macedonian': 'mk',
-        'mac': 'mk',
-        'mkd': 'mk',
-        'malayalam': 'ml',
-        'mal': 'ml',
-        'mongolian': 'mn',
-        'mon': 'mn',
-        'marathi': 'mr',
-        'mar': 'mr',
-        'malay': 'ms',
-        'may': 'ms',
-        'msa': 'ms',
-        'maltese': 'mt',
-        'mlt': 'mt',
-        'burmese': 'my',
-        'bur': 'my',
-        'mya': 'my',
-        'nepali': 'ne',
-        'nep': 'ne',
-        'dutch': 'nl',
-        'flemish': 'nl',
-        'dut': 'nl',
-        'nld': 'nl',
-        'norwegian': 'no',
-        'nor': 'no',
-        'panjabi': 'pa',
-        'punjabi': 'pa',
-        'pan': 'pa',
-        'polish': 'pl',
-        'pol': 'pl',
-        'pushto': 'ps',
-        'pashto': 'ps',
-        'pus': 'ps',
-        'portuguese': 'pt',
-        'por': 'pt',
-        'romanian': 'ro',
-        'moldavian': 'ro',
-        'moldovan': 'ro',
-        'rum': 'ro',
-        'ron': 'ro',
-        'russian': 'ru',
-        'rus': 'ru',
-        'sinhala': 'si',
-        'sinhalese': 'si',
-        'sin': 'si',
-        'slovak': 'sk',
-        'slo': 'sk',
-        'slk': 'sk',
-        'slovenian': 'sl',
-        'slv': 'sl',
-        'albanian': 'sq',
-        'alb': 'sq',
-        'sqi': 'sq',
-        'serbian': 'sr',
-        'srp': 'sr',
-        'swedish': 'sv',
-        'swe': 'sv',
-        'tamil': 'ta',
-        'tam': 'ta',
-        'telugu': 'te',
-        'tel': 'te',
-        'tajik': 'tg',
-        'tgk': 'tg',
-        'thai': 'th',
-        'tha': 'th',
-        'turkish': 'tr',
-        'tur': 'tr',
-        'ukrainian': 'uk',
-        'ukr': 'uk',
-        'urdu': 'ur',
-        'urd': 'ur',
-        'uzbek': 'uz',
-        'uzb': 'uz',
-        'vietnamese': 'vi',
-        'vie': 'vi',
-        'xhosa': 'xh',
-        'xho': 'xh',
-        'yiddish': 'yi',
-        'yid': 'yi',
-        'yoruba': 'yo',
-        'yor': 'yo',
-        'chinese': 'zh',
-        'chi': 'zh',
-        'zho': 'zh',
-        'zulu': 'zu',
-        'zul': 'zu',
-        'hans':'zh', # Also check for chinese scripts
-        'hant': 'zh',
-        'cmn':'zh' # In some cases we use 'cmn' = 'Mandarin'
-    }
-
-    _wtp_iso_set = set(_wtp_lang_map.values())
-
-    @classmethod
-    def convert_to_iso(cls, lang: str) -> Optional[str]:
-        # ISO 639-2 (language) is sometimes paired with ISO 15924 (script).
-        # Extract the language portion and check if supported in WtP.
-        if not lang:
-            return None
-
-        if '-' in lang:
-            lang = lang.split('-')[0]
-        if '_' in lang:
-            lang = lang.split('_')[0]
-
-        lang = lang.strip().lower()
-
-        if lang in cls._wtp_iso_set:
-            return lang
-
-        if lang in cls._wtp_lang_map:
-            return cls._wtp_lang_map[lang]
-
-        return None
diff --git a/python/AzureTranslation/setup.cfg b/python/AzureTranslation/setup.cfg
index b3fa4200..1b31b155 100644
--- a/python/AzureTranslation/setup.cfg
+++ b/python/AzureTranslation/setup.cfg
@@ -36,7 +36,6 @@ install_requires =
     langcodes
     spacy>=3.7.4
     wtpsplit>=1.3.0
-    torch>=2.2.0
 
 [options.entry_points]
 mpf.exported_component =

From 7d4fda9772d68f4a3dfd2b747a552eecc711c837 Mon Sep 17 00:00:00 2001
From: Howard Huang <hhuang@mitre.org>
Date: Tue, 30 Apr 2024 04:20:18 -0400
Subject: [PATCH 2/2] Minor code cleanup.

---
 .../tests/test_acs_translation.py             | 191 +-----------------
 1 file changed, 2 insertions(+), 189 deletions(-)

diff --git a/python/AzureTranslation/tests/test_acs_translation.py b/python/AzureTranslation/tests/test_acs_translation.py
index c871fcdb..d2297f71 100644
--- a/python/AzureTranslation/tests/test_acs_translation.py
+++ b/python/AzureTranslation/tests/test_acs_translation.py
@@ -42,16 +42,13 @@
 import mpf_component_api as mpf
 
 sys.path.insert(0, str(pathlib.Path(__file__).parent.parent))
-from nlp_text_splitter import TextSplitterModel, TextSplitter
+from nlp_text_splitter import TextSplitterModel
 from acs_translation_component.acs_translation_component import (AcsTranslationComponent,
     get_azure_char_count, TranslationClient, NewLineBehavior, ChineseAndJapaneseCodePoints,
     AcsTranslateUrlBuilder, get_n_azure_chars)
 
 from acs_translation_component.convert_language_code import iso_to_bcp
 
-# Set to true to test the WtP canine-s-1l model locally
-# Note, this will download ~1 GB to your local storage.
-LOCAL_TEST_WTP_MODEL = False
 SEEN_TRACE_IDS = set()
 
 CHINESE_SAMPLE_TEXT = '你好，你叫什么名字？'
@@ -74,8 +71,6 @@ class TestAcsTranslation(unittest.TestCase):
     def setUpClass(cls):
         cls.mock_server = MockServer()
         cls.wtp_model = TextSplitterModel("wtp-bert-mini", "cpu", "en")
-        if LOCAL_TEST_WTP_MODEL:
-            cls.wtp_adv_model = TextSplitterModel("wtp-canine-s-1l", "cuda", "zh")
         cls.spacy_model = TextSplitterModel("xx_sent_ud_sm", "cpu", "en")
 
 
@@ -600,97 +595,6 @@ def test_split_wtp_known_language(self, _):
                             'Spaces should not be added to Chinese text.')
 
 
-    def test_split_engine_difference(self):
-        # Note: we can only use the WtP models for subsequent tests
-        # involving Chinese text because only WtP's multilingual models
-        # can detect some of '。' characters used for this language.
-        text = (TEST_DATA / 'split-sentence/art-of-war.txt').read_text()
-
-        text_without_newlines = text.replace('\n', '')
-
-        actual = self.wtp_model._split_wtp(text_without_newlines)
-        self.assertEqual(3, len(actual))
-        for line in actual:
-            self.assertTrue(line.endswith('。'))
-
-        actual = self.spacy_model._split_spacy(text_without_newlines)
-        self.assertEqual(1, len(actual))
-
-        # However, WtP prefers newlines over the '。' character.
-        actual = self.wtp_model._split_wtp(text)
-        self.assertEqual(10, len(actual))
-
-    @mock.patch.object(TranslationClient, 'DETECT_MAX_CHARS', new_callable=lambda: 150)
-    def test_split_wtp_advanced_known_language(self, _):
-        # This test should only be run manually outside of a Docker build.
-        # The WtP canine model is ~1 GB and not worth downloading and adding to the pre-built Docker image.
-        if not LOCAL_TEST_WTP_MODEL:
-            return
-
-        # For this test, we're more interested in the changes in behavior
-        # caused by WtP split. So the translation files are mainly placeholders.
-        self.set_results_file('traditional-chinese-detect-result.json')
-        self.set_results_file('split-sentence/art-of-war-translation-1.json')
-        self.set_results_file('split-sentence/art-of-war-translation-2.json')
-        self.set_results_file('split-sentence/art-of-war-translation-3.json')
-        self.set_results_file('split-sentence/art-of-war-translation-4.json')
-
-        text = (TEST_DATA / 'split-sentence/art-of-war.txt').read_text()
-        detection_props = dict(TEXT=text)
-        TranslationClient(get_test_properties(SENTENCE_MODEL="wtp-canine-s-1l"), self.wtp_adv_model).add_translations(detection_props)
-
-        self.assertEqual(5, len(detection_props))
-        self.assertEqual(text, detection_props['TEXT'])
-
-        expected_translation = (TEST_DATA / 'split-sentence/art-war-translation.txt') \
-            .read_text().strip()
-        self.assertEqual(expected_translation, detection_props['TRANSLATION'])
-        self.assertEqual('EN', detection_props['TRANSLATION TO LANGUAGE'])
-        self.assertEqual('zh-Hant', detection_props['TRANSLATION SOURCE LANGUAGE'])
-        self.assertAlmostEqual(1.0,
-            float(detection_props['TRANSLATION SOURCE LANGUAGE CONFIDENCE']))
-
-        detect_request_text = self.get_request_body()[0]['Text']
-        self.assertEqual(text[0:TranslationClient.DETECT_MAX_CHARS], detect_request_text)
-
-        # Main test starts here:
-        expected_chunk_lengths = [61, 150, 61, 148]
-        self.assertEqual(sum(expected_chunk_lengths), len(text.replace('\n','')))
-        translation_request1 = self.get_request_body()[0]['Text']
-        self.assertTrue(translation_request1.startswith('兵者，'))
-        self.assertTrue(translation_request1.endswith('四曰將，五曰法。'))
-        self.assertEqual(expected_chunk_lengths[0], len(translation_request1))
-        self.assertNotIn('\n', translation_request1,
-                            'Newlines were not properly removed')
-        self.assertNotIn(' ', translation_request1,
-                            'Spaces should not be added to Chinese text.')
-
-        translation_request2 = self.get_request_body()[0]['Text']
-        self.assertEqual(expected_chunk_lengths[1], len(translation_request2))
-        self.assertTrue(translation_request2.startswith('道者，令民於上同意'))
-        self.assertTrue(translation_request2.endswith('賞罰孰明'))
-        self.assertNotIn('\n', translation_request2,
-                            'Newlines were not properly removed')
-        self.assertNotIn(' ', translation_request2,
-                            'Spaces should not be added to Chinese text.')
-
-        translation_request3 = self.get_request_body()[0]['Text']
-        self.assertEqual(expected_chunk_lengths[2], len(translation_request3))
-        self.assertTrue(translation_request3.startswith('？吾以此知勝'))
-        self.assertTrue(translation_request3.endswith('因利而制權也。'))
-        self.assertNotIn('\n', translation_request3,
-                            'Newlines were not properly removed')
-        self.assertNotIn(' ', translation_request3,
-                            'Spaces should not be added to Chinese text.')
-
-        translation_request4 = self.get_request_body()[0]['Text']
-        self.assertEqual(expected_chunk_lengths[3], len(translation_request4))
-        self.assertTrue(translation_request4.startswith('兵者，詭道也。'))
-        self.assertTrue(translation_request4.endswith('勝負見矣。'))
-        self.assertNotIn('\n', translation_request4,
-                            'Newlines were not properly removed')
-        self.assertNotIn(' ', translation_request4,
-                            'Spaces should not be added to Chinese text.')
 
     @mock.patch.object(TranslationClient, 'DETECT_MAX_CHARS', new_callable=lambda: 150)
     def test_split_wtp_unknown_lang(self, _):
@@ -979,98 +883,7 @@ def test_suggested_from(self):
         self.assertEqual(['en'], query_dict['to'])
 
 
-    def test_guess_split_simple_sentence(self):
-        input_text = 'Hello, what is your name? My name is John.'
-        actual = list(TextSplitter.split(input_text,
-            28,
-            28,
-            get_azure_char_count,
-            self.wtp_model))
-        self.assertEqual(input_text, ''.join(actual))
-        self.assertEqual(2, len(actual))
-
-        # "Hello, what is your name?"
-        self.assertEqual('Hello, what is your name? ', actual[0])
-        # " My name is John."
-        self.assertEqual('My name is John.', actual[1])
-
-        input_text = 'Hello, what is your name? My name is John.'
-        actual = list(TextSplitter.split(input_text,
-            28,
-            28,
-            get_azure_char_count,
-            self.spacy_model))
-        self.assertEqual(input_text, ''.join(actual))
-        self.assertEqual(2, len(actual))
-
-        # "Hello, what is your name?"
-        self.assertEqual('Hello, what is your name? ', actual[0])
-        # " My name is John."
-        self.assertEqual('My name is John.', actual[1])
-
-
-
-    def test_split_sentence_end_punctuation(self):
-        input_text = 'Hello. How are you? asdfasdf'
-        actual = list(TextSplitter.split(input_text,
-            20,
-            10,
-            get_azure_char_count,
-            self.wtp_model))
-
-        self.assertEqual(input_text, ''.join(actual))
-        self.assertEqual(2, len(actual))
-
-        self.assertEqual('Hello. How are you? ', actual[0])
-        self.assertEqual('asdfasdf', actual[1])
-
-        actual = list(TextSplitter.split(input_text,
-            20,
-            10,
-            get_azure_char_count,
-            self.spacy_model))
-
-        self.assertEqual(input_text, ''.join(actual))
-        self.assertEqual(2, len(actual))
-
-        self.assertEqual('Hello. How are you? ', actual[0])
-        self.assertEqual('asdfasdf', actual[1])
-
-
-    def test_guess_split_edge_cases(self):
-        input_text = ("This is a sentence (Dr.Test). Is this,"
-                      " a sentence as well? Maybe...maybe not?"
-                      " \n All done, I think!")
-
-        # Split using WtP model.
-        actual = list(TextSplitter.split(input_text,
-            30,
-            30,
-            get_azure_char_count,
-            self.wtp_model))
-
-        self.assertEqual(input_text, ''.join(actual))
-        self.assertEqual(4, len(actual))
-
-        # WtP should detect and split out each sentence
-        self.assertEqual("This is a sentence (Dr.Test). ", actual[0])
-        self.assertEqual("Is this, a sentence as well? ", actual[1])
-        self.assertEqual("Maybe...maybe not? \n ", actual[2])
-        self.assertEqual("All done, I think!", actual[3])
-
-        actual = list(TextSplitter.split(input_text,
-            35,
-            35,
-            get_azure_char_count,
-            self.spacy_model))
-        self.assertEqual(input_text, ''.join(actual))
-        self.assertEqual(4, len(actual))
-
-        # Split using spaCy model.
-        self.assertEqual("This is a sentence (Dr.Test). ", actual[0])
-        self.assertEqual("Is this, a sentence as well? ", actual[1])
-        self.assertEqual("Maybe...maybe not? \n ", actual[2])
-        self.assertEqual("All done, I think!", actual[3])
+
 
 
     def test_no_translate_no_detect_when_language_ff_prop_matches(self):