diff --git a/python/AzureSpeechDetection/README.md b/python/AzureSpeechDetection/README.md index 3201aa20..16ab9da3 100644 --- a/python/AzureSpeechDetection/README.md +++ b/python/AzureSpeechDetection/README.md @@ -1,33 +1,50 @@ # Overview -This repository contains source code for the OpenMPF Azure Cognitive Services Speech-to-Text Component. This component utilizes the [Azure Cognitive Services Batch Transcription REST endpoint](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/batch-transcription) to transcribe speech from audio and video files. +This repository contains source code for the OpenMPF Azure Cognitive Services +Speech-to-Text Component. This component utilizes the [Azure Cognitive Services Batch +Transcription REST +endpoint](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/batch-transcription) +to transcribe speech from audio and video files. # Required Job Properties -In order for the component to process any jobs, the job properties listed below must be provided. These properties have no default value, but can be set through environment variables of the same name. If both environment variable and job property are provided, the job property will be used. +In order for the component to process any jobs, the job properties listed below must be +provided. These properties have no default value, but can be set through environment +variables of the same name. If both environment variable and job property are provided, +the job property will be used. -- `ACS_URL`: URL for the Azure Cognitive Services Endpoint. - e.g. `https://virginia.cris.azure.us/api/speechtotext/v2.0/transcriptions`. - The component has only been tested against v2.0 of the API. +- `ACS_URL`: URL for the Azure Cognitive Services Endpoint. For example, + `https://virginia.cris.azure.us/api/speechtotext/v3.1/transcriptions`. The component has + been tested against v3.1 of the API. - - `ACS_SUBSCRIPTION_KEY`: A string containing your subscription key for the speech service. +- `ACS_SUBSCRIPTION_KEY`: A string containing your subscription key for the speech + service. -- `ACS_BLOB_CONTAINER_URL`: URL for an Azure Storage Blob container in which to store files during processing. - e.g. `https://myaccount.blob.core.windows.net/mycontainer`. - See Microsoft's [documentation on Azure storage](https://docs.microsoft.com/en-us/azure/storage/blobs/storage-blob-container-create) for details. +- `ACS_BLOB_CONTAINER_URL`: URL for an Azure Storage Blob container in which to store + files during processing. e.g. `https://myaccount.blob.core.windows.net/mycontainer`. See + Microsoft's [documentation on Azure + storage](https://docs.microsoft.com/en-us/azure/storage/blobs/storage-blob-container-create) + for details. -- `ACS_BLOB_SERVICE_KEY`: A string containing your Azure Cognitive Services storage access key. +- `ACS_BLOB_SERVICE_KEY`: A string containing your Azure Cognitive Services storage access + key. # Optional Job Properties The below properties can be optionally provided to alter the behavior of the component. -- `LANGUAGE`: The locale to use for transcription. Defaults to `en-US`. A complete list of available locales can be found in Microsoft's [Speech service documentation](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support). +- `LANGUAGE`: The BCP-47 locale to use for transcription. Defaults to `en-US`. A complete + list of available locales can be found in Microsoft's [Speech service + documentation](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support). -- `DIARIZE`: Whether to assign utterances to different speakers. Currently, this component supports only two-speaker diarization. Diarization is enabled by default. +- `DIARIZE`: Whether to assign utterances to different speakers. Currently, this component + supports only two-speaker diarization. Diarization is enabled by default. -- `CLEANUP`: Whether to delete files from Azure Blob storage container when processing is complete. It is recommended to always keep this enabled, unless it is expected that the same piece of media will be processed multiple times. +- `CLEANUP`: Whether to delete files from Azure Blob storage container when processing is + complete. It is recommended to always keep this enabled, unless it is expected that the + same piece of media will be processed multiple times. -- `BLOB_ACCESS_TIME`: The amount of time in minutes for which the Azure Speech service will have access to the file in blob storage. +- `BLOB_ACCESS_TIME`: The amount of time in minutes for which the Azure Speech service + will have access to the file in blob storage. @@ -50,120 +67,275 @@ Returned `AudioTrack` objects have the following members in their `detection_pro | `MISSING_LANGUAGE_MODELS` | All languages for which transcription was considered, but which were either invalid ISO 639-3 codes, did not have a corresponding BCP-47 code, or were not supported by the Azure Speech endpoint. | -AudioTracks also have the `start_time` and `stop_time` of their associated utterance's voiced segment, and the utterance `confidence`, as returned by Azure. +AudioTracks also have the `start_time` and `stop_time` of their associated utterance's +voiced segment, and the utterance `confidence`, as returned by Azure. # Sample Program -`sample_acs_speech_detector.py` can be used to quickly test with the Azure endpoint. Run with the `-h` flag to see accepted command-line arguments. +`sample_acs_speech_detector.py` can be used to quickly test with the Azure endpoint. Run +with the `-h` flag to see accepted command-line arguments. # Language Identifiers -The following are the BCP-47 codes and their corresponding languages which Azure Speech-to-Text supports. - - -| Language | BCP-47 | Language | BCP-47 | -|--------------------------------|--------|------------------------------|---------| -| Afrikaans | `af-ZA` | Hungarian | `hu-HU` | -| Amharic | `am-ET` | Icelandic | `is-IS` | -| Arabic (Algeria) | `ar-DZ` | Indonesian | `id-ID` | -| Arabic (Bahrain) | `ar-BH` | Irish | `ga-IE` | -| Arabic (Egypt) | `ar-EG` | Italian | `it-IT` | -| Arabic (Iraq) | `ar-IQ` | Japanese | `ja-JP` | -| Arabic (Israel) | `ar-IL` | Javanese | `jv-ID` | -| Arabic (Jordan) | `ar-JO` | Kannada | `kn-IN` | -| Arabic (Kuwait) | `ar-KW` | Khmer | `km-KH` | -| Arabic (Lebanon) | `ar-LB` | Korean | `ko-KR` | -| Arabic (Libya) | `ar-LY` | Lao | `lo-LA` | -| Arabic (Morocco) | `ar-MA` | Latvian | `lv-LV` | -| Arabic (Oman) | `ar-OM` | Lithuanian | `lt-LT` | -| Arabic (Palestinian Authority) | `ar-PS` | Macedonian | `mk-MK` | -| Arabic (Qatar) | `ar-QA` | Malay | `ms-MY` | -| Arabic (Saudi Arabia) | `ar-SA` | Maltese | `mt-MT` | -| Arabic (Syria) | `ar-SY` | Marathi | `mr-IN` | -| Arabic (Tunisia) | `ar-TN` | Norwegian | `nb-NO` | -| Arabic (United Arab Emirates) | `ar-AE` | Polish | `pl-PL` | -| Arabic (Yemen) | `ar-YE` | Portuguese (Brazil) | `pt-BR` | -| Bulgarian | `bg-BG` | Portuguese (Portugal) | `pt-PT` | -| Burmese | `my-MM` | Romanian | `ro-RO` | -| Catalan | `ca-ES` | Russian | `ru-RU` | -| Chinese (Cantonese) | `zh-HK` | Serbian | `sr-RS` | -| Chinese (Mandarin) | `zh-CN` | Sinhala | `si-LK` | -| Chinese (Taiwan) | `zh-TW` | Slovak | `sk-SK` | -| Croatian | `hr-HR` | Slovenian | `sl-SI` | -| Czech | `cs-CZ` | Spanish (Argentina) | `es-AR` | -| Danish | `da-DK` | Spanish (Bolivia) | `es-BO` | -| Dutch (Belgium) | `nl-BE` | Spanish (Chile) | `es-CL` | -| Dutch (Netherlands) | `nl-NL` | Spanish (Colombia) | `es-CO` | -| English (Australia) | `en-AU` | Spanish (Costa Rica) | `es-CR` | -| English (Canada) | `en-CA` | Spanish (Cuba) | `es-CU` | -| English (Ghana) | `en-GH` | Spanish (Dominican Republic) | `es-DO` | -| English (Hong Kong) | `en-HK` | Spanish (Ecuador) | `es-EC` | -| English (India) | `en-IN` | Spanish (El Salvador) | `es-SV` | -| English (Ireland) | `en-IE` | Spanish (Equatorial Guinea) | `es-GQ` | -| English (Kenya) | `en-KE` | Spanish (Guatemala) | `es-GT` | -| English (New Zealand) | `en-NZ` | Spanish (Honduras) | `es-HN` | -| English (Nigeria) | `en-NG` | Spanish (Mexico) | `es-MX` | -| English (Philippines) | `en-PH` | Spanish (Nicaragua) | `es-NI` | -| English (Singapore) | `en-SG` | Spanish (Panama) | `es-PA` | -| English (South Africa) | `en-ZA` | Spanish (Paraguay) | `es-PY` | -| English (Tanzania) | `en-TZ` | Spanish (Peru) | `es-PE` | -| English (United Kingdom) | `en-GB` | Spanish (Puerto Rico) | `es-PR` | -| English (United States) | `en-US` | Spanish (Spain) | `es-ES` | -| Estonian | `et-EE` | Spanish (United States) | `es-US` | -| Farsi | `fa-IR` | Spanish (Uruguay) | `es-UY` | -| Finnish | `fi-FI` | Spanish (Venezuela) | `es-VE` | -| Filipino | `fil-P` | Swahili (Kenya) | `sw-KE` | -| French (Belgium) | `fr-BE` | Swahili (Tanzania) | `sw-TZ` | -| French (Canada) | `fr-CA` | Swedish | `sv-SE` | -| French (France) | `fr-FR` | Tamil | `ta-IN` | -| French (Switzerland) | `fr-CH` | Telugu | `te-IN` | -| German (Austria) | `de-AT` | Thai | `th-TH` | -| German (Germany) | `de-DE` | Turkish | `tr-TR` | -| German (Switzerland) | `de-CH` | Ukrainian | `uk-UA` | -| Greek | `el-GR` | Uzbek | `uz-UZ` | -| Gujarati | `gu-IN` | Vietnamese | `vi-VN` | -| Hebrew | `he-IL` | Zulu | `zu-ZA` | - | Hindi | `hi-IN` | | | +The following are the BCP-47 codes and their corresponding languages which Azure +Speech-to-Text supports. + + +| Language | Locale (BCP-47) | +| ------------------------------------------- | --------------- | +| Afrikaans (South Africa) | af-ZA | +| Amharic (Ethiopia) | am-ET | +| Arabic (United Arab Emirates) | ar-AE | +| Arabic (Bahrain) | ar-BH | +| Arabic (Algeria) | ar-DZ | +| Arabic (Egypt) | ar-EG | +| Arabic (Israel) | ar-IL | +| Arabic (Iraq) | ar-IQ | +| Arabic (Jordan) | ar-JO | +| Arabic (Kuwait) | ar-KW | +| Arabic (Lebanon) | ar-LB | +| Arabic (Libya) | ar-LY | +| Arabic (Morocco) | ar-MA | +| Arabic (Oman) | ar-OM | +| Arabic (Palestinian Authority) | ar-PS | +| Arabic (Qatar) | ar-QA | +| Arabic (Saudi Arabia) | ar-SA | +| Arabic (Syria) | ar-SY | +| Arabic (Tunisia) | ar-TN | +| Arabic (Yemen) | ar-YE | +| Azerbaijani (Latin, Azerbaijan) | az-AZ | +| Bulgarian (Bulgaria) | bg-BG | +| Bengali (India) | bn-IN | +| Bosnian (Bosnia and Herzegovina) | bs-BA | +| Catalan | ca-ES | +| Czech (Czechia) | cs-CZ | +| Welsh (United Kingdom) | cy-GB | +| Danish (Denmark) | da-DK | +| German (Austria) | de-AT | +| German (Switzerland) | de-CH | +| German (Germany) | de-DE | +| Greek (Greece) | el-GR | +| English (Australia) | en-AU | +| English (Canada) | en-CA | +| English (United Kingdom) | en-GB | +| English (Ghana) | en-GH | +| English (Hong Kong SAR) | en-HK | +| English (Ireland) | en-IE | +| English (India) | en-IN | +| English (Kenya) | en-KE | +| English (Nigeria) | en-NG | +| English (New Zealand) | en-NZ | +| English (Philippines) | en-PH | +| English (Singapore) | en-SG | +| English (Tanzania) | en-TZ | +| English (United States) | en-US | +| English (South Africa) | en-ZA | +| Spanish (Argentina) | es-AR | +| Spanish (Bolivia) | es-BO | +| Spanish (Chile) | es-CL | +| Spanish (Colombia) | es-CO | +| Spanish (Costa Rica) | es-CR | +| Spanish (Cuba) | es-CU | +| Spanish (Dominican Republic) | es-DO | +| Spanish (Ecuador) | es-EC | +| Spanish (Spain) | es-ES | +| Spanish (Equatorial Guinea) | es-GQ | +| Spanish (Guatemala) | es-GT | +| Spanish (Honduras) | es-HN | +| Spanish (Mexico) | es-MX | +| Spanish (Nicaragua) | es-NI | +| Spanish (Panama) | es-PA | +| Spanish (Peru) | es-PE | +| Spanish (Puerto Rico) | es-PR | +| Spanish (Paraguay) | es-PY | +| Spanish (El Salvador) | es-SV | +| Spanish (United States)1 | es-US | +| Spanish (Uruguay) | es-UY | +| Spanish (Venezuela) | es-VE | +| Estonian (Estonia) | et-EE | +| Basque | eu-ES | +| Persian (Iran) | fa-IR | +| Finnish (Finland) | fi-FI | +| Filipino (Philippines) | fil-PH | +| French (Belgium) | fr-BE | +| French (Canada)1 | fr-CA | +| French (Switzerland) | fr-CH | +| French (France) | fr-FR | +| Irish (Ireland) | ga-IE | +| Galician | gl-ES | +| Gujarati (India) | gu-IN | +| Hebrew (Israel) | he-IL | +| Hindi (India) | hi-IN | +| Croatian (Croatia) | hr-HR | +| Hungarian (Hungary) | hu-HU | +| Armenian (Armenia) | hy-AM | +| Indonesian (Indonesia) | id-ID | +| Icelandic (Iceland) | is-IS | +| Italian (Switzerland) | it-CH | +| Italian (Italy) | it-IT | +| Japanese (Japan) | ja-JP | +| Javanese (Latin, Indonesia) | jv-ID | +| Georgian (Georgia) | ka-GE | +| Kazakh (Kazakhstan) | kk-KZ | +| Khmer (Cambodia) | km-KH | +| Kannada (India) | kn-IN | +| Korean (Korea) | ko-KR | +| Lao (Laos) | lo-LA | +| Lithuanian (Lithuania) | lt-LT | +| Latvian (Latvia) | lv-LV | +| Macedonian (North Macedonia) | mk-MK | +| Malayalam (India) | ml-IN | +| Mongolian (Mongolia) | mn-MN | +| Marathi (India) | mr-IN | +| Malay (Malaysia) | ms-MY | +| Maltese (Malta) | mt-MT | +| Burmese (Myanmar) | my-MM | +| Norwegian Bokmål (Norway) | nb-NO | +| Nepali (Nepal) | ne-NP | +| Dutch (Belgium) | nl-BE | +| Dutch (Netherlands) | nl-NL | +| Punjabi (India) | pa-IN | +| Polish (Poland) | pl-PL | +| Pashto (Afghanistan) | ps-AF | +| Portuguese (Brazil) | pt-BR | +| Portuguese (Portugal) | pt-PT | +| Romanian (Romania) | ro-RO | +| Russian (Russia) | ru-RU | +| Sinhala (Sri Lanka) | si-LK | +| Slovak (Slovakia) | sk-SK | +| Slovenian (Slovenia) | sl-SI | +| Somali (Somalia) | so-SO | +| Albanian (Albania) | sq-AL | +| Serbian (Cyrillic, Serbia) | sr-RS | +| Swedish (Sweden) | sv-SE | +| Swahili (Kenya) | sw-KE | +| Swahili (Tanzania) | sw-TZ | +| Tamil (India) | ta-IN | +| Telugu (India) | te-IN | +| Thai (Thailand) | th-TH | +| Turkish (Türkiye) | tr-TR | +| Ukrainian (Ukraine) | uk-UA | +| Urdu (India) | ur-IN | +| Uzbek (Latin, Uzbekistan) | uz-UZ | +| Vietnamese (Vietnam) | vi-VN | +| Chinese (Wu, Simplified) | wuu-CN | +| Chinese (Cantonese, Simplified) | yue-CN | +| Chinese (Mandarin, Simplified) | zh-CN | +| Chinese (Jilu Mandarin, Simplified) | zh-CN-shandong | +| Chinese (Southwestern Mandarin, Simplified) | zh-CN-sichuan | +| Chinese (Cantonese, Traditional) | zh-HK | +| Chinese (Taiwanese Mandarin, Traditional) | zh-TW | +| Zulu (South Africa) | zu-ZA | ## Dynamic Speech Selection -The below table describes the component's default behavior when supplied an ISO 639-3 language code by an upstream language identification component in a feed-forward track. For languages with multiple dialects (indicated by an asterisk), a BCP-47 locale was chosen according to internal data, which may not be desirable in all cases. This selection can be altered by editing `acs_speech_component/azure_utils.py`. - -If the language code supplied by a feed-forward track is not handled in `acs_speech_component/azure_utils.py`, the component will raise an `INVALID_PROPERTY` exception. - -| ISO 639-3 | Language | BCP-47 | -|:---------:|---------------------|----------| -| `AMH` | Amharic | `am-ET` | -| `ARA` | Arabic | `ar-EG`* | -| `BUL` | Bulgarian | `bg-BG` | -| `CES` | Czech | `cs-CZ` | -| `CMN` | Chinese (Mandarin) | `zh-CN`* | -| `ELL` | Greek | `el-GR` | -| `ENG` | English | `en-US`* | -| `FRE` | French | `fr-FR`* | -| `HIN` | Hindi | `hi-IN` | -| `IND` | Indonesian | `id-ID` | -| `JAV` | Javanese | `jv-ID` | -| `JPN` | Japanese | `ja-JP` | -| `KOR` | Korean | `ko-KR` | -| `LAO` | Lao | `lo-LA` | -| `LIT` | Lithuanian | `lt-LT` | -| `MKD` | Macedonian | `mk-MK` | -| `MYA` | Burmese | `my-MM` | -| `NAN` | Chinese (Taiwan) | `zh-TW`* | -| `PES` | Farsi | `fa-IR` | -| `POL` | Polish | `pl-PL` | -| `POR` | Portuguese | `pt-BR` | -| `RON` | Romanian | `ro-RO` | -| `RUS` | Russian | `ru-RU` | -| `SLK` | Slovak | `sk-SK` | -| `SPA` | Spanish | `es-MX`* | -| `SWA` | Swahili | `sw-KE`* | -| `TAM` | Tamil | `ta-IN` | -| `THA` | Thai | `th-TH` | -| `TUR` | Turkish | `tr-TR` | -| `UKR` | Ukrainian | `uk-UA` | -| `UZB` | Uzbek | `uz-UZ` | -| `VIE` | Vietnamese | `vi-VN` | -| `YUE` | Chinese (Cantonese) | `zh-HK`* | -| `ZUL` | Zulu | `zu-ZA` | +The below table describes the component's default behavior when supplied an ISO 639-3 +language code by an upstream language identification component in a feed-forward track. +For languages with multiple dialects (indicated by an asterisk), a BCP-47 locale was +chosen according to internal data, which may not be desirable in all cases. This selection +can be altered by editing `acs_speech_component/azure_utils.py`. + +If the language code supplied by a feed-forward track is not handled in +`acs_speech_component/azure_utils.py`, the component will raise an `INVALID_PROPERTY` +exception. + +| ISO 639--3 | Language | BCP-47 | +| ---------- | ---------------------------- | ------ | +| afr | Afrikaans | af-ZA | +| amh | Amharic | am-ET | +| ara | Arabic | ar-EG | +| aze | Azerbaijani | az-AZ | +| azj | North Azerbaijani | az-AZ | +| azb | South Azerbaijani | az-AZ | +| ben | Bengali | bn-IN | +| bul | Bulgarian | bg-BG | +| bos | Bosnian | bs-BA | +| cat | Catalan | ca-ES | +| ces | Czech | cs-CZ | +| cze | Czech | cs-CZ | +| cym | Welsh | cy-GB | +| wel | Welsh | cy-GB | +| dan | Danish | da-DK | +| jut | Jutish | da-DK | +| deu | German | de-DE | +| gsw | Swiss German | de-CH | +| bar | Bavarian | de-AT | +| ell | Modern Greek (1453-) | el-GR | +| eng | English | en-US | +| est | Estonian | et-EE | +| ekk | Standard Estonian | et-EE | +| eus | Basque | eu-ES | +| fas | Persian | fa-IR | +| fin | Finnish | fi-FI | +| fil | Filipino | fil-PH | +| fra | French | fr-FR | +| gle | Irish | ga-IE | +| glg | Galician | gl-ES | +| guj | Gujarati | gu-IN | +| heb | Hebrew | he-IL | +| hin | Hindi | hi-IN | +| hrv | Croatian | hr-HR | +| hun | Hungarian | hu-HU | +| ita | Italian | it-IT | +| ind | Indonesian | id-ID | +| ice | Icelandic | is-IS | +| isl | Icelandic | is-IS | +| jav | Javanese | jv-ID | +| jpn | Japanese | ja-JP | +| kat | Georgian | ka-GE | +| kaz | Kazakh | kk-KZ | +| khm | Khmer | km-KH | +| kxm | Northern Khmer | km-KH | +| kan | Kannada | kn-IN | +| kor | Korean | ko-KR | +| lao | Lao | lo-LA | +| lit | Lithuanian | lt-LT | +| lav | Latvian | lv-LV | +| lvs | Standard Latvian | lv-LV | +| mkd | Macedonian | mk-MK | +| mya | Burmese | my-MM | +| mal | Malayalam | ml-IN | +| mon | Mongolian | mn-MN | +| khk | Halh Mongolian | mn-MN | +| mvf | Peripheral Mongolian | mn-MN | +| mar | Marathi | mr-IN | +| zsm | Standard Malay | ms-MY | +| mlt | Maltese | mt-MT | +| nob | Norwegian Bokmål | nb-NO | +| nep | Nepali (macrolanguage) | ne-NP | +| npi | Nepali (individual language) | ne-NP | +| nld | Dutch | nl-NL | +| pan | Panjabi | pa-IN | +| pes | Iranian Persian | fa-IR | +| pol | Polish | pl-PL | +| por | Portuguese | pt-BR | +| pus | Pushto | ps-AF | +| pbu | Northern Pashto | ps-AF | +| pst | Central Pashto | ps-AF | +| pbt | Southern Pashto | ps-AF | +| sin | Sinhala | si-LK | +| ron | Romanian | ro-RO | +| rus | Russian | ru-RU | +| slk | Slovak | sk-SK | +| slv | Slovenian | sl-SI | +| som | Somali | so-SO | +| spa | Spanish | es-MX | +| sqi | Albanian | sq-AL | +| swa | Swahili (macrolanguage) | sw-KE | +| swe | Swedish | sv-SE | +| srp | Serbian | sr-RS | +| tam | Tamil | ta-IN | +| tel | Telugu | te-IN | +| tgl | Tagalog | fil-PH | +| tha | Thai | th-TH | +| tur | Turkish | tr-TR | +| ukr | Ukrainian | uk-UA | +| urd | Urdu | ur-IN | +| uzb | Uzbek | uz-UZ | +| vie | Vietnamese | vi-VN | +| cmn | Mandarin Chinese | zh-CN | +| zho | Chinese | zh-CN | +| yue | Yue Chinese | yue-CN | +| wuu | Wu Chinese | wuu-CN | +| nan | Min Nan Chinese | zh-TW | +| zul | Zulu | zu-ZA | \ No newline at end of file diff --git a/python/AzureSpeechDetection/acs_speech_component/acs_speech_processor.py b/python/AzureSpeechDetection/acs_speech_component/acs_speech_processor.py index bda7d24b..d1c2fbb0 100644 --- a/python/AzureSpeechDetection/acs_speech_component/acs_speech_processor.py +++ b/python/AzureSpeechDetection/acs_speech_component/acs_speech_processor.py @@ -70,6 +70,19 @@ class AcsSpeechDetectionProcessor(object): def __init__(self): self.acs = AzureConnection() + @staticmethod + def _convert_case_bcp(bcp:str)->str: + if not bcp: + return bcp + sep = '-' + if '_' in bcp: + sep = '_' + elif '-' not in bcp: + return bcp + + lang, script = bcp.split(sep) + return f'{lang.lower()}{sep}{script.upper()}' + @staticmethod def convert_word_timing( recognized_phrases: Iterable[Mapping[str, Any]], @@ -209,10 +222,10 @@ def process_audio(self, job_config: AzureJobConfig) -> List[mpf.AudioTrack]: ) missing_models = set() - default_locale = job_config.language + default_locale = self._convert_case_bcp(job_config.language) if (lang := job_config.override_default_language) is not None: - if lang in ISO6393_TO_BCP47: - for locale in ISO6393_TO_BCP47[lang]: + if lang.lower() in ISO6393_TO_BCP47: + for locale in ISO6393_TO_BCP47[lang.lower()]: if locale in self.acs.supported_locales: logger.debug( f"Override default language ('{lang}') detected, " @@ -241,18 +254,19 @@ def process_audio(self, job_config: AzureJobConfig) -> List[mpf.AudioTrack]: locale = default_locale if job_config.speaker is not None: speaker_language_valid = False - if (lang := job_config.speaker.language) in ISO6393_TO_BCP47: - for locale in ISO6393_TO_BCP47[lang]: - if locale in self.acs.supported_locales: - speaker_language_valid = True - break + if (lang := job_config.speaker.language): + if lang.lower() in ISO6393_TO_BCP47: + for locale in ISO6393_TO_BCP47[lang.lower()]: + if locale in self.acs.supported_locales: + speaker_language_valid = True + break if not speaker_language_valid: missing_models.add(job_config.speaker.language) ldict = job_config.speaker.language_scores for lang in sorted(ldict.keys(), key=ldict.get, reverse=True): - if lang in ISO6393_TO_BCP47: - for locale in ISO6393_TO_BCP47[lang]: + if lang.lower() in ISO6393_TO_BCP47: + for locale in ISO6393_TO_BCP47[lang.lower()]: if locale in self.acs.supported_locales: logger.warning( f"Language supplied in feed-forward track " @@ -283,6 +297,7 @@ def process_audio(self, job_config: AzureJobConfig) -> List[mpf.AudioTrack]: ) locale = default_locale + locale = self._convert_case_bcp(locale) if locale not in self.acs.supported_locales: raise mpf.DetectionException( f"Selected locale ('{locale}') is not supported by Azure " diff --git a/python/AzureSpeechDetection/acs_speech_component/azure_utils.py b/python/AzureSpeechDetection/acs_speech_component/azure_utils.py index 835d19ac..753fcd48 100644 --- a/python/AzureSpeechDetection/acs_speech_component/azure_utils.py +++ b/python/AzureSpeechDetection/acs_speech_component/azure_utils.py @@ -27,74 +27,141 @@ # Dict of conversions from ISO639-3 language codes to BCP-47 codes. The first # BCP-47 code in each value list will be used (the rest are primarily for # reference in case of later changes or customization) + +# Supported languages can be found here: +# https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=stt ISO6393_TO_BCP47 = dict( + afr=["af-ZA"], amh=["am-ET"], ara=["ar-EG", "ar-SA", "ar-IQ", "ar-IL", "ar-AE", "ar-SY", "ar-LY", "ar-DZ", "ar-BH", "ar-JO", "ar-KW", "ar-LB", "ar-MA", "ar-OM", "ar-PS", "ar-QA", "ar-TN", "ar-YE"], aze=["az-AZ"], - bel=["be-BY"], - ben=["bn-BD", "bn-IN"], + azj=["az-AZ"], # North Azerbaijani + azb=["az-AZ"], # South Azerbaijani + # bel=["be-BY"], # Deprecated + ben=["bn-IN"], # "bn-BD" Bengali-Bangladesh has been deprecated # bod=["bo"], # Deprecated bul=["bg-BG"], + bos=["bs-BA"], + cat=["ca-ES"], # ceb=["ceb"], ces=["cs-CZ"], - cmn=["zh-CN"], + cze=["cs-CZ"],# ISO-639-2 Variant + cym=["cy-GB"], + wel=["cy-GB"],# ISO-639-2 Variant + dan=["da-DK"], # Note: There is a related dialect JUT - Jutlandic + jut=["da-DK"], # Upon further research, Jutlantic is present in Denmark + # but declining over time. + deu=["de-DE", "de-AT", "de-CH"], + # Many other forms of German exist + gsw=["de-CH"], # Swiss German + bar=["de-AT"], # Bavarian / Upper German variant common in most of Austria ell=["el-GR"], eng=["en-US", "en-CA", "en-GB", "en-AU", "en-GH", "en-HK", "en-IN", "en-IE", "en-KE", "en-NZ", "en-NG", "en-PH", "en-SG", "en-ZA", "en-TZ"], - fra=["fr-FR", "fr-CA", "fr-CH"], + est=["et-EE"], # Estonian (Inclusive) + ekk=["et-EE"], # Standard Estonian + # vro=["et-EE"], Voro, doesn't seem to be direct match + eus=["eu-ES"], + fas=["fa-IR"], + fin=["fi-FI"], + fil=["fil-PH"], + fra=["fr-FR", "fr-BE", "fr-CA", "fr-CH"], + gle=["ga-IE"], + glg=["gl-ES"], + guj=["gu-IN"], + heb=["he-IL"], + hin=["hi-IN"], + hrv=["hr-HR"], + hun=["hu-HU"], + # ohu=["hu-HU"], # Note: Old-Hungarian, might not fully work with modern "hu-HU" # gug=["gn"], # Deprecated # hat=[], # hau=["ha"], # Deprecated # hbs=["sh"], # Deprecated - hin=["hi-IN"], # hye=["hy"], + ita=["it-IT", "it-CH"], ind=["id-ID"], + ice=["is-IS"], + isl=["is-IS"], jav=["jv-ID"], jpn=["ja-JP"], kat=["ka-GE"], kaz=["kk-KZ"], - kir=["ky-KG"], + khm=["km-KH"], + kxm=["km-KH"], # Northern Khmer, might not work as well. + kan=["kn-IN"], + # kir=["ky-KG"], # Deprecated kor=["ko-KR"], # kur=["ku"], # Deprecated lao=["lo-LA"], lit=["lt-LT"], + lav=["lv-LV"], + lvs=["lv-LV"], # Standard Latvian # luo=[], mkd=["mk-MK"], mya=["my-MM"], - nan=["zh-TW", "nan-TW"], + mal=["ml-IN"], + mon=["mn-MN"], # Mongolian (Inclusive) + khk=["mn-MN"], # Khalkha Mongolian (Predominant) + mvf=["mn-MN"], # Peripheral Mongolian (Part) + mar=["mr-IN"], + zsm=["ms-MY"], + mlt=["mt-MT"], + nob=["nb-NO"], + nep=["ne-NP"], # Nepali (Macrolanguage) + npi=["ne-NP"], # Nepali + nld=["nl-NL", "nl-BE"], # Netherlands and Belgium + # omr=["mr-IN"], # Old Maranthi, might not work # nde=["nd"], # orm=["om"], pan=["pa-IN"], pes=["fa-IR"], pol=["pl-PL"], - por=["pt-BR", "pt-PT"], - prs=["prs-AF"], - pus=["pa-AF"], - ron=["ro-RO", "ro-MD"], + por=["pt-BR", "pt-PT"], # pt-BR = Portuguese Brazil, pt-PT = Portuguese Portugal + pus=["ps-AF"], # Pashto, Pushto (Inclusive) + pbu=["ps-AF"], # Northern Pahsto + pst=["ps-AF"], # Central Pahsto + pbt=["ps-AF"], # Southern Pahsto + sin=["si-LK"], + # prs=["prs-AF"], # Deprecated + # pus=["pa-AF"], # Deprecated + ron=["ro-RO"], # ro-MD deprecated # run=[], rus=["ru-RU"], slk=["sk-SK"], + slv=["sl-SI"], # sna=["sn"], som=["so-SO"], spa=["es-MX", "es-US", "es-AR", "es-BO", "es-CL", "es-CO", "es-CR", "es-CU", "es-DO", "es-EC", "es-SV", "es-GQ", "es-GT", "es-HN", "es-NI", "es-PA", "es-PY", "es-PE", "es-PR", "es-ES", "es-UY", "es-VE"], + sqi=["sq-AL"], swa=["sw-KE", "sw-TZ"], + swe=["sv-SE"], + srp=["sr-RS"], tam=["ta-IN"], + tel=["te-IN"], + # wbq = ["te-IN"], Waddar/Vadari is related to Telugu. # tat=[], - tgk=["tg-TJ"], - tgl=["fil-PH", "tl-PH"], + # tgk=["tg-TJ"], # Deprecated + tgl=["fil-PH"], # "tl-PH" deprecated tha=["th-TH"], # tir=[], - tpi=["tpi-PG"], + # tpi=["tpi-PG"], # Deprecated tur=["tr-TR"], ukr=["uk-UA"], urd=["ur-IN"], uzb=["uz-UZ"], vie=["vi-VN"], - yue=["zh-HK", "yue-CN"], + cmn=["zh-CN", "zh-CN-shandong", "zh-CN-sichuan", "zh-HK", "zh-TW"], + zho=["zh-CN", "zh-CN-shandong", "zh-CN-sichuan", "zh-HK", "zh-TW"], + yue=["yue-CN", "zh-HK"], # Cantonese + wuu=["wuu-CN"], + nan=["zh-TW"], # nan-TW deprecated + # Note, Taiwanese has one standard + one major dialect, + # not sure which is covered better by Azure. zul=["zu-ZA"] ) \ No newline at end of file diff --git a/python/AzureSpeechDetection/plugin-files/descriptor/descriptor.json b/python/AzureSpeechDetection/plugin-files/descriptor/descriptor.json index 9c9b6f51..7dec6ef0 100644 --- a/python/AzureSpeechDetection/plugin-files/descriptor/descriptor.json +++ b/python/AzureSpeechDetection/plugin-files/descriptor/descriptor.json @@ -10,7 +10,7 @@ "description": "Uses Azure Cognitive Services to perform speech-to-text.", "actionType": "DETECTION", "trackType": "SPEECH", - "outputChangedCounter" : 2, + "outputChangedCounter": 2, "requiresCollection": { "states": [] }, @@ -59,7 +59,7 @@ }, { "name": "LANGUAGE", - "description": "The language/locale to use for transcription.", + "description": "The language/locale, in BCP-47 format, to use for transcription. Please consult README to review Azure's supported list of BCP-47 codes.", "type": "STRING", "defaultValue": "en-US" }, @@ -193,4 +193,4 @@ ] } ] -} +} \ No newline at end of file diff --git a/python/AzureSpeechDetection/tests/test_acs_speech.py b/python/AzureSpeechDetection/tests/test_acs_speech.py index 2d899120..d1bf47bb 100644 --- a/python/AzureSpeechDetection/tests/test_acs_speech.py +++ b/python/AzureSpeechDetection/tests/test_acs_speech.py @@ -112,7 +112,7 @@ def test_audio_file(self): stop_time=-1, job_properties=get_test_properties( DIARIZE='FALSE', - LANGUAGE='en-US', + LANGUAGE='EN-us', USE_SAS_AUTH='TRUE' ), media_properties={}, @@ -137,7 +137,7 @@ def test_video_file(self): stop_frame=-1, job_properties=get_test_properties( DIARIZE='FALSE', - LANGUAGE='en-US' + LANGUAGE='En-Us' ), media_properties=dict( FPS='24' @@ -204,7 +204,7 @@ def test_language(self): stop_time=-1, job_properties=get_test_properties( DIARIZE='TRUE', - LANGUAGE='en-US' + LANGUAGE='en-us' ), media_properties={}, feed_forward_track=None diff --git a/python/AzureTranslation/README.md b/python/AzureTranslation/README.md index 2740de41..d58c4aa4 100644 --- a/python/AzureTranslation/README.md +++ b/python/AzureTranslation/README.md @@ -1,68 +1,85 @@ # Overview This repository contains source code for the OpenMPF Azure Cognitive Services -Translation Component. This component utilizes the [Azure Cognitive Services +Translation Component. This component utilizes the [Azure Cognitive Services Translator REST endpoint](https://docs.microsoft.com/en-us/azure/cognitive-services/translator/reference/v3-0-translate) -to translate the content of detection properties. It has only been tested -against v3.0 of the API. +to translate the content of detection properties. It has been tested against v3.0 of the +API. This component translates the content of existing detection properties, -so it only makes sense to use it with -[feed forward](https://openmpf.github.io/docs/site/Feed-Forward-Guide) and +so it only makes sense to use it with +[feed forward](https://openmpf.github.io/docs/site/Feed-Forward-Guide) and when it isn't the first element of a pipeline. - -When a detection property is translated, the translation is put in to a new -detection property named `TRANSLATION`. The original detection property is not -modified. A property named `TRANSLATION TO LANGUAGE` containing the BCP-47 + +When a detection property is translated, the translation is put in to a new +detection property named `TRANSLATION`. The original detection property is not +modified. A property named `TRANSLATION TO LANGUAGE` containing the BCP-47 language code of the translated text will also be added. If the language of the input text is detected to be the same as the `TO_LANGUAGE` job property, -then no translation will occur. When translation is skipped because of -matching languages, the `TRANSLATION` detection property will be omitted and +then no translation will occur. When translation is skipped because of +matching languages, the `TRANSLATION` detection property will be omitted and `SKIPPED TRANSLATION=TRUE` will be added to the detection properties. When the source text is multiple languages, the translation endpoint will only -translate one of the languages. For example, translating -"你叫什么名字? ¿Cómo te llamas?" to English results in +translate one of the languages. For example, translating +"你叫什么名字? ¿Cómo te llamas?" to English results in "What is your name? The Cómo te llamas?". # Required Job Properties In order for the component to process any jobs, the job properties listed below -must be provided. Neither has a default value. - -- `ACS_URL`: Base URL for the Azure Cognitive Services Translator Endpoint. - e.g. `https://api.cognitive.microsofttranslator.com` or - `https:///translator/text/v3.0`. The URL should - not end with `/translate` because two separate endpoints are - used. `ACS_URL + '/translate'` is used for translation. - `ACS_URL + '/breaksentence'` is used to break up text when it is too long - for a single translation request. This property can also be configured - using an environment variable named `MPF_PROP_ACS_URL`. - +must be provided. Neither has a default value. + +- `ACS_URL`: Base URL for the Azure Cognitive Services Translator Endpoint. + e.g. `https://api.cognitive.microsofttranslator.com` or + `https:///translator/text/v3.0`. The URL should + not end with `/translate` because two separate endpoints are + used. `ACS_URL + '/translate'` is used for translation. + `ACS_URL + '/breaksentence'` is used to break up text when it is too long + for a single translation request. This property can also be configured + using an environment variable named `MPF_PROP_ACS_URL`. + - `ACS_SUBSCRIPTION_KEY`: A string containing your Azure Cognitive Services - subscription key. To get one you will need to create an + subscription key. To get one you will need to create an Azure Cognitive Services account. This property can also be configured using an environment variable named `MPF_PROP_ACS_SUBSCRIPTION_KEY`. - - + + # Important Job Properties: -- `TO_LANGUAGE`: The BCP-47 language code for language that the properties - should be translated to. -- `FEED_FORWARD_PROP_TO_PROCESS`: Comma-separated list of property names indicating - which properties in the feed-forward track or detection to consider +- `TO_LANGUAGE`: The BCP-47 language code for language that the properties + should be translated to. + +- `FEED_FORWARD_PROP_TO_PROCESS`: Comma-separated list of property names indicating + which properties in the feed-forward track or detection to consider translating. For example, `TEXT,TRANSCRIPT`. If the first property listed is present, then that property will be translated. If it's not, then the next property in the list is considered. At most, one property will be translated. - `FROM_LANGUAGE`: In most cases, this property should not be used. It should - only be used when automatic language detection is detecting the wrong - language. Providing this property prevents the translation endpoint from - doing automatic language detection. If `FROM_LANGUAGE` is provided, and the - text is actually another language, the translation endpoint will return the + only be used when automatic language detection is detecting the wrong + language: Users can provide a BCP-47 code to force the translation service + to translate text with a corrected source language. + For instance, if incoming text is incorrectly being detected by Azure as + Spanish instead of English, users can set `FROM_LANGUAGE=en` + to force the service to treat all submitted text in the current job as English. + + Providing this property prevents the translation endpoint from + doing automatic language detection. If `FROM_LANGUAGE` is provided, and the + text is actually another language, the translation endpoint will return the input text unchanged. - + +- `SUGGESTED_FROM_LANGUAGE`: Optional property that indicates the fallback source + BCP-47 language code to use when automatic language detection fails. + The value from this property is only used when automatic language detection fails. + `SUGGESTED_FROM_LANGUAGE` is the preferred setting to adjust when users know + they are processing a large amount of text in a particular language, but other + source languages may be present in individual pieces of text. + For instance, setting `SUGGESTED_FROM_LANGUAGE=es` would allow the component to + default to translating from Spanish, whenever Azure's language detector fails + to identify the source language of the incoming text. + # Listing Supported Languages -To list the supported languages replace `${ACS_URL}` and +To list the supported languages replace `${ACS_URL}` and `${ACS_SUBSCRIPTION_KEY}` in the following command and run it: ```shell script curl -H "Ocp-Apim-Subscription-Key: ${ACS_SUBSCRIPTION_KEY}" "https://${ACS_URL}/languages?api-version=3.0&scope=translation" diff --git a/python/AzureTranslation/acs_translation_component/convert_language_code.py b/python/AzureTranslation/acs_translation_component/convert_language_code.py index abfdabdf..402da4e4 100644 --- a/python/AzureTranslation/acs_translation_component/convert_language_code.py +++ b/python/AzureTranslation/acs_translation_component/convert_language_code.py @@ -24,70 +24,322 @@ # limitations under the License. # ############################################################################# +import logging from typing import Optional import langcodes +logger = logging.getLogger('AcsTranslationComponent') + +# A full list of supported languages can be found here: +# https://learn.microsoft.com/en-us/azure/ai-services/translator/language-support + +# For some cases, we'll need to distinguish incoming script info +# As general practice these script codes are attached +# to the ISO-639. +ISO639_WITH_SCRIPT_TO_BCP47 = { + "ZHO-HANS":"zh-hans", + "ZHO-HANT":"zh-hant" +} + ISO6393_TO_BCP47 = dict( + AFK='af', AMH='am', - ARA='ar', + ARA='ar', # Note, Large number of variants + ASM='as', AZE='az', + BAK='bk', + BEN='bn', + BHO='bho', # Azure uses ISO code. BOD='bo', + BOS='bs', + BRX='brx', # Azure uses ISO code. BUL='bg', + CAT='ca', CES='cs', + ZHO='zh-hans', # Choosing to associate baseline Chinese as simplfied variant. + ZH='zh-hans',# Choosing to associate baseline Chinese as simplfied variant. + # Change to zh-hant if needed. CMN='zh-hans', + CYM='cy', + DAN='da', # Insular Danish + DEU='de', # German, note: a lot of variants exist + DIV='dv', + DOI='doi', # Two other variants + DSB='dsb', + EUS='eu', ELL='el', ENG='en', + EST='et', # Two other variants + FAO='fo', + FIJ='fj', + FIL='fil', + FIN='fi', FRA='fr', + GLE='ga', + GLG='gl', + GUJ='gu', HAT='ht', + HAU='ha', + HEB='he', # Several archaic forms: https://en.wikipedia.org/wiki/Hebrew_language HIN='hi', + HRV='hr', + HSB='hsb', + HUN='hu', # Old Hungarian also exists as code `OHU` HYE='hy', + IBO='ig', IND='id', + IKT='ikt', + IKU='iu', + ISL='is', + ITA='it', JPN='ja', KAT='ka', + KAN='kn', + KAS='ks', KAZ='kk', + KHM='km', + KIN='rw', KIR='ky', + LUG='lug', + GOM='gom', # Goan Konkani, other two ISO variants redirected to this KOR='ko', KUR='ku', + CKB='ku', # Azure noted Central Kurdish is supported as Ku + KMR='kmr', # Northern Kurdish + # There areNorthern two other variants of Kurdish but them don't seem to be directly supported. LAO='lo', - LIT='lt', + LAV='lv', + LIN='ln', + LIT='lt', # There is an old Lithuanian variant (OLT) + LZH='lzh', + MAI='mai', + MAL='ml', + MAR='mr', # There's also an old variant Marathi variant (OMR) MKD='mk', - MYA='my', + MLG='mg', # Note: Many regional forms: https://en.wikipedia.org/wiki/Malagasy_language + MLT='mt', + MON = 'mn-cyrl', # Note: Azure also supports the traditional Mongolian script as `mn-Mong` + # The primary script these days is Cyrllic/Latin. + # From https://en.wikipedia.org/wiki/Mongolian_writing_systems: + # "In March 2020, the Government of Mongolia announced plans to use the + # traditional Mongolian script alongside Cyrillic in official documents starting from 2025." + KHK='mn-cyrl', # Khalkha Mongolian + MVF='mn-mong', # Peripheral Mongolian (part) + MWW='mww', + MRI='mi', + MSA='ms', # Note: Many regional forms: https://en.wikipedia.org/wiki/Malay_language + MYA='my', # Several variants exist. NAN='zh-hant', + NEP='ne', + NPI='ne', + NLD='nl', + NOR='no', + NOB='no', # Two subtypes of Norwegian in active use. + NNO='no', + NYA='ny', + NSO='nso', + ORI='or', # Several variants exist. + ORY='or', + OTQ='otq', PAN='pa', PES='fa', POL='pl', - POR='pt', + POR='pt', # Defaulting to Portuguese (Brazil) - Other variant below PRS='prs', - PUS='ps', + PUS='ps', # Several variants exist. RON='ro', + RUN='rn', RUS='ru', + SIN='si', SLK='sk', + SLV='sl', + SMO='sm', # Samoan Latin SOM='so', + SOT='st', + SRP='sr-Cyrl', # Note: Serbian language is fully digraphic, two popular script forms exist + # Cyrillic is the official version adopted by Serbia SPA='es', + SNA='sn', # Three other variants exist under Shona language. + SND='sd', SQI='sq', SWA='sw', - TAM='ta', + SWE='sv', + TAH='ty', + TAM='ta', # Old Tamil variant exists as OTY TAT='tt', + TEL='te', # Related language: wbq – Waddar (Vadari), not included. THA='th', TIR='ti', + TSN='tn', + TUK='tk', TUR='tr', + TON='to', + UIG='ug', UKR='uk', URD='ur', UZB='uz', VIE='vi', + XHO='xh', + YOR='yo', YUE='yue', - ZUL='zu' + YUA='yua', + ZUL='zu', ) -BCP_CODES = set(ISO6393_TO_BCP47.values()) +# These cover conflicting 639-2 codes and less common variants +# A warning will be issued if these are used. +ISO639_VAR_TO_BCP47 = dict( + # ISO Code Variant = Same language but two ISO-639 codes match to it. + # Variant = May be different from primary ISO code. + # Note: We're avoiding adding in archaic/extinct variants, attaching a note to + # languages with those present. + + ALB = 'sq', # 639-2 Code Variant + ARM = 'hy', # 639-2 Code Variant + AZJ = 'az', # North Azerbaijani Variant + AZB = 'az', # South Azerbaijani Variant + BAQ = 'eu', # 639-2 Code Variant + CZE = 'cs', # 639-2 Code Variant Czech + + TWL = 'sn', # 639-3 Variants of Shona + MXC = 'sn', # 639-3 Variants of Shona + TWX = 'sn', # 639-3 Variants of Shona + + JUT = 'da', # 639-3: Jutish - Danish Dialect + + DGO = 'doi', # Dogri proper + XNR = 'doi', # Kangri + + DUT = 'nl', # 639-2 Code Variant of Dutch + + EKK = 'et', # Standard Estonian + VRO = 'et', # Võro - Estonian Dialect (Debated) + + FRE = 'fr', # 639-2 Code Variant of French + + GEO = 'ka', # 639-2 Code Variant of Georgian + + GER = 'de', # 639-2 Code Variant of German + # Warning: There's many other variants of old and regional forms. + # https://en.wikipedia.org/wiki/German_language + + GRE = 'el', # 639-2 Code Variant of Greek + # Note: There's several other variants + # https://en.wikipedia.org/wiki/Greek_language + + ICE = 'is', # 639-2 Code Variant of Icelandic + + IKE = 'iu', # Eastern Canadian Inuktitut + + KXM = 'km', # Northern Khmer + + KOK = 'gom', # Kokani + KNN = 'gom', # Maharashtrian Konkani + + TTS = 'lo', # Isan (Thailand Lao) + + LVS = 'lv', # Standard Latvian language + # LTG = 'lv' # Latgalian language (Historical Form) + + MAC = 'mk', # 639-2 Code Variant of Macedonian + + MAY = 'ms', # 639-2 Code Variant of Malay + ZLM = 'ms', # Malay (individual language) + ZSM = 'ms', # Malaysian Malay + + MAO = 'mi', # 639-2 Code Variant of Maori + + BUR = 'my', # 639-2 Code Variant of Burmese (Myanmar) + # Several other closely related Burmese variants exist below: + INT = 'my', # Intha + TCO = 'my', # Taungyo + RKI = 'my', # Rakhine + RMZ = 'my', # Marma + TAY = 'my', # Tavoyan dialects + + # Variants of Odia + SPV = 'or', # Sambalpuri + ORT = 'or', # Adivasi Odia (Kotia) + DSO = 'or', # Desiya + + + # Variants of Pashto + PST = 'ps', # Central Pashto + PBU = 'ps', # Northern Pashto + PBT = 'ps', # Southern Pashto + # WNE - Archaic + + # Persian has many variants + # Only including top three + # PES = 'fa' # Iranian Persian - Default below + # PRS = 'fa' # Dari - Default Below + TGK = 'fa', # Tajik + + PNB = 'pa', # Western Punjabi/Panjabi + + RUM = 'ro', # 639-2 Code Variant of Romanian + + SLO = 'sk', # 639-2 Code Variant of Slovak + + # Swahili Variants + SWC='sw', # Congo Swahili + SWH='sw', # Coastal Swahili + # YMK='sw', # Makwe (?) + # WMW='sw', # Mwani (?) + + TIB='bo', # 639-2 Code Variant of Tibetan + + UZN='uz', # Northern Uzbek + UZS='uz', # Southern Uzbek + + WEL='cy', # 639-2 Code Variant of Welsh + ) + +BCP_CODES_ONLY = { + 'iu-latn', # Inuktitut (Latin) + 'fr-ca', # French Canadian + 'tlh-latn', # Klingon Latin + 'tlh-piqd', # Klingon (plqaD) + 'mn-cyrl', # Mongolian (Cyrllic) + 'mn-mong', # Mongolian (Traditional) + 'pt-pt', # Portuguese (Portugal) + 'sr-latn', # Serbian (Latin) +} + + + +BCP_CODES = BCP_CODES_ONLY | \ + set(ISO6393_TO_BCP47.values()) | \ + set(ISO639_WITH_SCRIPT_TO_BCP47.values()) def iso_to_bcp(language_code: str) -> Optional[str]: - if bcp_code := ISO6393_TO_BCP47.get(language_code.upper()): + # First check if we have matching scripts/regional variants + language_code = language_code.strip() + if bcp_code := ISO639_WITH_SCRIPT_TO_BCP47.get(language_code.upper()): return bcp_code elif language_code.lower() in BCP_CODES: - return language_code - elif lang_info := langcodes.get(language_code): + return language_code.lower() + + lang_code = language_code.upper() + + # Remove attached script/variant info, + # Check language portion of ISO code next. + if '-' in lang_code: + lang_code = lang_code.split('-')[0] + if bcp_code := ISO6393_TO_BCP47.get(lang_code): + return bcp_code + elif lang_code.lower() in BCP_CODES: + return lang_code.lower() + elif lang_info := langcodes.get(lang_code): + # TODO, after langcodes conversion, we may want to consider double checking the BCP codes again + # discard if value does not match supported codes. return lang_info.language + elif bcp_code_var := ISO639_VAR_TO_BCP47.get(lang_code): + logger.warning( + f"Unable to find direct a BCP code match for {language_code}. " + f"Found a potential BCP match or variant: {bcp_code_var}. " + f"Using `{bcp_code_var}` as input language.") + return bcp_code_var else: return None diff --git a/python/AzureTranslation/plugin-files/descriptor/descriptor.json b/python/AzureTranslation/plugin-files/descriptor/descriptor.json index 2533d996..24dd014f 100644 --- a/python/AzureTranslation/plugin-files/descriptor/descriptor.json +++ b/python/AzureTranslation/plugin-files/descriptor/descriptor.json @@ -10,7 +10,7 @@ "description": "Uses Azure Cognitive Services to perform translation.", "actionType": "DETECTION", "trackType": "TRANSLATION", - "outputChangedCounter" : 1, + "outputChangedCounter": 1, "requiresCollection": { "states": [] }, @@ -41,19 +41,19 @@ }, { "name": "TO_LANGUAGE", - "description": "The BCP-47 language code for language that the properties should be translated to.", + "description": "The BCP-47 language code for language that the properties should be translated to. Please consult README to query Azure's supported list of BCP-47 codes for translation.", "type": "STRING", "defaultValue": "en" }, { "name": "FROM_LANGUAGE", - "description": "Optional property that indicates the source language of the text. When provided, it disables automatic language detection. If the text isn't actually the specified FROM_LANGUAGE, the translation endpoint returns the text unmodified.", + "description": "Optional property that indicates the source BCP-47 language code of the text (i.e. 'es' to translate from text confirmed to be in Spanish). When provided, it disables automatic language detection. If the text isn't actually the specified FROM_LANGUAGE, the translation endpoint returns the text unmodified. Please consult README to query Azure's supported list of BCP-47 codes for translation.", "type": "STRING", "defaultValue": "" }, { "name": "SUGGESTED_FROM_LANGUAGE", - "description": "Optional property that indicates the fallback source language to use when automatic language detection fails. The value from this property is only used when automatic language detection fails.", + "description": "Optional property that indicates the fallback source BCP-47 language code to use when automatic language detection fails (i.e. 'es' to translate text that is suspected to be in Spanish). The value from this property is only used when automatic language detection fails. Please consult README to query Azure's supported list of BCP-47 codes for translation.", "type": "STRING", "defaultValue": "" }, @@ -104,8 +104,7 @@ "name": "AZURE TRANSLATION TEXT FILE ACTION", "description": "Uses Azure Cognitive Services to perform translation on a plain text file.", "algorithm": "AZURETRANSLATION", - "properties": [ - ] + "properties": [] } ], "tasks": [ diff --git a/python/AzureTranslation/tests/test_acs_translation.py b/python/AzureTranslation/tests/test_acs_translation.py index d9651d1b..83eac939 100644 --- a/python/AzureTranslation/tests/test_acs_translation.py +++ b/python/AzureTranslation/tests/test_acs_translation.py @@ -46,6 +46,8 @@ get_azure_char_count, TranslationClient, NewLineBehavior, ChineseAndJapaneseCodePoints, AcsTranslateUrlBuilder, BreakSentenceClient, SentenceBreakGuesser, get_n_azure_chars) +from acs_translation_component.convert_language_code import iso_to_bcp + SEEN_TRACE_IDS = set() @@ -88,6 +90,17 @@ def get_request_body(cls) -> List['AcsRequestEntry']: def tearDown(self): self.mock_server.drain_queues() + def test_iso_code_checker(self): + self.assertEqual('zh-hans', iso_to_bcp("ZH")) + self.assertEqual('zh-hans', iso_to_bcp("Zh")) + self.assertEqual('zh-hans', iso_to_bcp("zh")) + self.assertEqual('zh-hans', iso_to_bcp("ZHO")) + + self.assertEqual('zh-hant', iso_to_bcp("ZHO-HANT")) + self.assertEqual('zh-hant', iso_to_bcp("Zho-haNT")) + self.assertEqual('zh-hant', iso_to_bcp("ZH-Hant")) + self.assertEqual('zh-hans', iso_to_bcp("zh-HANS")) + self.assertEqual('fr-ca', iso_to_bcp("fr-ca")) def test_simple_jobs(self): def validate_results(results):