From d8f17b34c46d3cd693713b10e54e68a326368971 Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Thu, 4 Apr 2024 20:32:36 -0400 Subject: [PATCH 1/6] Updating language maps for Azure STT and Translation. Updating translation mapping behavior. --- .../acs_speech_component/azure_utils.py | 88 +++++- .../convert_language_code.py | 268 +++++++++++++++++- .../tests/test_acs_translation.py | 10 + 3 files changed, 340 insertions(+), 26 deletions(-) diff --git a/python/AzureSpeechDetection/acs_speech_component/azure_utils.py b/python/AzureSpeechDetection/acs_speech_component/azure_utils.py index 835d19ac..cba3c8b5 100644 --- a/python/AzureSpeechDetection/acs_speech_component/azure_utils.py +++ b/python/AzureSpeechDetection/acs_speech_component/azure_utils.py @@ -28,73 +28,131 @@ # BCP-47 code in each value list will be used (the rest are primarily for # reference in case of later changes or customization) ISO6393_TO_BCP47 = dict( + afr=["af-ZA"], amh=["am-ET"], ara=["ar-EG", "ar-SA", "ar-IQ", "ar-IL", "ar-AE", "ar-SY", "ar-LY", "ar-DZ", "ar-BH", "ar-JO", "ar-KW", "ar-LB", "ar-MA", "ar-OM", "ar-PS", "ar-QA", "ar-TN", "ar-YE"], aze=["az-AZ"], - bel=["be-BY"], - ben=["bn-BD", "bn-IN"], + # bel=["be-BY"], # Depreciated + # ben=["bn-BD", "bn-IN"], # Bengali-Bangladesh has been depreciated + ben=["bn-IN"], # bod=["bo"], # Deprecated bul=["bg-BG"], + bos=["bs-BA"], + cat=["ca-ES"], # ceb=["ceb"], ces=["cs-CZ"], - cmn=["zh-CN"], + cym=["cy-GB"], + cze=["cs-CZ"], + dan=["da-DK"], + deu=["de-DE", "de-AT", "de-CH"], ell=["el-GR"], eng=["en-US", "en-CA", "en-GB", "en-AU", "en-GH", "en-HK", "en-IN", "en-IE", "en-KE", "en-NZ", "en-NG", "en-PH", "en-SG", "en-ZA", "en-TZ"], - fra=["fr-FR", "fr-CA", "fr-CH"], + est=["et-EE"], # Estonian (Inclusive) + ekk=["et-EE"], # Standard Estonian + #vro=["et-EE"], Voro, doesn't seem to be direct match + eus=["eu-ES"], + fas=["fa-IR"], + fin=["fi-FI"], + fil=["fil-PH"], + fra=["fr-FR", "fr-BE", "fr-CA", "fr-CH"], + gle=["ga-IE"], + glg=["gl-ES"], + guj=["gu-IN"], + heb=["he-IL"], + hin=["hi-IN"], + hrv=["hr-HR"], + hun=["hu-HU"], + #ohu=["hu-HU"], # Note: Old-Hungarian, might not fully work with modern "hu-HU" # gug=["gn"], # Deprecated # hat=[], # hau=["ha"], # Deprecated # hbs=["sh"], # Deprecated - hin=["hi-IN"], # hye=["hy"], + ita=["it-IT", "it-CH"], ind=["id-ID"], + ice=["is-IS"], + isl=["is-IS"], jav=["jv-ID"], jpn=["ja-JP"], kat=["ka-GE"], kaz=["kk-KZ"], - kir=["ky-KG"], + khm=["km-KH"], + kxm=["km-KH"], # Northern Khmer, might not work as well. + kan=["kn-IN"], + # kir=["ky-KG"], # Deprecated kor=["ko-KR"], # kur=["ku"], # Deprecated lao=["lo-LA"], lit=["lt-LT"], + lav=["lv-LV"], + lvs=["lv-LV"], # Standard Latvian # luo=[], mkd=["mk-MK"], mya=["my-MM"], - nan=["zh-TW", "nan-TW"], + mal=["ml-IN"], + mon=["mn-MN"], # Mongolian (Inclusive) + khk=["mn-MN"], # Khalkha Mongolian (Predominant) + mvf=["mn-MN"], # Peripheral Mongolian (Part) + mar=["mr-IN"], + zsm=["ms-MY"], + mlt=["mt-MT"], + nob=["nb-NO"], + nep=["ne-NP"], # Nepali (Macrolanguage) + npi=["ne-NP"], # Nepali + nld=["nl-NL", "nl-BE"], # Netherlands and Belgium + # omr=["mr-IN"], # Old Maranthi, might not work # nde=["nd"], # orm=["om"], pan=["pa-IN"], pes=["fa-IR"], pol=["pl-PL"], - por=["pt-BR", "pt-PT"], - prs=["prs-AF"], - pus=["pa-AF"], - ron=["ro-RO", "ro-MD"], + por=["pt-BR", "pt-PT"], # pt-BR = Portuguese Brazil, pt-PT = Portuguese Portugal + pus=["ps-AF"], # Pashto, Pushto (Inclusive) + pbu=["ps-AF"], # Northern Pahsto + pst=["ps-AF"], # Central Pahsto + pbt=["ps-AF"], # Southern Pahsto + sin=["si-LK"], + #prs=["prs-AF"], # Deprecated + #pus=["pa-AF"], # Deprecated + #ron=["ro-RO", "ro-MD"], # ro-MD depreciated + ron=["ro-RO"], # run=[], rus=["ru-RU"], slk=["sk-SK"], + slv=["sl-SI"], # sna=["sn"], som=["so-SO"], spa=["es-MX", "es-US", "es-AR", "es-BO", "es-CL", "es-CO", "es-CR", "es-CU", "es-DO", "es-EC", "es-SV", "es-GQ", "es-GT", "es-HN", "es-NI", "es-PA", "es-PY", "es-PE", "es-PR", "es-ES", "es-UY", "es-VE"], + sqi=["sq-AL"], swa=["sw-KE", "sw-TZ"], + swe=["sv-SE"], + srp=["sr-RS"], tam=["ta-IN"], + tel=["te-IN"], + # wbq = ["te-IN"], Waddar/Vadari is related to Telugu. # tat=[], - tgk=["tg-TJ"], - tgl=["fil-PH", "tl-PH"], + #tgk=["tg-TJ"], # Depreciated + #tgl=["fil-PH", "tl-PH"], # "tl-PH" Depreciated tha=["th-TH"], # tir=[], - tpi=["tpi-PG"], + #tpi=["tpi-PG"], # Depreciated tur=["tr-TR"], ukr=["uk-UA"], urd=["ur-IN"], uzb=["uz-UZ"], vie=["vi-VN"], - yue=["zh-HK", "yue-CN"], + cmn=["zh-CN", "zh-CN-shandong", "zh-CN-sichuan", "zh-HK", "zh-TW"], + zho=["zh-CN", "zh-CN-shandong", "zh-CN-sichuan", "zh-HK", "zh-TW"], + yue=["yue-CN", "zh-HK"], # Cantonese + wuu=["wuu-CN"], + # nan=["zh-TW", "nan-TW"], # nan-TW depreciated + nan=["zh-TW"], # Note, Taiwanese has one standard + one major dialect, + # not sure which is covered better by Azure. zul=["zu-ZA"] ) \ No newline at end of file diff --git a/python/AzureTranslation/acs_translation_component/convert_language_code.py b/python/AzureTranslation/acs_translation_component/convert_language_code.py index abfdabdf..a3080bee 100644 --- a/python/AzureTranslation/acs_translation_component/convert_language_code.py +++ b/python/AzureTranslation/acs_translation_component/convert_language_code.py @@ -24,70 +24,316 @@ # limitations under the License. # ############################################################################# +import logging from typing import Optional import langcodes +logger = logging.getLogger('AcsTranslationComponent') + +# These cover conflicting 639-2 codes and less common variants +# A warning will be issued if these are used. +ISO639_VAR_TO_BCP47 = dict( + # ISO Code Variant = Same language but two ISO-639 codes match to it. + # Variant = May be different from primary ISO code. + # Note: We're avoiding adding in archaic/extinct variants, attaching a note to + # languages with those present. + + ALB = 'sq', # 639-2 Code Variant + ARM = 'hy', # 639-2 Code Variant + AZJ = 'az', # North Azerbaijani Variant + AZB = 'az', # South Azerbaijani Variant + BAQ = 'eu', # 639-2 Code Variant + CZE = 'cs', # 639-2 Code Variant Czech + + TWL = 'sn', # 639-3 Variants of Shona + MXC = 'sn', # 639-3 Variants of Shona + TWX = 'sn', # 639-3 Variants of Shona + + JUT = 'da', # 639-3: Jutish - Danish Dialect + + DGO = 'doi', # Dogri proper + XNR = 'doi', # Kangri + + DUT = 'nl', # 639-2 Code Variant of Dutch + + EKK = 'et', # Standard Estonian + VRO = 'et', # Võro - Estonian Dialect (Debated) + + FRE = 'fr', # 639-2 Code Variant of French + + GEO = 'ka', # 639-2 Code Variant of Georgian + + GER = 'de', # 639-2 Code Variant of German + # Warning: There's many other variants of old and regional forms. + # https://en.wikipedia.org/wiki/German_language + + GRE = 'el', # 639-2 Code Variant of Greek + # Note: There's several other variants + # https://en.wikipedia.org/wiki/Greek_language + + ICE = 'is', # 639-2 Code Variant of Icelandic + + IKE = 'iu', # Eastern Canadian Inuktitut + + KXM = 'km', # Northern Khmer + + KOK = 'gom', # Kokani + KNN = 'gom', # Maharashtrian Konkani + + TTS = 'lo', # Isan (Thailand Lao) + + LVS = 'lv', # Standard Latvian language + #LTG = 'lv' # Latgalian language (Historical Form) + + MAC = 'mk', # 639-2 Code Variant of Macedonian + + MAY = 'ms', # 639-2 Code Variant of Malay + ZLM = 'ms', # Malay (individual language) + ZSM = 'ms', # Malaysian Malay + + MAO = 'mi', # 639-2 Code Variant of Maori + + BUR = 'my', # 639-2 Code Variant of Burmese (Myanmar) + # Several other closely related Burmese variants exist below: + INT = 'my', # Intha + TCO = 'my', # Taungyo + RKI = 'my', # Rakhine + RMZ = 'my', # Marma + TAY = 'my', # Tavoyan dialects + + # Variants of Odia + SPV = 'or', # Sambalpuri + ORT = 'or', # Adivasi Odia (Kotia) + DSO = 'or', # Desiya + + + # Variants of Pashto + PST = 'ps', # Central Pashto + PBU = 'ps', # Northern Pashto + PBT = 'ps', # Southern Pashto + # WNE - Archaic + + # Persian has many variants + # Only including top three + #PES = 'fa' # Iranian Persian - Default below + #PRS = 'fa' # Dari - Default Below + TGK = 'fa', # Tajik + + PNB = 'pa', # Western Punjabi/Panjabi + + RUM = 'ro', # 639-2 Code Variant of Romanian + + SLO = 'sk', # 639-2 Code Variant of Slovak + + # Swahili Variants + SWC='sw', # Congo Swahili + SWH='sw', # Coastal Swahili + #YMK='sw', # Makwe (?) + #WMW='sw', # Mwani (?) + + TIB='bo', # 639-2 Code Variant of Tibetan + + UZN='uz', # Northern Uzbek + UZS='uz', # Southern Uzbek + + WEL='cy', # 639-2 Code Variant of Welsh + ) + + +# For some cases, we'll need to distinguish incoming script info +# As general practice these script codes are attached +# to the ISO-639. +ISO639_WITH_SCRIPT_TO_BCP47 = { + "ZHO-HANS":"zh-hans", + "ZHO-HANT":"zh-hant" +} + ISO6393_TO_BCP47 = dict( + AFK='af', AMH='am', - ARA='ar', + ARA='ar', # Note, Large number of variants + ASM='as', AZE='az', + BAK='bk', + BEN='bn', + BHO='bho', # Azure uses ISO code. BOD='bo', + BOS='bs', + BRX='brx', # Azure uses ISO code. BUL='bg', + CAT='ca', CES='cs', + ZHO='zh-hans', # Choosing to associate baseline Chinese as simplfied variant. + ZH='zh-hans',# Choosing to associate baseline Chinese as simplfied variant. + # Change to zh-hant if needed. CMN='zh-hans', + CYM='cy', + DAN='da', # Insular Danish + DEU='de', # German, note: a lot of variants exist + DIV='dv', + DOI='doi', # Two other variants + DSB='dsb', + EUS='eu', ELL='el', ENG='en', + EST='et', # Two other variants + FAO='fo', + FIJ='fj', + FIL='fil', + FIN='fi', FRA='fr', + GLE='ga', + GLG='gl', + GUJ='gu', HAT='ht', + HAU='ha', + HEB='he', # Several archaic forms: https://en.wikipedia.org/wiki/Hebrew_language HIN='hi', + HRV='hr', + HSB='hsb', + HUN='hu', # Old Hungarian also exists as code `OHU` HYE='hy', + IBO='ig', IND='id', + IKT='ikt', + IKU='iu', + ISL='is', + ITA='it', JPN='ja', KAT='ka', + KAN='kn', + KAS='ks', KAZ='kk', + KHM='km', + KIN='rw', KIR='ky', + LUG='lug', + GOM='gom', # Goan Konkani, other two ISO variants redirected to this KOR='ko', KUR='ku', + CKB='ku', # Azure noted Central Kurdish is supported as Ku + KMR='kmr', # Northern Kurdish + # There areNorthern two other variants of Kurdish but them don't seem to be directly supported. LAO='lo', - LIT='lt', + LAV='lv', + LIN='ln', + LIT='lt', # There is an old Lithuanian variant (OLT) + LZH='lzh', + MAI='mai', + MAL='ml', + MAR='mr', # There's also an old variant Marathi variant (OMR) MKD='mk', - MYA='my', + MLG='mg', # Note: Many regional forms: https://en.wikipedia.org/wiki/Malagasy_language + MLT='mt', + MON = 'mn-cyrl', # Note: Azure also supports the traditional Mongolian script as `mn-Mong` + # The primary script these days is Cyrllic/Latin. + # From https://en.wikipedia.org/wiki/Mongolian_writing_systems: + # "In March 2020, the Government of Mongolia announced plans to use the + # traditional Mongolian script alongside Cyrillic in official documents starting from 2025." + KHK='mn-cyrl', # Khalkha Mongolian + MVF='mn-mong', # Peripheral Mongolian (part) + MWW='mww', + MRI='mi', + MSA='ms', # Note: Many regional forms: https://en.wikipedia.org/wiki/Malay_language + MYA='my', # Several variants exist. NAN='zh-hant', + NEP='ne', + NPI='ne', + NLD='nl', + NOR='no', + NOB='no', # Two subtypes of Norwegian in active use. + NNO='no', + NYA='ny', + NSO='nso', + ORI='or', # Several variants exist. + ORY='or', + OTQ='otq', PAN='pa', PES='fa', POL='pl', - POR='pt', + POR='pt', # Defaulting to Portuguese (Brazil) - Other variant below PRS='prs', - PUS='ps', + PUS='ps', # Several variants exist. RON='ro', + RUN='rn', RUS='ru', + SIN='si', SLK='sk', + SLV='sl', + SMO='sm', # Samoan Latin SOM='so', + SOT='st', + SRP='sr-Cyrl', # Note: Serbian language is fully digraphic, two popular script forms exist + # Cyrillic is the official version adopted by Serbia SPA='es', + SNA='sn', # Three other variants exist under Shona language. + SND='sd', SQI='sq', SWA='sw', - TAM='ta', + SWE='sv', + TAH='ty', + TAM='ta', # Old Tamil variant exists as OTY TAT='tt', + TEL='te', # Related language: wbq – Waddar (Vadari), not included. THA='th', TIR='ti', + TSN='tn', + TUK='tk', TUR='tr', + TON='to', + UIG='ug', UKR='uk', URD='ur', UZB='uz', VIE='vi', + XHO='xh', + YOR='yo', YUE='yue', - ZUL='zu' + YUA='yua', + ZUL='zu', ) +BCP_CODES_ONLY = { + 'iu-latn', # Inuktitut (Latin) + 'fr-ca', # French Canadian + 'tlh-latn', # Klingon Latin + 'tlh-piqd', # Klingon (plqaD) + 'mn-cyrl', # Mongolian (Cyrllic) + 'mn-mong', # Mongolian (Traditional) + 'pt-pt', # Portuguese (Portugal) + 'sr-latn', # Serbian (Latin) +} -BCP_CODES = set(ISO6393_TO_BCP47.values()) +BCP_CODES = BCP_CODES_ONLY | \ + set(ISO6393_TO_BCP47.values()) | \ + set(ISO639_WITH_SCRIPT_TO_BCP47.values()) def iso_to_bcp(language_code: str) -> Optional[str]: - if bcp_code := ISO6393_TO_BCP47.get(language_code.upper()): + # First check if we have matching scripts/regional variants + if bcp_code := ISO639_WITH_SCRIPT_TO_BCP47.get(language_code.upper()): return bcp_code elif language_code.lower() in BCP_CODES: - return language_code - elif lang_info := langcodes.get(language_code): + return language_code.lower() + + lang_code = language_code.upper().strip() + + # Remove attached script/variant info, + # Check language portion of ISO code next. + if '-' in lang_code: + lang_code = lang_code.split('-')[0] + if bcp_code := ISO6393_TO_BCP47.get(lang_code): + return bcp_code + elif lang_code.lower() in BCP_CODES: + return lang_code.lower() + elif lang_info := langcodes.get(lang_code): + # TODO, after langcodes conversion, we may want to consider double checking the BCP codes again + # discard if value does not match supported codes. return lang_info.language + elif bcp_code_var := ISO639_VAR_TO_BCP47.get(lang_code): + logger.warning( + f"Unable to find direct a BCP code match for {language_code}\n" + f"Found a potential BCP match or variant: {bcp_code_var}\n" + f"Using `{bcp_code_var}` as input language ") + return bcp_code_var else: return None diff --git a/python/AzureTranslation/tests/test_acs_translation.py b/python/AzureTranslation/tests/test_acs_translation.py index d9651d1b..88643d05 100644 --- a/python/AzureTranslation/tests/test_acs_translation.py +++ b/python/AzureTranslation/tests/test_acs_translation.py @@ -46,6 +46,8 @@ get_azure_char_count, TranslationClient, NewLineBehavior, ChineseAndJapaneseCodePoints, AcsTranslateUrlBuilder, BreakSentenceClient, SentenceBreakGuesser, get_n_azure_chars) +from acs_translation_component.convert_language_code import iso_to_bcp + SEEN_TRACE_IDS = set() @@ -88,6 +90,14 @@ def get_request_body(cls) -> List['AcsRequestEntry']: def tearDown(self): self.mock_server.drain_queues() + def test_iso_code_checker(self): + self.assertEqual('zh-hans', iso_to_bcp("ZH")) + self.assertEqual('zh-hans', iso_to_bcp("ZHO")) + + self.assertEqual('zh-hant', iso_to_bcp("ZHO-HANT")) + self.assertEqual('zh-hant', iso_to_bcp("ZH-HANT")) + self.assertEqual('zh-hans', iso_to_bcp("ZH-HANS")) + self.assertEqual('fr-ca', iso_to_bcp("fr-ca")) def test_simple_jobs(self): def validate_results(results): From 4a317ba6b09742092aca013900518e419aa59164 Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Fri, 5 Apr 2024 02:48:29 -0400 Subject: [PATCH 2/6] Adding info, minor cleanup and lang update. --- .../acs_speech_component/azure_utils.py | 27 ++++++++++++------- .../convert_language_code.py | 3 +++ 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/python/AzureSpeechDetection/acs_speech_component/azure_utils.py b/python/AzureSpeechDetection/acs_speech_component/azure_utils.py index cba3c8b5..77fb05b9 100644 --- a/python/AzureSpeechDetection/acs_speech_component/azure_utils.py +++ b/python/AzureSpeechDetection/acs_speech_component/azure_utils.py @@ -27,6 +27,9 @@ # Dict of conversions from ISO639-3 language codes to BCP-47 codes. The first # BCP-47 code in each value list will be used (the rest are primarily for # reference in case of later changes or customization) + +# Supported languages can be found here: +# https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=stt ISO6393_TO_BCP47 = dict( afr=["af-ZA"], amh=["am-ET"], @@ -34,19 +37,26 @@ "ar-BH", "ar-JO", "ar-KW", "ar-LB", "ar-MA", "ar-OM", "ar-PS", "ar-QA", "ar-TN", "ar-YE"], aze=["az-AZ"], + azj=["az-AZ"], # North Azerbaijani + azb=["az-AZ"], # South Azerbaijani # bel=["be-BY"], # Depreciated - # ben=["bn-BD", "bn-IN"], # Bengali-Bangladesh has been depreciated - ben=["bn-IN"], + ben=["bn-IN"], # "bn-BD" Bengali-Bangladesh has been depreciated # bod=["bo"], # Deprecated bul=["bg-BG"], bos=["bs-BA"], cat=["ca-ES"], # ceb=["ceb"], ces=["cs-CZ"], + cze=["cs-CZ"],# ISO-639-2 Variant cym=["cy-GB"], - cze=["cs-CZ"], - dan=["da-DK"], + wel=["cy-GB"],# ISO-639-2 Variant + dan=["da-DK"], # Note: There is a related dialect JUT - Jutlandic + jut=["da-DK"], # Upon further research, Jutlantic is present in Denmark + # but declining over time. deu=["de-DE", "de-AT", "de-CH"], + # Many other forms of German exist + gsw=["de-CH"], # Swiss German + bar=["de-AT"], # Bavarian / Upper German variant common in most of Austria ell=["el-GR"], eng=["en-US", "en-CA", "en-GB", "en-AU", "en-GH", "en-HK", "en-IN", "en-IE", "en-KE", "en-NZ", "en-NG", "en-PH", "en-SG", "en-ZA", "en-TZ"], @@ -117,8 +127,7 @@ sin=["si-LK"], #prs=["prs-AF"], # Deprecated #pus=["pa-AF"], # Deprecated - #ron=["ro-RO", "ro-MD"], # ro-MD depreciated - ron=["ro-RO"], + ron=["ro-RO"], # ro-MD depreciated # run=[], rus=["ru-RU"], slk=["sk-SK"], @@ -138,7 +147,7 @@ # wbq = ["te-IN"], Waddar/Vadari is related to Telugu. # tat=[], #tgk=["tg-TJ"], # Depreciated - #tgl=["fil-PH", "tl-PH"], # "tl-PH" Depreciated + tgl=["fil-PH"], # "tl-PH" Depreciated tha=["th-TH"], # tir=[], #tpi=["tpi-PG"], # Depreciated @@ -151,8 +160,8 @@ zho=["zh-CN", "zh-CN-shandong", "zh-CN-sichuan", "zh-HK", "zh-TW"], yue=["yue-CN", "zh-HK"], # Cantonese wuu=["wuu-CN"], - # nan=["zh-TW", "nan-TW"], # nan-TW depreciated - nan=["zh-TW"], # Note, Taiwanese has one standard + one major dialect, + nan=["zh-TW"], # nan-TW depreciated + # Note, Taiwanese has one standard + one major dialect, # not sure which is covered better by Azure. zul=["zu-ZA"] ) \ No newline at end of file diff --git a/python/AzureTranslation/acs_translation_component/convert_language_code.py b/python/AzureTranslation/acs_translation_component/convert_language_code.py index a3080bee..c2789be9 100644 --- a/python/AzureTranslation/acs_translation_component/convert_language_code.py +++ b/python/AzureTranslation/acs_translation_component/convert_language_code.py @@ -31,6 +31,9 @@ logger = logging.getLogger('AcsTranslationComponent') +# A full list of supported languages can be found here: +# https://learn.microsoft.com/en-us/azure/ai-services/translator/language-support + # These cover conflicting 639-2 codes and less common variants # A warning will be issued if these are used. ISO639_VAR_TO_BCP47 = dict( From aa1e8111885ecc49d69ea0b8942e9d9195e11236 Mon Sep 17 00:00:00 2001 From: Howard Huang Date: Sun, 7 Apr 2024 04:54:24 -0400 Subject: [PATCH 3/6] Documentation update and language check improvements. --- python/AzureSpeechDetection/README.md | 346 +++++++++++++----- .../acs_speech_processor.py | 35 +- .../acs_speech_component/azure_utils.py | 14 +- .../plugin-files/descriptor/descriptor.json | 6 +- .../tests/test_acs_speech.py | 6 +- python/AzureTranslation/README.md | 69 ++-- .../convert_language_code.py | 231 ++++++------ .../plugin-files/descriptor/descriptor.json | 13 +- .../tests/test_acs_translation.py | 7 +- 9 files changed, 454 insertions(+), 273 deletions(-) diff --git a/python/AzureSpeechDetection/README.md b/python/AzureSpeechDetection/README.md index 3201aa20..b9b08113 100644 --- a/python/AzureSpeechDetection/README.md +++ b/python/AzureSpeechDetection/README.md @@ -21,7 +21,7 @@ In order for the component to process any jobs, the job properties listed below # Optional Job Properties The below properties can be optionally provided to alter the behavior of the component. -- `LANGUAGE`: The locale to use for transcription. Defaults to `en-US`. A complete list of available locales can be found in Microsoft's [Speech service documentation](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support). +- `LANGUAGE`: The BCP-47 locale to use for transcription. Defaults to `en-US`. A complete list of available locales can be found in Microsoft's [Speech service documentation](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support). - `DIARIZE`: Whether to assign utterances to different speakers. Currently, this component supports only two-speaker diarization. Diarization is enabled by default. @@ -61,69 +61,151 @@ AudioTracks also have the `start_time` and `stop_time` of their associated utter The following are the BCP-47 codes and their corresponding languages which Azure Speech-to-Text supports. -| Language | BCP-47 | Language | BCP-47 | -|--------------------------------|--------|------------------------------|---------| -| Afrikaans | `af-ZA` | Hungarian | `hu-HU` | -| Amharic | `am-ET` | Icelandic | `is-IS` | -| Arabic (Algeria) | `ar-DZ` | Indonesian | `id-ID` | -| Arabic (Bahrain) | `ar-BH` | Irish | `ga-IE` | -| Arabic (Egypt) | `ar-EG` | Italian | `it-IT` | -| Arabic (Iraq) | `ar-IQ` | Japanese | `ja-JP` | -| Arabic (Israel) | `ar-IL` | Javanese | `jv-ID` | -| Arabic (Jordan) | `ar-JO` | Kannada | `kn-IN` | -| Arabic (Kuwait) | `ar-KW` | Khmer | `km-KH` | -| Arabic (Lebanon) | `ar-LB` | Korean | `ko-KR` | -| Arabic (Libya) | `ar-LY` | Lao | `lo-LA` | -| Arabic (Morocco) | `ar-MA` | Latvian | `lv-LV` | -| Arabic (Oman) | `ar-OM` | Lithuanian | `lt-LT` | -| Arabic (Palestinian Authority) | `ar-PS` | Macedonian | `mk-MK` | -| Arabic (Qatar) | `ar-QA` | Malay | `ms-MY` | -| Arabic (Saudi Arabia) | `ar-SA` | Maltese | `mt-MT` | -| Arabic (Syria) | `ar-SY` | Marathi | `mr-IN` | -| Arabic (Tunisia) | `ar-TN` | Norwegian | `nb-NO` | -| Arabic (United Arab Emirates) | `ar-AE` | Polish | `pl-PL` | -| Arabic (Yemen) | `ar-YE` | Portuguese (Brazil) | `pt-BR` | -| Bulgarian | `bg-BG` | Portuguese (Portugal) | `pt-PT` | -| Burmese | `my-MM` | Romanian | `ro-RO` | -| Catalan | `ca-ES` | Russian | `ru-RU` | -| Chinese (Cantonese) | `zh-HK` | Serbian | `sr-RS` | -| Chinese (Mandarin) | `zh-CN` | Sinhala | `si-LK` | -| Chinese (Taiwan) | `zh-TW` | Slovak | `sk-SK` | -| Croatian | `hr-HR` | Slovenian | `sl-SI` | -| Czech | `cs-CZ` | Spanish (Argentina) | `es-AR` | -| Danish | `da-DK` | Spanish (Bolivia) | `es-BO` | -| Dutch (Belgium) | `nl-BE` | Spanish (Chile) | `es-CL` | -| Dutch (Netherlands) | `nl-NL` | Spanish (Colombia) | `es-CO` | -| English (Australia) | `en-AU` | Spanish (Costa Rica) | `es-CR` | -| English (Canada) | `en-CA` | Spanish (Cuba) | `es-CU` | -| English (Ghana) | `en-GH` | Spanish (Dominican Republic) | `es-DO` | -| English (Hong Kong) | `en-HK` | Spanish (Ecuador) | `es-EC` | -| English (India) | `en-IN` | Spanish (El Salvador) | `es-SV` | -| English (Ireland) | `en-IE` | Spanish (Equatorial Guinea) | `es-GQ` | -| English (Kenya) | `en-KE` | Spanish (Guatemala) | `es-GT` | -| English (New Zealand) | `en-NZ` | Spanish (Honduras) | `es-HN` | -| English (Nigeria) | `en-NG` | Spanish (Mexico) | `es-MX` | -| English (Philippines) | `en-PH` | Spanish (Nicaragua) | `es-NI` | -| English (Singapore) | `en-SG` | Spanish (Panama) | `es-PA` | -| English (South Africa) | `en-ZA` | Spanish (Paraguay) | `es-PY` | -| English (Tanzania) | `en-TZ` | Spanish (Peru) | `es-PE` | -| English (United Kingdom) | `en-GB` | Spanish (Puerto Rico) | `es-PR` | -| English (United States) | `en-US` | Spanish (Spain) | `es-ES` | -| Estonian | `et-EE` | Spanish (United States) | `es-US` | -| Farsi | `fa-IR` | Spanish (Uruguay) | `es-UY` | -| Finnish | `fi-FI` | Spanish (Venezuela) | `es-VE` | -| Filipino | `fil-P` | Swahili (Kenya) | `sw-KE` | -| French (Belgium) | `fr-BE` | Swahili (Tanzania) | `sw-TZ` | -| French (Canada) | `fr-CA` | Swedish | `sv-SE` | -| French (France) | `fr-FR` | Tamil | `ta-IN` | -| French (Switzerland) | `fr-CH` | Telugu | `te-IN` | -| German (Austria) | `de-AT` | Thai | `th-TH` | -| German (Germany) | `de-DE` | Turkish | `tr-TR` | -| German (Switzerland) | `de-CH` | Ukrainian | `uk-UA` | -| Greek | `el-GR` | Uzbek | `uz-UZ` | -| Gujarati | `gu-IN` | Vietnamese | `vi-VN` | -| Hebrew | `he-IL` | Zulu | `zu-ZA` | - | Hindi | `hi-IN` | | | +| Language | Locale (BCP-47) | +| ------------------------------------------- | --------------- | +| Afrikaans (South Africa) | af-ZA | +| Amharic (Ethiopia) | am-ET | +| Arabic (United Arab Emirates) | ar-AE | +| Arabic (Bahrain) | ar-BH | +| Arabic (Algeria) | ar-DZ | +| Arabic (Egypt) | ar-EG | +| Arabic (Israel) | ar-IL | +| Arabic (Iraq) | ar-IQ | +| Arabic (Jordan) | ar-JO | +| Arabic (Kuwait) | ar-KW | +| Arabic (Lebanon) | ar-LB | +| Arabic (Libya) | ar-LY | +| Arabic (Morocco) | ar-MA | +| Arabic (Oman) | ar-OM | +| Arabic (Palestinian Authority) | ar-PS | +| Arabic (Qatar) | ar-QA | +| Arabic (Saudi Arabia) | ar-SA | +| Arabic (Syria) | ar-SY | +| Arabic (Tunisia) | ar-TN | +| Arabic (Yemen) | ar-YE | +| Azerbaijani (Latin, Azerbaijan) | az-AZ | +| Bulgarian (Bulgaria) | bg-BG | +| Bengali (India) | bn-IN | +| Bosnian (Bosnia and Herzegovina) | bs-BA | +| Catalan | ca-ES | +| Czech (Czechia) | cs-CZ | +| Welsh (United Kingdom) | cy-GB | +| Danish (Denmark) | da-DK | +| German (Austria) | de-AT | +| German (Switzerland) | de-CH | +| German (Germany) | de-DE | +| Greek (Greece) | el-GR | +| English (Australia) | en-AU | +| English (Canada) | en-CA | +| English (United Kingdom) | en-GB | +| English (Ghana) | en-GH | +| English (Hong Kong SAR) | en-HK | +| English (Ireland) | en-IE | +| English (India) | en-IN | +| English (Kenya) | en-KE | +| English (Nigeria) | en-NG | +| English (New Zealand) | en-NZ | +| English (Philippines) | en-PH | +| English (Singapore) | en-SG | +| English (Tanzania) | en-TZ | +| English (United States) | en-US | +| English (South Africa) | en-ZA | +| Spanish (Argentina) | es-AR | +| Spanish (Bolivia) | es-BO | +| Spanish (Chile) | es-CL | +| Spanish (Colombia) | es-CO | +| Spanish (Costa Rica) | es-CR | +| Spanish (Cuba) | es-CU | +| Spanish (Dominican Republic) | es-DO | +| Spanish (Ecuador) | es-EC | +| Spanish (Spain) | es-ES | +| Spanish (Equatorial Guinea) | es-GQ | +| Spanish (Guatemala) | es-GT | +| Spanish (Honduras) | es-HN | +| Spanish (Mexico) | es-MX | +| Spanish (Nicaragua) | es-NI | +| Spanish (Panama) | es-PA | +| Spanish (Peru) | es-PE | +| Spanish (Puerto Rico) | es-PR | +| Spanish (Paraguay) | es-PY | +| Spanish (El Salvador) | es-SV | +| Spanish (United States)1 | es-US | +| Spanish (Uruguay) | es-UY | +| Spanish (Venezuela) | es-VE | +| Estonian (Estonia) | et-EE | +| Basque | eu-ES | +| Persian (Iran) | fa-IR | +| Finnish (Finland) | fi-FI | +| Filipino (Philippines) | fil-PH | +| French (Belgium) | fr-BE | +| French (Canada)1 | fr-CA | +| French (Switzerland) | fr-CH | +| French (France) | fr-FR | +| Irish (Ireland) | ga-IE | +| Galician | gl-ES | +| Gujarati (India) | gu-IN | +| Hebrew (Israel) | he-IL | +| Hindi (India) | hi-IN | +| Croatian (Croatia) | hr-HR | +| Hungarian (Hungary) | hu-HU | +| Armenian (Armenia) | hy-AM | +| Indonesian (Indonesia) | id-ID | +| Icelandic (Iceland) | is-IS | +| Italian (Switzerland) | it-CH | +| Italian (Italy) | it-IT | +| Japanese (Japan) | ja-JP | +| Javanese (Latin, Indonesia) | jv-ID | +| Georgian (Georgia) | ka-GE | +| Kazakh (Kazakhstan) | kk-KZ | +| Khmer (Cambodia) | km-KH | +| Kannada (India) | kn-IN | +| Korean (Korea) | ko-KR | +| Lao (Laos) | lo-LA | +| Lithuanian (Lithuania) | lt-LT | +| Latvian (Latvia) | lv-LV | +| Macedonian (North Macedonia) | mk-MK | +| Malayalam (India) | ml-IN | +| Mongolian (Mongolia) | mn-MN | +| Marathi (India) | mr-IN | +| Malay (Malaysia) | ms-MY | +| Maltese (Malta) | mt-MT | +| Burmese (Myanmar) | my-MM | +| Norwegian Bokmål (Norway) | nb-NO | +| Nepali (Nepal) | ne-NP | +| Dutch (Belgium) | nl-BE | +| Dutch (Netherlands) | nl-NL | +| Punjabi (India) | pa-IN | +| Polish (Poland) | pl-PL | +| Pashto (Afghanistan) | ps-AF | +| Portuguese (Brazil) | pt-BR | +| Portuguese (Portugal) | pt-PT | +| Romanian (Romania) | ro-RO | +| Russian (Russia) | ru-RU | +| Sinhala (Sri Lanka) | si-LK | +| Slovak (Slovakia) | sk-SK | +| Slovenian (Slovenia) | sl-SI | +| Somali (Somalia) | so-SO | +| Albanian (Albania) | sq-AL | +| Serbian (Cyrillic, Serbia) | sr-RS | +| Swedish (Sweden) | sv-SE | +| Swahili (Kenya) | sw-KE | +| Swahili (Tanzania) | sw-TZ | +| Tamil (India) | ta-IN | +| Telugu (India) | te-IN | +| Thai (Thailand) | th-TH | +| Turkish (Türkiye) | tr-TR | +| Ukrainian (Ukraine) | uk-UA | +| Urdu (India) | ur-IN | +| Uzbek (Latin, Uzbekistan) | uz-UZ | +| Vietnamese (Vietnam) | vi-VN | +| Chinese (Wu, Simplified) | wuu-CN | +| Chinese (Cantonese, Simplified) | yue-CN | +| Chinese (Mandarin, Simplified) | zh-CN | +| Chinese (Jilu Mandarin, Simplified) | zh-CN-shandong | +| Chinese (Southwestern Mandarin, Simplified) | zh-CN-sichuan | +| Chinese (Cantonese, Traditional) | zh-HK | +| Chinese (Taiwanese Mandarin, Traditional) | zh-TW | +| Zulu (South Africa) | zu-ZA | ## Dynamic Speech Selection @@ -131,39 +213,103 @@ The below table describes the component's default behavior when supplied an ISO If the language code supplied by a feed-forward track is not handled in `acs_speech_component/azure_utils.py`, the component will raise an `INVALID_PROPERTY` exception. -| ISO 639-3 | Language | BCP-47 | -|:---------:|---------------------|----------| -| `AMH` | Amharic | `am-ET` | -| `ARA` | Arabic | `ar-EG`* | -| `BUL` | Bulgarian | `bg-BG` | -| `CES` | Czech | `cs-CZ` | -| `CMN` | Chinese (Mandarin) | `zh-CN`* | -| `ELL` | Greek | `el-GR` | -| `ENG` | English | `en-US`* | -| `FRE` | French | `fr-FR`* | -| `HIN` | Hindi | `hi-IN` | -| `IND` | Indonesian | `id-ID` | -| `JAV` | Javanese | `jv-ID` | -| `JPN` | Japanese | `ja-JP` | -| `KOR` | Korean | `ko-KR` | -| `LAO` | Lao | `lo-LA` | -| `LIT` | Lithuanian | `lt-LT` | -| `MKD` | Macedonian | `mk-MK` | -| `MYA` | Burmese | `my-MM` | -| `NAN` | Chinese (Taiwan) | `zh-TW`* | -| `PES` | Farsi | `fa-IR` | -| `POL` | Polish | `pl-PL` | -| `POR` | Portuguese | `pt-BR` | -| `RON` | Romanian | `ro-RO` | -| `RUS` | Russian | `ru-RU` | -| `SLK` | Slovak | `sk-SK` | -| `SPA` | Spanish | `es-MX`* | -| `SWA` | Swahili | `sw-KE`* | -| `TAM` | Tamil | `ta-IN` | -| `THA` | Thai | `th-TH` | -| `TUR` | Turkish | `tr-TR` | -| `UKR` | Ukrainian | `uk-UA` | -| `UZB` | Uzbek | `uz-UZ` | -| `VIE` | Vietnamese | `vi-VN` | -| `YUE` | Chinese (Cantonese) | `zh-HK`* | -| `ZUL` | Zulu | `zu-ZA` | +| ISO 639--3 | Language | BCP-47 | +| ---------- | ---------------------------- | ------ | +| afr | Afrikaans | af-ZA | +| amh | Amharic | am-ET | +| ara | Arabic | ar-EG | +| aze | Azerbaijani | az-AZ | +| azj | North Azerbaijani | az-AZ | +| azb | South Azerbaijani | az-AZ | +| ben | Bengali | bn-IN | +| bul | Bulgarian | bg-BG | +| bos | Bosnian | bs-BA | +| cat | Catalan | ca-ES | +| ces | Czech | cs-CZ | +| cze | Czech | cs-CZ | +| cym | Welsh | cy-GB | +| wel | Welsh | cy-GB | +| dan | Danish | da-DK | +| jut | Jutish | da-DK | +| deu | German | de-DE | +| gsw | Swiss German | de-CH | +| bar | Bavarian | de-AT | +| ell | Modern Greek (1453-) | el-GR | +| eng | English | en-US | +| est | Estonian | et-EE | +| ekk | Standard Estonian | et-EE | +| eus | Basque | eu-ES | +| fas | Persian | fa-IR | +| fin | Finnish | fi-FI | +| fil | Filipino | fil-PH | +| fra | French | fr-FR | +| gle | Irish | ga-IE | +| glg | Galician | gl-ES | +| guj | Gujarati | gu-IN | +| heb | Hebrew | he-IL | +| hin | Hindi | hi-IN | +| hrv | Croatian | hr-HR | +| hun | Hungarian | hu-HU | +| ita | Italian | it-IT | +| ind | Indonesian | id-ID | +| ice | Icelandic | is-IS | +| isl | Icelandic | is-IS | +| jav | Javanese | jv-ID | +| jpn | Japanese | ja-JP | +| kat | Georgian | ka-GE | +| kaz | Kazakh | kk-KZ | +| khm | Khmer | km-KH | +| kxm | Northern Khmer | km-KH | +| kan | Kannada | kn-IN | +| kor | Korean | ko-KR | +| lao | Lao | lo-LA | +| lit | Lithuanian | lt-LT | +| lav | Latvian | lv-LV | +| lvs | Standard Latvian | lv-LV | +| mkd | Macedonian | mk-MK | +| mya | Burmese | my-MM | +| mal | Malayalam | ml-IN | +| mon | Mongolian | mn-MN | +| khk | Halh Mongolian | mn-MN | +| mvf | Peripheral Mongolian | mn-MN | +| mar | Marathi | mr-IN | +| zsm | Standard Malay | ms-MY | +| mlt | Maltese | mt-MT | +| nob | Norwegian Bokmål | nb-NO | +| nep | Nepali (macrolanguage) | ne-NP | +| npi | Nepali (individual language) | ne-NP | +| nld | Dutch | nl-NL | +| pan | Panjabi | pa-IN | +| pes | Iranian Persian | fa-IR | +| pol | Polish | pl-PL | +| por | Portuguese | pt-BR | +| pus | Pushto | ps-AF | +| pbu | Northern Pashto | ps-AF | +| pst | Central Pashto | ps-AF | +| pbt | Southern Pashto | ps-AF | +| sin | Sinhala | si-LK | +| ron | Romanian | ro-RO | +| rus | Russian | ru-RU | +| slk | Slovak | sk-SK | +| slv | Slovenian | sl-SI | +| som | Somali | so-SO | +| spa | Spanish | es-MX | +| sqi | Albanian | sq-AL | +| swa | Swahili (macrolanguage) | sw-KE | +| swe | Swedish | sv-SE | +| srp | Serbian | sr-RS | +| tam | Tamil | ta-IN | +| tel | Telugu | te-IN | +| tgl | Tagalog | fil-PH | +| tha | Thai | th-TH | +| tur | Turkish | tr-TR | +| ukr | Ukrainian | uk-UA | +| urd | Urdu | ur-IN | +| uzb | Uzbek | uz-UZ | +| vie | Vietnamese | vi-VN | +| cmn | Mandarin Chinese | zh-CN | +| zho | Chinese | zh-CN | +| yue | Yue Chinese | yue-CN | +| wuu | Wu Chinese | wuu-CN | +| nan | Min Nan Chinese | zh-TW | +| zul | Zulu | zu-ZA | \ No newline at end of file diff --git a/python/AzureSpeechDetection/acs_speech_component/acs_speech_processor.py b/python/AzureSpeechDetection/acs_speech_component/acs_speech_processor.py index bda7d24b..d1c2fbb0 100644 --- a/python/AzureSpeechDetection/acs_speech_component/acs_speech_processor.py +++ b/python/AzureSpeechDetection/acs_speech_component/acs_speech_processor.py @@ -70,6 +70,19 @@ class AcsSpeechDetectionProcessor(object): def __init__(self): self.acs = AzureConnection() + @staticmethod + def _convert_case_bcp(bcp:str)->str: + if not bcp: + return bcp + sep = '-' + if '_' in bcp: + sep = '_' + elif '-' not in bcp: + return bcp + + lang, script = bcp.split(sep) + return f'{lang.lower()}{sep}{script.upper()}' + @staticmethod def convert_word_timing( recognized_phrases: Iterable[Mapping[str, Any]], @@ -209,10 +222,10 @@ def process_audio(self, job_config: AzureJobConfig) -> List[mpf.AudioTrack]: ) missing_models = set() - default_locale = job_config.language + default_locale = self._convert_case_bcp(job_config.language) if (lang := job_config.override_default_language) is not None: - if lang in ISO6393_TO_BCP47: - for locale in ISO6393_TO_BCP47[lang]: + if lang.lower() in ISO6393_TO_BCP47: + for locale in ISO6393_TO_BCP47[lang.lower()]: if locale in self.acs.supported_locales: logger.debug( f"Override default language ('{lang}') detected, " @@ -241,18 +254,19 @@ def process_audio(self, job_config: AzureJobConfig) -> List[mpf.AudioTrack]: locale = default_locale if job_config.speaker is not None: speaker_language_valid = False - if (lang := job_config.speaker.language) in ISO6393_TO_BCP47: - for locale in ISO6393_TO_BCP47[lang]: - if locale in self.acs.supported_locales: - speaker_language_valid = True - break + if (lang := job_config.speaker.language): + if lang.lower() in ISO6393_TO_BCP47: + for locale in ISO6393_TO_BCP47[lang.lower()]: + if locale in self.acs.supported_locales: + speaker_language_valid = True + break if not speaker_language_valid: missing_models.add(job_config.speaker.language) ldict = job_config.speaker.language_scores for lang in sorted(ldict.keys(), key=ldict.get, reverse=True): - if lang in ISO6393_TO_BCP47: - for locale in ISO6393_TO_BCP47[lang]: + if lang.lower() in ISO6393_TO_BCP47: + for locale in ISO6393_TO_BCP47[lang.lower()]: if locale in self.acs.supported_locales: logger.warning( f"Language supplied in feed-forward track " @@ -283,6 +297,7 @@ def process_audio(self, job_config: AzureJobConfig) -> List[mpf.AudioTrack]: ) locale = default_locale + locale = self._convert_case_bcp(locale) if locale not in self.acs.supported_locales: raise mpf.DetectionException( f"Selected locale ('{locale}') is not supported by Azure " diff --git a/python/AzureSpeechDetection/acs_speech_component/azure_utils.py b/python/AzureSpeechDetection/acs_speech_component/azure_utils.py index 77fb05b9..4bb07242 100644 --- a/python/AzureSpeechDetection/acs_speech_component/azure_utils.py +++ b/python/AzureSpeechDetection/acs_speech_component/azure_utils.py @@ -39,8 +39,8 @@ aze=["az-AZ"], azj=["az-AZ"], # North Azerbaijani azb=["az-AZ"], # South Azerbaijani - # bel=["be-BY"], # Depreciated - ben=["bn-IN"], # "bn-BD" Bengali-Bangladesh has been depreciated + # bel=["be-BY"], # Deprecated + ben=["bn-IN"], # "bn-BD" Bengali-Bangladesh has been deprecated # bod=["bo"], # Deprecated bul=["bg-BG"], bos=["bs-BA"], @@ -127,7 +127,7 @@ sin=["si-LK"], #prs=["prs-AF"], # Deprecated #pus=["pa-AF"], # Deprecated - ron=["ro-RO"], # ro-MD depreciated + ron=["ro-RO"], # ro-MD deprecated # run=[], rus=["ru-RU"], slk=["sk-SK"], @@ -146,11 +146,11 @@ tel=["te-IN"], # wbq = ["te-IN"], Waddar/Vadari is related to Telugu. # tat=[], - #tgk=["tg-TJ"], # Depreciated - tgl=["fil-PH"], # "tl-PH" Depreciated + #tgk=["tg-TJ"], # Deprecated + tgl=["fil-PH"], # "tl-PH" deprecated tha=["th-TH"], # tir=[], - #tpi=["tpi-PG"], # Depreciated + #tpi=["tpi-PG"], # Deprecated tur=["tr-TR"], ukr=["uk-UA"], urd=["ur-IN"], @@ -160,7 +160,7 @@ zho=["zh-CN", "zh-CN-shandong", "zh-CN-sichuan", "zh-HK", "zh-TW"], yue=["yue-CN", "zh-HK"], # Cantonese wuu=["wuu-CN"], - nan=["zh-TW"], # nan-TW depreciated + nan=["zh-TW"], # nan-TW deprecated # Note, Taiwanese has one standard + one major dialect, # not sure which is covered better by Azure. zul=["zu-ZA"] diff --git a/python/AzureSpeechDetection/plugin-files/descriptor/descriptor.json b/python/AzureSpeechDetection/plugin-files/descriptor/descriptor.json index 9c9b6f51..7dec6ef0 100644 --- a/python/AzureSpeechDetection/plugin-files/descriptor/descriptor.json +++ b/python/AzureSpeechDetection/plugin-files/descriptor/descriptor.json @@ -10,7 +10,7 @@ "description": "Uses Azure Cognitive Services to perform speech-to-text.", "actionType": "DETECTION", "trackType": "SPEECH", - "outputChangedCounter" : 2, + "outputChangedCounter": 2, "requiresCollection": { "states": [] }, @@ -59,7 +59,7 @@ }, { "name": "LANGUAGE", - "description": "The language/locale to use for transcription.", + "description": "The language/locale, in BCP-47 format, to use for transcription. Please consult README to review Azure's supported list of BCP-47 codes.", "type": "STRING", "defaultValue": "en-US" }, @@ -193,4 +193,4 @@ ] } ] -} +} \ No newline at end of file diff --git a/python/AzureSpeechDetection/tests/test_acs_speech.py b/python/AzureSpeechDetection/tests/test_acs_speech.py index 2d899120..d1bf47bb 100644 --- a/python/AzureSpeechDetection/tests/test_acs_speech.py +++ b/python/AzureSpeechDetection/tests/test_acs_speech.py @@ -112,7 +112,7 @@ def test_audio_file(self): stop_time=-1, job_properties=get_test_properties( DIARIZE='FALSE', - LANGUAGE='en-US', + LANGUAGE='EN-us', USE_SAS_AUTH='TRUE' ), media_properties={}, @@ -137,7 +137,7 @@ def test_video_file(self): stop_frame=-1, job_properties=get_test_properties( DIARIZE='FALSE', - LANGUAGE='en-US' + LANGUAGE='En-Us' ), media_properties=dict( FPS='24' @@ -204,7 +204,7 @@ def test_language(self): stop_time=-1, job_properties=get_test_properties( DIARIZE='TRUE', - LANGUAGE='en-US' + LANGUAGE='en-us' ), media_properties={}, feed_forward_track=None diff --git a/python/AzureTranslation/README.md b/python/AzureTranslation/README.md index 2740de41..e78a9408 100644 --- a/python/AzureTranslation/README.md +++ b/python/AzureTranslation/README.md @@ -1,68 +1,83 @@ # Overview This repository contains source code for the OpenMPF Azure Cognitive Services -Translation Component. This component utilizes the [Azure Cognitive Services +Translation Component. This component utilizes the [Azure Cognitive Services Translator REST endpoint](https://docs.microsoft.com/en-us/azure/cognitive-services/translator/reference/v3-0-translate) to translate the content of detection properties. It has only been tested against v3.0 of the API. This component translates the content of existing detection properties, -so it only makes sense to use it with -[feed forward](https://openmpf.github.io/docs/site/Feed-Forward-Guide) and +so it only makes sense to use it with +[feed forward](https://openmpf.github.io/docs/site/Feed-Forward-Guide) and when it isn't the first element of a pipeline. - -When a detection property is translated, the translation is put in to a new -detection property named `TRANSLATION`. The original detection property is not -modified. A property named `TRANSLATION TO LANGUAGE` containing the BCP-47 + +When a detection property is translated, the translation is put in to a new +detection property named `TRANSLATION`. The original detection property is not +modified. A property named `TRANSLATION TO LANGUAGE` containing the BCP-47 language code of the translated text will also be added. If the language of the input text is detected to be the same as the `TO_LANGUAGE` job property, -then no translation will occur. When translation is skipped because of -matching languages, the `TRANSLATION` detection property will be omitted and +then no translation will occur. When translation is skipped because of +matching languages, the `TRANSLATION` detection property will be omitted and `SKIPPED TRANSLATION=TRUE` will be added to the detection properties. When the source text is multiple languages, the translation endpoint will only -translate one of the languages. For example, translating -"你叫什么名字? ¿Cómo te llamas?" to English results in +translate one of the languages. For example, translating +"你叫什么名字? ¿Cómo te llamas?" to English results in "What is your name? The Cómo te llamas?". # Required Job Properties In order for the component to process any jobs, the job properties listed below -must be provided. Neither has a default value. +must be provided. Neither has a default value. -- `ACS_URL`: Base URL for the Azure Cognitive Services Translator Endpoint. - e.g. `https://api.cognitive.microsofttranslator.com` or +- `ACS_URL`: Base URL for the Azure Cognitive Services Translator Endpoint. + e.g. `https://api.cognitive.microsofttranslator.com` or `https:///translator/text/v3.0`. The URL should not end with `/translate` because two separate endpoints are used. `ACS_URL + '/translate'` is used for translation. `ACS_URL + '/breaksentence'` is used to break up text when it is too long for a single translation request. This property can also be configured using an environment variable named `MPF_PROP_ACS_URL`. - + - `ACS_SUBSCRIPTION_KEY`: A string containing your Azure Cognitive Services - subscription key. To get one you will need to create an + subscription key. To get one you will need to create an Azure Cognitive Services account. This property can also be configured using an environment variable named `MPF_PROP_ACS_SUBSCRIPTION_KEY`. - - + + # Important Job Properties: -- `TO_LANGUAGE`: The BCP-47 language code for language that the properties +- `TO_LANGUAGE`: The BCP-47 language code for language that the properties should be translated to. -- `FEED_FORWARD_PROP_TO_PROCESS`: Comma-separated list of property names indicating - which properties in the feed-forward track or detection to consider +- `FEED_FORWARD_PROP_TO_PROCESS`: Comma-separated list of property names indicating + which properties in the feed-forward track or detection to consider translating. For example, `TEXT,TRANSCRIPT`. If the first property listed is present, then that property will be translated. If it's not, then the next property in the list is considered. At most, one property will be translated. - `FROM_LANGUAGE`: In most cases, this property should not be used. It should - only be used when automatic language detection is detecting the wrong - language. Providing this property prevents the translation endpoint from - doing automatic language detection. If `FROM_LANGUAGE` is provided, and the - text is actually another language, the translation endpoint will return the + only be used when automatic language detection is detecting the wrong + language: Users can provide a BCP-47 code to force the translation service + to translate text with a corrected source language. + For instance, if incoming text is incorrectly being detected by Azure as + Spanish instead of English, users can set `FROM_LANGUAGE=en` + to force the service to treat all submitted text in the current job as English. + + Providing this property prevents the translation endpoint from + doing automatic language detection. If `FROM_LANGUAGE` is provided, and the + text is actually another language, the translation endpoint will return the input text unchanged. - + +- `SUGGESTED_FROM_LANGUAGE`: Optional property that indicates the fallback source + BCP-47 language code to use when automatic language detection fails. + The value from this property is only used when automatic language detection fails. `SUGGESTED_FROM_LANGUAGE` is the preferred setting to adjust when users know + they are processing a large amount of text in a particular language, but other + source languages may be present in individual pieces of text. + For instance, setting `SUGGESTED_FROM_LANGUAGE=es` would allow the component to + default to translating from Spanish, whenever Azure's language detector fails + to identify the source language of the incoming text. + # Listing Supported Languages -To list the supported languages replace `${ACS_URL}` and +To list the supported languages replace `${ACS_URL}` and `${ACS_SUBSCRIPTION_KEY}` in the following command and run it: ```shell script curl -H "Ocp-Apim-Subscription-Key: ${ACS_SUBSCRIPTION_KEY}" "https://${ACS_URL}/languages?api-version=3.0&scope=translation" diff --git a/python/AzureTranslation/acs_translation_component/convert_language_code.py b/python/AzureTranslation/acs_translation_component/convert_language_code.py index c2789be9..37a42411 100644 --- a/python/AzureTranslation/acs_translation_component/convert_language_code.py +++ b/python/AzureTranslation/acs_translation_component/convert_language_code.py @@ -34,116 +34,6 @@ # A full list of supported languages can be found here: # https://learn.microsoft.com/en-us/azure/ai-services/translator/language-support -# These cover conflicting 639-2 codes and less common variants -# A warning will be issued if these are used. -ISO639_VAR_TO_BCP47 = dict( - # ISO Code Variant = Same language but two ISO-639 codes match to it. - # Variant = May be different from primary ISO code. - # Note: We're avoiding adding in archaic/extinct variants, attaching a note to - # languages with those present. - - ALB = 'sq', # 639-2 Code Variant - ARM = 'hy', # 639-2 Code Variant - AZJ = 'az', # North Azerbaijani Variant - AZB = 'az', # South Azerbaijani Variant - BAQ = 'eu', # 639-2 Code Variant - CZE = 'cs', # 639-2 Code Variant Czech - - TWL = 'sn', # 639-3 Variants of Shona - MXC = 'sn', # 639-3 Variants of Shona - TWX = 'sn', # 639-3 Variants of Shona - - JUT = 'da', # 639-3: Jutish - Danish Dialect - - DGO = 'doi', # Dogri proper - XNR = 'doi', # Kangri - - DUT = 'nl', # 639-2 Code Variant of Dutch - - EKK = 'et', # Standard Estonian - VRO = 'et', # Võro - Estonian Dialect (Debated) - - FRE = 'fr', # 639-2 Code Variant of French - - GEO = 'ka', # 639-2 Code Variant of Georgian - - GER = 'de', # 639-2 Code Variant of German - # Warning: There's many other variants of old and regional forms. - # https://en.wikipedia.org/wiki/German_language - - GRE = 'el', # 639-2 Code Variant of Greek - # Note: There's several other variants - # https://en.wikipedia.org/wiki/Greek_language - - ICE = 'is', # 639-2 Code Variant of Icelandic - - IKE = 'iu', # Eastern Canadian Inuktitut - - KXM = 'km', # Northern Khmer - - KOK = 'gom', # Kokani - KNN = 'gom', # Maharashtrian Konkani - - TTS = 'lo', # Isan (Thailand Lao) - - LVS = 'lv', # Standard Latvian language - #LTG = 'lv' # Latgalian language (Historical Form) - - MAC = 'mk', # 639-2 Code Variant of Macedonian - - MAY = 'ms', # 639-2 Code Variant of Malay - ZLM = 'ms', # Malay (individual language) - ZSM = 'ms', # Malaysian Malay - - MAO = 'mi', # 639-2 Code Variant of Maori - - BUR = 'my', # 639-2 Code Variant of Burmese (Myanmar) - # Several other closely related Burmese variants exist below: - INT = 'my', # Intha - TCO = 'my', # Taungyo - RKI = 'my', # Rakhine - RMZ = 'my', # Marma - TAY = 'my', # Tavoyan dialects - - # Variants of Odia - SPV = 'or', # Sambalpuri - ORT = 'or', # Adivasi Odia (Kotia) - DSO = 'or', # Desiya - - - # Variants of Pashto - PST = 'ps', # Central Pashto - PBU = 'ps', # Northern Pashto - PBT = 'ps', # Southern Pashto - # WNE - Archaic - - # Persian has many variants - # Only including top three - #PES = 'fa' # Iranian Persian - Default below - #PRS = 'fa' # Dari - Default Below - TGK = 'fa', # Tajik - - PNB = 'pa', # Western Punjabi/Panjabi - - RUM = 'ro', # 639-2 Code Variant of Romanian - - SLO = 'sk', # 639-2 Code Variant of Slovak - - # Swahili Variants - SWC='sw', # Congo Swahili - SWH='sw', # Coastal Swahili - #YMK='sw', # Makwe (?) - #WMW='sw', # Mwani (?) - - TIB='bo', # 639-2 Code Variant of Tibetan - - UZN='uz', # Northern Uzbek - UZS='uz', # Southern Uzbek - - WEL='cy', # 639-2 Code Variant of Welsh - ) - - # For some cases, we'll need to distinguish incoming script info # As general practice these script codes are attached # to the ISO-639. @@ -296,6 +186,116 @@ YUA='yua', ZUL='zu', ) + +# These cover conflicting 639-2 codes and less common variants +# A warning will be issued if these are used. +ISO639_VAR_TO_BCP47 = dict( + # ISO Code Variant = Same language but two ISO-639 codes match to it. + # Variant = May be different from primary ISO code. + # Note: We're avoiding adding in archaic/extinct variants, attaching a note to + # languages with those present. + + ALB = 'sq', # 639-2 Code Variant + ARM = 'hy', # 639-2 Code Variant + AZJ = 'az', # North Azerbaijani Variant + AZB = 'az', # South Azerbaijani Variant + BAQ = 'eu', # 639-2 Code Variant + CZE = 'cs', # 639-2 Code Variant Czech + + TWL = 'sn', # 639-3 Variants of Shona + MXC = 'sn', # 639-3 Variants of Shona + TWX = 'sn', # 639-3 Variants of Shona + + JUT = 'da', # 639-3: Jutish - Danish Dialect + + DGO = 'doi', # Dogri proper + XNR = 'doi', # Kangri + + DUT = 'nl', # 639-2 Code Variant of Dutch + + EKK = 'et', # Standard Estonian + VRO = 'et', # Võro - Estonian Dialect (Debated) + + FRE = 'fr', # 639-2 Code Variant of French + + GEO = 'ka', # 639-2 Code Variant of Georgian + + GER = 'de', # 639-2 Code Variant of German + # Warning: There's many other variants of old and regional forms. + # https://en.wikipedia.org/wiki/German_language + + GRE = 'el', # 639-2 Code Variant of Greek + # Note: There's several other variants + # https://en.wikipedia.org/wiki/Greek_language + + ICE = 'is', # 639-2 Code Variant of Icelandic + + IKE = 'iu', # Eastern Canadian Inuktitut + + KXM = 'km', # Northern Khmer + + KOK = 'gom', # Kokani + KNN = 'gom', # Maharashtrian Konkani + + TTS = 'lo', # Isan (Thailand Lao) + + LVS = 'lv', # Standard Latvian language + #LTG = 'lv' # Latgalian language (Historical Form) + + MAC = 'mk', # 639-2 Code Variant of Macedonian + + MAY = 'ms', # 639-2 Code Variant of Malay + ZLM = 'ms', # Malay (individual language) + ZSM = 'ms', # Malaysian Malay + + MAO = 'mi', # 639-2 Code Variant of Maori + + BUR = 'my', # 639-2 Code Variant of Burmese (Myanmar) + # Several other closely related Burmese variants exist below: + INT = 'my', # Intha + TCO = 'my', # Taungyo + RKI = 'my', # Rakhine + RMZ = 'my', # Marma + TAY = 'my', # Tavoyan dialects + + # Variants of Odia + SPV = 'or', # Sambalpuri + ORT = 'or', # Adivasi Odia (Kotia) + DSO = 'or', # Desiya + + + # Variants of Pashto + PST = 'ps', # Central Pashto + PBU = 'ps', # Northern Pashto + PBT = 'ps', # Southern Pashto + # WNE - Archaic + + # Persian has many variants + # Only including top three + #PES = 'fa' # Iranian Persian - Default below + #PRS = 'fa' # Dari - Default Below + TGK = 'fa', # Tajik + + PNB = 'pa', # Western Punjabi/Panjabi + + RUM = 'ro', # 639-2 Code Variant of Romanian + + SLO = 'sk', # 639-2 Code Variant of Slovak + + # Swahili Variants + SWC='sw', # Congo Swahili + SWH='sw', # Coastal Swahili + #YMK='sw', # Makwe (?) + #WMW='sw', # Mwani (?) + + TIB='bo', # 639-2 Code Variant of Tibetan + + UZN='uz', # Northern Uzbek + UZS='uz', # Southern Uzbek + + WEL='cy', # 639-2 Code Variant of Welsh + ) + BCP_CODES_ONLY = { 'iu-latn', # Inuktitut (Latin) 'fr-ca', # French Canadian @@ -307,18 +307,21 @@ 'sr-latn', # Serbian (Latin) } + + BCP_CODES = BCP_CODES_ONLY | \ set(ISO6393_TO_BCP47.values()) | \ set(ISO639_WITH_SCRIPT_TO_BCP47.values()) def iso_to_bcp(language_code: str) -> Optional[str]: # First check if we have matching scripts/regional variants + language_code = language_code.strip() if bcp_code := ISO639_WITH_SCRIPT_TO_BCP47.get(language_code.upper()): return bcp_code elif language_code.lower() in BCP_CODES: return language_code.lower() - lang_code = language_code.upper().strip() + lang_code = language_code.upper() # Remove attached script/variant info, # Check language portion of ISO code next. @@ -334,9 +337,9 @@ def iso_to_bcp(language_code: str) -> Optional[str]: return lang_info.language elif bcp_code_var := ISO639_VAR_TO_BCP47.get(lang_code): logger.warning( - f"Unable to find direct a BCP code match for {language_code}\n" - f"Found a potential BCP match or variant: {bcp_code_var}\n" - f"Using `{bcp_code_var}` as input language ") + f"Unable to find direct a BCP code match for {language_code}. " + f"Found a potential BCP match or variant: {bcp_code_var}. " + f"Using `{bcp_code_var}` as input language.") return bcp_code_var else: return None diff --git a/python/AzureTranslation/plugin-files/descriptor/descriptor.json b/python/AzureTranslation/plugin-files/descriptor/descriptor.json index 2533d996..359d0bab 100644 --- a/python/AzureTranslation/plugin-files/descriptor/descriptor.json +++ b/python/AzureTranslation/plugin-files/descriptor/descriptor.json @@ -10,7 +10,7 @@ "description": "Uses Azure Cognitive Services to perform translation.", "actionType": "DETECTION", "trackType": "TRANSLATION", - "outputChangedCounter" : 1, + "outputChangedCounter": 1, "requiresCollection": { "states": [] }, @@ -41,19 +41,19 @@ }, { "name": "TO_LANGUAGE", - "description": "The BCP-47 language code for language that the properties should be translated to.", + "description": "The BCP-47 language code for language that the properties should be translated to. Please consult README to query Azure's supported list of BCP-47 codes for translation.", "type": "STRING", "defaultValue": "en" }, { "name": "FROM_LANGUAGE", - "description": "Optional property that indicates the source language of the text. When provided, it disables automatic language detection. If the text isn't actually the specified FROM_LANGUAGE, the translation endpoint returns the text unmodified.", + "description": "Optional property that indicates the source BCP-47 language code of the text (i.e. 'es' for input Spanish text). When provided, it disables automatic language detection. If the text isn't actually the specified FROM_LANGUAGE, the translation endpoint returns the text unmodified. Please consult README to query Azure's supported list of BCP-47 codes for translation.", "type": "STRING", "defaultValue": "" }, { "name": "SUGGESTED_FROM_LANGUAGE", - "description": "Optional property that indicates the fallback source language to use when automatic language detection fails. The value from this property is only used when automatic language detection fails.", + "description": "Optional property that indicates the fallback source BCP-47 language code to use when automatic language detection fails (i.e. 'es' to translate text from suspected Spanish text). The value from this property is only used when automatic language detection fails. Please consult README to query Azure's supported list of BCP-47 codes for translation.", "type": "STRING", "defaultValue": "" }, @@ -104,8 +104,7 @@ "name": "AZURE TRANSLATION TEXT FILE ACTION", "description": "Uses Azure Cognitive Services to perform translation on a plain text file.", "algorithm": "AZURETRANSLATION", - "properties": [ - ] + "properties": [] } ], "tasks": [ @@ -133,4 +132,4 @@ ] } ] -} +} \ No newline at end of file diff --git a/python/AzureTranslation/tests/test_acs_translation.py b/python/AzureTranslation/tests/test_acs_translation.py index 88643d05..83eac939 100644 --- a/python/AzureTranslation/tests/test_acs_translation.py +++ b/python/AzureTranslation/tests/test_acs_translation.py @@ -92,11 +92,14 @@ def tearDown(self): def test_iso_code_checker(self): self.assertEqual('zh-hans', iso_to_bcp("ZH")) + self.assertEqual('zh-hans', iso_to_bcp("Zh")) + self.assertEqual('zh-hans', iso_to_bcp("zh")) self.assertEqual('zh-hans', iso_to_bcp("ZHO")) self.assertEqual('zh-hant', iso_to_bcp("ZHO-HANT")) - self.assertEqual('zh-hant', iso_to_bcp("ZH-HANT")) - self.assertEqual('zh-hans', iso_to_bcp("ZH-HANS")) + self.assertEqual('zh-hant', iso_to_bcp("Zho-haNT")) + self.assertEqual('zh-hant', iso_to_bcp("ZH-Hant")) + self.assertEqual('zh-hans', iso_to_bcp("zh-HANS")) self.assertEqual('fr-ca', iso_to_bcp("fr-ca")) def test_simple_jobs(self): From 4680b04f79a3029ba8091884754d8a5ecb2718fd Mon Sep 17 00:00:00 2001 From: Howard W Huang <40070840+hhuangMITRE@users.noreply.github.com> Date: Mon, 8 Apr 2024 01:35:09 -0400 Subject: [PATCH 4/6] Update descriptor.json --- .../AzureTranslation/plugin-files/descriptor/descriptor.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/AzureTranslation/plugin-files/descriptor/descriptor.json b/python/AzureTranslation/plugin-files/descriptor/descriptor.json index 359d0bab..185d95f7 100644 --- a/python/AzureTranslation/plugin-files/descriptor/descriptor.json +++ b/python/AzureTranslation/plugin-files/descriptor/descriptor.json @@ -47,7 +47,7 @@ }, { "name": "FROM_LANGUAGE", - "description": "Optional property that indicates the source BCP-47 language code of the text (i.e. 'es' for input Spanish text). When provided, it disables automatic language detection. If the text isn't actually the specified FROM_LANGUAGE, the translation endpoint returns the text unmodified. Please consult README to query Azure's supported list of BCP-47 codes for translation.", + "description": "Optional property that indicates the source BCP-47 language code of the text (i.e. 'es' to translate from text confirmed to be in Spanish). When provided, it disables automatic language detection. If the text isn't actually the specified FROM_LANGUAGE, the translation endpoint returns the text unmodified. Please consult README to query Azure's supported list of BCP-47 codes for translation.", "type": "STRING", "defaultValue": "" }, @@ -132,4 +132,4 @@ ] } ] -} \ No newline at end of file +} From 73c0a4d45387e8fdbcf2d519db62d50ac4d63ad0 Mon Sep 17 00:00:00 2001 From: Howard W Huang <40070840+hhuangMITRE@users.noreply.github.com> Date: Mon, 8 Apr 2024 01:37:31 -0400 Subject: [PATCH 5/6] Update descriptor.json --- python/AzureTranslation/plugin-files/descriptor/descriptor.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/AzureTranslation/plugin-files/descriptor/descriptor.json b/python/AzureTranslation/plugin-files/descriptor/descriptor.json index 185d95f7..24dd014f 100644 --- a/python/AzureTranslation/plugin-files/descriptor/descriptor.json +++ b/python/AzureTranslation/plugin-files/descriptor/descriptor.json @@ -53,7 +53,7 @@ }, { "name": "SUGGESTED_FROM_LANGUAGE", - "description": "Optional property that indicates the fallback source BCP-47 language code to use when automatic language detection fails (i.e. 'es' to translate text from suspected Spanish text). The value from this property is only used when automatic language detection fails. Please consult README to query Azure's supported list of BCP-47 codes for translation.", + "description": "Optional property that indicates the fallback source BCP-47 language code to use when automatic language detection fails (i.e. 'es' to translate text that is suspected to be in Spanish). The value from this property is only used when automatic language detection fails. Please consult README to query Azure's supported list of BCP-47 codes for translation.", "type": "STRING", "defaultValue": "" }, From 2ef2ed0a470377ef1bb6aaebfe8de019449bc4a2 Mon Sep 17 00:00:00 2001 From: jrobble Date: Mon, 8 Apr 2024 11:05:05 -0400 Subject: [PATCH 6/6] Formatting changes. * Update supported version of Azure STT. --- python/AzureSpeechDetection/README.md | 64 +++++++++++++------ .../acs_speech_component/azure_utils.py | 12 ++-- python/AzureTranslation/README.md | 24 +++---- .../convert_language_code.py | 10 +-- 4 files changed, 69 insertions(+), 41 deletions(-) diff --git a/python/AzureSpeechDetection/README.md b/python/AzureSpeechDetection/README.md index b9b08113..16ab9da3 100644 --- a/python/AzureSpeechDetection/README.md +++ b/python/AzureSpeechDetection/README.md @@ -1,33 +1,50 @@ # Overview -This repository contains source code for the OpenMPF Azure Cognitive Services Speech-to-Text Component. This component utilizes the [Azure Cognitive Services Batch Transcription REST endpoint](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/batch-transcription) to transcribe speech from audio and video files. +This repository contains source code for the OpenMPF Azure Cognitive Services +Speech-to-Text Component. This component utilizes the [Azure Cognitive Services Batch +Transcription REST +endpoint](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/batch-transcription) +to transcribe speech from audio and video files. # Required Job Properties -In order for the component to process any jobs, the job properties listed below must be provided. These properties have no default value, but can be set through environment variables of the same name. If both environment variable and job property are provided, the job property will be used. +In order for the component to process any jobs, the job properties listed below must be +provided. These properties have no default value, but can be set through environment +variables of the same name. If both environment variable and job property are provided, +the job property will be used. -- `ACS_URL`: URL for the Azure Cognitive Services Endpoint. - e.g. `https://virginia.cris.azure.us/api/speechtotext/v2.0/transcriptions`. - The component has only been tested against v2.0 of the API. +- `ACS_URL`: URL for the Azure Cognitive Services Endpoint. For example, + `https://virginia.cris.azure.us/api/speechtotext/v3.1/transcriptions`. The component has + been tested against v3.1 of the API. - - `ACS_SUBSCRIPTION_KEY`: A string containing your subscription key for the speech service. +- `ACS_SUBSCRIPTION_KEY`: A string containing your subscription key for the speech + service. -- `ACS_BLOB_CONTAINER_URL`: URL for an Azure Storage Blob container in which to store files during processing. - e.g. `https://myaccount.blob.core.windows.net/mycontainer`. - See Microsoft's [documentation on Azure storage](https://docs.microsoft.com/en-us/azure/storage/blobs/storage-blob-container-create) for details. +- `ACS_BLOB_CONTAINER_URL`: URL for an Azure Storage Blob container in which to store + files during processing. e.g. `https://myaccount.blob.core.windows.net/mycontainer`. See + Microsoft's [documentation on Azure + storage](https://docs.microsoft.com/en-us/azure/storage/blobs/storage-blob-container-create) + for details. -- `ACS_BLOB_SERVICE_KEY`: A string containing your Azure Cognitive Services storage access key. +- `ACS_BLOB_SERVICE_KEY`: A string containing your Azure Cognitive Services storage access + key. # Optional Job Properties The below properties can be optionally provided to alter the behavior of the component. -- `LANGUAGE`: The BCP-47 locale to use for transcription. Defaults to `en-US`. A complete list of available locales can be found in Microsoft's [Speech service documentation](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support). +- `LANGUAGE`: The BCP-47 locale to use for transcription. Defaults to `en-US`. A complete + list of available locales can be found in Microsoft's [Speech service + documentation](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support). -- `DIARIZE`: Whether to assign utterances to different speakers. Currently, this component supports only two-speaker diarization. Diarization is enabled by default. +- `DIARIZE`: Whether to assign utterances to different speakers. Currently, this component + supports only two-speaker diarization. Diarization is enabled by default. -- `CLEANUP`: Whether to delete files from Azure Blob storage container when processing is complete. It is recommended to always keep this enabled, unless it is expected that the same piece of media will be processed multiple times. +- `CLEANUP`: Whether to delete files from Azure Blob storage container when processing is + complete. It is recommended to always keep this enabled, unless it is expected that the + same piece of media will be processed multiple times. -- `BLOB_ACCESS_TIME`: The amount of time in minutes for which the Azure Speech service will have access to the file in blob storage. +- `BLOB_ACCESS_TIME`: The amount of time in minutes for which the Azure Speech service + will have access to the file in blob storage. @@ -50,15 +67,18 @@ Returned `AudioTrack` objects have the following members in their `detection_pro | `MISSING_LANGUAGE_MODELS` | All languages for which transcription was considered, but which were either invalid ISO 639-3 codes, did not have a corresponding BCP-47 code, or were not supported by the Azure Speech endpoint. | -AudioTracks also have the `start_time` and `stop_time` of their associated utterance's voiced segment, and the utterance `confidence`, as returned by Azure. +AudioTracks also have the `start_time` and `stop_time` of their associated utterance's +voiced segment, and the utterance `confidence`, as returned by Azure. # Sample Program -`sample_acs_speech_detector.py` can be used to quickly test with the Azure endpoint. Run with the `-h` flag to see accepted command-line arguments. +`sample_acs_speech_detector.py` can be used to quickly test with the Azure endpoint. Run +with the `-h` flag to see accepted command-line arguments. # Language Identifiers -The following are the BCP-47 codes and their corresponding languages which Azure Speech-to-Text supports. +The following are the BCP-47 codes and their corresponding languages which Azure +Speech-to-Text supports. | Language | Locale (BCP-47) | @@ -209,9 +229,15 @@ The following are the BCP-47 codes and their corresponding languages which Azure ## Dynamic Speech Selection -The below table describes the component's default behavior when supplied an ISO 639-3 language code by an upstream language identification component in a feed-forward track. For languages with multiple dialects (indicated by an asterisk), a BCP-47 locale was chosen according to internal data, which may not be desirable in all cases. This selection can be altered by editing `acs_speech_component/azure_utils.py`. +The below table describes the component's default behavior when supplied an ISO 639-3 +language code by an upstream language identification component in a feed-forward track. +For languages with multiple dialects (indicated by an asterisk), a BCP-47 locale was +chosen according to internal data, which may not be desirable in all cases. This selection +can be altered by editing `acs_speech_component/azure_utils.py`. -If the language code supplied by a feed-forward track is not handled in `acs_speech_component/azure_utils.py`, the component will raise an `INVALID_PROPERTY` exception. +If the language code supplied by a feed-forward track is not handled in +`acs_speech_component/azure_utils.py`, the component will raise an `INVALID_PROPERTY` +exception. | ISO 639--3 | Language | BCP-47 | | ---------- | ---------------------------- | ------ | diff --git a/python/AzureSpeechDetection/acs_speech_component/azure_utils.py b/python/AzureSpeechDetection/acs_speech_component/azure_utils.py index 4bb07242..753fcd48 100644 --- a/python/AzureSpeechDetection/acs_speech_component/azure_utils.py +++ b/python/AzureSpeechDetection/acs_speech_component/azure_utils.py @@ -62,7 +62,7 @@ "en-KE", "en-NZ", "en-NG", "en-PH", "en-SG", "en-ZA", "en-TZ"], est=["et-EE"], # Estonian (Inclusive) ekk=["et-EE"], # Standard Estonian - #vro=["et-EE"], Voro, doesn't seem to be direct match + # vro=["et-EE"], Voro, doesn't seem to be direct match eus=["eu-ES"], fas=["fa-IR"], fin=["fi-FI"], @@ -75,7 +75,7 @@ hin=["hi-IN"], hrv=["hr-HR"], hun=["hu-HU"], - #ohu=["hu-HU"], # Note: Old-Hungarian, might not fully work with modern "hu-HU" + # ohu=["hu-HU"], # Note: Old-Hungarian, might not fully work with modern "hu-HU" # gug=["gn"], # Deprecated # hat=[], # hau=["ha"], # Deprecated @@ -125,8 +125,8 @@ pst=["ps-AF"], # Central Pahsto pbt=["ps-AF"], # Southern Pahsto sin=["si-LK"], - #prs=["prs-AF"], # Deprecated - #pus=["pa-AF"], # Deprecated + # prs=["prs-AF"], # Deprecated + # pus=["pa-AF"], # Deprecated ron=["ro-RO"], # ro-MD deprecated # run=[], rus=["ru-RU"], @@ -146,11 +146,11 @@ tel=["te-IN"], # wbq = ["te-IN"], Waddar/Vadari is related to Telugu. # tat=[], - #tgk=["tg-TJ"], # Deprecated + # tgk=["tg-TJ"], # Deprecated tgl=["fil-PH"], # "tl-PH" deprecated tha=["th-TH"], # tir=[], - #tpi=["tpi-PG"], # Deprecated + # tpi=["tpi-PG"], # Deprecated tur=["tr-TR"], ukr=["uk-UA"], urd=["ur-IN"], diff --git a/python/AzureTranslation/README.md b/python/AzureTranslation/README.md index e78a9408..d58c4aa4 100644 --- a/python/AzureTranslation/README.md +++ b/python/AzureTranslation/README.md @@ -3,8 +3,8 @@ This repository contains source code for the OpenMPF Azure Cognitive Services Translation Component. This component utilizes the [Azure Cognitive Services Translator REST endpoint](https://docs.microsoft.com/en-us/azure/cognitive-services/translator/reference/v3-0-translate) -to translate the content of detection properties. It has only been tested -against v3.0 of the API. +to translate the content of detection properties. It has been tested against v3.0 of the +API. This component translates the content of existing detection properties, so it only makes sense to use it with @@ -31,13 +31,13 @@ In order for the component to process any jobs, the job properties listed below must be provided. Neither has a default value. - `ACS_URL`: Base URL for the Azure Cognitive Services Translator Endpoint. - e.g. `https://api.cognitive.microsofttranslator.com` or - `https:///translator/text/v3.0`. The URL should - not end with `/translate` because two separate endpoints are - used. `ACS_URL + '/translate'` is used for translation. - `ACS_URL + '/breaksentence'` is used to break up text when it is too long - for a single translation request. This property can also be configured - using an environment variable named `MPF_PROP_ACS_URL`. + e.g. `https://api.cognitive.microsofttranslator.com` or + `https:///translator/text/v3.0`. The URL should + not end with `/translate` because two separate endpoints are + used. `ACS_URL + '/translate'` is used for translation. + `ACS_URL + '/breaksentence'` is used to break up text when it is too long + for a single translation request. This property can also be configured + using an environment variable named `MPF_PROP_ACS_URL`. - `ACS_SUBSCRIPTION_KEY`: A string containing your Azure Cognitive Services subscription key. To get one you will need to create an @@ -47,7 +47,8 @@ must be provided. Neither has a default value. # Important Job Properties: - `TO_LANGUAGE`: The BCP-47 language code for language that the properties - should be translated to. + should be translated to. + - `FEED_FORWARD_PROP_TO_PROCESS`: Comma-separated list of property names indicating which properties in the feed-forward track or detection to consider translating. For example, `TEXT,TRANSCRIPT`. If the first property listed is @@ -68,7 +69,8 @@ must be provided. Neither has a default value. - `SUGGESTED_FROM_LANGUAGE`: Optional property that indicates the fallback source BCP-47 language code to use when automatic language detection fails. - The value from this property is only used when automatic language detection fails. `SUGGESTED_FROM_LANGUAGE` is the preferred setting to adjust when users know + The value from this property is only used when automatic language detection fails. + `SUGGESTED_FROM_LANGUAGE` is the preferred setting to adjust when users know they are processing a large amount of text in a particular language, but other source languages may be present in individual pieces of text. For instance, setting `SUGGESTED_FROM_LANGUAGE=es` would allow the component to diff --git a/python/AzureTranslation/acs_translation_component/convert_language_code.py b/python/AzureTranslation/acs_translation_component/convert_language_code.py index 37a42411..402da4e4 100644 --- a/python/AzureTranslation/acs_translation_component/convert_language_code.py +++ b/python/AzureTranslation/acs_translation_component/convert_language_code.py @@ -240,7 +240,7 @@ TTS = 'lo', # Isan (Thailand Lao) LVS = 'lv', # Standard Latvian language - #LTG = 'lv' # Latgalian language (Historical Form) + # LTG = 'lv' # Latgalian language (Historical Form) MAC = 'mk', # 639-2 Code Variant of Macedonian @@ -272,8 +272,8 @@ # Persian has many variants # Only including top three - #PES = 'fa' # Iranian Persian - Default below - #PRS = 'fa' # Dari - Default Below + # PES = 'fa' # Iranian Persian - Default below + # PRS = 'fa' # Dari - Default Below TGK = 'fa', # Tajik PNB = 'pa', # Western Punjabi/Panjabi @@ -285,8 +285,8 @@ # Swahili Variants SWC='sw', # Congo Swahili SWH='sw', # Coastal Swahili - #YMK='sw', # Makwe (?) - #WMW='sw', # Mwani (?) + # YMK='sw', # Makwe (?) + # WMW='sw', # Mwani (?) TIB='bo', # 639-2 Code Variant of Tibetan