diff --git a/python/AzureSpeechDetection/acs_speech_component/azure_utils.py b/python/AzureSpeechDetection/acs_speech_component/azure_utils.py index 753fcd48..37a061a9 100644 --- a/python/AzureSpeechDetection/acs_speech_component/azure_utils.py +++ b/python/AzureSpeechDetection/acs_speech_component/azure_utils.py @@ -28,140 +28,159 @@ # BCP-47 code in each value list will be used (the rest are primarily for # reference in case of later changes or customization) +# This mapping is also used to generate `ISO_LANGUAGE` codes after +# Speech-to-Text conversion. The last ISO code listed in the key list will be used. + # Supported languages can be found here: # https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=stt ISO6393_TO_BCP47 = dict( - afr=["af-ZA"], - amh=["am-ET"], + afr=["af-ZA"], # Afrikaans + amh=["am-ET"], # Amharic ara=["ar-EG", "ar-SA", "ar-IQ", "ar-IL", "ar-AE", "ar-SY", "ar-LY", "ar-DZ", "ar-BH", "ar-JO", "ar-KW", "ar-LB", "ar-MA", "ar-OM", "ar-PS", "ar-QA", - "ar-TN", "ar-YE"], - aze=["az-AZ"], + "ar-TN", "ar-YE"], # Arabic azj=["az-AZ"], # North Azerbaijani azb=["az-AZ"], # South Azerbaijani + aze=["az-AZ"], # Azerbaijani (Inclusive) # bel=["be-BY"], # Deprecated ben=["bn-IN"], # "bn-BD" Bengali-Bangladesh has been deprecated # bod=["bo"], # Deprecated - bul=["bg-BG"], - bos=["bs-BA"], - cat=["ca-ES"], + bul=["bg-BG"], # Bulgarian + bos=["bs-BA"], # Bosnian + cat=["ca-ES"], # Catalan, Valencian # ceb=["ceb"], - ces=["cs-CZ"], - cze=["cs-CZ"],# ISO-639-2 Variant - cym=["cy-GB"], - wel=["cy-GB"],# ISO-639-2 Variant + cze=["cs-CZ"], # Czech SO-639-2 Variant + ces=["cs-CZ"], # Czech + wel=["cy-GB"], # Welsh ISO-639-2 Variant + cym=["cy-GB"], # Welsh dan=["da-DK"], # Note: There is a related dialect JUT - Jutlandic jut=["da-DK"], # Upon further research, Jutlantic is present in Denmark # but declining over time. - deu=["de-DE", "de-AT", "de-CH"], + ger=["de-DE", "de-AT", "de-CH"], # German ISO-2 Variant + deu=["de-DE", "de-AT", "de-CH"], # German # Many other forms of German exist gsw=["de-CH"], # Swiss German bar=["de-AT"], # Bavarian / Upper German variant common in most of Austria - ell=["el-GR"], + gre=["el-GR"], # Greek ISO-639-2 Variant + ell=["el-GR"], # Greek eng=["en-US", "en-CA", "en-GB", "en-AU", "en-GH", "en-HK", "en-IN", "en-IE", "en-KE", "en-NZ", "en-NG", "en-PH", "en-SG", "en-ZA", "en-TZ"], - est=["et-EE"], # Estonian (Inclusive) ekk=["et-EE"], # Standard Estonian + est=["et-EE"], # Estonian (Inclusive) # vro=["et-EE"], Voro, doesn't seem to be direct match - eus=["eu-ES"], - fas=["fa-IR"], - fin=["fi-FI"], - fil=["fil-PH"], - fra=["fr-FR", "fr-BE", "fr-CA", "fr-CH"], - gle=["ga-IE"], - glg=["gl-ES"], - guj=["gu-IN"], - heb=["he-IL"], - hin=["hi-IN"], - hrv=["hr-HR"], - hun=["hu-HU"], + baq=["eu-ES"], # Basque ISO-639-2 Variant + eus=["eu-ES"], # Basque + fin=["fi-FI"], # Finnish + fre=["fr-FR", "fr-BE", "fr-CA", "fr-CH"], # French ISO-639-2 Variant + fra=["fr-FR", "fr-BE", "fr-CA", "fr-CH"], # French + gle=["ga-IE"], # Irish + glg=["gl-ES"], # Galician + guj=["gu-IN"], # Gujarati + heb=["he-IL"], # Hebrew + hin=["hi-IN"], # Hindi + hrv=["hr-HR"], # Croatian + hun=["hu-HU"], # Hungarian # ohu=["hu-HU"], # Note: Old-Hungarian, might not fully work with modern "hu-HU" # gug=["gn"], # Deprecated # hat=[], # hau=["ha"], # Deprecated # hbs=["sh"], # Deprecated - # hye=["hy"], - ita=["it-IT", "it-CH"], - ind=["id-ID"], - ice=["is-IS"], - isl=["is-IS"], - jav=["jv-ID"], - jpn=["ja-JP"], - kat=["ka-GE"], - kaz=["kk-KZ"], - khm=["km-KH"], + arm=['hy-AM'], # Armenian ISO-639-2 Variant + hye=['hy-AM'], # Armenian + ita=["it-IT", "it-CH"], # Italian + ind=["id-ID"], # Indonesian + ice=["is-IS"], # Icelandic ISO-639-2 Variant + isl=["is-IS"], # Icelandic + jav=["jv-ID"], # Javanese + jpn=["ja-JP"], # Japanese + geo=["ka-GE"], # Georgian ISO-639-2 Variant + kat=["ka-GE"], # Georgian + kaz=["kk-KZ"], # Kazakh kxm=["km-KH"], # Northern Khmer, might not work as well. - kan=["kn-IN"], + khm=["km-KH"], # Central Khmer + kan=["kn-IN"], # Kannada # kir=["ky-KG"], # Deprecated - kor=["ko-KR"], + kor=["ko-KR"], # Korean # kur=["ku"], # Deprecated - lao=["lo-LA"], - lit=["lt-LT"], - lav=["lv-LV"], + lao=["lo-LA"], # Lao + lit=["lt-LT"], # Lithuanian lvs=["lv-LV"], # Standard Latvian + lav=["lv-LV"], # Latvian (Inclusive) # luo=[], - mkd=["mk-MK"], - mya=["my-MM"], - mal=["ml-IN"], - mon=["mn-MN"], # Mongolian (Inclusive) + mac=["mk-MK"], # Macedonian ISO 639-2 Variant + mkd=["mk-MK"], # Macedonian + bur=["my-MM"], # Burmese / Myanmar ISO 639-2 Variant + mya=["my-MM"], # Burmese / Myanmar + mal=["ml-IN"], # Malayalam khk=["mn-MN"], # Khalkha Mongolian (Predominant) mvf=["mn-MN"], # Peripheral Mongolian (Part) - mar=["mr-IN"], - zsm=["ms-MY"], - mlt=["mt-MT"], - nob=["nb-NO"], - nep=["ne-NP"], # Nepali (Macrolanguage) + mon=["mn-MN"], # Mongolian (Inclusive) + mar=["mr-IN"], # Marathi + may=["ms-MY"], # Malay ISO 639-2 Variant + msa=["ms-MY"], # Malay (Inclusive, Macrolanguage) + zsm=["ms-MY"], # Standard Malay (Malaysian Malay) + # In this case, the ms-MY code indicates ZSM (Standard Malay) + mlt=["mt-MT"], # Maltese + nob=["nb-NO"], # Norwegian Bokmål (Norway) npi=["ne-NP"], # Nepali - nld=["nl-NL", "nl-BE"], # Netherlands and Belgium + nep=["ne-NP"], # Nepali (Inclusive, Macrolanguage) + dut=["nl-NL", "nl-BE"], # Dutch ISO 639-2 Variant + nld=["nl-NL", "nl-BE"], # Dutch - Netherlands and Belgium # omr=["mr-IN"], # Old Maranthi, might not work # nde=["nd"], # orm=["om"], - pan=["pa-IN"], - pes=["fa-IR"], - pol=["pl-PL"], + pan=["pa-IN"], # Punjabi, Panjabi + per=["fa-IR"], # Persian ISO-639-2 Variant + fas=["fa-IR"], # Persian + pes=["fa-IR"], # Iranian Persian + pol=["pl-PL"], # Polish por=["pt-BR", "pt-PT"], # pt-BR = Portuguese Brazil, pt-PT = Portuguese Portugal - pus=["ps-AF"], # Pashto, Pushto (Inclusive) pbu=["ps-AF"], # Northern Pahsto pst=["ps-AF"], # Central Pahsto pbt=["ps-AF"], # Southern Pahsto - sin=["si-LK"], + pus=["ps-AF"], # Pashto, Pushto (Inclusive) + sin=["si-LK"], # Sinhala, Sinhalese # prs=["prs-AF"], # Deprecated # pus=["pa-AF"], # Deprecated - ron=["ro-RO"], # ro-MD deprecated + rum=["ro-RO"], # Romanian ISO-639-2 Variant + ron=["ro-RO"], # Romanian, Moldavian, Moldovan + # ro-MD deprecated # run=[], - rus=["ru-RU"], - slk=["sk-SK"], - slv=["sl-SI"], + rus=["ru-RU"], # Russian + slo=["sk-SK"], # Slovak ISO-639-2 Variant + slk=["sk-SK"], # Slovak + slv=["sl-SI"], # Slovenian # sna=["sn"], - som=["so-SO"], + som=["so-SO"], # Somali spa=["es-MX", "es-US", "es-AR", "es-BO", "es-CL", "es-CO", "es-CR", "es-CU", "es-DO", "es-EC", "es-SV", "es-GQ", "es-GT", "es-HN", "es-NI", "es-PA", - "es-PY", "es-PE", "es-PR", "es-ES", "es-UY", "es-VE"], - - sqi=["sq-AL"], - swa=["sw-KE", "sw-TZ"], - swe=["sv-SE"], - srp=["sr-RS"], - tam=["ta-IN"], - tel=["te-IN"], + "es-PY", "es-PE", "es-PR", "es-ES", "es-UY", "es-VE"], # Spanish + alb=["sq-AL"], # Albanian ISO-639-2 Variant + sqi=["sq-AL"], # Albanian + swa=["sw-KE", "sw-TZ"], # Swahili + swe=["sv-SE"], # Swedish + srp=["sr-RS"], # Serbian + tam=["ta-IN"], # Tamil + tel=["te-IN"], # Telugu # wbq = ["te-IN"], Waddar/Vadari is related to Telugu. # tat=[], # tgk=["tg-TJ"], # Deprecated - tgl=["fil-PH"], # "tl-PH" deprecated - tha=["th-TH"], + tgl=["fil-PH"], # "tl-PH" deprecated, Tagalog + fil=["fil-PH"], # Filipino (Standardized form of Tagalog) + tha=["th-TH"], # Thai # tir=[], # tpi=["tpi-PG"], # Deprecated - tur=["tr-TR"], - ukr=["uk-UA"], - urd=["ur-IN"], - uzb=["uz-UZ"], - vie=["vi-VN"], - cmn=["zh-CN", "zh-CN-shandong", "zh-CN-sichuan", "zh-HK", "zh-TW"], - zho=["zh-CN", "zh-CN-shandong", "zh-CN-sichuan", "zh-HK", "zh-TW"], + tur=["tr-TR"], # Turkish + ukr=["uk-UA"], # Ukrainian + urd=["ur-IN"], # Urdu + uzb=["uz-UZ"], # Uzbek + vie=["vi-VN"], # Vietnamese + cmn=["zh-CN", "zh-CN-shandong", "zh-CN-sichuan", "zh-HK", "zh-TW"], # Mandarin Chinese + zho=["zh-CN", "zh-CN-shandong", "zh-CN-sichuan", "zh-HK", "zh-TW"], # Chinese yue=["yue-CN", "zh-HK"], # Cantonese - wuu=["wuu-CN"], + wuu=["wuu-CN"], # Chinese (Wu, Simplified) nan=["zh-TW"], # nan-TW deprecated # Note, Taiwanese has one standard + one major dialect, # not sure which is covered better by Azure. - zul=["zu-ZA"] + zul=["zu-ZA"] # Zulu ) \ No newline at end of file diff --git a/python/AzureTranslation/acs_translation_component/convert_language_code.py b/python/AzureTranslation/acs_translation_component/convert_language_code.py index 967f0607..7cf30c1b 100644 --- a/python/AzureTranslation/acs_translation_component/convert_language_code.py +++ b/python/AzureTranslation/acs_translation_component/convert_language_code.py @@ -43,82 +43,82 @@ } ISO6393_TO_BCP47 = dict( - AFK='af', - AMH='am', - ARA='ar', # Note, Large number of variants - ASM='as', - AZE='az', - BAK='bk', - BEN='bn', - BHO='bho', # Azure uses ISO code. - BOD='bo', - BOS='bs', - BRX='brx', # Azure uses ISO code. - BUL='bg', - CAT='ca', - CES='cs', + AFR='af', # Afrikaans + AMH='am', # Amharic + ARA='ar', # Note, Large number of variants for Arabic + ASM='as', # Assamese + AZE='az', # Azerbaijani + BAK='ba', # Bashkir + BEN='bn', # Bengali + BHO='bho', # Azure uses ISO code for Bhojpuri. + BOD='bo', # Tibetan + BOS='bs', # Bosnian + BRX='brx', # Azure uses ISO code for Boro/Bodo. + BUL='bg', # Bulgarian + CAT='ca', # Catalan, Valencian + CES='cs', # Czech ZHO='zh-hans', # Choosing to associate baseline Chinese as simplfied variant. ZH='zh-hans',# Choosing to associate baseline Chinese as simplfied variant. # Change to zh-hant if needed. - CMN='zh-hans', - CYM='cy', + CMN='zh-hans', # Mandarin Chinese + CYM='cy', # Welsh DAN='da', # Insular Danish DEU='de', # German, note: a lot of variants exist - DIV='dv', - DOI='doi', # Two other variants - DSB='dsb', - EUS='eu', - ELL='el', - ENG='en', - EST='et', # Two other variants - FAO='fo', - FIJ='fj', - FIL='fil', - FIN='fi', - FRA='fr', - GLE='ga', - GLG='gl', - GUJ='gu', - HAT='ht', - HAU='ha', + DIV='dv', # Divehi, Dhivehi, Maldivian + DOI='doi', # Two other variants for Dogri. + DSB='dsb', # Lower Sorbian + EUS='eu', # Basque + ELL='el', # Modern Greek + ENG='en', # English + EST='et', # Two other variants of Estonian + FAO='fo', # Faroese + FIJ='fj', # Fijian + FIL='fil', # Filipino + FIN='fi', # Finnish + FRA='fr', # French + GLE='ga', # Irish + GLG='gl', # Galician + GUJ='gu', # Gujarati + HAT='ht', # Haitian, Haitian Creole + HAU='ha', # Hausa HEB='he', # Several archaic forms: https://en.wikipedia.org/wiki/Hebrew_language - HIN='hi', - HRV='hr', - HSB='hsb', - HUN='hu', # Old Hungarian also exists as code `OHU` - HYE='hy', - IBO='ig', - IND='id', - IKT='ikt', - IKU='iu', - ISL='is', - ITA='it', - JPN='ja', - KAT='ka', - KAN='kn', - KAS='ks', - KAZ='kk', - KHM='km', - KIN='rw', - KIR='ky', - LUG='lug', + HIN='hi', # Hindi + HRV='hr', # Croatian + HSB='hsb', # Upper Sorbian + HUN='hu', # Hungarian. Note, Old Hungarian also exists as code `OHU` + HYE='hy', # Armenian + IBO='ig', # Igbo + IND='id', # Indonesian + IKT='ikt', # Inuinnaqtun + IKU='iu', # Inuktitut + ISL='is', # Icelandic + ITA='it', # Italian + JPN='ja', # Japanese + KAT='ka', # Georgian + KAN='kn', # Kannada + KAS='ks', # Kashmiri + KAZ='kk', # Kazakh + KHM='km', # Central Khmer + KIN='rw', # Kinyarwanda + KIR='ky', # Kirghiz, Kyrgyz + LUG='lug', # Luganda GOM='gom', # Goan Konkani, other two ISO variants redirected to this - KOR='ko', - KUR='ku', + KOR='ko', # Korean + KUR='ku', # Kurdish CKB='ku', # Azure noted Central Kurdish is supported as Ku KMR='kmr', # Northern Kurdish - # There areNorthern two other variants of Kurdish but them don't seem to be directly supported. - LAO='lo', - LAV='lv', - LIN='ln', - LIT='lt', # There is an old Lithuanian variant (OLT) - LZH='lzh', - MAI='mai', - MAL='ml', - MAR='mr', # There's also an old variant Marathi variant (OMR) - MKD='mk', - MLG='mg', # Note: Many regional forms: https://en.wikipedia.org/wiki/Malagasy_language - MLT='mt', + # There are two other variants of Kurdish but they don't seem to be directly supported. + LAO='lo', # Lao + LAV='lv', # Latvian + LIN='ln', # Lingala + LIT='lt', # Lithuanian. Note, there is an old Lithuanian variant (OLT) + LZH='lzh', # Chinese (Literary) + MAI='mai', # Maithili + MAL='ml', # Malayalam + MAR='mr', # Marathi. Note, there's also an old variant Marathi variant (OMR) + MKD='mk', # Macedonian + MLG='mg', # Malagasy. Note: Many regional forms: https://en.wikipedia.org/wiki/Malagasy_language + MLT='mt', # Maltese MON = 'mn-cyrl', # Note: Azure also supports the traditional Mongolian script as `mn-Mong` # The primary script these days is Cyrllic/Latin. # From https://en.wikipedia.org/wiki/Mongolian_writing_systems: @@ -126,65 +126,66 @@ # traditional Mongolian script alongside Cyrillic in official documents starting from 2025." KHK='mn-cyrl', # Khalkha Mongolian MVF='mn-mong', # Peripheral Mongolian (part) - MWW='mww', - MRI='mi', + MWW='mww', # Hmong Daw (Latin) + MRI='mi', # Maori MSA='ms', # Note: Many regional forms: https://en.wikipedia.org/wiki/Malay_language - MYA='my', # Several variants exist. - NAN='zh-hant', - NEP='ne', - NPI='ne', - NLD='nl', - NOR='no', - NOB='no', # Two subtypes of Norwegian in active use. - NNO='no', - NYA='ny', - NSO='nso', - ORI='or', # Several variants exist. - ORY='or', - OTQ='otq', - PAN='pa', - PES='fa', - POL='pl', + MYA='my', # Burmese/Myanmar. Note, several variants exist. + NAN='zh-hant', # Southern Min Chinese + NEP='ne', # Nepali + NPI='ne', # Nepali (individual language) + NLD='nl', # Dutch, Flemish + NOR='nb', # Note: `nn`, `no`, and `nb` correspond to variants of Norwegian. But Azure + # only supports `nb`. Also redirecting default Norwegian. + NOB='nb', # Norwegian Bokmål (Norway) + #NNO='nb', # Norwegian Nynorsk is not directly supported. + NYA='nya', # `Azure uses ISO-639-3 variant for Nyanja instead of ISO-639-2 + NSO='nso', # Sesotho sa Leboa + ORI='or', # Several variants exist for Odia + ORY='or', # Odia + OTQ='otq', # Queretaro Otomi + PAN='pa', # Punjabi, Panjabi + PES='fa', # Iranian Persian + POL='pl', # Polish POR='pt', # Defaulting to Portuguese (Brazil) - Other variant below - PRS='prs', - PUS='ps', # Several variants exist. - RON='ro', - RUN='rn', - RUS='ru', - SIN='si', - SLK='sk', - SLV='sl', + PRS='prs', # Dari + PUS='ps', # Several variants exist for Pashto, Pushto + RON='ro', # Romanian, Moldavian, Moldovan + RUN='run', # Azure uses ISO-639-3 instead of ISO-639-2 for Rundi. + RUS='ru', # Russian + SIN='si', # Sinhala, Sinhalese + SLK='sk', # Slovak + SLV='sl', # Slovenian SMO='sm', # Samoan Latin - SOM='so', - SOT='st', + SOM='so', # Somali + SOT='st', # Southern Sotho SRP='sr-Cyrl', # Note: Serbian language is fully digraphic, two popular script forms exist # Cyrillic is the official version adopted by Serbia - SPA='es', + SPA='es', # Spanish SNA='sn', # Three other variants exist under Shona language. - SND='sd', - SQI='sq', - SWA='sw', - SWE='sv', - TAH='ty', - TAM='ta', # Old Tamil variant exists as OTY - TAT='tt', - TEL='te', # Related language: wbq – Waddar (Vadari), not included. - THA='th', - TIR='ti', - TSN='tn', - TUK='tk', - TUR='tr', - TON='to', - UIG='ug', - UKR='uk', - URD='ur', - UZB='uz', - VIE='vi', - XHO='xh', - YOR='yo', - YUE='yue', - YUA='yua', - ZUL='zu', + SND='sd', # Sindhi + SQI='sq', # Albanian + SWA='sw', # Swahili + SWE='sv', # Swedish + TAH='ty', # Tahitian + TAM='ta', # Tamil. Note, Old Tamil variant exists as OTY + TAT='tt', # Tatar + TEL='te', # Telugu. Related language: wbq – Waddar (Vadari), not included. + THA='th', # Thai + TIR='ti', # Tigrinya + TSN='tn', # Tswana + TUK='tk', # Turkmen + TUR='tr', # Turkish + TON='to', # Tonga (Tonga Islands) + UIG='ug', # Uighur, Uyghur + UKR='uk', # Ukrainian + URD='ur', # Urdu + UZB='uz', # Uzbek + VIE='vi', # Vietnamese + XHO='xh', # Xhosa + YOR='yo', # Yoruba + YUE='yue', # Cantonese (Traditional) + YUA='yua', # Yucatec Maya + ZUL='zu', # Zulu ) # These cover conflicting 639-2 codes and less common variants @@ -274,6 +275,8 @@ # Only including top three # PES = 'fa' # Iranian Persian - Default below # PRS = 'fa' # Dari - Default Below + PER = 'fa', # Persian (Inclusive) + FAS = 'fa', # Persian (Inclusive) TGK = 'fa', # Tajik PNB = 'pa', # Western Punjabi/Panjabi @@ -308,7 +311,6 @@ } - BCP_CODES = BCP_CODES_ONLY | \ set(ISO6393_TO_BCP47.values()) | \ set(ISO639_WITH_SCRIPT_TO_BCP47.values()) @@ -332,8 +334,8 @@ def iso_to_bcp(language_code: str) -> Optional[str]: elif lang_code.lower() in BCP_CODES: return lang_code.lower() elif lang_info := langcodes.get(lang_code): - # TODO, after langcodes conversion, we may want to consider double checking the BCP codes again - # discard if value does not match supported codes. + # TODO, after langcodes conversion, we may want to consider double checking the BCP codes + # again discard if value does not match supported codes. return lang_info.language elif bcp_code_var := ISO639_VAR_TO_BCP47.get(lang_code): logger.warning( diff --git a/python/AzureTranslation/tests/data/split-sentence/break-sentence-art-of-war-results.json b/python/AzureTranslation/tests/data/split-sentence/break-sentence-art-of-war-results.json deleted file mode 100644 index 43dc2324..00000000 --- a/python/AzureTranslation/tests/data/split-sentence/break-sentence-art-of-war-results.json +++ /dev/null @@ -1,5 +0,0 @@ -[ - { - "sentLen": [24, 37, 81, 22, 18, 5, 5, 5, 5, 5, 5, 8, 27, 15, 10, 7, 27, 50, 12, 28, 13, 11] - } -]