Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
179 changes: 99 additions & 80 deletions python/AzureSpeechDetection/acs_speech_component/azure_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,140 +28,159 @@
# BCP-47 code in each value list will be used (the rest are primarily for
# reference in case of later changes or customization)

# This mapping is also used to generate `ISO_LANGUAGE` codes after
# Speech-to-Text conversion. The last ISO code listed in the key list will be used.

# Supported languages can be found here:
# https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=stt
ISO6393_TO_BCP47 = dict(
afr=["af-ZA"],
amh=["am-ET"],
afr=["af-ZA"], # Afrikaans
amh=["am-ET"], # Amharic
ara=["ar-EG", "ar-SA", "ar-IQ", "ar-IL", "ar-AE", "ar-SY", "ar-LY", "ar-DZ",
"ar-BH", "ar-JO", "ar-KW", "ar-LB", "ar-MA", "ar-OM", "ar-PS", "ar-QA",
"ar-TN", "ar-YE"],
aze=["az-AZ"],
"ar-TN", "ar-YE"], # Arabic
azj=["az-AZ"], # North Azerbaijani
azb=["az-AZ"], # South Azerbaijani
aze=["az-AZ"], # Azerbaijani (Inclusive)
# bel=["be-BY"], # Deprecated
ben=["bn-IN"], # "bn-BD" Bengali-Bangladesh has been deprecated
# bod=["bo"], # Deprecated
bul=["bg-BG"],
bos=["bs-BA"],
cat=["ca-ES"],
bul=["bg-BG"], # Bulgarian
bos=["bs-BA"], # Bosnian
cat=["ca-ES"], # Catalan, Valencian
# ceb=["ceb"],
ces=["cs-CZ"],
cze=["cs-CZ"],# ISO-639-2 Variant
cym=["cy-GB"],
wel=["cy-GB"],# ISO-639-2 Variant
cze=["cs-CZ"], # Czech SO-639-2 Variant
ces=["cs-CZ"], # Czech
wel=["cy-GB"], # Welsh ISO-639-2 Variant
cym=["cy-GB"], # Welsh
dan=["da-DK"], # Note: There is a related dialect JUT - Jutlandic
jut=["da-DK"], # Upon further research, Jutlantic is present in Denmark
# but declining over time.
deu=["de-DE", "de-AT", "de-CH"],
ger=["de-DE", "de-AT", "de-CH"], # German ISO-2 Variant
deu=["de-DE", "de-AT", "de-CH"], # German
# Many other forms of German exist
gsw=["de-CH"], # Swiss German
bar=["de-AT"], # Bavarian / Upper German variant common in most of Austria
ell=["el-GR"],
gre=["el-GR"], # Greek ISO-639-2 Variant
ell=["el-GR"], # Greek
eng=["en-US", "en-CA", "en-GB", "en-AU", "en-GH", "en-HK", "en-IN", "en-IE",
"en-KE", "en-NZ", "en-NG", "en-PH", "en-SG", "en-ZA", "en-TZ"],
est=["et-EE"], # Estonian (Inclusive)
ekk=["et-EE"], # Standard Estonian
est=["et-EE"], # Estonian (Inclusive)
# vro=["et-EE"], Voro, doesn't seem to be direct match
eus=["eu-ES"],
fas=["fa-IR"],
fin=["fi-FI"],
fil=["fil-PH"],
fra=["fr-FR", "fr-BE", "fr-CA", "fr-CH"],
gle=["ga-IE"],
glg=["gl-ES"],
guj=["gu-IN"],
heb=["he-IL"],
hin=["hi-IN"],
hrv=["hr-HR"],
hun=["hu-HU"],
baq=["eu-ES"], # Basque ISO-639-2 Variant
eus=["eu-ES"], # Basque
fin=["fi-FI"], # Finnish
fre=["fr-FR", "fr-BE", "fr-CA", "fr-CH"], # French ISO-639-2 Variant
fra=["fr-FR", "fr-BE", "fr-CA", "fr-CH"], # French
gle=["ga-IE"], # Irish
glg=["gl-ES"], # Galician
guj=["gu-IN"], # Gujarati
heb=["he-IL"], # Hebrew
hin=["hi-IN"], # Hindi
hrv=["hr-HR"], # Croatian
hun=["hu-HU"], # Hungarian
# ohu=["hu-HU"], # Note: Old-Hungarian, might not fully work with modern "hu-HU"
# gug=["gn"], # Deprecated
# hat=[],
# hau=["ha"], # Deprecated
# hbs=["sh"], # Deprecated
# hye=["hy"],
ita=["it-IT", "it-CH"],
ind=["id-ID"],
ice=["is-IS"],
isl=["is-IS"],
jav=["jv-ID"],
jpn=["ja-JP"],
kat=["ka-GE"],
kaz=["kk-KZ"],
khm=["km-KH"],
arm=['hy-AM'], # Armenian ISO-639-2 Variant
hye=['hy-AM'], # Armenian
ita=["it-IT", "it-CH"], # Italian
ind=["id-ID"], # Indonesian
ice=["is-IS"], # Icelandic ISO-639-2 Variant
isl=["is-IS"], # Icelandic
jav=["jv-ID"], # Javanese
jpn=["ja-JP"], # Japanese
geo=["ka-GE"], # Georgian ISO-639-2 Variant
kat=["ka-GE"], # Georgian
kaz=["kk-KZ"], # Kazakh
kxm=["km-KH"], # Northern Khmer, might not work as well.
kan=["kn-IN"],
khm=["km-KH"], # Central Khmer
kan=["kn-IN"], # Kannada
# kir=["ky-KG"], # Deprecated
kor=["ko-KR"],
kor=["ko-KR"], # Korean
# kur=["ku"], # Deprecated
lao=["lo-LA"],
lit=["lt-LT"],
lav=["lv-LV"],
lao=["lo-LA"], # Lao
lit=["lt-LT"], # Lithuanian
lvs=["lv-LV"], # Standard Latvian
lav=["lv-LV"], # Latvian (Inclusive)
# luo=[],
mkd=["mk-MK"],
mya=["my-MM"],
mal=["ml-IN"],
mon=["mn-MN"], # Mongolian (Inclusive)
mac=["mk-MK"], # Macedonian ISO 639-2 Variant
mkd=["mk-MK"], # Macedonian
bur=["my-MM"], # Burmese / Myanmar ISO 639-2 Variant
mya=["my-MM"], # Burmese / Myanmar
mal=["ml-IN"], # Malayalam
khk=["mn-MN"], # Khalkha Mongolian (Predominant)
mvf=["mn-MN"], # Peripheral Mongolian (Part)
mar=["mr-IN"],
zsm=["ms-MY"],
mlt=["mt-MT"],
nob=["nb-NO"],
nep=["ne-NP"], # Nepali (Macrolanguage)
mon=["mn-MN"], # Mongolian (Inclusive)
mar=["mr-IN"], # Marathi
may=["ms-MY"], # Malay ISO 639-2 Variant
msa=["ms-MY"], # Malay (Inclusive, Macrolanguage)
zsm=["ms-MY"], # Standard Malay (Malaysian Malay)
# In this case, the ms-MY code indicates ZSM (Standard Malay)
mlt=["mt-MT"], # Maltese
nob=["nb-NO"], # Norwegian Bokmål (Norway)
npi=["ne-NP"], # Nepali
nld=["nl-NL", "nl-BE"], # Netherlands and Belgium
nep=["ne-NP"], # Nepali (Inclusive, Macrolanguage)
dut=["nl-NL", "nl-BE"], # Dutch ISO 639-2 Variant
nld=["nl-NL", "nl-BE"], # Dutch - Netherlands and Belgium
# omr=["mr-IN"], # Old Maranthi, might not work
# nde=["nd"],
# orm=["om"],
pan=["pa-IN"],
pes=["fa-IR"],
pol=["pl-PL"],
pan=["pa-IN"], # Punjabi, Panjabi
per=["fa-IR"], # Persian ISO-639-2 Variant
fas=["fa-IR"], # Persian
pes=["fa-IR"], # Iranian Persian
pol=["pl-PL"], # Polish
por=["pt-BR", "pt-PT"], # pt-BR = Portuguese Brazil, pt-PT = Portuguese Portugal
pus=["ps-AF"], # Pashto, Pushto (Inclusive)
pbu=["ps-AF"], # Northern Pahsto
pst=["ps-AF"], # Central Pahsto
pbt=["ps-AF"], # Southern Pahsto
sin=["si-LK"],
pus=["ps-AF"], # Pashto, Pushto (Inclusive)
sin=["si-LK"], # Sinhala, Sinhalese
# prs=["prs-AF"], # Deprecated
# pus=["pa-AF"], # Deprecated
ron=["ro-RO"], # ro-MD deprecated
rum=["ro-RO"], # Romanian ISO-639-2 Variant
ron=["ro-RO"], # Romanian, Moldavian, Moldovan
# ro-MD deprecated
# run=[],
rus=["ru-RU"],
slk=["sk-SK"],
slv=["sl-SI"],
rus=["ru-RU"], # Russian
slo=["sk-SK"], # Slovak ISO-639-2 Variant
slk=["sk-SK"], # Slovak
slv=["sl-SI"], # Slovenian
# sna=["sn"],
som=["so-SO"],
som=["so-SO"], # Somali
spa=["es-MX", "es-US", "es-AR", "es-BO", "es-CL", "es-CO", "es-CR", "es-CU",
"es-DO", "es-EC", "es-SV", "es-GQ", "es-GT", "es-HN", "es-NI", "es-PA",
"es-PY", "es-PE", "es-PR", "es-ES", "es-UY", "es-VE"],

sqi=["sq-AL"],
swa=["sw-KE", "sw-TZ"],
swe=["sv-SE"],
srp=["sr-RS"],
tam=["ta-IN"],
tel=["te-IN"],
"es-PY", "es-PE", "es-PR", "es-ES", "es-UY", "es-VE"], # Spanish
alb=["sq-AL"], # Albanian ISO-639-2 Variant
sqi=["sq-AL"], # Albanian
swa=["sw-KE", "sw-TZ"], # Swahili
swe=["sv-SE"], # Swedish
srp=["sr-RS"], # Serbian
tam=["ta-IN"], # Tamil
tel=["te-IN"], # Telugu
# wbq = ["te-IN"], Waddar/Vadari is related to Telugu.
# tat=[],
# tgk=["tg-TJ"], # Deprecated
tgl=["fil-PH"], # "tl-PH" deprecated
tha=["th-TH"],
tgl=["fil-PH"], # "tl-PH" deprecated, Tagalog
fil=["fil-PH"], # Filipino (Standardized form of Tagalog)
tha=["th-TH"], # Thai
# tir=[],
# tpi=["tpi-PG"], # Deprecated
tur=["tr-TR"],
ukr=["uk-UA"],
urd=["ur-IN"],
uzb=["uz-UZ"],
vie=["vi-VN"],
cmn=["zh-CN", "zh-CN-shandong", "zh-CN-sichuan", "zh-HK", "zh-TW"],
zho=["zh-CN", "zh-CN-shandong", "zh-CN-sichuan", "zh-HK", "zh-TW"],
tur=["tr-TR"], # Turkish
ukr=["uk-UA"], # Ukrainian
urd=["ur-IN"], # Urdu
uzb=["uz-UZ"], # Uzbek
vie=["vi-VN"], # Vietnamese
cmn=["zh-CN", "zh-CN-shandong", "zh-CN-sichuan", "zh-HK", "zh-TW"], # Mandarin Chinese
zho=["zh-CN", "zh-CN-shandong", "zh-CN-sichuan", "zh-HK", "zh-TW"], # Chinese
yue=["yue-CN", "zh-HK"], # Cantonese
wuu=["wuu-CN"],
wuu=["wuu-CN"], # Chinese (Wu, Simplified)
nan=["zh-TW"], # nan-TW deprecated
# Note, Taiwanese has one standard + one major dialect,
# not sure which is covered better by Azure.
zul=["zu-ZA"]
zul=["zu-ZA"] # Zulu
)
Loading