In [1]:
import pandas as pd
import numpy as np
from cv_languages import LANGUAGES
from cv_release_stats import STATS

## Get the iso639-3 language code table

In [2]:
# Import the table of iso639 codes
iso639 = pd.read_table('iso-639-3.tab', sep = '\t')
iso639 = iso639.drop(['Part2b', 'Part2t', 'Scope', 'Language_Type', 'Comment'], axis = 1).rename(columns = {'Ref_Name': 'name', 'Id': 'code_iso639-3'})
iso639.rename(columns = {'Part1': 'code_iso639-1'}, inplace=True)
print(iso639)

     code_iso639-3 code_iso639-1                name
0              aaa           NaN              Ghotuo
1              aab           NaN          Alumu-Tesu
2              aac           NaN                 Ari
3              aad           NaN                Amal
4              aae           NaN  Arbëreshë Albanian
...            ...           ...                 ...
7914           zyj           NaN     Youjiang Zhuang
7915           zyn           NaN      Yongnan Zhuang
7916           zyp           NaN          Zyphe Chin
7917           zza           NaN                Zaza
7918           zzj           NaN     Zuojiang Zhuang

[7919 rows x 3 columns]


## Get the information of Common Voice corpus

In [3]:
# Load info from Common Voice corpus
cv_lang = pd.DataFrame.from_dict(LANGUAGES, orient = 'index')
cv_log = pd.DataFrame.from_dict(STATS['locales'], orient = 'index')
cv_log['validClips'] = cv_log.buckets.apply(pd.Series)['validated']
cv_log = cv_log[['clips', 'validClips', 'totalHrs', 'validHrs']]
cv_track = pd.concat([cv_lang, cv_log], join = 'inner', axis = 1)
cv_track.rename(columns={0: "name"}, inplace = True)
cv_track.index.name = 'code'

cv_lang_code = cv_track['name'].reset_index()
cv_lang_code[['code', 'code_cv_suffix']] = cv_lang_code.code.str.split('-', expand=True)
cv_lang_code['code_cv_suffix'] = cv_lang_code['code_cv_suffix'].fillna('')

cv_lang2 = cv_lang_code[cv_lang_code['code'].str.len() != 3]
cv_lang2.set_index('code', inplace=True)
cv_lang3 = cv_lang_code[cv_lang_code['code'].str.len() == 3]
cv_lang3.set_index('code', inplace=True)
print('Two-letter coded languages in Common Voice:')
print(cv_lang2)
print('\n')
print('Three-letter coded languages in Common Voice:')
print(cv_lang3)

Two-letter coded languages in Common Voice:
                     name code_cv_suffix
code                                    
ab                 Abkhaz               
af              Afrikaans               
am                Amharic               
ar                 Arabic               
as               Assamese               
...                   ...            ...
yo                 Yoruba               
zh        Chinese (China)             CN
zh    Chinese (Hong Kong)             HK
zh       Chinese (Taiwan)             TW
zu                   Zulu               

[99 rows x 2 columns]


Three-letter coded languages in Common Voice:
                               name code_cv_suffix
code                                              
ast                        Asturian               
bas                           Basaa               
ckb                 Central Kurdish               
cnh                      Hakha Chin               
dyu                          Dioula           

## Find the iso639-3 3-letter codes for languages in Common Voice

In [4]:
# For languages in Common Voice that used a non-three-letter code:
iso639_2 = iso639.rename(columns = {'code_iso639-1': 'code'})
iso639_2.set_index('code', inplace=True)

# Join it with iso639 table
cv_res2 = iso639_2.join(cv_lang2, how = 'right', lsuffix = '_iso639-3', rsuffix = '_cv')
cv_res2.reset_index(inplace=True)

# Update the codes with iso639-3 codes
cv_res2['code_update'] = cv_res2['code_iso639-3']

cv_res2.drop(['code_iso639-3', 'code'], axis = 1, inplace=True)
cv_res2.set_index('code_update', inplace=True)
print('The updated language codes:')
print(cv_res2)

The updated language codes:
            name_iso639-3              name_cv code_cv_suffix
code_update                                                  
abk             Abkhazian               Abkhaz               
afr             Afrikaans            Afrikaans               
amh               Amharic              Amharic               
ara                Arabic               Arabic               
asm              Assamese             Assamese               
...                   ...                  ...            ...
yor                Yoruba               Yoruba               
zho               Chinese      Chinese (China)             CN
zho               Chinese  Chinese (Hong Kong)             HK
zho               Chinese     Chinese (Taiwan)             TW
zul                  Zulu                 Zulu               

[99 rows x 3 columns]


In [5]:
# For languages in Common Voice that used a non-three-letter code:
iso639_3 = iso639.rename(columns = {'code_iso639-3': 'code'})
iso639_3.set_index('code', inplace=True)

# Join it with iso639 table
cv_res3 = iso639_3.join(cv_lang3, how = 'right', lsuffix = '_iso639-3', rsuffix = '_cv')
cv_res3.reset_index(inplace=True)

# Update the codes with iso639-3 codes
cv_res3['code_update'] = cv_res3['code']
cv_res3.drop(['code_iso639-1', 'code'], axis = 1, inplace=True)
cv_res3.set_index('code_update', inplace=True)
print('The updated language codes:')
print(cv_res3)

The updated language codes:
                                       name_iso639-3  \
code_update                                            
ast                                         Asturian   
bas                                  Basa (Cameroon)   
ckb                                  Central Kurdish   
cnh                                       Hakha Chin   
dyu                                            Dyula   
hsb                                    Upper Sorbian   
kab                                           Kabyle   
kmr                                 Northern Kurdish   
lij                                         Ligurian   
ltg                                        Latgalian   
mdf                                           Moksha   
mhr                                     Eastern Mari   
mrj                                     Western Mari   
myv                                            Erzya   
nan                                              NaN   
nhi          Zacatlá

In [6]:
# Get the updated codes
cv_res = pd.concat([cv_res2, cv_res3], axis=0)
cv_res.rename(columns = {'name_cv': 'name'}, inplace=True)
print(cv_res)
del cv_res2, cv_res3, cv_lang2, cv_lang3

                           name_iso639-3       name code_cv_suffix
code_update                                                       
abk                            Abkhazian     Abkhaz               
afr                            Afrikaans  Afrikaans               
amh                              Amharic    Amharic               
ara                               Arabic     Arabic               
asm                             Assamese   Assamese               
...                                  ...        ...            ...
zgh          Standard Moroccan Tamazight  Tamazight               
zza                                 Zaza       Zaza               
luo             Luo (Kenya and Tanzania)     Dholuo               
kln                             Kalenjin   Kalenjin               
dav                                Taita   Kidawida               

[129 rows x 3 columns]


In [7]:
# Update the language code in Common Voice
cv_track.reset_index(inplace=True)
cv_track.set_index('name', inplace=True)
print(cv_track)

cv_res.reset_index(inplace=True)
cv_res.set_index('name',inplace=True)
print(cv_res)

cv_update = cv_track.join(cv_res, how = 'left')
print(cv_update)

del cv_track, cv_res, cv_lang, cv_lang_code, cv_log

          code   clips  validClips  totalHrs  validHrs
name                                                  
Abkhaz      ab   59210       41992     84.41     59.86
Afrikaans   af     405         198      0.56      0.27
Amharic     am    1528         903      2.63      1.55
Arabic      ar  134892       78156    155.81     90.27
Assamese    as    2072        1726      3.27      2.72
...        ...     ...         ...       ...       ...
Zulu        zu      35           0      0.04      0.00
Zaza       zza     412         312      0.44      0.33
Dholuo     luo   59210       41992     84.41     59.86
Kalenjin   kln   59210       41992     84.41     59.86
Kidawida   dav   59210       41992     84.41     59.86

[129 rows x 5 columns]
          code_update                name_iso639-3 code_cv_suffix
name                                                             
Abkhaz            abk                    Abkhazian               
Afrikaans         afr                    Afrikaans             

## Find the iso639-3 codes for the languages in XPF

In [8]:
# Load info from XPF corpus
xpf_list = pd.read_table('xpf_langs-list.tsv')
xpf_list = xpf_list[['code', 'name', 'compromised']]
xpf_list['xpf'] = np.where(pd.isna(xpf_list['compromised']), 'yes', 'compromised')
xpf_list.rename(columns={'name': 'name_xpf'}, inplace=True)
xpf_list = xpf_list.drop(['compromised'], axis = 1)

xpf_list2 = xpf_list[xpf_list['code'].str.len() != 3]
xpf_list2.set_index('code', inplace=True)
xpf_list3 = xpf_list[xpf_list['code'].str.len() == 3]
xpf_list3.set_index('code', inplace=True)

In [9]:
# For languages in Common Voice that used a non-three-letter code:
iso639_2 = iso639.rename(columns = {'code_iso639-1': 'code'})
iso639_2 = iso639_2[pd.notna(iso639_2['code'])]
iso639_2.set_index('code', inplace=True)

# Join it with iso639 table
xpf_res2 = iso639_2.join(xpf_list2, how = 'right', lsuffix = '_iso639-3', rsuffix = '_xpf')
xpf_res2.reset_index(inplace=True)

# Update the codes with iso639-3 codes
xpf_res2.rename(columns = {'code_iso639-3': 'code_update'}, inplace=True)
xpf_res2.set_index('code_update', inplace=True)
xpf_res2.rename(columns = {'code': 'code_xpf'}, inplace=True)
print('The updated language codes:')
print(xpf_res2)

The updated language codes:
            code_xpf                     name         name_xpf          xpf
code_update                                                                
abk               ab                Abkhazian           Abkhaz          yes
arg               an                Aragonese        Aragonese          yes
aym               ay                   Aymara           Aymara          yes
aze               az              Azerbaijani      Azerbaijani          yes
bak               ba                  Bashkir          Bashkir          yes
bel               be               Belarusian        Belarusan          yes
bul               bg                Bulgarian        Bulgarian          yes
bis               bi                  Bislama          Bislama          yes
ces               cs                    Czech            Czech          yes
chv               cv                  Chuvash          Chuvash          yes
div               dv                  Dhivehi        Maldivi

In [10]:
# For languages in Common Voice that used a three-letter code:
iso639_3 = iso639.rename(columns = {'code_iso639-3': 'code'})
iso639_3.set_index('code', inplace=True)

# Join it with iso639 table
xpf_res3 = iso639_3.join(xpf_list3, how = 'right', lsuffix = '_iso639-3', rsuffix = '_xpf')
xpf_res3.reset_index(inplace=True)


# Update the codes with iso639-3 codes
xpf_res3['code_update'] = xpf_res3['code']
xpf_res3.rename(columns = {'code': 'code_xpf'}, inplace=True)
xpf_res3.drop(['code_iso639-1'], axis = 1, inplace=True)
xpf_res3.set_index('code_update', inplace=True)
print('The updated language codes:')
print(xpf_res3)

The updated language codes:
            code_xpf                        name                    name_xpf  \
code_update                                                                    
aak              aak                      Ankave                      Ankave   
aau              aau                        Abau                        Abau   
acf              acf  Saint Lucian Creole French  Saint Lucian Creole French   
aey              aey                       Amele                       Amele   
agg              agg                       Angor                       Angor   
...              ...                         ...                         ...   
tzj              tzj                   Tz'utujil                   Tz'utujil   
tzm              tzm     Central Atlas Tamazight     Central Atlas Tamazight   
wmw              wmw                       Mwani                       Mwani   
zsm              zsm              Standard Malay              Standard Malay   
zza         

In [11]:
xpf_res = pd.concat([xpf_res2, xpf_res3], axis = 0)
xpf_res.drop('name', axis = 1, inplace=True)
print(xpf_res)
del xpf_res2, xpf_res3, xpf_list2, xpf_list3

            code_xpf                 name_xpf          xpf
code_update                                               
abk               ab                   Abkhaz          yes
arg               an                Aragonese          yes
aym               ay                   Aymara          yes
aze               az              Azerbaijani          yes
bak               ba                  Bashkir          yes
...              ...                      ...          ...
tzj              tzj                Tz'utujil          yes
tzm              tzm  Central Atlas Tamazight  compromised
wmw              wmw                    Mwani  compromised
zsm              zsm           Standard Malay  compromised
zza              zza                     Zaza  compromised

[201 rows x 3 columns]


In [12]:
# Load info from Epitran
epi_list = pd.read_csv('epi_langs-list.csv')
epi_list[['code', 'epi_ortho_typ']] = epi_list.Code.str.split('-', n=1, expand = True)
epi_list.rename(columns = {'Code': 'code_epi'}, inplace=True)
epi_list = epi_list.rename(columns={'Language (Script)' : 'name_epi'})
epi_list = epi_list.set_index('code')
epi_list['name_epi'] = epi_list.name_epi.str.replace(' \(.+\)', '', regex=True)
epi_list['epitran'] = 'yes'
epi_list = epi_list.drop('epi_ortho_typ', axis=1).drop_duplicates()

epi_list.index.name = 'code_update'
print(epi_list)

                 code_epi              name_epi epitran
code_update                                            
aar              aar-Latn                  Afar     yes
aii              aii-Syrc  Assyrian Neo-Aramaic     yes
amh              amh-Ethi               Amharic     yes
amh           amh-Ethi-pp               Amharic     yes
amh          amh-Ethi-red               Amharic     yes
...                   ...                   ...     ...
wuu              wuu-Latn       Shanghainese Wu     yes
xho              xho-Latn                 Xhosa     yes
yor              yor-Latn                Yoruba     yes
zha              zha-Latn                Zhuang     yes
zul              zul-Latn                  Zulu     yes

[117 rows x 3 columns]


In [13]:
cv_update.rename(columns={"name": "name_cv"}, inplace= True)
xpf_res.rename(columns={"name": "name_iso"}, inplace= True)
cv_update.reset_index(inplace=True)
cv_update.set_index('code_update', inplace=True)
xpf_res.reset_index(inplace=True)
xpf_res.set_index('code_update', inplace=True)

print("Epitran:\n", epi_list)
print("\nCommon Voice:\n", cv_update)
print("\nXPF:\n", xpf_res)

Epitran:
                  code_epi              name_epi epitran
code_update                                            
aar              aar-Latn                  Afar     yes
aii              aii-Syrc  Assyrian Neo-Aramaic     yes
amh              amh-Ethi               Amharic     yes
amh           amh-Ethi-pp               Amharic     yes
amh          amh-Ethi-red               Amharic     yes
...                   ...                   ...     ...
wuu              wuu-Latn       Shanghainese Wu     yes
xho              xho-Latn                 Xhosa     yes
yor              yor-Latn                Yoruba     yes
zha              zha-Latn                Zhuang     yes
zul              zul-Latn                  Zulu     yes

[117 rows x 3 columns]

Common Voice:
                   name code   clips  validClips  totalHrs  validHrs  \
code_update                                                           
abk             Abkhaz   ab   59210       41992     84.41     59.86   
afr      

In [14]:
# Merge them
vxc_info = cv_update.join(xpf_res, how = 'left')
vxc_info.rename(columns = {'code': 'code_cv'}, inplace=True)
vxc_info = vxc_info.join(epi_list, how = 'left')
vxc_info.reset_index(inplace=True)
vxc_info.drop(['code_cv_suffix'], axis = 1, inplace=True)
vxc_info = vxc_info[['code_cv', 'code_update', 'name', 'name_iso639-3', 'name_xpf', 'name_epi', 'clips', 'validClips', 'totalHrs', 'validHrs', 'xpf', 'code_xpf', 'epitran', 'code_epi']]
vxc_info.rename(columns={'name': 'name_cv', 'code_update' : 'code_iso639_3'}, inplace=True)
vxc_info['spkr_file'] = ''
vxc_info['lexicon'] = ''
vxc_info['acoustic_model'] = ''
print(vxc_info)

vxc_info.drop_duplicates(inplace=True)
vxc_info.to_csv('/Users/miaozhang/switchdrive/vxc_scripts/vxc_lang_code_processing/VoxCommunis_Info.csv')

del cv_update, epi_list, iso639, iso639_2, iso639_3, LANGUAGES, STATS, xpf_list, xpf_res, vxc_info


    code_cv code_iso639_3    name_cv             name_iso639-3 name_xpf  \
0        ab           abk     Abkhaz                 Abkhazian   Abkhaz   
1        af           afr  Afrikaans                 Afrikaans      NaN   
2        am           amh    Amharic                   Amharic      NaN   
3        am           amh    Amharic                   Amharic      NaN   
4        am           amh    Amharic                   Amharic      NaN   
..      ...           ...        ...                       ...      ...   
147      zu           zul       Zulu                      Zulu      NaN   
148     zza           zza       Zaza                      Zaza     Zaza   
149     luo           luo     Dholuo  Luo (Kenya and Tanzania)      NaN   
150     kln           kln   Kalenjin                  Kalenjin      NaN   
151     dav           dav   Kidawida                     Taita      NaN   

    name_epi  clips  validClips  totalHrs  validHrs          xpf code_xpf  \
0        NaN  59210   