In [None]:
import pandas as pd
import numpy as np
from cv_languages import LANGUAGES
from cv_release_stats import STATS

## Get the iso639-3 language code table

In [None]:
# Import the table of iso639 codes
iso639 = pd.read_table('iso-639-3.tab', sep = '\t')
iso639 = iso639.drop(['Part2b', 'Part2t', 'Scope', 'Language_Type', 'Comment'], axis = 1).rename(columns = {'Ref_Name': 'name', 'Id': 'code_iso639-3'})
iso639.rename(columns = {'Part1': 'code_iso639-1'}, inplace=True)
print(iso639)

## Get the information of Common Voice corpus

In [None]:
# Load info from Common Voice corpus
cv_lang = pd.DataFrame.from_dict(LANGUAGES, orient = 'index')
cv_log = pd.DataFrame.from_dict(STATS['locales'], orient = 'index')
cv_log['validClips'] = cv_log.buckets.apply(pd.Series)['validated']
cv_log = cv_log[['clips', 'validClips', 'totalHrs', 'validHrs']]
cv_track = pd.concat([cv_lang, cv_log], join = 'inner', axis = 1)
cv_track.rename(columns={0: "name"}, inplace = True)
cv_track.index.name = 'code'

cv_lang_code = cv_track['name'].reset_index()
cv_lang_code[['code', 'code_cv_suffix']] = cv_lang_code.code.str.split('-', expand=True)
cv_lang_code['code_cv_suffix'] = cv_lang_code['code_cv_suffix'].fillna('')

cv_lang2 = cv_lang_code[cv_lang_code['code'].str.len() != 3]
cv_lang2.set_index('code', inplace=True)
cv_lang3 = cv_lang_code[cv_lang_code['code'].str.len() == 3]
cv_lang3.set_index('code', inplace=True)
print('Two-letter coded languages in Common Voice:')
print(cv_lang2)
print('\n')
print('Three-letter coded languages in Common Voice:')
print(cv_lang3)

## Find the iso639-3 3-letter codes for languages in Common Voice

In [None]:
# For languages in Common Voice that used a non-three-letter code:
iso639_2 = iso639.rename(columns = {'code_iso639-1': 'code'})
iso639_2.set_index('code', inplace=True)

# Join it with iso639 table
cv_res2 = iso639_2.join(cv_lang2, how = 'right', lsuffix = '_iso639-3', rsuffix = '_cv')
cv_res2.reset_index(inplace=True)

# Update the codes with iso639-3 codes
cv_res2['code_update'] = cv_res2['code_iso639-3']

cv_res2.drop(['code_iso639-3', 'code'], axis = 1, inplace=True)
cv_res2.set_index('code_update', inplace=True)
print('The updated language codes:')
print(cv_res2)

In [None]:
# For languages in Common Voice that used a non-three-letter code:
iso639_3 = iso639.rename(columns = {'code_iso639-3': 'code'})
iso639_3.set_index('code', inplace=True)

# Join it with iso639 table
cv_res3 = iso639_3.join(cv_lang3, how = 'right', lsuffix = '_iso639-3', rsuffix = '_cv')
cv_res3.reset_index(inplace=True)

# Update the codes with iso639-3 codes
cv_res3['code_update'] = cv_res3['code']
cv_res3.drop(['code_iso639-1', 'code'], axis = 1, inplace=True)
cv_res3.set_index('code_update', inplace=True)
print('The updated language codes:')
print(cv_res3)

In [None]:
# Get the updated codes
cv_res = pd.concat([cv_res2, cv_res3], axis=0)
cv_res.rename(columns = {'name_cv': 'name'}, inplace=True)
print(cv_res)
del cv_res2, cv_res3, cv_lang2, cv_lang3

In [None]:
# Update the language code in Common Voice
cv_track.reset_index(inplace=True)
cv_track.set_index('name', inplace=True)
print(cv_track)

cv_res.reset_index(inplace=True)
cv_res.set_index('name',inplace=True)
print(cv_res)

cv_update = cv_track.join(cv_res, how = 'left')
print(cv_update)

del cv_track, cv_res, cv_lang, cv_lang_code, cv_log

## Find the iso639-3 codes for the languages in XPF

In [None]:
# Load info from XPF corpus
xpf_list = pd.read_table('xpf_langs-list.tsv')
xpf_list = xpf_list[['code', 'name', 'compromised']]
xpf_list['xpf'] = np.where(pd.isna(xpf_list['compromised']), 'yes', 'compromised')
xpf_list.rename(columns={'name': 'name_xpf'}, inplace=True)
xpf_list = xpf_list.drop(['compromised'], axis = 1)

xpf_list2 = xpf_list[xpf_list['code'].str.len() != 3]
xpf_list2.set_index('code', inplace=True)
xpf_list3 = xpf_list[xpf_list['code'].str.len() == 3]
xpf_list3.set_index('code', inplace=True)

In [None]:
# For languages in Common Voice that used a non-three-letter code:
iso639_2 = iso639.rename(columns = {'code_iso639-1': 'code'})
iso639_2 = iso639_2[pd.notna(iso639_2['code'])]
iso639_2.set_index('code', inplace=True)

# Join it with iso639 table
xpf_res2 = iso639_2.join(xpf_list2, how = 'right', lsuffix = '_iso639-3', rsuffix = '_xpf')
xpf_res2.reset_index(inplace=True)

# Update the codes with iso639-3 codes
xpf_res2.rename(columns = {'code_iso639-3': 'code_update'}, inplace=True)
xpf_res2.set_index('code_update', inplace=True)
xpf_res2.rename(columns = {'code': 'code_xpf'}, inplace=True)
print('The updated language codes:')
print(xpf_res2)

In [None]:
# For languages in Common Voice that used a three-letter code:
iso639_3 = iso639.rename(columns = {'code_iso639-3': 'code'})
iso639_3.set_index('code', inplace=True)

# Join it with iso639 table
xpf_res3 = iso639_3.join(xpf_list3, how = 'right', lsuffix = '_iso639-3', rsuffix = '_xpf')
xpf_res3.reset_index(inplace=True)


# Update the codes with iso639-3 codes
xpf_res3['code_update'] = xpf_res3['code']
xpf_res3.rename(columns = {'code': 'code_xpf'}, inplace=True)
xpf_res3.drop(['code_iso639-1'], axis = 1, inplace=True)
xpf_res3.set_index('code_update', inplace=True)
print('The updated language codes:')
print(xpf_res3)

In [None]:
xpf_res = pd.concat([xpf_res2, xpf_res3], axis = 0)
xpf_res.drop('name', axis = 1, inplace=True)
print(xpf_res)
del xpf_res2, xpf_res3, xpf_list2, xpf_list3

In [None]:
# Load info from Epitran
epi_list = pd.read_csv('epi_langs-list.csv')
epi_list[['code', 'epi_ortho_typ']] = epi_list.Code.str.split('-', n=1, expand = True)
epi_list.rename(columns = {'Code': 'code_epi'}, inplace=True)
epi_list = epi_list.rename(columns={'Language (Script)' : 'name_epi'})
epi_list = epi_list.set_index('code')
epi_list['name_epi'] = epi_list.name_epi.str.replace(' \(.+\)', '', regex=True)
epi_list['epitran'] = 'yes'
epi_list = epi_list.drop('epi_ortho_typ', axis=1).drop_duplicates()

epi_list.index.name = 'code_update'
print(epi_list)

In [None]:
cv_update.rename(columns={"name": "name_cv"}, inplace= True)
xpf_res.rename(columns={"name": "name_iso"}, inplace= True)
cv_update.reset_index(inplace=True)
cv_update.set_index('code_update', inplace=True)
xpf_res.reset_index(inplace=True)
xpf_res.set_index('code_update', inplace=True)

print("Epitran:\n", epi_list)
print("\nCommon Voice:\n", cv_update)
print("\nXPF:\n", xpf_res)

In [None]:
# Merge them
vxc_info = cv_update.join(xpf_res, how = 'left')
vxc_info.rename(columns = {'code': 'code_cv'}, inplace=True)
vxc_info = vxc_info.join(epi_list, how = 'left')
vxc_info.reset_index(inplace=True)
vxc_info.drop(['code_update', 'code_cv_suffix'], axis = 1, inplace=True)
vxc_info = vxc_info[['code_cv', 'name', 'name_iso639-3', 'name_xpf', 'name_epi', 'clips', 'validClips', 'totalHrs', 'validHrs', 'xpf', 'code_xpf', 'epitran', 'code_epi']]
vxc_info.rename(columns={'name': 'name_cv'}, inplace=True)
vxc_info['spkr_file'] = ''
vxc_info['lexicon'] = ''
vxc_info['acoustic_model'] = ''
print(vxc_info)

vxc_info.drop_duplicates(inplace=True)
vxc_info.to_csv('/Users/miaozhang/Research/CorpusPhon/Scripts/vxc_lang_code_processing/VoxCommunis_Info.csv')

del cv_update, epi_list, iso639, iso639_2, iso639_3, LANGUAGES, STATS, xpf_list, xpf_res, vxc_info
