In [38]:
import asyncio
from googletrans import Translator
import pandas as pd

# Multi-character word dataset improvement : ensuring data quality

It is clear from tests that the final csv file used to generate the json for multi-character words has a few errors. Many words lack pronunciations, and some lack valid definitions (only containing pinyin without context). We use a google translate API to fix some. First, we replace all pronunciations using google translate. Then, we check for syllables with accent marks in the definitions, and find definitions using the same API for definitions with only pinyin (no real meaning given)

#### Reading in the data and testing translator apis

In [39]:
path = "test_multi_list.csv" #"final_test_characters.csv"
test_data = pd.read_csv(path)
print(len(test_data))
test_data.head(n=20)

4758


Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,cat,word/character,pronunciation,definition,code,alt,test_definition
0,0,0,0,2,1,我的,Wǒ de,mine,s,,wǒ_ my
1,1,1,1,3,1,我的,Wǒ de,mine,t,,wǒ_ my
2,2,2,2,4,1,高的,Gāo de,"gāode high, tall",s,,"gāo_ high, tall"
3,3,3,3,5,1,高的,Gāo de,"高的 gāode high, tall",t,,"gāo_ high, tall"
4,4,4,4,6,1,是的,Shì de,"shìde that's it, that's right",s,,"shì_ that's it, that's right"
5,5,5,5,7,1,是的,Shì de,"是的 shìde that's it, that's right",t,,"shì_ that's it, that's right"
6,6,6,6,8,1,是的,Shì de,shìde one who...,s,,shì_ one who...
7,7,7,7,9,1,是的,Shì de,shìde one who...,t,,shì_ one who...
8,8,8,8,10,1,他是说汉语的,Tā shì shuō hànyǔ de,Tā shì shuō Hànyǔde. He is one who speaks Chin...,s,,Tā shì shuō Hànyǔ_. He is one who speaks Chinese.
9,9,9,9,11,1,他是說漢語的,Tā shì shuō hànyǔ de,他是說漢語的. Tā shì shuō Hànyǔde. He is one who spe...,t,,. Tā shì shuō Hànyǔ_. He is one who speaks Chi...


Testing translator API

In [3]:
async def chinese_to_eng(input):
    translator = Translator()
    result = await translator.translate(input, dest="en", src="zh")
    return result


In [4]:
result = await chinese_to_eng("我的")
print("definition:", result.text)

definition: mine


This is the full data that can be gained. Note extracting pinyin is not as direct

In [5]:
print("pronunciation (pinyin):", result.extra_data)

pronunciation (pinyin): {'translation': [['mine', '我的', None, None, 10], [None, None, None, 'Wǒ de']], 'all-translations': [['pronoun', ['mine', 'my'], [['mine', ['我的']], ['my', ['吾', '敝', '鄙', '窃', '我的']]], '我的', 8]], 'original-language': 'zh-CN', 'possible-translations': [['我的', None, [['mine', None, True, False, [10]], ['my', None, True, False, [10]]], [[0, 2]], '我的', 0, 0]], 'confidence': None, 'possible-mistakes': None, 'language': [['zh-CN'], None, [0], ['zh-CN']], 'synonyms': None, 'definitions': None, 'examples': None, 'see-also': None}


More examples

In [6]:
result = await chinese_to_eng("他是說漢語的")
print(result.extra_data)

{'translation': [['He speaks Chinese', '他是說漢語的', None, None, 3, None, None, [[]], [[['af64405095a399ceb1e05c7abb7cda66', 'zh_en_2023q1.md']]]], [None, None, None, 'Tā shì shuō hànyǔ de']], 'all-translations': None, 'original-language': 'zh-CN', 'possible-translations': [['他是說漢語的', None, [['He speaks Chinese', None, True, False, [3], None, [[3]]], ['He spoke Chinese', None, True, False, [8]]], [[0, 6]], '他是說漢語的', 0, 0]], 'confidence': 1, 'possible-mistakes': None, 'language': [['en'], None, [1], ['en']], 'synonyms': None, 'definitions': None, 'examples': None, 'see-also': None}


In [7]:
result = await chinese_to_eng("他是說漢語的")
print(result.extra_data)

{'translation': [['He speaks Chinese', '他是說漢語的', None, None, 3, None, None, [[]], [[['af64405095a399ceb1e05c7abb7cda66', 'zh_en_2023q1.md']]]], [None, None, None, 'Tā shì shuō hànyǔ de']], 'all-translations': None, 'original-language': 'zh-CN', 'possible-translations': [['他是說漢語的', None, [['He speaks Chinese', None, True, False, [3], None, [[3]]], ['He spoke Chinese', None, True, False, [8]]], [[0, 6]], '他是說漢語的', 0, 0]], 'confidence': 1, 'possible-mistakes': None, 'language': [['en'], None, [1], ['en']], 'synonyms': None, 'definitions': None, 'examples': None, 'see-also': None}


In [8]:


result = await chinese_to_eng("的确")
print(result.extra_data)

{'translation': [['indeed', '的确', None, None, 10], [None, None, None, 'Díquè']], 'all-translations': [['adverb', ['indeed', 'really', 'certainly', 'quite', 'duly', 'iwis', 'in faith', 'certes', 'in troth'], [['indeed', ['确实', '的确', '确', '真', '果真', '诚'], None, 0.14406367], ['really', ['真', '确实', '实在', '的确', '果然', '其实'], None, 0.06595715], ['certainly', ['当然', '一定', '无疑', '的确', '定然', '自'], None, 0.01382537], ['quite', ['相当', '很', '比较', '挺', '颇', '的确'], None, 0.0010828866], ['duly', ['的确', '适时地'], None, 0.000117758456], ['iwis', ['的确', '的确地'], None, 1.5446549e-05], ['in faith', ['的确'], None, 7.646596e-06], ['certes', ['的确'], None, 3.84495e-06], ['in troth', ['的确'], None, 3.84495e-06]], '的确', 4], ['adjective', ['certain'], [['certain', ['某些', '一定', '某', '确定', '肯定', '的确'], None, 0.0006991642]], '的确', 3], ['verb', ['be certain'], [['be certain', ['肯定', '的确'], None, 1.922354e-05]], '的确', 2]], 'original-language': 'zh-CN', 'possible-translations': [['的确', None, [['indeed', None, True, False, [

In [9]:

result = await chinese_to_eng("高的")
print(result.extra_data)

{'translation': [['high', '高的', None, None, 3, None, None, [[]], [[['af64405095a399ceb1e05c7abb7cda66', 'zh_en_2023q1.md']]]], [None, None, None, 'Gāo de']], 'all-translations': [['adjective', ['overhead'], [['overhead', ['上', '高的', '上面的']]], '高的', 3]], 'original-language': 'zh-CN', 'possible-translations': [['高的', None, [['high', None, True, False, [3], None, [[3]]], ['tall', None, True, False, [8]]], [[0, 2]], '高的', 0, 0]], 'confidence': None, 'possible-mistakes': None, 'language': [['zh-CN'], None, [0], ['zh-CN']], 'synonyms': None, 'definitions': None, 'examples': None, 'see-also': None}


In [10]:
curr = result.extra_data

In [11]:
print(curr["translation"])

[['high', '高的', None, None, 3, None, None, [[]], [[['af64405095a399ceb1e05c7abb7cda66', 'zh_en_2023q1.md']]]], [None, None, None, 'Gāo de']]


#### Writing function to extract pinyin

From this, we can extract pinyin using this pattern. Check for none, return error for potential issues

In [12]:
async def getPinyin(input): 
    try :
        result = await chinese_to_eng(input)
        result = result.extra_data
        final_list = result["translation"]
        final_list = final_list[len(final_list)-1]
        pinyin = final_list[len(final_list)-1]
        if (pinyin is None): 
            return ""
        else:
            return pinyin
    except :
        return "error"


testing

In [13]:
pinyin = await getPinyin("高的")
print(pinyin)

Gāo de


In [14]:
pinyin = await getPinyin("台湾作词人，音乐人[1]。現為大國大熊星娛樂总经理。曾擔任台湾科藝百代（EMI）及维京音乐（Virgin Music Chinese）、新力哥倫比亞音樂、點將唱片的总经理。二十年来，打造出林慧萍、張清芳、伍思凱、優客李林、柯以敏、萧亚轩、刘若英、李玟、赵薇、江美琪、余憲忠等歌手[2]。个人发表原创歌词600多首，包括许多脍炙人口的作品。")
print(pinyin)

Táiwān zuòcí rén, yīnyuè rén [1]. Xiàn wéi dàguó dàxióng xīng yúlè zǒng jīnglǐ. Céng dānrèn táiwān kē yì bǎidài (EMI) jí wéi jīng yīnyuè (Virgin Music Chinese), xīnlì gēlúnbǐyǎ yīnyuè, diǎnjiàng chàngpiàn de zǒng jīnglǐ. Èrshí niánlái, dǎzào chū línhuìpíng, zhāngqīngfāng, wǔsīkǎi, yōu kè lǐ lín, kēyǐmǐn, xiāoyàxuān, liúruòyīng, lǐ wén, zhàowēi, jiāngměi qí, yú xiànzhōng děng gēshǒu [2]. Gèrén fà biǎo yuánchuàng gēcí 600 duō shǒu, bāokuò xǔduō kuàizhìrénkǒu de zuòpǐn.


##### pinyin validation

We can help validate for chinese character-only inputs by checking if the pinyin is valid. We can check if every character is represented by one pinyin. Do a simple check we were check if the number of chinese characters equals the number of vowel clusters

In [15]:
import unicodedata

def count_vowel_clusters(s):
    vowels = "aeiouAEIOU"
    cluster_count = 0
    in_cluster = False

    for char in s:
        #remove diacritics 
        modified_char = unicodedata.normalize('NFD', char)
        modified_char = str(modified_char.encode('ascii', 'ignore').decode("utf-8"))
        if modified_char in vowels:
            if not in_cluster:
                cluster_count += 1  # Start a new cluster
                in_cluster = True
        else:
            in_cluster = False             # Not a vowel, so any current cluster ends

            
    return cluster_count

In [16]:
import re
import string 

def contains_chinese(text):
    #Checks if the input string contains any Chinese character
    # Regex pattern for the main CJK Unified Ideographs range
    chinese_pattern = re.compile(r'[\u4e00-\u9fff]')
    return bool(chinese_pattern.search(text))
#loop through a word (unicode parts to see if all are chinese or punctuation)

def allChinese(text):
    punctuations = set(string.punctuation) 
    for char in text:
        #return false if the current character is not chinese and is not punctuation
        if (contains_chinese(char) == False and char not in punctuations):
            return False
    return True
# count, using similar logic, the number of chinese characters in text 
def numChinese(text):
    count = 0
    for char in text:
        if contains_chinese(char) == True:
            count+=1
    return count

Testing the functions (by default python does not tells difference between diacritics, so we need to remove the accent marks). 

In [17]:
print(count_vowel_clusters("xiāoyàxuān"))

print(count_vowel_clusters("liúruòyīng"))

print(count_vowel_clusters("gēshǒu"))

3
3
2


In [18]:
print( allChinese("da國大熊星") )

print( allChinese("國大熊星") )

print( allChinese("國大-=/熊星") )

False
True
True


In [19]:
print( numChinese("da國大熊星") )
print( numChinese("國大熊星-大熊星-") )

4
7


##### Final version of the function

We complete the function by adding the case if the text is all chinese, the pinyin condition must match

In [20]:
async def getPinyin(input): 
    try :
        result = await chinese_to_eng(input)
        result = result.extra_data
        final_list = result["translation"]
        final_list = final_list[len(final_list)-1]
        pinyin = final_list[len(final_list)-1]
        if (pinyin is None): 
            print("no pinyin was able to be returned for", input)
            return ""
        if (allChinese(input)==True and numChinese(input) != count_vowel_clusters(pinyin)): 
            print("error: output pinyin: ", pinyin, "not valid for", input)
            return ""
        else:
            return pinyin
    except Exception as e:
        print("error: ",e , "for", input)
        return "",

Verifying

In [21]:
result = await getPinyin("國大熊星")


In [22]:
print(result)

Guó dàxióng xīng


In [23]:
result = await getPinyin("國大熊星-大熊星")

print(result)

Guó dàxióng xīng-dàxióng xīng


#### Creating function to detect pinyin and chinese characters, to see what the text without it looks like, so we can determine if the definition is valid

Create a function to check if a word is a pure english word (no diacritics or nonlatin)

In [24]:
def is_plain_latin(char):
  #Checks if a character is a plain, unmarked Latin letter or punctuation
  punctuations = set(string.punctuation) 
  return (char in string.ascii_letters or char in punctuations)
def word_plain_latin(word):
    for x in word:
       if (is_plain_latin(x) == False):
          return False
    return True
#isolate words of a sentence that only have plain latin characters into a string
def plainLatinCharsWord(input):
  words = input.split(" ")
  finalword = ""
  for word in words:
     if word_plain_latin(word):
        finalword += (word+" ")
  return finalword.strip()

Testing.

As you can see, it reduces the complicated definitions to the chinese characters only

In [25]:
plainLatinCharsWord("是的 shìde that's it, that's right 是的")

"that's it, that's right"

### Data modification

After running all of this, ideally no errors should remain when running again

Check if the all the different words in the list are actually purely Chinese

In [26]:
chars_list = list(test_data["word/character"])
print(len(chars_list))

4758


In [27]:
i = 0
for word in chars_list:
    if allChinese(word) == False:
        print(word, "at index", i, "is not all chinese")
    i += 1

As we can see there are no such errors

Modify the definitions list to make sure that all are valid definitions. If not, we use google translate to help

In [28]:
definitions = list(test_data["definition"])

In [29]:
i = 0 #track index
for word in definitions:
    word = str(word)
    #isolate pure latin characters 
    latin = plainLatinCharsWord(word)
    if len(latin)<4: #if the length is less than 4, the definition is likely not valid
        print("definition:", word, "at index", i, "is not a valid definition, changing using google translator")
        currchar = chars_list[i] #get current character using previous list
        newdef = await chinese_to_eng(currchar)
        newdef = newdef.text
        definitions[i] = newdef
        print("changed definition of",currchar, "to ", newdef)
        
    i += 1

definition: Now at index 50 is not a valid definition, changing using google translator
changed definition of 现在 to  Now
definition: now at index 51 is not a valid definition, changing using google translator
changed definition of 現在 to  now
definition: USA at index 82 is not a valid definition, changing using google translator
changed definition of 美国 to  USA
definition: us at index 94 is not a valid definition, changing using google translator
changed definition of 我们 to  us
definition: we at index 95 is not a valid definition, changing using google translator
changed definition of 我們 to  we
definition: son at index 104 is not a valid definition, changing using google translator
changed definition of 儿子 to  son
definition: son at index 105 is not a valid definition, changing using google translator
changed definition of 兒子 to  son
definition: all at index 312 is not a valid definition, changing using google translator
changed definition of 所有 to  all
definition: all at index 313 is n

Save updated data

In [30]:
test_data["definition"] = definitions

Finally, we fix invalid pronuniciations

In [31]:
pronunciations = list(test_data["pronunciation"])

In [32]:
i = 0 #track index
for pron in pronunciations:
    currchar = chars_list[i] #get current character using previous list
    #check if it is valid by counting vowel clusters, and ensuring there are no chinese characters 
    if numChinese(currchar) != count_vowel_clusters(pron):
        print("fix index ", i)
        #print(f"{i} invalid pronunciation", pron, "found for word", currchar)
        #attempt repair 
        result = await getPinyin(currchar)
        if (result != ""):
            #assign if correct
            pronunciations[i] = result 
            #print("new pronunciation: ", result)
        else:
            print("failed repair at index", i)

    i+=1

after these few errors, we change manually add pronunciations for :
国民党, 乱麻麻, 屎壳郎, 伸懶腰 for test_multi_list.csv

save changes 


In [33]:
test_data["pronunciation"] = pronunciations

#### Finally, save this modified data

In [34]:
test_data.to_csv(path)


### Last part : changing column name to full pronunciation, and creating a column with the pronunciation without tones (full_pronunciation_wo), similar to that in final_test_characters.csv

In [40]:
test_data = test_data.rename(columns={'pronunciation': 'full_pronunciation'})
test_data.head(n=1)

Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,cat,word/character,full_pronunciation,definition,code,alt,test_definition
0,0,0,0,2,1,我的,Wǒ de,mine,s,,wǒ_ my


Stripping diacritics

In [44]:
full_pronunciation_wo = list(test_data["full_pronunciation"])
i = 0
for pro in full_pronunciation_wo:
    #loop through each letter in pro to remove diacritic 
    modified_pro = ""
    for char in pro:
        #remove diacritics 
        modified_char = unicodedata.normalize('NFD', char)
        modified_char = str(modified_char.encode('ascii', 'ignore').decode("utf-8"))
        modified_pro += modified_char
    print("modified, ", modified_pro, "current", pro)
    full_pronunciation_wo[i] = modified_pro
    i += 1

modified,  Wo de current Wǒ de
modified,  Wo de current Wǒ de
modified,  Gao de current Gāo de
modified,  Gao de current Gāo de
modified,  Shi de current Shì de
modified,  Shi de current Shì de
modified,  Shi de current Shì de
modified,  Shi de current Shì de
modified,  Ta shi shuo hanyu de current Tā shì shuō hànyǔ de
modified,  Ta shi shuo hanyu de current Tā shì shuō hànyǔ de
modified,  Mudi current Mùdì
modified,  Mudi current Mùdì
modified,  Dique current Díquè
modified,  Dique current Díquè
modified,  Di yi current Dì yī
modified,  Di yi current Dì yī
modified,  Kan yi kan current Kàn yī kàn
modified,  Kan yi kan current Kàn yī kàn
modified,  Yige ren current Yīgè rén
modified,  Yigeren current Yīgèrén
modified,  Yiding current Yīdìng
modified,  Yiding current Yīdìng
modified,  Yiyang current Yīyàng
modified,  Yiyang current Yīyàng
modified,  Yi yue current Yī yuè
modified,  Yi yue current Yī yuè
modified,  Yidian er current Yīdiǎn er
modified,  Yidian er current Yīdiǎn er
modifi

In [45]:
test_data["full_pronunciation_wo"] = full_pronunciation_wo
test_data.head(n=5)

Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,cat,word/character,full_pronunciation,definition,code,alt,test_definition,full_pronunciation_wo
0,0,0,0,2,1,我的,Wǒ de,mine,s,,wǒ_ my,Wo de
1,1,1,1,3,1,我的,Wǒ de,mine,t,,wǒ_ my,Wo de
2,2,2,2,4,1,高的,Gāo de,"gāode high, tall",s,,"gāo_ high, tall",Gao de
3,3,3,3,5,1,高的,Gāo de,"高的 gāode high, tall",t,,"gāo_ high, tall",Gao de
4,4,4,4,6,1,是的,Shì de,"shìde that's it, that's right",s,,"shì_ that's it, that's right",Shi de


### Final part: saving data to export JSON. This is the same from before

In [None]:
test_data = test_data.loc[:, ~test_data.columns.str.startswith('Unnamed:')] #no unamed columns
test_data.to_csv(path)
test_data = pd.read_csv(path, index_col=0)
test_data.head()

In [None]:
#exporting traditional characters data
our_data = test_data[test_data["code"] == "t"]

json_index = our_data.to_json(orient='index')

# Print the JSON string
print(json_index)

In [None]:
#exporting simplified characters data

our_data = test_data[test_data["code"] == "s"]

json_index = our_data.to_json(orient='index')

# Print the JSON string
print(json_index)