In [52]:
import asyncio
from googletrans import Translator
import pandas as pd

# Multi-character word dataset improvement

It is clear from tests that the final csv file used to generate the json for multi-character words has a few errors. Many words lack pronunciations, and some lack valid definitions (only containing pinyin without context). We use a google translate API to fix some. First, we replace all pronunciations using google translate. Then, we check for syllables with accent marks in the definitions, and find definitions using the same API for definitions with only pinyin (no real meaning given)

#### Reading in the data and testing translator apis

In [53]:
path = "main_final_list.csv" #"final_test_characters.csv"
test_data = pd.read_csv(path)
print(len(test_data))
test_data.head(n=20)

10505


Unnamed: 0.1,Unnamed: 0,cat,word/character,pronunciation,definition,code,alt
0,0,1,的,de,<grammatical particle marking genitive as well...,s,
1,1,1,的,de,<grammatical particle marking genitive as well...,t,
2,2,1,我的,de,wǒde my,s,
3,3,1,我的,de,我的 wǒde my,t,
4,4,1,高的,de,"gāode high, tall",s,
5,5,1,高的,de,"高的 gāode high, tall",t,
6,6,1,是的,de,"shìde that's it, that's right",s,
7,7,1,是的,de,"是的 shìde that's it, that's right",t,
8,8,1,是的,de,shìde one who...,s,
9,9,1,是的,de,shìde one who...,t,


Testing translator API

In [54]:
async def chinese_to_eng(input):
    translator = Translator()
    result = await translator.translate(input, dest="en", src="zh")
    return result


In [55]:
result = await chinese_to_eng("我的")
print("definition:", result.text)

definition: mine


This is the full data that can be gained. Note extracting pinyin is not as direct

In [56]:
print("pronunciation (pinyin):", result.extra_data)

pronunciation (pinyin): {'translation': [['mine', '我的', None, None, 10], [None, None, None, 'Wǒ de']], 'all-translations': [['pronoun', ['mine', 'my'], [['mine', ['我的']], ['my', ['吾', '敝', '鄙', '窃', '我的']]], '我的', 8]], 'original-language': 'zh-CN', 'possible-translations': [['我的', None, [['mine', None, True, False, [10]], ['my', None, True, False, [10]]], [[0, 2]], '我的', 0, 0]], 'confidence': None, 'possible-mistakes': None, 'language': [['zh-CN'], None, [0], ['zh-CN']], 'synonyms': None, 'definitions': None, 'examples': None, 'see-also': None}


More examples

In [57]:
result = await chinese_to_eng("他是說漢語的")
print(result.extra_data)

{'translation': [['He speaks Chinese', '他是說漢語的', None, None, 3, None, None, [[]], [[['af64405095a399ceb1e05c7abb7cda66', 'zh_en_2023q1.md']]]], [None, None, None, 'Tā shì shuō hànyǔ de']], 'all-translations': None, 'original-language': 'zh-CN', 'possible-translations': [['他是說漢語的', None, [['He speaks Chinese', None, True, False, [3], None, [[3]]], ['He spoke Chinese', None, True, False, [8]]], [[0, 6]], '他是說漢語的', 0, 0]], 'confidence': 1, 'possible-mistakes': None, 'language': [['en'], None, [1], ['en']], 'synonyms': None, 'definitions': None, 'examples': None, 'see-also': None}


In [58]:
result = await chinese_to_eng("他是說漢語的")
print(result.extra_data)

{'translation': [['He speaks Chinese', '他是說漢語的', None, None, 3, None, None, [[]], [[['af64405095a399ceb1e05c7abb7cda66', 'zh_en_2023q1.md']]]], [None, None, None, 'Tā shì shuō hànyǔ de']], 'all-translations': None, 'original-language': 'zh-CN', 'possible-translations': [['他是說漢語的', None, [['He speaks Chinese', None, True, False, [3], None, [[3]]], ['He spoke Chinese', None, True, False, [8]]], [[0, 6]], '他是說漢語的', 0, 0]], 'confidence': 1, 'possible-mistakes': None, 'language': [['en'], None, [1], ['en']], 'synonyms': None, 'definitions': None, 'examples': None, 'see-also': None}


In [59]:


result = await chinese_to_eng("的确")
print(result.extra_data)

{'translation': [['indeed', '的确', None, None, 10], [None, None, None, 'Díquè']], 'all-translations': [['adverb', ['indeed', 'really', 'certainly', 'quite', 'duly', 'iwis', 'in faith', 'certes', 'in troth'], [['indeed', ['确实', '的确', '确', '真', '果真', '诚'], None, 0.14406367], ['really', ['真', '确实', '实在', '的确', '果然', '其实'], None, 0.06595715], ['certainly', ['当然', '一定', '无疑', '的确', '定然', '自'], None, 0.01382537], ['quite', ['相当', '很', '比较', '挺', '颇', '的确'], None, 0.0010828866], ['duly', ['的确', '适时地'], None, 0.000117758456], ['iwis', ['的确', '的确地'], None, 1.5446549e-05], ['in faith', ['的确'], None, 7.646596e-06], ['certes', ['的确'], None, 3.84495e-06], ['in troth', ['的确'], None, 3.84495e-06]], '的确', 4], ['adjective', ['certain'], [['certain', ['某些', '一定', '某', '确定', '肯定', '的确'], None, 0.0006991642]], '的确', 3], ['verb', ['be certain'], [['be certain', ['肯定', '的确'], None, 1.922354e-05]], '的确', 2]], 'original-language': 'zh-CN', 'possible-translations': [['的确', None, [['indeed', None, True, False, [

In [60]:

result = await chinese_to_eng("高的")
print(result.extra_data)

{'translation': [['high', '高的', None, None, 3, None, None, [[]], [[['af64405095a399ceb1e05c7abb7cda66', 'zh_en_2023q1.md']]]], [None, None, None, 'Gāo de']], 'all-translations': [['adjective', ['overhead'], [['overhead', ['上', '高的', '上面的']]], '高的', 3]], 'original-language': 'zh-CN', 'possible-translations': [['高的', None, [['high', None, True, False, [3], None, [[3]]], ['tall', None, True, False, [8]]], [[0, 2]], '高的', 0, 0]], 'confidence': None, 'possible-mistakes': None, 'language': [['zh-CN'], None, [0], ['zh-CN']], 'synonyms': None, 'definitions': None, 'examples': None, 'see-also': None}


In [61]:
curr = result.extra_data

In [62]:
print(curr["translation"])

[['high', '高的', None, None, 3, None, None, [[]], [[['af64405095a399ceb1e05c7abb7cda66', 'zh_en_2023q1.md']]]], [None, None, None, 'Gāo de']]


#### Writing function to extract pinyin

From this, we can extract pinyin using this pattern. Check for none, return error for potential issues

In [63]:
async def getPinyin(input): 
    try :
        result = await chinese_to_eng(input)
        result = result.extra_data
        final_list = result["translation"]
        final_list = final_list[len(final_list)-1]
        pinyin = final_list[len(final_list)-1]
        if (pinyin is None): 
            return ""
        else:
            return pinyin
    except :
        return "error"


testing

In [64]:
pinyin = await getPinyin("高的")
print(pinyin)

Gāo de


In [65]:
pinyin = await getPinyin("台湾作词人，音乐人[1]。現為大國大熊星娛樂总经理。曾擔任台湾科藝百代（EMI）及维京音乐（Virgin Music Chinese）、新力哥倫比亞音樂、點將唱片的总经理。二十年来，打造出林慧萍、張清芳、伍思凱、優客李林、柯以敏、萧亚轩、刘若英、李玟、赵薇、江美琪、余憲忠等歌手[2]。个人发表原创歌词600多首，包括许多脍炙人口的作品。")
print(pinyin)

Táiwān zuòcí rén, yīnyuè rén [1]. Xiàn wéi dàguó dàxióng xīng yúlè zǒng jīnglǐ. Céng dānrèn táiwān kē yì bǎidài (EMI) jí wéi jīng yīnyuè (Virgin Music Chinese), xīnlì gēlúnbǐyǎ yīnyuè, diǎnjiàng chàngpiàn de zǒng jīnglǐ. Èrshí niánlái, dǎzào chū línhuìpíng, zhāngqīngfāng, wǔsīkǎi, yōu kè lǐ lín, kēyǐmǐn, xiāoyàxuān, liúruòyīng, lǐ wén, zhàowēi, jiāngměi qí, yú xiànzhōng děng gēshǒu [2]. Gèrén fà biǎo yuánchuàng gēcí 600 duō shǒu, bāokuò xǔduō kuàizhìrénkǒu de zuòpǐn.


##### pinyin validation

We can help validate for chinese character-only inputs by checking if the pinyin is valid. We can check if every character is represented by one pinyin. Do a simple check we were check if the number of chinese characters equals the number of vowel clusters

In [66]:
import unicodedata

def count_vowel_clusters(s):
    vowels = "aeiouAEIOU"
    cluster_count = 0
    in_cluster = False

    for char in s:
        #remove diacritics 
        modified_char = unicodedata.normalize('NFD', char)
        modified_char = str(modified_char.encode('ascii', 'ignore').decode("utf-8"))
        if modified_char in vowels:
            if not in_cluster:
                cluster_count += 1  # Start a new cluster
                in_cluster = True
        else:
            in_cluster = False             # Not a vowel, so any current cluster ends

            
    return cluster_count

In [67]:
import re

def contains_chinese(text):
    #Checks if the input string contains any Chinese character
    # Regex pattern for the main CJK Unified Ideographs range
    chinese_pattern = re.compile(r'[\u4e00-\u9fff]')
    return bool(chinese_pattern.search(text))
#loop through a word (unicode parts to see if all are chinese)

def allChinese(text):
    for char in text:
        if contains_chinese(char) == False:
            return False
    return True
# count, using similar logic, the number of chinese characters in text 
def numChinese(text):
    count = 0
    for char in text:
        if contains_chinese(char) == True:
            count+=1
    return count

Testing the functions (by default python does not tells difference between diacritics, so we need to remove the accent marks). 

In [68]:
print(count_vowel_clusters("xiāoyàxuān"))

print(count_vowel_clusters("liúruòyīng"))

print(count_vowel_clusters("gēshǒu"))

3
3
2


In [69]:
print( allChinese("da國大熊星") )

print( allChinese("國大熊星") )

False
True


In [70]:
print( numChinese("da國大熊星") )
print( numChinese("國大熊星-大熊星-") )

4
7


##### Final version of the function

We complete the function by adding the case if the text is all chinese, the pinyin condition must match

In [78]:
async def getPinyin(input): 
    try :
        result = await chinese_to_eng(input)
        result = result.extra_data
        final_list = result["translation"]
        final_list = final_list[len(final_list)-1]
        pinyin = final_list[len(final_list)-1]
        if (pinyin is None): 
            print("no pinyin was able to be returned for", input)
            return ""
        if (allChinese(input)==True and numChinese(input) != count_vowel_clusters(pinyin)): 
            print("error: output pinyin: ", pinyin, "not valid for", input)
            return ""
        else:
            return pinyin
    except Exception as e:
        print("error: ",e , "for", input)
        return "",

Verifying

In [79]:
result = await getPinyin("國大熊星")


In [80]:
print(result)

Guó dàxióng xīng


In [81]:
result = await getPinyin("國大熊星-大熊星")

print(result)

Guó dàxióng xīng-dàxióng xīng


#### Creating function to detect pinyin and chinese characters, to see what the text without it looks like, so we can determine if the definition is valid

Create a function to check if a word is a pure english word (no diacritics or nonlatin)

In [74]:
import string 
def is_plain_latin(char):
  #Checks if a character is a plain, unmarked Latin letter or punctuation
  punctuations = set(string.punctuation) 
  return (char in string.ascii_letters or char in punctuations)
def word_plain_latin(word):
    for x in word:
       if (is_plain_latin(x) == False):
          return False
    return True
#isolate words of a sentence that only have plain latin characters into a string
def plainLatinCharsWord(input):
  words = input.split(" ")
  finalword = ""
  for word in words:
     if word_plain_latin(word):
        finalword += (word+" ")
  return finalword.strip()

Testing.

As you can see, it reduces the complicated definitions to the chinese characters only

In [75]:
plainLatinCharsWord("是的 shìde that's it, that's right")

"that's it, that's right"

### Data modification

Check if the all the different words in the list are actually purely Chinese

In [83]:
test_data.head(n=10)

Unnamed: 0.1,Unnamed: 0,cat,word/character,pronunciation,definition,code,alt
0,0,1,的,de,<grammatical particle marking genitive as well...,s,
1,1,1,的,de,<grammatical particle marking genitive as well...,t,
2,2,1,我的,de,wǒde my,s,
3,3,1,我的,de,我的 wǒde my,t,
4,4,1,高的,de,"gāode high, tall",s,
5,5,1,高的,de,"高的 gāode high, tall",t,
6,6,1,是的,de,"shìde that's it, that's right",s,
7,7,1,是的,de,"是的 shìde that's it, that's right",t,
8,8,1,是的,de,shìde one who...,s,
9,9,1,是的,de,shìde one who...,t,


### Final part: saving data to export JSON. This is the same from before

In [76]:
our_data = test_data[test_data["code"] == "t"]

json_index = our_data.to_json(orient='index')

# Print the JSON string
print(json_index)



In [77]:
our_data = test_data[test_data["code"] == "s"]

json_index = our_data.to_json(orient='index')

# Print the JSON string
print(json_index)

