In [None]:
import asyncio
from googletrans import Translator
import pandas as pd

# Multi-character word dataset improvement : ensuring data quality

It is clear from tests that the final csv file used to generate the json for multi-character words has a few errors. Many words lack pronunciations, and some lack valid definitions (only containing pinyin without context). We use a google translate API to fix some. First, we replace all pronunciations using google translate. Then, we check for syllables with accent marks in the definitions, and find definitions using the same API for definitions with only pinyin (no real meaning given)

#### Reading in the data and testing translator apis

In [None]:
path = "test_multi_list.csv" #"final_test_characters.csv"
test_data = pd.read_csv(path)
print(len(test_data))
test_data.head(n=20)

Testing translator API

In [None]:
async def chinese_to_eng(input):
    translator = Translator()
    result = await translator.translate(input, dest="en", src="zh")
    return result


In [None]:
result = await chinese_to_eng("我的")
print("definition:", result.text)

This is the full data that can be gained. Note extracting pinyin is not as direct

In [None]:
print("pronunciation (pinyin):", result.extra_data)

More examples

In [None]:
result = await chinese_to_eng("他是說漢語的")
print(result.extra_data)

In [None]:
result = await chinese_to_eng("他是說漢語的")
print(result.extra_data)

In [None]:


result = await chinese_to_eng("的确")
print(result.extra_data)

In [None]:

result = await chinese_to_eng("高的")
print(result.extra_data)

In [None]:
curr = result.extra_data

In [None]:
print(curr["translation"])

#### Writing function to extract pinyin

From this, we can extract pinyin using this pattern. Check for none, return error for potential issues

In [None]:
async def getPinyin(input): 
    try :
        result = await chinese_to_eng(input)
        result = result.extra_data
        final_list = result["translation"]
        final_list = final_list[len(final_list)-1]
        pinyin = final_list[len(final_list)-1]
        if (pinyin is None): 
            return ""
        else:
            return pinyin
    except :
        return "error"


testing

In [None]:
pinyin = await getPinyin("高的")
print(pinyin)

In [None]:
pinyin = await getPinyin("台湾作词人，音乐人[1]。現為大國大熊星娛樂总经理。曾擔任台湾科藝百代（EMI）及维京音乐（Virgin Music Chinese）、新力哥倫比亞音樂、點將唱片的总经理。二十年来，打造出林慧萍、張清芳、伍思凱、優客李林、柯以敏、萧亚轩、刘若英、李玟、赵薇、江美琪、余憲忠等歌手[2]。个人发表原创歌词600多首，包括许多脍炙人口的作品。")
print(pinyin)

##### pinyin validation

We can help validate for chinese character-only inputs by checking if the pinyin is valid. We can check if every character is represented by one pinyin. Do a simple check we were check if the number of chinese characters equals the number of vowel clusters

In [None]:
import unicodedata

def count_vowel_clusters(s):
    vowels = "aeiouAEIOU"
    cluster_count = 0
    in_cluster = False

    for char in s:
        #remove diacritics 
        modified_char = unicodedata.normalize('NFD', char)
        modified_char = str(modified_char.encode('ascii', 'ignore').decode("utf-8"))
        if modified_char in vowels:
            if not in_cluster:
                cluster_count += 1  # Start a new cluster
                in_cluster = True
        else:
            in_cluster = False             # Not a vowel, so any current cluster ends

            
    return cluster_count

In [None]:
import re
import string 

def contains_chinese(text):
    #Checks if the input string contains any Chinese character
    # Regex pattern for the main CJK Unified Ideographs range
    chinese_pattern = re.compile(r'[\u4e00-\u9fff]')
    return bool(chinese_pattern.search(text))
#loop through a word (unicode parts to see if all are chinese or punctuation)

def allChinese(text):
    punctuations = set(string.punctuation) 
    for char in text:
        #return false if the current character is not chinese and is not punctuation
        if (contains_chinese(char) == False and char not in punctuations):
            return False
    return True
# count, using similar logic, the number of chinese characters in text 
def numChinese(text):
    count = 0
    for char in text:
        if contains_chinese(char) == True:
            count+=1
    return count

Testing the functions (by default python does not tells difference between diacritics, so we need to remove the accent marks). 

In [None]:
print(count_vowel_clusters("xiāoyàxuān"))

print(count_vowel_clusters("liúruòyīng"))

print(count_vowel_clusters("gēshǒu"))

In [None]:
print( allChinese("da國大熊星") )

print( allChinese("國大熊星") )

print( allChinese("國大-=/熊星") )

In [None]:
print( numChinese("da國大熊星") )
print( numChinese("國大熊星-大熊星-") )

##### Final version of the function

We complete the function by adding the case if the text is all chinese, the pinyin condition must match

In [None]:
async def getPinyin(input): 
    try :
        result = await chinese_to_eng(input)
        result = result.extra_data
        final_list = result["translation"]
        final_list = final_list[len(final_list)-1]
        pinyin = final_list[len(final_list)-1]
        if (pinyin is None): 
            print("no pinyin was able to be returned for", input)
            return ""
        if (allChinese(input)==True and numChinese(input) != count_vowel_clusters(pinyin)): 
            print("error: output pinyin: ", pinyin, "not valid for", input)
            return ""
        else:
            return pinyin
    except Exception as e:
        print("error: ",e , "for", input)
        return "",

Verifying

In [None]:
result = await getPinyin("國大熊星")


In [None]:
print(result)

In [None]:
result = await getPinyin("國大熊星-大熊星")

print(result)

#### Creating function to detect pinyin and chinese characters, to see what the text without it looks like, so we can determine if the definition is valid

Create a function to check if a word is a pure english word (no diacritics or nonlatin)

In [None]:
def is_plain_latin(char):
  #Checks if a character is a plain, unmarked Latin letter or punctuation
  punctuations = set(string.punctuation) 
  return (char in string.ascii_letters or char in punctuations)
def word_plain_latin(word):
    for x in word:
       if (is_plain_latin(x) == False):
          return False
    return True
#isolate words of a sentence that only have plain latin characters into a string
def plainLatinCharsWord(input):
  words = input.split(" ")
  finalword = ""
  for word in words:
     if word_plain_latin(word):
        finalword += (word+" ")
  return finalword.strip()

Testing.

As you can see, it reduces the complicated definitions to the chinese characters only

In [None]:
plainLatinCharsWord("是的 shìde that's it, that's right 是的")

### Data modification

After running all of this, ideally no errors should remain when running again

Check if the all the different words in the list are actually purely Chinese

In [None]:
chars_list = list(test_data["word/character"])
print(len(chars_list))

In [None]:
i = 0
for word in chars_list:
    if allChinese(word) == False:
        print(word, "at index", i, "is not all chinese")
    i += 1

As we can see there are no such errors

Modify the definitions list to make sure that all are valid definitions. If not, we use google translate to help

In [None]:
definitions = list(test_data["definition"])

In [None]:
i = 0 #track index
for word in definitions:
    word = str(word)
    #isolate pure latin characters 
    latin = plainLatinCharsWord(word)
    if len(latin)<4: #if the length is less than 4, the definition is likely not valid
        print("definition:", word, "at index", i, "is not a valid definition, changing using google translator")
        currchar = chars_list[i] #get current character using previous list
        newdef = await chinese_to_eng(currchar)
        newdef = newdef.text
        definitions[i] = newdef
        print("changed definition of",currchar, "to ", newdef)
        
    i += 1

Save updated data

In [None]:
test_data["definition"] = definitions

Finally, we fix invalid pronuniciations

In [None]:
pronunciations = list(test_data["pronunciation"])

In [None]:
i = 0 #track index
for pron in pronunciations:
    currchar = chars_list[i] #get current character using previous list
    #check if it is valid by counting vowel clusters, and ensuring there are no chinese characters 
    if numChinese(currchar) != count_vowel_clusters(pron):
        print("fix index ", i)
        #print(f"{i} invalid pronunciation", pron, "found for word", currchar)
        #attempt repair 
        result = await getPinyin(currchar)
        if (result != ""):
            #assign if correct
            pronunciations[i] = result 
            #print("new pronunciation: ", result)
        else:
            print("failed repair at index", i)

    i+=1

after these few errors, we change manually add pronunciations for :
国民党, 乱麻麻, 屎壳郎, 伸懶腰 for test_multi_list.csv

save changes 


In [None]:
test_data["pronunciation"] = pronunciations

#### Finally, save this modified data

In [None]:
test_data.to_csv(path)


### Last part : changing column name to full pronunciation, and creating a column with the pronunciation without tones (full_pronunciation_wo), similar to that in final_test_characters.csv

In [None]:
test_data = test_data.rename(columns={'pronunciation': 'full_pronunciation'})
test_data.head(n=1)

Stripping diacritics

In [None]:
full_pronunciation_wo = list(test_data["full_pronunciation"])
i = 0
for pro in full_pronunciation_wo:
    #loop through each letter in pro to remove diacritic 
    modified_pro = ""
    for char in pro:
        #remove diacritics 
        modified_char = unicodedata.normalize('NFD', char)
        modified_char = str(modified_char.encode('ascii', 'ignore').decode("utf-8"))
        modified_pro += modified_char
    print("modified, ", modified_pro, "current", pro)
    full_pronunciation_wo[i] = modified_pro
    i += 1

In [None]:
test_data["full_pronunciation_wo"] = full_pronunciation_wo
test_data.head(n=5)

### Final part: saving data to export JSON. This is the same from before

In [None]:
test_data = test_data.loc[:, ~test_data.columns.str.startswith('Unnamed:')] #no unamed columns
test_data.to_csv(path)
test_data = pd.read_csv(path, index_col=0)
test_data.head()

In [None]:
#exporting traditional characters data
our_data = test_data[test_data["code"] == "t"]

json_index = our_data.to_json(orient='index')

# Print the JSON string
print(json_index)

In [None]:
#exporting simplified characters data

our_data = test_data[test_data["code"] == "s"]

json_index = our_data.to_json(orient='index')

# Print the JSON string
print(json_index)