In [48]:
import numpy as np
import pandas as pd
import urllib.request
import math
import json  

In [49]:
# Read the hanzi data
df = pd.read_json("../data/hanzidictionary.txt", lines=True, encoding='utf-8')


In [50]:
# Display the first 5 rows of the DataFrame
print(df.head())

  character definition pinyin decomposition radical             matches  \
0         ⺀        ice     []             ？       ⺀        [None, None]   
1         ⺈        NaN     []             ？       ⺈        [None, None]   
2         ⺊        NaN     []           ⿰丨？       ⺊         [[0], None]   
3         ⺌        NaN     []             ？       ⺌  [None, None, None]   
4         ⺍        NaN     []             ？       ⺍  [None, None, None]   

                                           etymology  
0                                                NaN  
1                                                NaN  
2  {'type': 'ideographic', 'hint': 'A crack on an...  
3                                                NaN  
4                                                NaN  


In [51]:
# Filter out rows where the "pinyin" column is an empty list
df = df[df['pinyin'].apply(len) > 0]

In [52]:
# Drop the matches and etymology columns
df = df.drop(['matches', 'etymology', 'decomposition', 'radical'], axis=1)

In [53]:
df.head()

Unnamed: 0,character,definition,pinyin
6,⺮,bamboo; flute,[zhú]
9,㐆,old form of 隱,[yǐn]
10,㐌,a tribe of savages in South China,[yí]
11,㐬,"pennant; wild, barren, uncultivated",[liú]
12,㐭,"granary; to stockpile, to supply",[lǐn]


In [54]:
# specify the encoding when opening the file
with open('../data/pinyin-lite.json', 'w', encoding='utf-8') as file:
    # pass the file object to the to_json() method using the path_or_buf parameter
    df.to_json(file, orient='records')

In [55]:
# Find entries with multiple pinyin pronunciations
df = pd.read_json("../data/pinyin-lite.json")
df[df['pinyin'].apply(lambda x: len(x) > 1)]

Unnamed: 0,character,definition,pinyin
123,了,clear; to finish; particle of completed action,"[le, liǎo]"
240,伽,temple; used to transliterations,"[jiā, gā]"
730,匙,spoon; surname,"[chí, shī]"
856,句,"sentence, clause, phrase, paragraph; stanza","[jù, gōu]"
945,呢,wool; interrogative or emphatic final particle,"[né, ne]"
1767,宀,roof; house,"[gài, mián]"
1807,宿,"to stop, to rest, to lodge; constellation","[sù, xiù]"
6238,艾,"mugwort, Artemisia vulgaris; used in translite...","[ài, yì]"


In [56]:
# Explode the pinyin list
exploded_df = df.explode('pinyin')
exploded_df = exploded_df.reset_index(drop=True)

In [57]:
exploded_df.head()

Unnamed: 0,character,definition,pinyin
0,⺮,bamboo; flute,zhú
1,㐆,old form of 隱,yǐn
2,㐌,a tribe of savages in South China,yí
3,㐬,"pennant; wild, barren, uncultivated",liú
4,㐭,"granary; to stockpile, to supply",lǐn


In [58]:
# Group by pinyin pronunciation
pinyin_df = exploded_df.groupby(['pinyin'])

In [59]:
# Convert groups to dictionaries
pinyin_dict = pinyin_df.apply(lambda g: g[['character', 'definition']].apply(tuple, axis=1).tolist()).to_dict()

In [60]:
# Number of syllables that map to a character
print("Syllables with tone that map to a character", len(pinyin_dict.keys()))

Syllables with tone that map to a character 1236


In [61]:
with open('../data/pinyin-groups.json', 'w', encoding='utf-8') as file:
    json.dump(pinyin_dict, file)

In [62]:
# Now find valid tones for every combination.
initials = ['p', 'b', 'm', 'f', 'd', 't', 'n', 'l', 'zh', 'ch', 'sh', 'r', 'z', 'c', 's', 'g', 'k', 'h', 'j', 'x', 'q', 'None']
finals = ['a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'er', 'i', 'i*', 'ia', 'iao', 'ian', 'iang', 'ie', 'in', 'ing', 'iong', 'iu', 'u', 'ua', 'uai', 'uan', 'uang', 'ueng', 'ui', 'uo', 'un', 'ü', 'üan', 'üe', 'ün', 'o', 'ong', 'ou']
vowels = ["a", "e", "i", "o", "u", "ü"]
toneDict = {
    "a": ["ā", "á", "ǎ", "à", "a"],
    "e": ["ē", "é", "ě", "è", "e"],
    "i": ["ī", "í", "ǐ", "ì", "i"],
    "o": ["ō", "ó", "ǒ", "ò", "o"],
    "u": ["ū", "ú", "ǔ", "ù", "u"],
    "ü": ["ǖ", "ǘ", "ǚ", "ǜ", "ü"]
  }

In [63]:
# Algorithm to map initial + final to pinyin syllable
def pinyin(initial, final):
    
    # 1. Initial

    # Select y, w, or nothing for Null-initial
    valInitial = initial
    if (initial == "None" and final != ""):
        if(final[0]=='u'):
            valInitial = "w"
        elif(final[0]=='ü' or final[0]=='ü' or final[0]=="i"):
            valInitial = "y"
        else:
            valInitial = "" 
    
    # 2. Final

    # Handle "w" and "i" contractions and mutations. 
    valFinal = final
    if (initial == "None"):
        if final in ["ia", "iao", "ian", "iang", "ie", "iong", "ua", "uai", "uan","uang","ueng","uo"]:
            valFinal=valFinal[1:]
        elif final == "iu":
            valFinal = "ou"
        elif final == "ei":
            valFinal = "ei"
        elif final == "un":
            valFinal = "en"
    
    # Clean special short i*
    if final == "i*": 
        valFinal = "i"
    

    # If initial is j, x, q, null turn ü into u
    if (valFinal[0]=="ü"):
        if initial in ["j", "x", "q", "None"]:
            valFinal = "u"+valFinal[1:]
        

    # 3. Tone

    # Find letter index of tone
    long = len(valFinal) >= 2
    twoVowels = 0 if not long else valFinal[0] in vowels and valFinal[1] in vowels
    aeo =  valFinal[0] == "a" or valFinal[0] == "e" or valFinal[0] == "o"
    accentIndex = 1 if twoVowels and not aeo else 0 
    accentIndex += len(valInitial)

    return valInitial + valFinal, accentIndex

In [64]:
# Get map from initial to final to the set of tones that exist for that syllable

# The syllables with tone that exist for at least one Chinese character
pinyinKeys = pinyin_dict.keys()

# To be filled with key-value pairs - initial : [(final1, [1,4,5]), (final2, [1,2]), ...]
realTones = dict([(initial, []) for initial in initials])
for initial in initials:
    for final in finals: 
        s, idx = pinyin(initial, final)

        # Syllable with tones
        modulated = [s[:idx]+v+s[idx+1:] for v in toneDict[s[idx]]]

        # The numbers of those that exist in reality
        realToneNums = [j+1 for j, m in enumerate(modulated) if m in pinyinKeys]

        realTones[initial] += [(final, realToneNums)]

    # Convert to inner dictionary with existing syllables
    realTones[initial] = dict([(k, list(map(str,v))) for k, v in realTones[initial] if len(v)!=0])


In [65]:
with open('../data/tone-dict.json', 'w', encoding='utf-8') as file:
    json.dump(realTones, file)

In [66]:
# Find entries with multiple pinyin pronunciations
toneDictDF = pd.read_json("../data/tone-dict.json")

In [68]:
# Copy the dictionary
toneDictDF_copy = toneDictDF.copy()

# Turn NaN elements to empty lists
def nanToEmpty(x):
    return [] if type(x)==float else x

# Append valid tones to pinyin(row, column)
for i in toneDictDF.columns:
    tones = toneDictDF[i].map(nanToEmpty)
    finalsTones = list(zip(tones.index, tones))
    toneDictDF_copy[i] = list(map(lambda pair: list(map(lambda n: pinyin(i, pair[0])[0]+n, pair[1])), finalsTones))
# Flatten nested list of syllables
allSyllables = []
for i in toneDictDF_copy.columns:
    allSyllables += [item for sublist in list(toneDictDF_copy[i]) for item in sublist]

In [69]:
# Write to file
with open("../data/all-syllables.txt","w") as file:
    for syllable in allSyllables:
        file.write("%s\n" % syllable)

In [70]:
# Read from file
with open("../data/all-syllables.txt","r") as file:
    lines = file.readlines()
    allSyllables = [line.strip() for line in lines]

In [84]:
# Now download .mp3s and save
audioURL = "https://resources.allsetlearning.com/pronwiki/resources/pinyin-audio/"
for syllable in allSyllables[1325:1330]:
    if syllable[-1] != "5" and not syllable=="dia3" and not syllable[1]=="ü":
        print(syllable)
        urllib.request.urlretrieve(audioURL+syllable+".mp3", "../data/audio/"+syllable+".mp3")

er3
er4
weng1
weng3
weng4


In [108]:
# Lower the volume of the mp3s
from pydub import AudioSegment
import os

In [None]:
# Specify folders where audio is located
input_folder = r"c:/Users/oilio/Documents/Blog/orfeasliossatos.github.io/data/audio"
output_folder = r"c:/Users/oilio/Documents/Blog/orfeasliossatos.github.io/data/audio-backup"

# Loop through all files finishing with .mp3
for filename in os.listdir(input_folder):
    if filename.endswith('.mp3'):
        # Process the MP3 file
        input_path = input_folder + "/" + filename
        output_path = output_folder + "/" + filename
        # Load the MP3 file
        audio = AudioSegment.from_file(input_path, format='mp3', ffprobe=False)
        
        # Lower the volume by 10 dB
        audio = audio - 10
        
        # Save the processed MP3 file
        audio.export(output_path, format='mp3')