In [None]:
import pycantonese, re, shutil, pkuseg

In [None]:
# Input file names
file1 = "/Users/miaozhang/Research/CorpusPhon/CorpusData/CommonVoice/zh-HK_v17/zh-HK_epi_lexicon17.txt"
file2 = "/Users/miaozhang/Research/CorpusPhon/CorpusData/CommonVoice/yue_v17/yue_epi_lexicon17.txt"
output_file = "/Users/miaozhang/Research/CorpusPhon/CorpusData/CommonVoice/yue_v17/cantonese_epi_lexicon17.txt"

# Read and store lines from file1 and file2 without duplicates
lines_seen = set()
all_lines = []

for infile in [file1, file2]:
    with open(infile, "r") as file:
        for line in file:
            if line.strip() not in lines_seen:
                lines_seen.add(line.strip())
                all_lines.append(line)

# Write unique and sorted lines to the output file
with open(output_file, "w") as outfile:
    sorted_lines = sorted(set(all_lines))
    outfile.writelines(sorted_lines)

In [None]:
# List of Korean sentences
sentences = ['你好，我叫張淼。', '下午天氣好熱！', '今天係星期五。', '細仔悟性唔錯。', '呢個係一個好有趣嘅對話。']

# Tokenize each sentence
tokenized_sentences = [' '.join(pycantonese.segment(sentence)) for sentence in sentences]
tokenized_sentences = [re.sub('[。|，|！|？]', '', item) for item in tokenized_sentences]
words = ''.join(tokenized_sentences).split()

# Print the list of tokenized sentences
for i, tokens in enumerate(tokenized_sentences):
    print(f"Sentence {i+1} tokens: {tokens}")

print(words)

In [None]:
words = ["場", "圖", "到", "嘴", "仔", "光", "⻣", "ㄧ", "㗎", "㗎妹", "㩒", "㩒錢", "㩿", "㪐", "㪐㩿", "䁪", "䁪眼", "䒐䒏", "䟴腳", "䰧", "䱽", "一"]
from transformers import T5ForConditionalGeneration, AutoTokenizer
from lingpy import ipa2tokens

model = T5ForConditionalGeneration.from_pretrained('charsiu/g2p_multilingual_byT5_tiny_16_layers_100')
tokenizer = AutoTokenizer.from_pretrained('google/byt5-small')

chr_words = ['<yue>: '+i for i in words]
out = tokenizer(chr_words,padding=True,add_special_tokens=False,return_tensors='pt')

preds = model.generate(**out,num_beams=1,max_length=50) # We do not find beam search helpful. Greedy decoding is enough. 
phones = tokenizer.batch_decode(preds.tolist(),skip_special_tokens=True)


phones = [ipa2tokens(phone) for phone in phones]
phones = [' '.join(phone) for phone in phones]
phones = [re.sub(':', 'ː', phone) for phone in phones]

    
# Define the regular expipasion pattern to capture whitespace between 't' and 's' following IPA tone letters
pattern_ts = re.compile(r'(^|[˥˦˧˨˩]\s)t s')
pattern_ng_syll = re.compile(r'(^|[˥˦˧˨˩]+\s)ŋ(\s[˥˦˧˨˩]+)')

# Replace the whitespace between 't' and 's' with empty string
phones = [re.sub(pattern_ts, r'\1t͡s', phone) for phone in phones]
phones = [re.sub(pattern_ng_syll, r'\1ŋ̩\2', phone) for phone in phones]

pattern_tone_pos = re.compile(r'\s([mnŋptk])\s([˥˦˧˨˩]+)')
#pattern_tone_strip = re.compile(r'\s[˥˦˧˨˩]+(^|\s)')

phones_with_tone = [re.sub(pattern_tone_pos, r' \2 \1', phone) for phone in phones]
phones_with_tone = [re.sub(r'(.)\s([˥˦˧˨˩]+)', r'\1\2', phone) for phone in phones_with_tone]
phones_no_tone = [re.sub(r'[˥˦˧˨˩]+', '', phone) for phone in phones]
phones_no_tone = [re.sub(r'\s+', ' ', phone) for phone in phones_no_tone]

for word, phone in zip(words, phones_with_tone):
    print(word + '\t' + phone)
print("\n")
for word, phone in zip(words, phones_no_tone):
    print(word + '\t' + phone)


In [None]:
import re
dict = '/Users/miaozhang/Documents/MFA/pretrained_models/dictionary/mandarin_china_mfa.dict'
with open(dict, 'r') as file:
    data = [re.sub('\n', '', line) for line in file.readlines()]

In [None]:
from pypinyin import pinyin, Style
from pinyin_to_ipa import pinyin_to_ipa
import jieba, re

# Example
chinese_text = ["若男", "丹阳", "大娘", "看我", "卡诺", "南阳", "那年", "瓜分", "挖掘", "募捐", "坏账", "连绵", "梳妆", "床照", "银行", "打扮", "牵过", "长江", "全国", "队长", "葵花", "大二", "八亿", "蛇口", "是非", "明显", "勉强"]
#tokenized = ' '.join(jieba.cut(chinese_text))
#tokenized = tokenized.split(' ')
#print(tokenized)

g2p = []
# Convert to Pinyin with tone numbers
def cmn_g2p(chinese_text):
    py = pinyin(chinese_text, style=Style.TONE3)
    # Flatten the list and join words with spaces
    py = [item[0] for item in py]
    # Get each syllable
    ipa = [pinyin_to_ipa(item)[0] for item in py]
    ipa = [' '.join(sound) for sound in ipa]
    ipa = [re.sub('[˥˦˧˨˩]', '', sound) for sound in ipa]
    # Make onglides superscript and attach them to the following vowel
    ipa = [re.sub(r'(p|m|f|t|n|l|k|x|s|ʂ|ɻ|ʰ) w ', r'\1 ʷ', syll) for syll in ipa]
    ipa = [re.sub(r'(p|t|m|n|l|ɕ|ʰ) j ', r'\1 ʲ', syll) for syll in ipa]
    ipa = [re.sub(r'(n|l|ɕ|ʰ) ɥ ', r'\1 ᶣ', syll) for syll in ipa]
    ipa = [re.sub('a ŋ', 'ɑ ŋ', syll) for syll in ipa]
    ipa = ' '.join(ipa)
    
    transcript = chinese_text + '\t' + ipa
    
    return transcript

for word in chinese_text:
    transcript = cmn_g2p(word)
    g2p.append(transcript)

for item in g2p:
    print(item)

In [None]:
%pip install jyutping

In [None]:
from jyutping import Jyutping
from pycantonese import YALE_TO_IPA

def convert_to_jyutping(text):
    converter = Jyutping()
    jyutping_output = converter.get(text)
    return jyutping_output

def convert_jyutping_to_ipa(jyutping_text):
    yale_to_ipa = YALE_TO_IPA
    ipa_symbols = []

    for syllable in jyutping_text.split():
        ipa_symbols.extend(yale_to_ipa[syllable] if syllable in yale_to_ipa else [syllable])

    ipa_output = " ".join(ipa_symbols)
    return ipa_output

# Input text in Cantonese
input_text = "你好"

# Convert characters to Jyutping
jyutping_result = convert_to_jyutping(input_text)

# Convert Jyutping to IPA
ipa_result = convert_jyutping_to_ipa(jyutping_result)

print(f"Input Text: {input_text}")
print(f"Jyutping: {jyutping_result}")
print(f"IPA: {ipa_result}")

In [None]:
folder = '/Users/miaozhang/Research/CorpusPhon/CorpusData/CommonVoice'
script = '/Users/miaozhang/Research/CorpusPhon/Scripts/vxc_pipeline/delete_invalid_clips.sh'
print(f'bash {script} {folder}')