<a href="https://colab.research.google.com/github/npnkhoi/vn-asr/blob/master/wikitionary_IPA_extract.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://cotich.net/truyen-co-tich-tam-cam-a350.html


In [1]:
import requests
from bs4 import BeautifulSoup
from time import sleep
from tqdm.notebook import tqdm # print progress bar
import pandas as pd

In [2]:
def extract_IPA_from_html_vi(s):
  return s[s.find('>')+1: s.find('<',1)]

def extract_IPA_from_html_en(s):
  return s[s.find('[')+1: s.find(']')]

In [3]:
def tokenizer(string):
  string = string.lower()
  token_list = string.split()
  return token_list

In [4]:
def IPA_extractor_vi(token):
  # page = requests.get(f"https://en.wiktionary.org/wiki/{token}")
  page = requests.get(f"https://vi.wiktionary.org/wiki/{token}#Tiếng_Việt")
  
  soup = BeautifulSoup(page.content, 'html.parser')
  SG_IPA = soup.find_all(class_='wiktvi-vie-pron-sg')
  if len(SG_IPA) > 0:
    IPA = SG_IPA[0].find_all(class_='IPA')[0]
    IPA_tone = SG_IPA[0].find_all(class_='IPA')[0].find_all(class_='IPA-tone')[0]
    return extract_IPA_from_html_vi(str(IPA)), extract_IPA_from_html_vi(str(IPA_tone))
  else:
    return None

def IPA_extractor_en(token):
  page = requests.get(f"https://en.wiktionary.org/wiki/{token}")
  soup = BeautifulSoup(page.content, 'html.parser')
  IPA = soup.find_all(class_='IPA')
  try:
    return extract_IPA_from_html_en(str(IPA[2]))
  except:
    return token

In [5]:
s = "Theo dự thảo phòng chống Covid-19 với người nhập cảnh đang được lấy ý kiến"
token_list = tokenizer(s)
for i in range(len(token_list)):
  print(IPA_extractor_en(token_list[i]))

tʰɛw˧˧
jɨ˨˩˨
tʰaːw˨˩˦
fawŋ͡m˨˩
cəwŋ͡m˦˥
covid-19
vəːj˦˥
ŋɨj˨˩
ɲəp̚˨˩˨
kan˨˩˦
ʔɗaːŋ˧˧
ʔɗɨək̚˨˩˨
ləj˦˥
<span class="IPA">/iː/</span
kiəŋ˦˥


In [6]:
wikitionary_IPA_tone_list = {
    'ngang': '˧˧', 
    'sắc': '˦˥',
    'huyền': '˨˩',
    'hỏi': '˨˩˦', 
    'ngã': '˨˩˦',
    'nặng': '˨˩˨'
}

wikipedia_IPA_tone_list = {
    'ngang': '˧', 
    'sắc': '˧˥',
    'huyền': '˨˩',
    'hỏi': '˧˩˧', 
    'ngã': '˨˩˦',
    'nặng': '˨˧'
}

In [32]:
phoneme_db = pd.read_csv('https://raw.githubusercontent.com/npnkhoi/vn-asr/master/vn_phonemes.csv')

In [33]:
# clean db
phoneme_db['num_char'] = phoneme_db.phoneme.str.len()

In [34]:
phoneme_db.sample(5)

Unnamed: 0,note,phoneme,examples,note.1,num_char
50,Diphthongs,əj,ây,,2
38,Vowels,ɛ,e,,1
44,Vowels,o,ô,,1
57,Diphthongs,wɛ,oe,,2
5,Initial consonants,h,hàng,,1


In [11]:
def find_tone(IPA_string):
  tone = ''
  for t in wikitionary_IPA_tone_list.values():
    if t in IPA_string:
      tone = t
      break
  return tone

In [12]:
def add_tone(vowel, tone):
  if tone != wikitionary_IPA_tone_list['ngang']:
    return vowel + tone
  return vowel

In [36]:
def segmentize_phoneme(IPA_string, tone):
  phonemes = []
  # segmentize initial consonant, prioritize more chars in the IPA first
  for initial in phoneme_db[phoneme_db.note == 'Initial consonants'].sort_values(by=['num_char'], ascending=False).phoneme:
    if initial == IPA_string[0:2]: # for initial that has 2 characters such as tʰ
      phonemes.append(initial)
      IPA_string = IPA_string[2:]
      break
    # elif f'?{initial}' == IPA_string[0:2]:
    #   phonemes.append(f'?{initial}')
    #   IPA_string = IPA_string[2:]
    #   break
    elif initial == IPA_string[0:1]: # for initial that has 1 character
      phonemes.append(initial)
      IPA_string = IPA_string[1:]
      break

  # segmentize triphthong
  for triphthong in phoneme_db[phoneme_db.note == 'Triphthongs'].sort_values(by=['num_char'], ascending=False).phoneme:
    if triphthong == IPA_string[:4]: # for triphthong that has 4 chars such as waːw
      IPA_string = IPA_string[4:]
      phonemes.append(add_tone(triphthong, tone))
      break
    
    elif triphthong == IPA_string[:3]: # for triphthong that has 3 chars such as wɛw
      IPA_string = IPA_string[3:]
      phonemes.append(add_tone(triphthong, tone))
      break

  # segmentize diphthong
  for diphthong in phoneme_db[phoneme_db.note == 'Diphthongs'].sort_values(by=['num_char'], ascending=False).phoneme:
    if diphthong == IPA_string[:3]: # for diphthong that has 3 chars such as aːj
      IPA_string = IPA_string[3:]
      phonemes.append(add_tone(diphthong, tone))
      break

    if diphthong == IPA_string[:2]: # for diphthong that has 2 chars such as ɛw
      IPA_string = IPA_string[2:]
      phonemes.append(add_tone(diphthong, tone))
      break

  # segmentize vowel
  for vowel in phoneme_db[phoneme_db.note == 'Vowels'].sort_values(by=['num_char'], ascending=False).phoneme:
    if vowel == IPA_string[0:2]:
      IPA_string = IPA_string[2:]
      phonemes.append(add_tone(vowel, tone))
      break
    elif vowel == IPA_string[0:1]:
      IPA_string = IPA_string[1:]
      phonemes.append(add_tone(vowel, tone))
      break
  
  # segmentize final consonant
  for final in phoneme_db[phoneme_db.note == 'Final consonants'].phoneme:
    if final == IPA_string[0:2]:
      phonemes.append(final)
      IPA_string = IPA_string[2:]
      break
    elif final == IPA_string[0:1]:
      phonemes.append(final)
      IPA_string = IPA_string[1:]
      break
  
  sleep(0.1)

  return phonemes

In [42]:
story_url = 'https://raw.githubusercontent.com/npnkhoi/vn-asr/master/story.txt'
page = requests.get(story_url)

text = page.text

punctuation = ['\n', ',', '.', '?', '!', ':', '-']
for p in punctuation:
  text = text.replace(p, ' ')

In [43]:
token_list = tokenizer(text)
phoneme_list = []
for i in tqdm(range(len(token_list[:-1]))):
  token = token_list[i]
  try:
    IPA, tone = IPA_extractor_vi(token)
    phoneme_list += segmentize_phoneme(IPA, tone)
    # print('vi', token, segmentize_phoneme(IPA, tone))
  except:
    try:
      IPA = IPA_extractor_en(token)
      phoneme_list += segmentize_phoneme(IPA, find_tone(IPA))
      # print('en', token, segmentize_phoneme(IPA, find_tone(IPA)))

    except:
      print(token)

  0%|          | 0/599 [00:00<?, ?it/s]

In [44]:
# count occurence of each phoneme in the story
occurence = []
for phoneme in phoneme_db.phoneme:
  occurence.append(phoneme_list.count(phoneme))

phoneme_db['occurence'] = occurence

In [56]:
len(phoneme_db[phoneme_db.occurence == 0])

31

In [41]:
phoneme_db.to_csv('occurence.csv')

In [24]:
string = 'khác'
IPA_en = IPA_extractor_en(string)
IPA_vn, tone = IPA_extractor_vi(string)
print(IPA_vn)
print(IPA_en)

ʨaː
<span class="IPA">-ɑː</span


In [55]:
# find duplicate phoneme character in initial and final
a = phoneme_db[(phoneme_db.note == 'Initial consonants') | (phoneme_db.note=='Final consonants')].phoneme.duplicated()
phoneme_db[(phoneme_db.note == 'Initial consonants') | (phoneme_db.note=='Final consonants')].phoneme[a]

25    j
26    m
27    ŋ
28    n
31    k
32    t
34    w
Name: phoneme, dtype: object