<a href="https://colab.research.google.com/github/npnkhoi/vn-asr/blob/master/wikitionary_IPA_extract.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://cotich.net/truyen-co-tich-tam-cam-a350.html


# Bỏ : ở nguyên âm

In [143]:
import requests
from bs4 import BeautifulSoup
from time import sleep
from tqdm.notebook import tqdm # print progress bar
import pandas as pd

In [144]:
def extract_IPA_from_html_vi(s):
  return s[s.find('>')+1: s.find('<',1)]

def extract_IPA_from_html_en(s):
  return s[s.find('[')+1: s.find(']')]

In [145]:
def tokenizer(string):
  string = string.lower()
  token_list = string.split()
  return token_list

In [146]:
def IPA_extractor_vi(token):
  # page = requests.get(f"https://en.wiktionary.org/wiki/{token}")
  page = requests.get(f"https://vi.wiktionary.org/wiki/{token}#Tiếng_Việt")
  
  soup = BeautifulSoup(page.content, 'html.parser')
  SG_IPA = soup.find_all(class_='wiktvi-vie-pron-sg')
  if len(SG_IPA) > 0:
    IPA = SG_IPA[0].find_all(class_='IPA')[0]
    IPA_tone = SG_IPA[0].find_all(class_='IPA')[0].find_all(class_='IPA-tone')[0]
    return extract_IPA_from_html_vi(str(IPA)), extract_IPA_from_html_vi(str(IPA_tone))
  else:
    return None

def IPA_extractor_en(token):
  page = requests.get(f"https://en.wiktionary.org/wiki/{token}")
  soup = BeautifulSoup(page.content, 'html.parser')
  IPA = soup.find_all(class_='IPA')
  try:
    return extract_IPA_from_html_en(str(IPA[2]))
  except:
    return token

In [147]:
s = "Theo dự thảo phòng chống Covid-19 với người nhập cảnh đang được lấy ý kiến"
token_list = tokenizer(s)
for i in range(len(token_list)):
  print(IPA_extractor(token_list[i]))

('tʰɛw', '˧˧')
('jɨ', '˨˩˨')
('tʰaːw', '˨˩˦')
('fawŋ', '˨˩')
('ʨəwŋ', '˧˥')
None
('jəːj', '˧˥')
('ŋɨəj', '˨˩')
('ɲəp', '˨˩˨')
('kan', '˨˩˦')
('ɗaːŋ', '˧˧')
('ɗɨək', '˨˩˨')
('ləj', '˧˥')
('i', '˧˥')
('kiəŋ', '˧˥')


In [148]:
wikitionary_IPA_tone_list = {
    'ngang': '˧˧', 
    'sắc': '˦˥',
    'huyền': '˨˩',
    'hỏi': '˨˩˦', 
    'ngã': '˨˩˦',
    'nặng': '˨˩˨'
}

wikipedia_IPA_tone_list = {
    'ngang': '˧', 
    'sắc': '˧˥',
    'huyền': '˨˩',
    'hỏi': '˧˩˧', 
    'ngã': '˨˩˦',
    'nặng': '˨˧'
}

In [149]:
phoneme_db = pd.read_csv('phonemes.csv')

In [150]:
# clean db
phoneme_db['num_char'] = phoneme_db.phoneme.str.len()

In [151]:
phoneme_db.sample(5)

Unnamed: 0,note,phoneme,examples,num_char
33,Vowels,ɐ,ăn,1
13,Initial consonants,p,pin,1
29,Final consonants,k,"xuất, ác",1
25,Final consonants,ŋ,"ban, trứng",1
11,Initial consonants,ɲ,nhà,1


In [152]:
def find_tone(IPA_string):
  tone = ''
  for t in wikitionary_IPA_tone_list.values():
    if t in IPA_string:
      tone = t
      break
  return tone

In [153]:
def add_tone(vowel, tone):
  if tone != wikitionary_IPA_tone_list['ngang']:
    return vowel + tone
  return vowel

In [154]:
def segmentize_phoneme(IPA_string, tone):
  phonemes = []
  # segmentize initial consonant, prioritize more chars in the IPA first
  for initial in phoneme_db[phoneme_db.note == 'Initial consonants'].sort_values(by=['num_char'], ascending=False).phoneme:
    if initial == IPA_string[0:2]:
      phonemes.append(initial)
      IPA_string = IPA_string[2:]
      break
    # elif f'?{initial}' == IPA_string[0:2]:
    #   phonemes.append(f'?{initial}')
    #   IPA_string = IPA_string[2:]
    #   break
    elif initial == IPA_string[0]:
      phonemes.append(initial)
      IPA_string = IPA_string[1:]
      break
    
  # segmentize diphthong
  for diphthong in phoneme_db[phoneme_db.note == 'Diphthongs'].sort_values(by=['num_char'], ascending=False).phoneme:
    if diphthong == IPA_string[:2]:
      IPA_string = IPA_string[2:]
      phonemes.append(add_tone(diphthong, tone))
      break

  # segmentize vowel
  for vowel in phoneme_db[phoneme_db.note == 'Vowels'].sort_values(by=['num_char'], ascending=False).phoneme:
    if vowel == IPA_string[0:2]:
      IPA_string = IPA_string[2:]
      phonemes.append(add_tone(vowel, tone))
      break
    elif vowel == IPA_string[0]:
      IPA_string = IPA_string[1:]
      phonemes.append(add_tone(vowel, tone))
      break
  
  # segmentize final consonant
  for final in phoneme_db[phoneme_db.note == 'Final consonants'].phoneme:
    if final == IPA_string[0:2]:
      phonemes.append(final)
      IPA_string = IPA_string[2:]
      break
    elif final == IPA_string[0]:
      phonemes.append(final)
      IPA_string = IPA_string[1:]
      break
  
  sleep(0.1)

  return phonemes

In [173]:
string = 'cha'
IPA_en = IPA_extractor_en(string)
IPA_vn, tone = IPA_extractor_vi(string)
# segmentize_phoneme(IPA, find_tone(IPA))
IPA_en, IPA_vn

('<span class="IPA">-ɑː</span', 'ʨaː')

In [156]:
with open('story.txt') as f:
  text = f.read()

punctuation = ['\n', ',', '.', '?', '!', ':', '-']
for p in punctuation:
  text = text.replace(p, ' ')

In [157]:
segmentize_phoneme('ŋaj', '˨˩')

['ŋ', 'a˨˩', 'j']

In [165]:
token_list = tokenizer(text)
phoneme_list = []
for i in tqdm(range(len(token_list))):
  token = token_list[i]
  try:
    IPA, tone = IPA_extractor_vi(token)
    phoneme_list += segmentize_phoneme(IPA, tone)
  except:
    try:
      IPA = IPA_extractor_en(token)
      phoneme_list += segmentize_phoneme(IPA, find_tone(IPA))
    except:
      print(token)

  0%|          | 0/219 [00:00<?, ?it/s]

In [168]:
# count occurence of each phoneme in the story
occurence = []
for phoneme in phoneme_db.phoneme:
  occurence.append(phoneme_list.count(phoneme))

phoneme_db['occurence'] = occurence

In [169]:
phoneme_db[phoneme_db.occurence == 0]

Unnamed: 0,note,phoneme,examples,num_char,occurence
13,Initial consonants,p,pin,1,0
18,Initial consonants,t͡ɕ,chè,3,0
19,Initial consonants,ʈ,tra,1,0
21,Initial consonants,r,"ra, rồi",1,0
22,Medial glide,ʷ,"oanh; quốc; Nguyễn, tuy",1,0
27,Final consonants,ŋm,"bốn [9], bún (after u, ô), chúng (after u, ô, ...",2,0
28,Final consonants,p̚,tiếp,2,0
31,Final consonants,kp,"một (after u, ô), học (after u, ô, o)[10]",2,0
33,Vowels,ɐ,ăn,1,0
35,Vowels,e,về,1,0
