In [1]:
import re, json, tqdm, importlib, tltk
import pandas as pd
import numpy as np
from collections import Counter
from pythainlp.corpus import thai_stopwords
from pythainlp import word_tokenize
import g2p

STOPWORDS = thai_stopwords()

In [46]:
importlib.reload(g2p)

<module 'g2p' from '/Users/Nozomi/Library/Mobile Documents/com~apple~CloudDocs/g2p/g2p.py'>

In [47]:
text = "ปัจจุบันได้มีการคิดหาทางรักษาอาการอีดี ด้วยวิธีใหม่ ๆ ทางคณะแพทย์จากอิสราเอล ได้ทดลองรักษาชายอีดีที่มีปัญหาหลอดเลือดด้วยคลื่นเสียงความถี่ต่ำ (Low-intensity Extracorporeal shock wave) ในกลุ่มชายอีดี พบว่าได้ผลดีอย่างเหลือเชื่อ... อ่านต่อที่ : https://www.dailynews.co.th/article/809343"

g2p.g2p(text, return_token=0)

'pàt cù ban dâay mii kaan kʰít hǎa tʰaaŋ rák sǎa ʔaa kaan ʔii dii dûay wí tʰii mày ๆ tʰaaŋ kʰá náʔ pʰɛ̂ɛt càak ʔìt sa raa ʔeen dâay tʰót lɔɔŋ rák sǎa cʰaay ʔii dii tʰîi  lɔ̀ɔt lɯ̂at dûay kʰlɯ̂ɯn sǐaŋ kʰwaam tʰìi tàm ( Low-intensity Extracorporeal shock wave ) nay klùm cʰaay ʔii dii pʰóp wâa dâay pʰǒn dii yàaŋ lɯ̌a cʰɯ̂a ... ʔàan tɔ̀ɔ tʰîi :'

In [30]:
#g2p.__g2p_number('31')
g2p.__is_number('27')

False

# validate

In [74]:
data = pd.read_csv('thai2phone.csv')
#data = data.set_index('thai')

# show rows with phone
data[~data.phone.isna()]

Unnamed: 0,thai,phone
0,ก็,kX-3
1,ก.,kX-1
2,ก.ค.,ka-2 ra-4 ka-2 dA-1 Kom1
3,ก.ต.,kX-1 TX-1
4,ก.ป.ส.,kX-1 pX-1 sX-5
...,...,...
62275,95,kAw3 sip2 hA-3
62276,96,kAw3 sip2 hok2
62277,97,kAw3 sip2 cet2
62278,98,kAw3 sip2 pYt2


In [75]:
for i, row in data[~data.phone.isna()].iterrows():
    r = g2p.validate(row.phone)
    if not r:
        print(row.thai, row.phone)

In [6]:
for row in data[~data.phone.isna()]['phone']:
    g2p.decode(row)

In [113]:
importlib.reload(g2p)

<module 'g2p' from '/Users/Nozomi/Library/Mobile Documents/com~apple~CloudDocs/g2p/g2p.py'>

In [66]:
df = pd.read_csv('thai2phone.csv')
for i, row in df.iterrows():
    if type(row.phone) != str:
        continue
    elif g2p.validate(row.phone):
        continue
    else:
        words = row['phone'].split(' ')
        newwords = []
        for word in words:
            tone = word[-2]
            coda = word[-1]
            newwords.append(word[:-2] + coda + tone)
        df.iloc[i]['phone'] = ' '.join(newwords)

In [67]:
df.to_csv('thai2phone.csv', index=False)

In [110]:
df = pd.read_csv('number2phone.csv')
df = df.sort_values('number')
df.to_csv('number2phone.csv', index=False)

In [32]:
def get_phone(tltk_result):
    splitted = re.split(r'(?:\||<tr/>)', tltk_result)
    return [x for x in splitted if re.match(r'[a-z]', x)]

get_phone(tltk.g2p('ตำรวจกองปราบฯบุกรวบ'))

In [121]:
g2p.g2p_number('3120')

'sAm5 Pan1 rXj4 jI-3 sip2'

In [42]:
# test g2p sentence
for w in word_tokenize('โลกยังปราบโควิด-19 ไม่ลง ส่วนใหญ่ยังคงแพร่'):
    print(w, g2p.g2p_oneword(w, return_none=True), sep='\t')

โลก	lôːk
ยัง	jaŋ
ปราบ	pràːp
โควิด	kʰoː wìt
-	None
19	None
 	None
ไม่	mâj
ลง	loŋ
 	None
ส่วนใหญ่	sùan jàj
ยังคง	jaŋ kʰoŋ
แพร่	pʰrɛ̂ː


In [65]:
word_tokenize("1,230 บาท")

['1,230', ' ', 'บาท']

# tokenize

In [33]:
df = pd.read_csv('tweet.csv')
df['tokens'] = df.tweet.apply(g2p.tokenize)

In [34]:
df[['tokens','ID']].to_csv('twitter.csv', index=False)

# analyze

In [25]:
def entropy(counter):
    total = sum(counter.values())
    entropy = -sum(np.log2(count/total)*(count/total) for count in counter.values())
    return total, entropy

class Freq:
    def __init__(self):
        self.freq = Counter()
        self.freq_nostop = Counter()

class Domain:
    def __init__(self, domain_name):
        self.df = pd.read_csv(f'{domain_name}.csv')
        self.domain = domain_name.upper()
        self.calc_word_freq()

    def calc_word_freq(self):
        # word frequency
        self.word_freq = Counter()
        self.word_freq_nostop = Counter()
        # vowel frequency (word base)
        self.vowel_freq = Counter()
        self.vowel_freq_nostop = Counter()
        self.vowel_tone_freq = Counter()
        self.vowel_tone_freq_nostop = Counter()
        # vowel frequency (syllable base)
        self.vowel2_freq = Counter()
        self.vowel2_freq_nostop = Counter()
        self.vowel2_tone_freq = Counter()
        self.vowel2_tone_freq_nostop = Counter()
        # coda/onset frequency (word base)
        self.coda_freq = Counter()
        self.coda_freq_nostop = Counter()
        self.onset_freq = Counter()
        self.onset_freq_nostop = Counter()
        # coda/onset frequency (syllable base)
        self.coda2_freq = Counter()
        self.coda2_freq_nostop = Counter()
        self.onset2_freq = Counter()
        self.onset2_freq_nostop = Counter()
        # calculate
        for row in tqdm.tqdm(self.df.tokens): # iterate each text: list of tokens
            # only Thaiword token list
            tokens = [w for w in eval(row) if re.match(r'[ก-๙][ก-๙\.\- ]*$', w)] # only thaiword
            tokens_nostop = [w for w in tokens if w not in STOPWORDS]
            # phone list (only convertable words)
            phones = list(filter(lambda x:x!=None, [g2p.get_phone_oneword(w, return_none=True) for w in tokens]))
            phones_nostop = list(filter(lambda x:x!=None, [g2p.get_phone_oneword(w, return_none=True) for w in tokens_nostop]))
            # word frequency
            self.word_freq.update(tokens)
            self.word_freq_nostop.update(tokens_nostop)
            # vowel frequency 
            vowels = [g2p.get_vowels(p, ipa=True) for p in phones] # [(a,aa), (a,ua),(u),..]
            vowels_nostop = [g2p.get_vowels(p, ipa=True) for p in phones_nostop]
            self.vowel_freq.update(vowels) # word base
            self.vowel_freq_nostop.update(vowels_nostop) # word base
            self.vowel2_freq.update(sum(map(list, vowels), [])) # syl base
            self.vowel2_freq_nostop.update(sum(map(list, vowels_nostop), [])) # syl base
            # vowel tone frequency
            vowels_tone = [g2p.get_vowels_tone(p, ipa=True) for p in phones] # [(a,aa), (a,ua),(u),..]
            vowels_tone_nostop = [g2p.get_vowels_tone(p, ipa=True) for p in phones_nostop]
            self.vowel_tone_freq.update(vowels_tone) # word base
            self.vowel_tone_freq_nostop.update(vowels_tone_nostop) # word base
            self.vowel2_tone_freq.update(sum(map(list, vowels_tone), [])) # syl base
            self.vowel2_tone_freq_nostop.update(sum(map(list, vowels_tone_nostop), [])) # syl base
            # coda frequency 
            codas = [g2p.get_codas(p, ipa=True) for p in phones] # [(k,-), (t,p),(-),..]
            codas_nostop = [g2p.get_codas(p, ipa=True) for p in phones_nostop]
            self.coda_freq.update(codas) # word base
            self.coda_freq_nostop.update(codas_nostop) # word base
            self.coda2_freq.update(sum(map(list, codas), [])) # syl base
            self.coda2_freq_nostop.update(sum(map(list, codas_nostop), [])) # syl base
            # onset frequency 
            onsets = [g2p.get_onsets(p, ipa=True) for p in phones] # [(k,-), (t,p),(-),..]
            onsets_nostop = [g2p.get_onsets(p, ipa=True) for p in phones_nostop]
            self.onset_freq.update(onsets) # word base
            self.onset_freq_nostop.update(onsets_nostop) # word base
            self.onset2_freq.update(sum(map(list, onsets), [])) # syl base
            self.onset2_freq_nostop.update(sum(map(list, onsets_nostop), [])) # syl base

        # word entropy
        self.all_token, self.entropy = entropy(self.word_freq)
        self.all_token_nostop, self.entropy_nostop = entropy(self.word_freq_nostop)

        # vowel entropy
        self.all_vowel, self.vowel_entropy = entropy(self.vowel_freq)
        self.all_vowel_nostop, self.vowel_entropy_nostop = entropy(self.vowel_freq_nostop)
        self.all_vowel2, self.vowel2_entropy = entropy(self.vowel2_freq)
        self.all_vowel2_nostop, self.vowel2_entropy_nostop = entropy(self.vowel2_freq_nostop)

        # vowel tone entropy
        self.all_vowel_tone, self.vowel_tone_entropy = entropy(self.vowel_tone_freq)
        self.all_vowel_tone_nostop, self.vowel_tone_entropy_nostop = entropy(self.vowel_tone_freq_nostop)
        self.all_vowel2_tone, self.vowel2_tone_entropy = entropy(self.vowel2_tone_freq)
        self.all_vowel2_tone_nostop, self.vowel2_tone_entropy_nostop = entropy(self.vowel2_tone_freq_nostop)

        # coda entropy
        self.all_coda, self.coda_entropy = entropy(self.coda_freq)
        self.all_coda_nostop, self.coda_entropy_nostop = entropy(self.coda_freq_nostop)
        self.all_coda2, self.coda2_entropy = entropy(self.coda2_freq)
        self.all_coda2_nostop, self.coda2_entropy_nostop = entropy(self.coda2_freq_nostop)

        # onset entropy
        self.all_onset, self.onset_entropy = entropy(self.onset_freq)
        self.all_onset_nostop, self.onset_entropy_nostop = entropy(self.onset_freq_nostop)
        self.all_onset2, self.onset2_entropy = entropy(self.onset2_freq)
        self.all_onset2_nostop, self.onset2_entropy_nostop = entropy(self.onset2_freq_nostop)

    def show_entropy(self):
        print(f'----- {self.domain} -----')
        print(f'Entropy with stopwords: {self.entropy:.3f}')
        print(f'Entropy w/o  stopwords: {self.entropy_nostop:.3f}')
        print(f'Vowel2 Entropy with stopwords: {self.vowel2_entropy:.3f}')
        print(f'Vowel2 Entropy w/o  stopwords: {self.vowel2_entropy_nostop:.3f}')
        print(f'Vowel2-Tone Entropy with stopwords: {self.vowel2_tone_entropy:.3f}')
        print(f'Vowel2-Tone Entropy w/o  stopwords: {self.vowel2_tone_entropy_nostop:.3f}')
        print(f'Onset2 Entropy with stopwords: {self.onset2_entropy:.3f}')
        print(f'Onset2 Entropy w/o  stopwords: {self.onset2_entropy_nostop:.3f}')
        print(f'Coda2 Entropy with stopwords: {self.coda2_entropy:.3f}')
        print(f'Coda2 Entropy w/o  stopwords: {self.coda2_entropy_nostop:.3f}')    
        print(f'Vowel Entropy with stopwords: {self.vowel_entropy:.3f}')
        print(f'Vowel Entropy w/o  stopwords: {self.vowel_entropy_nostop:.3f}')
        print(f'Vowel-Tone Entropy with stopwords: {self.vowel_tone_entropy:.3f}')
        print(f'Vowel-Tone Entropy w/o  stopwords: {self.vowel_tone_entropy_nostop:.3f}')    
        print(f'Onset Entropy with stopwords: {self.onset_entropy:.3f}')
        print(f'Onset Entropy w/o  stopwords: {self.onset_entropy_nostop:.3f}')
        print(f'Coda Entropy with stopwords: {self.coda_entropy:.3f}')
        print(f'Coda Entropy w/o  stopwords: {self.coda_entropy_nostop:.3f}')
        
    def show_word_freq(self, n=20, nostop=False):
        print(f'----- {self.domain} -----')
        if not nostop:
            for x in self.word_freq.most_common(n):
                print(x[0], round(x[1]/self.all_token*1000000,3),sep='\t')
        else:
            for x in self.word_freq_nostop.most_common(n):
                print(x[0], round(x[1]/self.all_token_nostop*1000000,3),sep='\t')

    def show_coda_freq(self, n=100, nostop=False):
        if not nostop:
            print('----- syllable -----')
            for x in self.coda2_freq.most_common(n):
                print(' '.join(x[0]), round(x[1]/self.all_coda2*100, 3),sep='\t')
            print('\n----- word -----')
            for x in self.coda_freq.most_common(n):
                print(' '.join([c if c != '' else '-' for c in x[0]]), round(x[1]/self.all_coda*100, 3),sep='\t')
        else:
            print('----- syllable -----')
            for x in self.coda2_freq_nostop.most_common(n):
                print(' '.join(x[0]), round(x[1]/self.all_coda2_nostop*100, 3),sep='\t')
            print('\n----- word -----')
            for x in self.coda_freq_nostop.most_common(n):
                print(' '.join([c if c != '' else '-' for c in x[0]]), round(x[1]/self.all_coda_nostop*100, 3),sep='\t')

    def show_onset_freq(self, n=100, nostop=False):
        if not nostop:
            print('----- syllable -----')
            for x in ['s', 'tʰ', 'r', 'n', 'k', 'kʰ', 'pʰ', 'm', 't', 'l', 'tɕʰ', 'b', 'tɕ', 'w', 'ʔ', 'j', 'd', 'p', 'h', 'pr', 'ŋ', 'kr', 'kʰw', 'f', 'tr', 'kʰr', 'pʰr', 'kl', 'kʰl', 'pl', 'pʰl', 'kw', 'fr', 'br', 'tʰr', 'dr', 'fl', 'bl']:
                print(x, round(self.onset2_freq[x]/self.all_onset2*100, 3),sep='\t')
            print('\n----- word -----')
            for x in self.onset_freq.most_common(n):
                print(' '.join(x[0]), round(x[1]/self.all_onset*100, 3),sep='\t')
        else:
            print('----- syllable -----')
            for x in ['s', 'tʰ', 'r', 'n', 'k', 'kʰ', 'pʰ', 'm', 't', 'l', 'tɕʰ', 'b', 'tɕ', 'w', 'ʔ', 'j', 'd', 'p', 'h', 'pr', 'ŋ', 'kr', 'kʰw', 'f', 'tr', 'kʰr', 'pʰr', 'kl', 'kʰl', 'pl', 'pʰl', 'kw', 'fr', 'br', 'tʰr', 'dr', 'fl', 'bl']:
                print(x, round(self.onset2_freq_nostop[x]/self.all_onset2_nostop*100, 3),sep='\t')
            print('\n----- word -----')
            for x in self.onset_freq_nostop.most_common(n):
                print(' '.join(x[0]), round(x[1]/self.all_onset_nostop*100, 3),sep='\t')

    def show_vowel_freq(self, n=20, nostop=False):
        print(f'----- {self.domain} -----')
        if not nostop:
            print('\n----- syl -----')
            for vowel in ['a', 'aa', 'ɔɔ', 'ii', 'o', 'i', 'u', 'ua', 'ɛɛ', 'uu', 'ee', 'e', 'oo', 'ɯa', 'əə', 'ɯ', 'ɛ', 'ia', 'ɯɯ', 'ɔ', 'ə']:
                print(vowel, round(self.vowel2_freq[vowel]/self.all_vowel2*100, 3),sep='\t')

            print('\n----- syl tone -----')
            for vowel, count in self.vowel2_tone_freq.most_common(n):
                print(vowel, round(count/self.all_vowel2_tone*100, 3),sep='\t')

            print('\n----- word -----')
            for vowel, count in self.vowel_freq.most_common(n):
                print(' '.join(vowel), round(count/self.all_vowel*100, 3),sep='\t')

            print('\n----- word tone -----')
            for vowel, count in self.vowel_tone_freq.most_common(n):
                print(' '.join(vowel), round(count/self.all_vowel_tone*100, 3),sep='\t')

        else:
            print('\n----- syl -----')
            for vowel in ['a', 'aa', 'ɔɔ', 'ii', 'o', 'i', 'u', 'ua', 'ɛɛ', 'uu', 'ee', 'e', 'oo', 'ɯa', 'əə', 'ɯ', 'ɛ', 'ia', 'ɯɯ', 'ɔ', 'ə']:
                print(vowel, round(self.vowel2_freq_nostop[vowel]/self.all_vowel2_nostop*100, 3),sep='\t')

            print('\n----- syl tone -----')
            for vowel, count in self.vowel2_tone_freq_nostop.most_common(n):
                print(vowel, round(count/self.all_vowel2_tone_nostop*100, 3),sep='\t')

            print('\n----- word -----')
            for vowel, count in self.vowel_freq_nostop.most_common(n):
                print(' '.join(vowel), round(count/self.all_vowel_nostop*100, 3),sep='\t')

            print('\n----- word tone -----')
            for vowel, count in self.vowel_tone_freq_nostop.most_common(n):
                print(' '.join(vowel), round(count/self.all_vowel_tone_nostop*100, 3),sep='\t')

In [37]:
data = Domain('twitter')
#pantip = Domain('pantip')
#thairath = Domain('thairath')

100%|██████████| 485000/485000 [03:41<00:00, 2193.82it/s]


In [38]:
data.show_entropy()

----- TWITTER -----
Entropy with stopwords: 10.670
Entropy w/o  stopwords: 11.638
Vowel2 Entropy with stopwords: 3.520
Vowel2 Entropy w/o  stopwords: 3.692
Vowel2-Tone Entropy with stopwords: 5.535
Vowel2-Tone Entropy w/o  stopwords: 5.713
Onset2 Entropy with stopwords: 4.459
Onset2 Entropy w/o  stopwords: 4.504
Coda2 Entropy with stopwords: 3.015
Coda2 Entropy w/o  stopwords: 2.989
Vowel Entropy with stopwords: 5.052
Vowel Entropy w/o  stopwords: 5.888
Vowel-Tone Entropy with stopwords: 7.153
Vowel-Tone Entropy w/o  stopwords: 8.021
Onset Entropy with stopwords: 6.057
Onset Entropy w/o  stopwords: 6.802
Coda Entropy with stopwords: 4.466
Coda Entropy w/o  stopwords: 5.058


In [50]:
data.show_coda_freq(100, nostop=True)

----- syllable -----
	32.294
n	14.908
ŋ	9.828
j	8.189
k	7.003
t	6.865
m	6.411
ʔ	6.169
p	4.464
w	3.733
f	0.067
d	0.047
l	0.013
s	0.01

----- word -----
-	21.06
n	10.811
ŋ	7.333
j	5.326
ʔ	5.139
k	5.012
m	4.502
t	3.984
p	3.324
w	3.071
- -	2.867
- k	1.448
n -	1.227
- t	1.16
- ŋ	0.97
- j	0.92
- n	0.853
n n	0.656
ŋ n	0.618
- ʔ	0.588
p -	0.569
ŋ -	0.558
n j	0.542
t ŋ	0.466
ŋ j	0.431
k -	0.395
m n	0.394
p n	0.388
w -	0.373
- m	0.358
j n	0.33
ŋ ŋ	0.303
m -	0.296
k j	0.295
- w	0.294
j -	0.28
t -	0.271
j ŋ	0.268
m j	0.267
t j	0.254
n w	0.252
n ŋ	0.25
j j	0.235
k n	0.222
- p	0.217
ʔ t	0.21
t n	0.206
- - -	0.192
k ŋ	0.182
m ŋ	0.176
m t	0.173
ʔ n	0.173
m k	0.163
ʔ ʔ	0.16
k w	0.158
ʔ -	0.157
n k	0.152
p m	0.143
w j	0.143
- t -	0.142
n t	0.133
m m	0.132
t t	0.131
k k	0.131
m p	0.13
n p	0.127
ŋ m	0.125
- - n	0.124
t m	0.116
- - t	0.113
ŋ k	0.112
p p	0.109
ʔ j	0.106
n m	0.1
- m k	0.096
p j	0.094
j m	0.09
ŋ t	0.086
ŋ w	0.086
- t t	0.083
t - -	0.082
m - k	0.081
w ŋ	0.077
ʔ ŋ	0.076
n ʔ	0.076
f	0.076
n - -	

In [32]:
importlib.reload(g2p)

<module 'g2p' from '/Users/Nozomi/Library/Mobile Documents/com~apple~CloudDocs/g2p/g2p.py'>

In [44]:
data.show_word_freq(1000, nostop=True)

----- TWITTER -----
พี่	11585.171
คน	10273.041
กก	7078.544
ดี	6898.11
ทำ	6418.903
งง	6096.722
อ่ะ	5843.79
ออ	5541.115
ดู	5531.037
น้อง	5486.498
ชอบ	5442.608
ผม	5143.835
นอน	5063.859
อะ	4922.113
รัก	4522.232
รู้	4212.73
แม่	3910.381
น่ารัก	3896.727
เค้า	3778.388
ง	3623.962
กิน	3556.665
นะคะ	3343.72
อย่า	3263.419
หรอ	3089.162
น	3068.03
เรื่อง	2884.345
เหมือน	2847.608
ค	2799.492
ตัวเอง	2659.696
เล่น	2658.396
เพลง	2650.593
สิ	2630.437
เจอ	2549.81
รอ	2548.51
ตื่น	2548.185
หน้า	2543.958
ตอน	2542.333
คิดถึง	2536.806
อ	2385.306
เพื่อน	2310.857
ตอนนี้	2297.203
แบบนี้	2291.351
ล่ะ	2263.717
งาน	2213.325
ไหม	2169.111
เดี๋ยว	2079.706
อี	2063.776
วะ	2059.225
อ่าน	2058.574
ม	2048.821
คุย	2035.492
แฟน	2032.241
ย	2024.438
ตัว	2012.734
เวลา	2006.232
ไทย	1980.874
หา	1951.614
บ้าน	1942.186
รู้สึก	1942.186
ล	1926.581
โดน	1924.305
กุ	1921.054
ใจ	1918.454
รี	1918.128
ปี	1898.947
ค่า	1891.145
หนู	1887.243
รูป	1873.589
สวย	1857.659
หล่อ	1850.181
ใส่	1824.823
เด็ก	1798.814
บอ	1795.563
ขนาด	1790.036
ก	1783.209
ข

In [23]:
data.vowel_freq.most_common(50)

[(('a',), 438733),
 (('aa',), 429229),
 (('ii',), 198109),
 (('ɔɔ',), 176683),
 (('o',), 94940),
 (('a', 'a'), 87712),
 (('a', 'aa'), 84567),
 (('ɛ',), 80055),
 (('ɛɛ',), 68970),
 (('ɯa',), 67859),
 (('aa', 'aa'), 63748),
 (('e',), 60513),
 (('uu',), 56047),
 (('ua',), 55586),
 (('ɯ',), 50178),
 (('oo',), 47923),
 (('i',), 43390),
 (('a', 'ii'), 42045),
 (('u',), 41329),
 (('əə',), 32869),
 (('ɯɯ',), 32426),
 (('ee',), 29128),
 (('a', 'o'), 28504),
 (('aa', 'a'), 27112),
 (('ɔ',), 21430),
 (('ɔɔ', 'a'), 20653),
 (('ia',), 19749),
 (('a', 'a', 'aa'), 19515),
 (('ɔɔ', 'aa'), 18741),
 (('a', 'ua'), 17870),
 (('ɔɔ', 'ɔɔ'), 17735),
 (('a', 'aa', 'o'), 15006),
 (('a', 'ɔɔ'), 14384),
 (('a', 'u'), 14256),
 (('a', 'ee'), 13741),
 (('o', 'aa'), 12198),
 (('a', 'ɛɛ'), 11927),
 (('a', 'aa', 'ii'), 11199),
 (('a', 'i'), 11126),
 (('a', 'əə'), 10909),
 (('ii', 'a'), 10468),
 (('ɔɔ', 'ɔɔ', 'ɔɔ'), 10419),
 (('ua', 'a'), 10380),
 (('ee', 'aa'), 9884),
 (('a', 'a', 'ii'), 9622),
 (('aa', 'u'), 9479),
 

In [35]:
print([x.replace(' ', '') for x in t.split('\n')])

['a', 'aa', 'ɔɔ', 'ii', 'o', 'i', 'u', 'ua', 'ɛɛ', 'uu', 'ee', 'e', 'oo', 'ɯa', 'əə', 'ɯ', 'ɛ', 'ia', 'ɯɯ', 'ɔ', 'ə']
