In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.read_csv('../data/name_romanization.csv')
df['romanize1'] = df.romanize1.apply(lambda x: x.lower())
print('num of row:', len(df))
df = df.drop_duplicates()
print('drop duplicate:', len(df))
#df_name = pd.read_csv('name1000.csv')

num of row: 3847
drop duplicate: 3847


# make name list from original data

In [7]:
### MAKE dict {thai: [roman1, roman2],...}
### ONE TRANSCRIPTION MAY BE READ IN SEVERAL WAYS e.g. เอกรัฐ
names = {}
for g, p in zip(df.name, df.romanize1):
    names[g] = names.get(g, []) + [p]
print('num of distinct names:', len(names))

### EXCLUDE AMBIGUOUS NAMES
names_not_amb = [(g,p[0]) for g,p in names.items() if len(p)==1]
print('not ambiguous name:', len(names_not_amb))

num of distinct names: 3806
not ambiguous name: 3766


In [9]:
### if use all entries
#df = df[['token',"romanize1"]]

### USE ONLY NOT AMBIGUOUS NAMES
df_filtered = pd.DataFrame(names_not_amb, columns=['name','romanize1']) 
df = pd.merge(left=df_filtered, right=df, on='name') # JOIN
df

Unnamed: 0,name,romanize1_x,gender,ipa,romanize1_y,romanize2,romanize3
0,กุลสตรี,kunlasattri,f,kun1 la-1 sat2 trI-1,kunlasattri,kullasattri,
1,กุสุมา,kusuma,f,ku-2 su-2 mA-1,kusuma,gusuma,
2,กุสุมาลย์,kusuman,f,ku-2 su-2 mAn1,kusuman,kusumal,gusumal
3,กุหลาบ,kulap,f,ku-1 lAp2,kulap,kulab,kularb
4,ก่อเกียรติ,kokiat,m,kX-2 kJt2,kokiat,korkiat,gorkiat
...,...,...,...,...,...,...,...
3761,ไลลา,laila,f,laj1 lA-1,laila,,
3762,ไวพจน์,waiphot,m,waj1 Pot4,waiphot,Waipot,
3763,ไอริณ,airin,f,?aj1 rin1,airin,Irene,Irin
3764,ไอลดา,ailada,f,?aj1 la-1 dA-1,ailada,Irada,


In [17]:
train, test = train_test_split(df, test_size=0.1858)
print('train:', len(train))
print('test:', len(test))
print(len(test), "=", len(set(test.token))) # must equal

train: 3066
test: 700
700 = 700


In [18]:
train.token.to_csv('../data/train_x.txt', index=False, header=False)
train.romanize1.to_csv('../data/train_y.txt', index=False, header=False)
test.token.to_csv('../data/test_x.txt', index=False, header=False)
test.romanize1.to_csv('../data/test_y.txt', index=False, header=False)

# additional dict data

In [None]:
df = pd.read_csv('../data/g2p_dict.csv')
df.g.to_csv('../data/dic_x.txt', index=False, header=None)
df.rtgs.to_csv('../data/dic_y.txt', index=False, header=None)

# convert Prim's Data

In [69]:
SHORT_VOWELS = "aivueyoxz"
LONG_VOWELS =  "AIVUEYOXZ"
DIPHTHONGS = "JWR"
VOWELS = SHORT_VOWELS + LONG_VOWELS + DIPHTHONGS
ONSETS = ["br","bl","pr","pl","Pr","Pl","fr","fl","dr","tr","Tr","kr","kl","kw","Kr","Kl","Kw"] + \
    ["b","p","P","m","f","d","t","T","n","s","r","l","c","C","k","K","N","w","j","h","?"]
CODAS = ["p","m","f","t","d","n","s","l","c","k","N","w","j","?","-"]

df = df[['token', 'transcription']]
df

Unnamed: 0,token,transcription
0,กุลสตรี,kun0.la0.sat1.tri:0
1,กุสุมา,ku1.su1.ma:0
2,กุสุมาลย์,ku1.su1.ma:n0
3,กุหลาบ,ku0.la:p1
4,ก่อเกียรติ,kO:1.kiat1
...,...,...
3842,ไลลา,laj0.la:0
3843,ไวพจน์,waj0.phot3
3844,ไอริณ,?aj0.rin0
3845,ไอลดา,?aj0.la0.da:0


In [70]:
def convert_syl(syl):
    tone = syl[-1]
    syl = syl[:-1]

    if syl[-1] in 'ptkmnNwj':
        coda = syl[-1]
        syl = syl[:-1]
    else:
        coda = '-'

    if syl[-1] == ':':
        vowel = {'a:':'A','i:':'I','u:':'U','I:':'V','e:':'E','{:':'Y','o:':'O','O:':'X','@:':'Z'}[syl[-2:]]
        syl = syl[:-2]
    elif syl[-2:] in ['ia','ua','Ia']:
        vowel = {'ia':'J','ua':'W','Ia':'W'}[syl[-2:]]
        syl = syl[:-2]
    else:
        vowel = {'a':'a','i':'i','u':'u','I':'v','e':'e','{':'y','o':'o','O':'x','@':'z'}[syl[-1]]
        syl = syl[:-1]

    if syl[:2] in ['kh','ph','th','ch']:
        onset = {'kh':'K','ph':'P','th':'T','ch':'C'}[syl[:2]] + syl[2:]
    else:
        onset = syl

    return onset+vowel+coda+str(int(tone)+1)
        

def convert(transcription):
    syls = transcription.split('.')
    return ' '.join([convert_syl(syl) for syl in syls])

In [71]:
df['new'] = df.transcription.apply(convert)

In [74]:
df.to_csv('../data/name_phoneme.csv', index=False)

In [66]:
#### validation ####
for x in df.transcription:
    try:
        convert(x)
    except:
        print(x)