In [1]:
import keras
from keras import layers
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model, to_categorical
import numpy as np

import matplotlib.pyplot as plt
plt.style.use('ggplot')
from IPython.display import Image, display_png
import pandas as pd
from tensorflow.keras.utils import plot_model

In [61]:
SHORT_VOWELS = "aivueyoxz"
LONG_VOWELS =  "AIVUEYOXZ"
DIPHTHONGS = "JWR"
VOWELS = SHORT_VOWELS + LONG_VOWELS + DIPHTHONGS
ONSETS = ["br","bl","pr","pl","Pr","Pl","fr","fl","dr","tr","Tr","kr","kl","kw","Kr","Kl","Kw"] + \
    ["b","p","P","m","f","d","t","T","n","s","r","l","c","C","k","K","N","w","j","h","?"]
CODAS = ["p","m","f","t","d","n","s","l","c","k","N","w","j","?","-"]

In [68]:
data = pd.read_csv('data/thai2phone.csv', names=['g','p'])
data = data[~data.g.str.contains('\.')] # exclude abbreviation
data = data.dropna().reset_index(drop=True)
data['g'] = data.g.apply(lambda x: '^'+x+'$')
data

Unnamed: 0,g,p
0,^กก$,kok2
1,^กกกอด$,kok2 kXt2
2,^กกช้าง$,kok2 CAN4
3,^กกธูป$,kok2 TUp3
4,^กกหู$,kok2 hU-5
...,...,...
31478,^ไฮโล$,haj1 lo-1
31479,^ไฮไดรด์$,haj1 daj1 ra-4
31480,^ไฮไฟ$,haj1 faj1
31481,^ไฮไลต์$,haj1 laj1


# train data

In [69]:
def syl_split(syl):
    tone = syl[-1] # prA-1 -> 1
    coda = syl[-2] # prA-1 -> -
    vowel = syl[-3] # prA-1 -> A
    onset = syl[:-3] # # prA-1 -> pr
    return onset, vowel, coda, int(tone)

def make_output(phon, n=0): # n: onset, vowel, coda, tone
    syls = phon.split()
    splitted = [syl_split(syl)[n] for syl in syls]
    if n==0:
        return [onset2id[x] for x in splitted]
    elif n==1:
        return [vowel2id[x] for x in splitted]
    elif n==2:
        return [coda2id[x] for x in splitted]
    elif n==3:
        return splitted

thai2id = {'<PAD>':0, '^':1, '$':2, ' ':3}
for i in range(3585, 3674): # ก=3585->4, ๙=3673
    thai2id[chr(i)] = i - 3581
id2thai = {v:k for k,v in thai2id.items()}

onset2id = {'<UNK>':-1,'<PAD>':0,'<SOS>':1,'<EOS>':2}
for c in ONSETS:
    onset2id[c] = max(onset2id.values()) + 1
id2onset = {v:k for k,v in onset2id.items()}

coda2id = {'<UNK>':-1,'<PAD>':0,'<SOS>':1,'<EOS>':2}
for c in CODAS:
    coda2id[c] = max(coda2id.values()) + 1
id2coda = {v:k for k,v in coda2id.items()}

vowel2id = {'<UNK>':-1,'<PAD>':0,'<SOS>':1,'<EOS>':2}
for c in VOWELS:
    vowel2id[c] = max(vowel2id.values()) + 1
id2vowel = {v:k for k,v in vowel2id.items()}


In [70]:
print(data.p[100])
make_output(data.p[100], 2)

ka-2 ra-4 nI-1 cam1 pen1


[17, 17, 17, 4, 8]

In [71]:
# max len of input character length
print(max(map(len, data.g)))

# max len of output syllable length
print(max([len(x.split(" ")) for x in data.p]))

52
20


In [76]:
## validation ##
for phon in data.p:
    try:
        make_output(phon, n=3)
    except:
        print(phon)

In [92]:
input_seq = [[thai2id[c] for c in word] for word in data.g]
input_seq = pad_sequences(input_seq, padding='post', maxlen=55, value=0)

output0 = [[1]+make_output(phon, n=0) for phon in data.p] # onset
output1 = [[1]+make_output(phon, n=1) for phon in data.p] # vowel
output2 = [[1]+make_output(phon, n=2) for phon in data.p] # coda
output3 = [[-1]+make_output(phon, n=3) for phon in data.p] # tone
output4 = [[0]*(len(seq)-1)+[1] for seq in output0] # end condition

In [93]:
i = 100

print(output0[i])
print(output1[i])
print(output2[i])
print(output3[i])
print(output4[i])

[1, 34, 30, 28, 32, 21]
[1, 3, 3, 13, 3, 7]
[1, 17, 17, 17, 4, 8]
[-1, 2, 4, 1, 1, 1]
[0, 0, 0, 0, 0, 1]


In [94]:
### padding ###
output0 = pad_sequences(output0, padding='post', maxlen=20, value=0)
output1 = pad_sequences(output1, padding='post', maxlen=20, value=0)
output2 = pad_sequences(output2, padding='post', maxlen=20, value=0)
output3 = pad_sequences(output3, padding='post', maxlen=20, value=0)
output4 = pad_sequences(output4, padding='post', maxlen=20, value=1)

i = 100

print(output0[i])
print(output1[i])
print(output2[i])
print(output3[i])
print(output4[i])

[ 1 34 30 28 32 21  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
[ 1  3  3 13  3  7  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
[ 1 17 17 17  4  8  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
[-1  2  4  1  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
[0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [95]:
output0 = to_categorical(output0, num_classes=len(onset2id))
output1 = to_categorical(output1, num_classes=len(vowel2id))
output2 = to_categorical(output2, num_classes=len(coda2id))
output3 = to_categorical(output3, num_classes=7)
output4 = to_categorical(output4, num_classes=2)

i = 100
print(output3[i])
print(output4[i])

[[0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]]
[[1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]]


# make model

In [None]:
input1 = layers.Input(batch_size=None, shape=(data.input_maxlen,), name='Enc_Input')
embedding1 = layers.Embedding(input_dim = data.input_vocab, # num of vocab = num of ID dict
                                    input_length = data.input_maxlen,
                                    output_dim = emb_dim,
                                    mask_zero = True,
                                    name = 'Enc_Embedding')
lstm = layers.Bidirectional(layers.LSTM(lstm_dim,
                                        return_sequences=return_seq, # use only final output when w/o attention -> return (batch, lstm_dim)
                                        return_state=True,
                                        recurrent_dropout=lstm_dropout,
                                        name='Enc_LSTM'), name='BiDir') # return: output, h1, c1, h2, c2
concat_h = layers.Concatenate(name='Concat_h') # concatnate bidirectional output of state h 
concat_c = layers.Concatenate(name='Concat_c') # concatnate bidirectional output of state c

    def __call__(self):
        x = self.input
        x = self.embedding(x)
        x, h1, c1, h2, c2 = self.lstm(x)
        h = self.concat_h([h1, h2])
        c = self.concat_c([c1, c2])
        return x, h, c

    def build(self):
        x, h, c = self()
        model = keras.models.Model(inputs=self.input, outputs=[x,h,c])
        return model