In [38]:
import keras
from keras import layers
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model, to_categorical
import numpy as np

import matplotlib.pyplot as plt
plt.style.use('ggplot')
from IPython.display import Image, display_png
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
SHORT_VOWELS = "aivueyoxz"
LONG_VOWELS =  "AIVUEYOXZ"
DIPHTHONGS = "JWR"
VOWELS = SHORT_VOWELS + LONG_VOWELS + DIPHTHONGS
ONSETS = ["br","bl","pr","pl","Pr","Pl","fr","fl","dr","tr","Tr","kr","kl","kw","Kr","Kl","Kw"] + \
    ["b","p","P","m","f","d","t","T","n","s","r","l","c","C","k","K","N","w","j","h","?"]
CODAS = ["p","m","f","t","d","n","s","l","c","k","N","w","j","?","-"]

In [3]:
data = pd.read_csv('data/thai2phone.csv', names=['g','p'])
data = data[~data.g.str.contains('\.')] # exclude abbreviation
data = data.dropna().reset_index(drop=True)
data['g'] = data.g.apply(lambda x: '^'+x+'$')
data

Unnamed: 0,g,p
0,^กก$,kok2
1,^กกกอด$,kok2 kXt2
2,^กกช้าง$,kok2 CAN4
3,^กกธูป$,kok2 TUp3
4,^กกหู$,kok2 hU-5
...,...,...
31478,^ไฮโล$,haj1 lo-1
31479,^ไฮไดรด์$,haj1 daj1 ra-4
31480,^ไฮไฟ$,haj1 faj1
31481,^ไฮไลต์$,haj1 laj1


# train data

In [24]:
def syl_split(syl):
    tone = syl[-1] # prA-1 -> 1
    coda = syl[-2] # prA-1 -> -
    vowel = syl[-3] # prA-1 -> A
    onset = syl[:-3] # # prA-1 -> pr
    return onset, vowel, coda, tone

def make_output(phon, n=0): # n: onset, vowel, coda, tone
    syls = phon.split()
    splitted = [syl_split(syl)[n] for syl in syls]
    if n==0:
        return [onset2id[x] for x in splitted]
    elif n==1:
        return [vowel2id[x] for x in splitted]
    elif n==2:
        return [coda2id[x] for x in splitted]
    elif n==3:
        return [tone2id[x] for x in splitted]

thai2id = {'<PAD>':0, '^':1, '$':2, ' ':3}
for i in range(3585, 3674): # ก=3585->4, ๙=3673
    thai2id[chr(i)] = i - 3581
id2thai = {v:k for k,v in thai2id.items()}

onset2id = {'<PAD>':0,'<SOS>':1}
for c in ONSETS:
    onset2id[c] = max(onset2id.values()) + 1
id2onset = {v:k for k,v in onset2id.items()}

coda2id = {'<PAD>':0,'<SOS>':1}
for c in CODAS:
    coda2id[c] = max(coda2id.values()) + 1
id2coda = {v:k for k,v in coda2id.items()}

vowel2id = {'<PAD>':0,'<SOS>':1}
for c in VOWELS:
    vowel2id[c] = max(vowel2id.values()) + 1
id2vowel = {v:k for k,v in vowel2id.items()}

tone2id = {'<PAD>':0,'<SOS>':1,'1':2,'2':3,'3':4,'4':5,'5':6}
id2tone = {v:k for k,v in tone2id.items()}

In [26]:
print(data.p[100])
make_output(data.p[100], 3)

ka-2 ra-4 nI-1 cam1 pen1


[3, 5, 2, 2, 2]

In [33]:
# max len of input character length
print(max(map(len, data.g)))

# max len of output syllable length
print(max([len(x.split(" ")) for x in data.p]))

52
20


In [28]:
## validation of data ##
for phon in data.p:
    try:
        make_output(phon, n=3)
    except:
        print(phon)

In [29]:
input_seq = [[thai2id[c] for c in word] for word in data.g]
input_seq = pad_sequences(input_seq, padding='post', maxlen=55, value=0)

output0 = [[1]+make_output(phon, n=0) for phon in data.p] # onset
output1 = [[1]+make_output(phon, n=1) for phon in data.p] # vowel
output2 = [[1]+make_output(phon, n=2) for phon in data.p] # coda
output3 = [[1]+make_output(phon, n=3) for phon in data.p] # tone
output4 = [[0]*(len(seq)-1)+[1] for seq in output0] # end condition

### padding ###
output0 = pad_sequences(output0, padding='post', maxlen=20, value=0)
output1 = pad_sequences(output1, padding='post', maxlen=20, value=0)
output2 = pad_sequences(output2, padding='post', maxlen=20, value=0)
output3 = pad_sequences(output3, padding='post', maxlen=20, value=0)
output4 = pad_sequences(output4, padding='post', maxlen=20, value=1)

In [30]:
i = 100

print(output0[i])
print(output1[i])
print(output2[i])
print(output3[i])
print(output4[i])

[ 1 33 29 27 31 20  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
[ 1  2  2 12  2  6  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
[ 1 16 16 16  3  7  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
[1 3 5 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [31]:
### one-hot vector ###
output0 = to_categorical(output0, num_classes=len(onset2id)) # onset
output1 = to_categorical(output1, num_classes=len(vowel2id)) # vowel
output2 = to_categorical(output2, num_classes=len(coda2id)) # coda
output3 = to_categorical(output3, num_classes=7) # tone 1-5, padding 0, start -1
output4 = to_categorical(output4, num_classes=2)

i = 100
print(output3[i])
print(output4[i])

[[0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]]
[[1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [0. 1.]]


# train - test split

In [36]:
print(input_seq.shape)
print(output0.shape) # onset
print(output1.shape) # vowel
print(output2.shape) # coda
print(output3.shape) # tone
print(output4.shape) # end condition


(31483, 55)
(31483, 20, 40)
(31483, 20, 23)
(31483, 20, 17)
(31483, 20, 7)
(31483, 20, 2)


In [40]:
train_x, test_x = train_test_split(input_seq, test_size=0.2, random_state=1)
train_y0, test_y0 = train_test_split(output0, test_size=0.2, random_state=1)
train_y1, test_y1 = train_test_split(output1, test_size=0.2, random_state=1)
train_y2, test_y2 = train_test_split(output2, test_size=0.2, random_state=1)
train_y3, test_y3 = train_test_split(output3, test_size=0.2, random_state=1)
train_y4, test_y4 = train_test_split(output4, test_size=0.2, random_state=1)

print(train_x.shape, test_x.shape)
print(train_y0.shape, test_y0.shape)

input_len = train_x.shape[1]
output_len = train_y0.shape[1]

(25186, 55) (6297, 55)
(25186, 20, 40) (6297, 20, 40)


# make model

In [None]:
emb_dim = 300
lstm_dim = 200
lstm_dropout = 0.2

### ENCODER ###
input_enc= layers.Input(batch_size=None, shape=(input_len,), name='Enc_Input')
emb_enc = layers.Embedding(input_dim = data.input_vocab, # num of vocab = num of ID dict
                                    input_length = data.input_maxlen,
                                    output_dim = emb_dim,
                                    mask_zero = True,
                                    name = 'Enc_Embedding')
lstm_enc = layers.Bidirectional(layers.LSTM(lstm_dim,
                                        return_sequences=True, # use only final output when w/o attention -> return (batch, lstm_dim)
                                        return_state=True,
                                        recurrent_dropout=lstm_dropout,
                                        name='Enc_LSTM'), name='BiDir') # return: output, h1, c1, h2, c2
concat_h = layers.Concatenate(name='Concat_h') # concatnate bidirectional output of state h 
concat_c = layers.Concatenate(name='Concat_c') # concatnate bidirectional output of state c

### DECODER ###

class Attention:
    def __init__(self, unit=256, emb_dim=256):
        self.dot = layers.Dot(axes=[2,2], name='Attn_Dot') # (batch, dec_seq_length, "lstm_dim") * (batch, enc_seq_length, "lstm_dim") -> weight (batch, dec_seq_length, enc_seq_length) 
        #self.sqrt = layers.Lambda(lambda x: x/emb_dim**0.5, name='Sqrt') # scaling by square root : */√dim
        self.softmax = layers.Activation(activation='softmax', name='Attn_Softmax')
        self.context = layers.Dot(axes=[2,1], name='Attn_Context') # weight (batch, dec_seq_length, "enc_seq_length") * (batch, "enc_seq_length", dim) -> (batch, dec_seq_length, dim)
        self.concat = layers.Concatenate(name='Attn_Concat') # concate context and decoder output
        self.hidden = layers.Dense(unit, activation='tanh', name='Attn_hidden')

    def __call__(self, enc_output, dec_output):
        attention_dot = self.dot([dec_output, enc_output])
        #attention_weight = self.softmax(self.sqrt(attention_dot))
        attention_weight = self.softmax(attention_dot)
        context_vector = self.context([attention_weight, enc_output])
        concat_vector = self.concat([context_vector, dec_output])
        return self.hidden(concat_vector)


input_dec0 = layers.Input(batch_size=None, shape=(output_len,), name='Dec_Input0')
input_dec1 = layers.Input(batch_size=None, shape=(output_len,), name='Dec_Input1')
input_dec2 = layers.Input(batch_size=None, shape=(output_len,), name='Dec_Input2')
input_dec3 = layers.Input(batch_size=None, shape=(output_len,), name='Dec_Input3')
input_dec4 = layers.Input(batch_size=None, shape=(output_len,), name='Dec_Input4')
input_h = layers.Input(batch_size=None, shape=(lstm_dim*2,), name='Dec_Input_h') # for predict
input_c = layers.Input(batch_size=None, shape=(lstm_dim*2,), name='Dec_Input_c') # for predict
emb_dec0 = layers.Embedding(input_dim=len(onset2id), input_length = output_len, output_dim=100, mask_zero = True, name = 'Dec_Emb0')
emb_dec1 = layers.Embedding(input_dim=len(onset2id), input_length = output_len, output_dim=100, mask_zero = True, name = 'Dec_Emb1')
emb_dec2 = layers.Embedding(input_dim=len(onset2id), input_length = output_len, output_dim=100, mask_zero = True, name = 'Dec_Emb2')
emb_dec3 = layers.Embedding(input_dim=len(onset2id), input_length = output_len, output_dim=100, mask_zero = True, name = 'Dec_Emb3')
emb_dec4 = layers.Embedding(input_dim=len(onset2id), input_length = output_len, output_dim=100, mask_zero = True, name = 'Dec_Emb4')
concat_emb = layers.Concatenate(name='Concat_c') # concatnate bidirectional output of state c
lstm_dec = layers.LSTM(lstm_dim, # twice as lstm dim of encoder
                        return_sequences=True,
                        return_state=True,
                        recurrent_dropout=lstm_dropout,
                        name='Dec_LSTM')
dense = layers.Dense(data.output_vocab, activation='softmax', name='Dec_Hidden')
        self.attention = Attention(unit=attention_dim, emb_dim=emb_dim)
        self.enc_output = layers.Input(batch_size=None, shape=(lstm_dim), name='Enc_Output')
        ### for inference - receive previous states
        self.input_h = layers.Input(shape=lstm_dim, name='Dec_Input_h')
        self.input_c = layers.Input(shape=lstm_dim, name='Dec_Input_c')

class AttentionDecoder(Decoder):
    def __init__(self, data:Data, emb_dim=256, lstm_dim=256, lstm_dropout=0.2, attention_dim=256):
        super().__init__(data=data, emb_dim=emb_dim, lstm_dim=lstm_dim, lstm_dropout=lstm_dropout)
        
    
    def __call__(self, state_h, state_c, enc_output=None):
        x = self.input
        x = self.embedding(x)
        dec_output, h, c = self.lstm(x, initial_state=[state_h, state_c])
        x = self.attention(enc_output, dec_output)
        x = self.dense(x)
        return x, h, c

    def build(self):
        output, h, c = self(state_h=self.input_h, state_c=self.input_c, enc_output=self.enc_output)
        model = keras.models.Model(inputs=[self.input, self.enc_output, self.input_h, self.input_c], outputs=[output, h, c])
        return model