## Imports

In [23]:
import numpy as np
import tqdm
from tqdm import tqdm_notebook as tqdm
import math
import tensorflow as tf
import pandas as pd
from keras.models import Sequential
from keras.layers.core import Masking, Dense
from keras.layers.recurrent import LSTM
from keras.layers.wrappers import TimeDistributed
from keras.optimizers import RMSprop

Using TensorFlow backend.


## Loading data

In [2]:
text = open('train.txt', 'r')
word_dict = {}
X_data, Y_data = [], []
for lines in text:
    line = lines.split()
    for words in line[1:]:
        word_dict.update({line[0] : words})
        X_data.append(line[0])
        Y_data.append(words)
X_data, Y_data = np.array(X_data), np.array(Y_data)
dictionary = word_dict
reverse_dictionary = dict([(v, k) for k, v in dictionary.items()])

In [3]:
print(X_data[:10])

['LEMIEUX' 'MINDING' 'STRIPED' 'KEN' 'CONFERENCE' 'CONFERENCE' 'IMMOLATE'
 'TRANSGRESS' 'RABBLE' 'AIRSHARE']


In [4]:
print(Y_data[:10])

['L_AH_M_Y_UW' 'M_AY_N_D_IH_NG' 'S_T_R_AY_P_T' 'K_EH_N'
 'K_AA_N_F_ER_AH_N_S' 'K_AA_N_F_R_AH_N_S' 'IH_M_AH_L_EY_T'
 'T_R_AE_N_Z_G_R_EH_S' 'R_AE_B_AH_L' 'EH_R_SH_EH_R']


### Баловство

In [5]:
simb2phonDict = {}
gr2coinc = {}
from string import ascii_uppercase
for let in ascii_uppercase:
    gr2coinc.update({let: 0})
phoneme_set = set()
phonemes = reverse_dictionary.keys()
ok = 0
sz3 = 0
for phoneme in phonemes:
    grapheme = list(reverse_dictionary[phoneme])
    phoneme = phoneme.split('_')
    if len(phoneme) > sz3:
        sz3 = len(phoneme)
    for p in phoneme:
        phoneme_set.add(p)
    sz = min(len(grapheme), len(phoneme))
    for i in range(sz):
        simb2phonDict.update({grapheme[i]: phoneme[i]})
        if grapheme[i] == phoneme[i]:
            gr2coinc[grapheme[i]] += 1
for g in gr2coinc.keys():
    print(g, "\t", gr2coinc[g], "\t", simb2phonDict[g])
# for g in simb2phonDict.keys():
#     print(g, "\t", simb2phonDict[g])

G 	 6149 	 JH
D 	 11830 	 IY
W 	 2861 	 AH
A 	 0 	 AE
Q 	 0 	 K
T 	 17953 	 T
E 	 0 	 ER
F 	 5558 	 L
Z 	 2059 	 Z
J 	 0 	 JH
K 	 5390 	 S
Y 	 572 	 M
L 	 19248 	 L
H 	 0 	 M
S 	 16704 	 S
M 	 12604 	 AH
X 	 0 	 K
C 	 0 	 AA
V 	 4887 	 L
O 	 0 	 AA
I 	 0 	 ER
N 	 21153 	 Z
P 	 9562 	 P
R 	 20421 	 D
B 	 9753 	 B
U 	 0 	 IY


## Preparing data

In [6]:
X_vocab = {}
for let in ascii_uppercase:
    X_vocab.update({let: 0})
X_vocab.update({"'" : 0, '-' : 0})
sz2 = 0
for gr in dictionary.keys():
    for l in gr:
        if l in X_vocab.keys():
            X_vocab[l] += 1
        else:
            X_vocab.update({'UNK': 0})
            print(gr)
    if len(gr) > sz2:
        sz2 = len(gr)


In [7]:
print(X_vocab.items())

dict_items([('G', 16816), ('D', 21402), ('W', 6829), ('A', 54333), ('Q', 832), ('T', 35700), ('E', 69983), ('F', 8552), ('Z', 3918), ('J', 1662), ("'", 4907), ('-', 649), ('K', 11002), ('Y', 9888), ('L', 35662), ('H', 17889), ('S', 46389), ('M', 19204), ('X', 1424), ('C', 23850), ('V', 6467), ('O', 39591), ('I', 46686), ('N', 44939), ('P', 14006), ('R', 47637), ('B', 13850), ('U', 18607)])


In [8]:
import operator
X_sorted = [k for k, v in sorted(X_vocab.items(), key=operator.itemgetter(1))][::-1]

In [9]:
X_let_to_ix = {'ZERO' : 0}
v = 1
for k in X_sorted:
    X_let_to_ix.update({k : v})
    v += 1
X_let_to_ix.update({"BGN" : v})
grapheme_sz = v + 1
print(v)

29


In [10]:
X_ix_to_let = dict([(v, k) for k, v in X_let_to_ix.items()])

In [11]:
phoneme2int = {}
v = 1
for phoneme in phoneme_set:
    phoneme2int.update({phoneme : v})
    v += 1
print(v)
phoneme2int.update({"BGN" : v})
phoneme_sz = v + 1
int2phoneme = dict([(v, k) for (k, v) in phoneme2int.items()])
print(phoneme_sz)

40
41


In [12]:
print(Y_data[:10])

['L_AH_M_Y_UW' 'M_AY_N_D_IH_NG' 'S_T_R_AY_P_T' 'K_EH_N'
 'K_AA_N_F_ER_AH_N_S' 'K_AA_N_F_R_AH_N_S' 'IH_M_AH_L_EY_T'
 'T_R_AE_N_Z_G_R_EH_S' 'R_AE_B_AH_L' 'EH_R_SH_EH_R']


In [13]:
sz1 = X_data.shape[0]
sz2 = max(map(len, X_data)) + 1
sz3 = sz2
X_train = np.zeros((sz1, sz2), dtype=np.int)
Y_train = np.zeros((sz1, sz3), dtype=np.int)
i, j = 0, 0
for g in X_data:
    X_train[i][0] = grapheme_sz - 1
    Y_train[i][0] = phoneme_sz - 1
    j = 1
    for l in g:
        X_train[i][j] = X_let_to_ix[l]
        j += 1
    j = 1
    for ph in dictionary[g].split('_'):
        Y_train[i][j] = phoneme2int[ph]
        j += 1
    X_train[i] = X_train[i][::-1]
    i += 1

In [14]:
print(X_train.shape)

(89056, 35)


In [15]:
print(X_train[:5])

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0 26 13  1  4 12  1  9 29]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0 15  6  4 11  6  4 12 29]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0 11  1 16  4  3  8  5 29]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  6  1 18 29]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   1 10  6  1  3  1 20  6  7 10 29]]


In [16]:
print(Y_train[:5])

[[40 39 17 34 22 32  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0]
 [40 34  2  9  1  3 31  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0]
 [40 33 25 16  2 15 25  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0]
 [40 29 18  9  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0]
 [40 29 36  9 26 16 17  9 33  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0]]


In [17]:
print(list(map(lambda x: X_ix_to_let[x], [29, 10, 7, 6, 20, 1, 3, 1, 6, 10, 1])))

['BGN', 'C', 'O', 'N', 'F', 'E', 'R', 'E', 'N', 'C', 'E']


In [18]:
print(list(map(lambda x: int2phoneme[x], [40, 22, 33, 39, 14, 15, 2, 39, 26])))

['BGN', 'Y', 'S', 'L', 'W', 'P', 'AY', 'L', 'F']


## Vectorization

In [21]:
X = np.zeros((sz1, sz2, grapheme_sz))
y = np.zeros((sz1, sz3, phoneme_sz))
print(X.shape)
print(y.shape)
max_len, feats = X.shape[1], X.shape[2]

(89056, 35, 30)
(89056, 35, 41)


In [20]:
for i in tqdm(range(sz1)):
    for j in range(sz2):
        X[i, j, X_train[i, j]] = 1
    for k in range(sz3):
        y[i, k, Y_train[i, k]] = 1

Widget Javascript not detected.  It may not be installed or enabled properly.





## Trainig

In [None]:
print(X.shape, y.shape)
outs = y.shape[2]
hidden_l = 64
batch_size = 100
epochs = 1

model = Sequential()
model.add(Masking(mask_value=0., input_shape=(max_len, feats)))
model.add(LSTM(hidden_l, return_sequences=True))
model.add(LSTM(hidden_l, return_sequences=True))
model.add(LSTM(hidden_l, return_sequences=True))
model.add(TimeDistributed(Dense(outs, activation='softmax')))

optimizer = RMSprop(lr=0.001, decay=1e-6)
model.compile(optimizer=optimizer, loss='categorical_crossentropy')

(89056, 35, 30) (89056, 35, 41)


In [None]:
hist = model.fit(X, y, batch_size=batch_size, epochs=epochs, verbose=1)

Epoch 1/1
12600/89056 [===>..........................] - ETA: 6:46 - loss: nan

## Trying model

In [31]:
x = np.zeros((1, sz2, grapheme_sz), dtype=np.int)
for j in range(sz2):
    x[0, j, X_train[4, j]] = 1

pred = model.predict(x, verbose=1)



In [32]:
print(pred.shape)

(1, 35, 41)


In [35]:
def sample(preds):
    return int2phoneme[np.argmax(preds)]

In [36]:
pred = pred[0]
int2phoneme.update({0 : ' '})
print('_'.join(map(sample, pred)))

 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
