In [66]:
import re
import numpy as np
from sklearn.model_selection import train_test_split
from keras_tqdm import TQDMNotebookCallback
from keras.layers import LSTM, Embedding, RepeatVector, Dense, TimeDistributed, Input, Bidirectional
from keras.optimizers  import Adam
from keras.models import Model

In [10]:
PATH = '/run/media/backman/yay/spellbee/'

In [26]:
lines = [l.strip().split("  ") for l in open(PATH + 'cmudict-0.7b', encoding='latin1')
         if re.match('^[A-Z]', l)]
lines = [(w, ps.split()) for w, ps in lines]
lines[90]

('ABBREVIATE', ['AH0', 'B', 'R', 'IY1', 'V', 'IY0', 'EY2', 'T'])

In [33]:
phonemes = ["_"] + sorted(set(p for w, ps in lines for p in ps))
phonemes[:5]

['_', 'AA0', 'AA1', 'AA2', 'AE0']

In [37]:
p2i = dict((v,k) for k,v in enumerate(phonemes))
letters = "_abcdefghijklmnopqrstuvwxyz*"
l2i = dict((v,k) for k,v in enumerate(letters))

In [40]:
maxlen=15
pronounce_dict = {w.lower(): [p2i[p] for p in ps] for w, ps in lines
                 if (5<=len(w)<=maxlen) and re.match("^[A-Z]+$", w)}
len(pronounce_dict)

108006

In [41]:
maxlen_p = max([len(v) for k,v in pronounce_dict.items()])

In [44]:
pairs = np.random.permutation(list(pronounce_dict.keys()))
n = len(pairs)
input_ = np.zeros((n, maxlen_p), np.int32)
labels_ = np.zeros((n, maxlen), np.int32)

for i, k in enumerate(pairs):
    for j, p in enumerate(pronounce_dict[k]): input_[i][j] = p
    for j, letter in enumerate(k): labels_[i][j] = l2i[letter]

In [50]:
go_token = l2i["*"]
dec_input_ = np.concatenate([np.ones((n,1)) * go_token, labels_[:,:-1]], axis=1)

In [51]:
(input_train, input_test, labels_train, labels_test, dec_input_train, dec_input_test
    ) = train_test_split(input_, labels_, dec_input_, test_size=0.1)

In [52]:
input_vocab_size, output_vocab_size = len(phonemes), len(letters)


In [58]:
parms = {'verbose': 0, 'callbacks': [TQDMNotebookCallback(leave_inner=True)]}
dim = 240

In [59]:
def get_rnn(return_sequences= True): 
    return LSTM(dim, dropout_U= 0.1, dropout_W= 0.1, 
               consume_less= 'gpu', return_sequences=return_sequences)

In [64]:
inp = Input((maxlen_p,))
x = Embedding(input_vocab_size, 120)(inp)

x = Bidirectional(get_rnn())(x)
x = get_rnn(False)(x)

x = RepeatVector(maxlen)(x)
x = get_rnn()(x)
x = get_rnn()(x)
x = TimeDistributed(Dense(output_vocab_size, activation='softmax'))(x)

  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until


In [67]:
model = Model(inp, x)

In [68]:
model.compile(Adam(), 'sparse_categorical_crossentropy', metrics=['acc'])

In [72]:
hist = model.fit(input_train, np.expand_dims(labels_train, -1),
                 validation_data=[input_test, np.expand_dims(labels_test, -1)],
                    batch_size = 64, **parms, nb_epoch = 3)

  This is separate from the ipykernel package so we can avoid doing imports until





In [73]:
def eval_keras(input):
    preds = model.predict(input, batch_size = 128)
    predict = np.argmax(preds, axis=2)
    return (np.mean([all(real==p) for real, p in zip(labels_test, predict)]), predict)

In [74]:
acc, preds = eval_keras(input_test); acc

0.23303397833533931

In [75]:
def print_examples(preds):
    print("pronunciation".ljust(40), "real spelling".ljust(17), 
          "model spelling".ljust(17), "is correct")

    for index in range(20):
        ps = "-".join([phonemes[p] for p in input_test[index]]) 
        real = [letters[l] for l in labels_test[index]] 
        predict = [letters[l] for l in preds[index]]
        print (ps.split("-_")[0].ljust(40), "".join(real).split("_")[0].ljust(17),
            "".join(predict).split("_")[0].ljust(17), str(real == predict))

In [77]:
print_examples(preds)

pronunciation                            real spelling     model spelling    is correct
B-EY1-R-OY2-TH                           bayreuth          barroth           False
SH-L-EY1-F-ER0                           schlafer          schlafer          True
HH-IH1-P-AH0-K-R-IH2-T-S                 hypocrites        hyppcrrit         False
AE2-P-R-AH0-P-OW1                        apropos           aprrpo            False
T-EH2-L-AH0-K-T-R-AA1-N-IH0-K-S          telectronics      telectronic       False
S-T-AA1-R-W-EY2-V                        starwave          starwave          True
W-IY1-K-L-AO2-NG                         weeklong          weekloog          False
M-AE1-N-D-ER0-S-AH0-N                    manderson         manderson         True
R-UW1-L-IY0                              rulli             rully             False
P-AE1-S-ER0-Z-B-IY0                      passersby         passerby          False
V-EH0-N-T-R-IH1-K-Y-UW0-L-ER0            ventricular       ventriculr        False
AE