In [25]:
import re
import numpy as np
from sklearn.model_selection import train_test_split
from keras_tqdm import TQDMNotebookCallback
from keras.layers import LSTM, Embedding, RepeatVector, Dense, TimeDistributed, Input, Bidirectional
from keras.optimizers  import Adam
from keras.models import Model
from attention import Attention

# DATA

In [2]:
#PATH = '/run/media/backman/yay/spellbee/'
PATH = '/Users/fulop/Downloads/'

In [3]:
lines = [l.strip().split("  ") for l in open(PATH + 'cmudict-0.7b', encoding='latin1')
         if re.match('^[A-Z]', l)]
lines = [(w, ps.split()) for w, ps in lines]
lines[90]

('ABBREVIATE', ['AH0', 'B', 'R', 'IY1', 'V', 'IY0', 'EY2', 'T'])

In [4]:
phonemes = ["_"] + sorted(set(p for w, ps in lines for p in ps))
phonemes[:5]

['_', 'AA0', 'AA1', 'AA2', 'AE0']

In [5]:
p2i = dict((v,k) for k,v in enumerate(phonemes))
letters = "_abcdefghijklmnopqrstuvwxyz*"
l2i = dict((v,k) for k,v in enumerate(letters))

In [6]:
maxlen=15
pronounce_dict = {w.lower(): [p2i[p] for p in ps] for w, ps in lines
                 if (5<=len(w)<=maxlen) and re.match("^[A-Z]+$", w)}
len(pronounce_dict)

108006

In [7]:
maxlen_p = max([len(v) for k,v in pronounce_dict.items()])

In [8]:
pairs = np.random.permutation(list(pronounce_dict.keys()))
n = len(pairs)
input_ = np.zeros((n, maxlen_p), np.int32)
labels_ = np.zeros((n, maxlen), np.int32)

for i, k in enumerate(pairs):
    for j, p in enumerate(pronounce_dict[k]): input_[i][j] = p
    for j, letter in enumerate(k): labels_[i][j] = l2i[letter]

In [9]:
go_token = l2i["*"]
dec_input_ = np.concatenate([np.ones((n,1)) * go_token, labels_[:,:-1]], axis=1)

In [10]:
(input_train, input_test, labels_train, labels_test, dec_input_train, dec_input_test
    ) = train_test_split(input_, labels_, dec_input_, test_size=0.1)

In [11]:
input_vocab_size, output_vocab_size = len(phonemes), len(letters)


In [12]:
parms = {'verbose': 0, 'callbacks': [TQDMNotebookCallback(leave_inner=True)]}
dim = 240

In [13]:
def get_rnn(return_sequences= True): 
    return LSTM(dim, dropout_U= 0.1, dropout_W= 0.1, 
               consume_less= 'gpu', return_sequences=return_sequences)

# Basic RNN

In [14]:
inp = Input((maxlen_p,))
x = Embedding(input_vocab_size, 120)(inp)

x = Bidirectional(get_rnn())(x)
x = get_rnn(False)(x)

x = RepeatVector(maxlen)(x)
x = get_rnn()(x)
x = get_rnn()(x)
x = TimeDistributed(Dense(output_vocab_size, activation='softmax'))(x)

  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until


In [15]:
model = Model(inp, x)

In [16]:
model.compile(Adam(), 'sparse_categorical_crossentropy', metrics=['acc'])

In [17]:
hist = model.fit(input_train, np.expand_dims(labels_train, -1),
                 validation_data=[input_test, np.expand_dims(labels_test, -1)],
                    batch_size = 64, **parms, nb_epoch = 3)

  This is separate from the ipykernel package so we can avoid doing imports until





In [18]:
def eval_keras(input):
    preds = model.predict(input, batch_size = 128)
    predict = np.argmax(preds, axis=2)
    return (np.mean([all(real==p) for real, p in zip(labels_test, predict)]), predict)

In [19]:
acc, preds = eval_keras(input_test); acc

0.11193408017776132

In [20]:
def print_examples(preds):
    print("pronunciation".ljust(40), "real spelling".ljust(17), 
          "model spelling".ljust(17), "is correct")

    for index in range(20):
        ps = "-".join([phonemes[p] for p in input_test[index]]) 
        real = [letters[l] for l in labels_test[index]] 
        predict = [letters[l] for l in preds[index]]
        print (ps.split("-_")[0].ljust(40), "".join(real).split("_")[0].ljust(17),
            "".join(predict).split("_")[0].ljust(17), str(real == predict))

In [21]:
print_examples(preds)

pronunciation                            real spelling     model spelling    is correct
F-AA1-N-D-N-AH0-S                        fondness          fondons           False
G-EH1-B-ER0-T                            gebert            gebbrt            False
T-EH1-N-IH0-S-AH0-N                      tennison          tennson           False
R-IH0-F-R-IH1-JH-ER0-AH0-N-T             refrigerant       refrggrant        False
Z-IY1-L-AH0-N-D-ER0-Z                    zealanders        seramndrs         False
HH-AH1-F-S-T-IH0-D-AH0-L-ER0             hufstedler        holtiiller        False
T-AE1-M-ER0-AH0-N                        tamarin           tamern            False
P-R-AA1-D-IH0-G-AH0-L                    prodigal          prodigll          False
AH0-N-M-AE1-N-IH0-JH-AH0-B-AH0-L         unmanageable      unmangaabl        False
EH1-N-T-IH0-N                            entin             entin             True
D-IH0-B-IY0-AA1-S-IY0-OW2                dibiasio          debemsto          False


# Attention

In [26]:
inp = Input((maxlen_p,))
inp_dec = Input((maxlen,))
emb_dec =  Embedding(output_vocab_size, 120)(inp_dec)
emb_dec =  Dense(dim)(emb_dec)


x = Embedding(input_vocab_size, 120)(inp)
x = Bidirectional(get_rnn())(x)
x = get_rnn()(x)
x = get_rnn()(x)
x = Attention(get_rnn, 3)([x, emb_dec])
x = TimeDistributed(Dense(output_vocab_size, activation='softmax'))(x)

  This is separate from the ipykernel package so we can avoid doing imports until


AttributeError: 'LSTM' object has no attribute 'get_output_shape_for'

In [None]:
model = Model(inp, x)

In [None]:
model.compile(Adam(), 'sparse_categorical_crossentropy', metrics=['acc'])

In [None]:
hist = model.fit(input_train, np.expand_dims(labels_train, -1),
                 validation_data=[input_test, np.expand_dims(labels_test, -1)],
                    batch_size = 64, **parms, nb_epoch = 3)

In [None]:
K.set_value(model.optimizer.lr, 1e-4)
hist=model.fit([input_train, dec_input_train], np.expand_dims(labels_train,-1), 
          validation_data=[[input_test, dec_input_test], np.expand_dims(labels_test,-1)], 
          batch_size=64, **parms, nb_epoch=5)

In [None]:
acc, preds = eval_keras(); acc