## Imports

In [187]:
import numpy as np
import tqdm
from tqdm import tqdm_notebook as tqdm
import math
import tensorflow as tf
import pandas as pd
from keras.models import Sequential
from keras.layers.core import Masking, Dense
from keras.layers.recurrent import LSTM
from keras.layers.wrappers import TimeDistributed
from keras.optimizers import RMSprop
from keras.layers import Bidirectional, GRU

## Loading data

$\textit{input_data}$ is a 2-dementional array. $\textit{input_data[0]}$ includes all graphemes and $\textit{input_data[1]}$ includes all phonemes. 

In [5]:
text = open('train.txt', 'r')
input_data = [[], []]
g2p_dict, p2g_dict = {}, {}
for lines in text:
    line = lines.split()
    grapheme = line[0]
    for phoneme in line[1:]:
        g2p_dict.update({grapheme : phoneme})
        p2g_dict.update({phoneme : grapheme})
        input_data[0].append(grapheme)
        input_data[1].append(phoneme)
#input_data = np.array(input_data)

In [6]:
n = 5
print('First %d example data:' %n)
print(' \t'.join(input_data[0][:n]))
print(' \t'.join(input_data[1][:n]))

First 5 example data:
LEMIEUX 	MINDING 	STRIPED 	KEN 	CONFERENCE
L_AH_M_Y_UW 	M_AY_N_D_IH_NG 	S_T_R_AY_P_T 	K_EH_N 	K_AA_N_F_ER_AH_N_S


## Preparing data

In [39]:
def makeVocabularySet(data, sep = False):
    vocab = {}
    sequences = [] 
    max_seq_len = 0
    i = 0
    for rows in data:
        if sep:
            rows = rows.split('_')
        if len(rows) > max_seq_len:
            max_seq_len = len(rows)
        for c in rows:
            if c in vocab.keys():
                continue
            vocab.update({c : i})
            i += 1
    rev_vocab = dict((v, k) for (k, v) in vocab.items())
    return max_seq_len, vocab, rev_vocab
graph_max_seq_len, grapheme_encoder, grapheme_decoder = makeVocabularySet(input_data[0])
phone_max_seq_len, phoneme_encoder, phoneme_decoder = makeVocabularySet(input_data[1], True)
print(graph_max_seq_len, phone_max_seq_len)

34 32


Add $\textit{go}$ and $\textit{end}$ tokens:

In [40]:
# Не заупскай этот код дважды
def add_token(vocab, rev_vocab, token):
    n = len(vocab)
    vocab.update({token : n})
    rev_vocab.update({n : token})
add_token(phoneme_encoder, phoneme_decoder, '<go>')
add_token(phoneme_encoder, phoneme_decoder, '<end>')
add_token(grapheme_encoder, grapheme_decoder, '<end>')
print('ПРОВЕРКА', len(phoneme_decoder), len(phoneme_encoder))


ПРОВЕРКА 41 41


In [41]:
print(phoneme_decoder[40])
print(grapheme_decoder[28])

<end>
<end>


In [42]:
num_grapheme = len(grapheme_encoder)
num_phoneme = len(phoneme_encoder)
graphemes = input_data[0]
phonemes = input_data[1]

In [43]:
print(type(graphemes[0]))

<class 'str'>


In [87]:
print(type(graphemes[0]))

<class 'str'>


In [56]:
def encode_sequence(data, vocab, split = False):
    encoded = []
    for rows in data:
        if split:
            rows = '<go>_' + rows   #  add go-token (for phonemes only)
            rows = rows.split('_')
        tmp = list(map(lambda x: vocab[x], rows))  
        tmp.append(vocab['<end>'])   # add end-token
        encoded.append(tmp)
    return np.array(encoded)
encoded_graphemes = encode_sequence(graphemes, grapheme_encoder)
graph_max_seq_len += 1
encoded_phonemes = encode_sequence(phonemes, phoneme_encoder, True)
phone_max_seq_len += 2
print('Encoded graphemes:')
print(' \t\t'.join(graphemes[:4]))
print(''.join(map(str, encoded_graphemes[:4])))

print('Encoded phonemes:')
print(' \t\t'.join(phonemes[:4]))
print(''.join(map(str, encoded_phonemes[:4])))

Encoded graphemes:
LEMIEUX 		MINDING 		STRIPED 		KEN
[0, 1, 2, 3, 1, 4, 5, 28][2, 3, 6, 7, 3, 6, 8, 28][9, 10, 11, 3, 12, 1, 7, 28][13, 1, 6, 28]
Encoded phonemes:
L_AH_M_Y_UW 		M_AY_N_D_IH_NG 		S_T_R_AY_P_T 		K_EH_N
[39, 0, 1, 2, 3, 4, 40][39, 2, 5, 6, 7, 8, 9, 40][39, 10, 11, 12, 5, 13, 11, 40][39, 14, 15, 6, 40]


In [57]:
print(max(map(len, encoded_graphemes)), graph_max_seq_len)

35 35


In [58]:
print(np.array(encoded_graphemes[0]).shape)

(8,)


In [68]:
def padding(data, vocab):
    padded = []
    max_len = max(graph_max_seq_len, phone_max_seq_len)
    for row in data:
        add = [vocab['<end>'] for i in range(max_len - len(row))]
        padded.append(row + add)
    return padded
padded_graphemes = padding(encoded_graphemes, grapheme_encoder)
padded_phonemes = padding(encoded_phonemes, phoneme_encoder)
print('Padded grapheme:')
print(' \t\t'.join(graphemes[:1]))
print(''.join(map(str, padded_graphemes[:1])))

print('Padded phoneme:')
print(' \t\t'.join(phonemes[:1]))
print(''.join(map(str, padded_phonemes[:1])))

Padded grapheme:
LEMIEUX
[0, 1, 2, 3, 1, 4, 5, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28]
Padded phoneme:
L_AH_M_Y_UW
[39, 0, 1, 2, 3, 4, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40]


In [151]:
def vectorization(data, vocab):
    shp = [len(data), len(data[0]), len(vocab)]
    train_data = np.zeros((shp[0], shp[1], shp[2]), dtype=np.int)
    for i in range(shp[0]):
        j = 0
        for k in data[i]:
            train_data[i][j][k] = 1
            j += 1
    i += 1
    return train_data
X_train = vectorization(padded_graphemes, grapheme_encoder)
y_train = vectorization(padded_phonemes, phoneme_encoder)

In [77]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

X_train shape: (89056, 35, 29)
y_train shape: (89056, 35, 41)


In [152]:
print(padded_graphemes[0])

[0, 1, 2, 3, 1, 4, 5, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28]


In [153]:
print(X_train[0][0])

[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


## Training (by Keras)

In [188]:
outs = y_train.shape[2]
outs1 = y_train.shape[1]
max_len, feats = X_train.shape[1], X_train.shape[2]
hidden_l = 64
batch_size = 100
epochs = 1

model = Sequential()
model.add(Masking(mask_value=0., input_shape=(max_len, feats)))
model.add(LSTM(hidden_l, return_sequences=True, activation='tanh'))
model.add(LSTM(hidden_l, return_sequences=True, activation='tanh', go_backwards=True))
model.add(GRU(hidden_l, return_sequences=True))
model.add(GRU(hidden_l, return_sequences=True, go_backwards=True))
model.add(TimeDistributed(Dense(256, activation='relu')))
model.add(TimeDistributed(Dense(outs, activation='softmax')))
model.compile(loss='categorical_crossentropy', optimizer="rmsprop")

In [None]:
model.fit(X_train,y_train, batch_size=batch_size, epochs=epochs, verbose=1)

Epoch 1/1

### Test model

In [180]:
def word_to_vector(word):
    encoded = encode_sequence(word, grapheme_encoder)
    padded = padding(encoded, grapheme_encoder)
    return vectorization(padded, grapheme_encoder)
words = ['HELLO', 'CONFERENCE', 'ELEVEN']
x = word_to_vector(words)
print(x.shape)

(3, 35, 29)


In [181]:
# x = X_train[0]
# n = X_train.shape[1]
# x = x.reshape((1, n, -1))
print(x.shape)
pred = model.predict(x, verbose=1)

(3, 35, 29)


In [184]:
decode = lambda x : phoneme_decoder[np.argmax(x)]
print('_'.join(map(decode, pred[1])))


<go>_K_N_N_AH_L_<end>_AH_N_<end>_<end>_<end>_<end>_<end>_<end>_<end>_<end>_<end>_<end>_<end>_<end>_<end>_<end>_<end>_<end>_<end>_<end>_<end>_<end>_<end>_<end>_<end>_<end>_<end>_<end>


In [161]:
print(pred[0].shape)
print()

(35, 41)


### Trying to use seq2seq-model

attempt failed

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
def build_graph(gra_vocab_size, pho_vocab_size, embedding_dim=10, num_hidden_units=50, learning_rate=0.001,
                use_dropout=False, keep_prob=0.8):
    # Placeholders for data input
    with tf.variable_scope("data_input_placeholders") as data_input_placeholders_scope:
        # input in batch-major format: batch_size x g_seq_len
        gra_inputs = tf.placeholder(tf.int32, (None, None), name='grapheme_inputs')
        # variable length grapheme sequences with shape batch_size
        gra_input_lens = tf.placeholder(tf.int32, (None), name='grapheme_seq_lengths')

        # output of decoder will be the phonemes also with shape batch_size x p_seq_len
        dec_pho_inputs = tf.placeholder(tf.int32, (None, None), name='phoneme_decoder_inputs')
        # variable length phoneme sequences with shape batch_size
        dec_pho_inputs_lens = tf.placeholder(tf.int32, (None), name='phoneme_decoder_input_lengths')

        # labels (teacher forcing) with shape batch_size x p_seq_len
        pho_labels = tf.placeholder(tf.int32, (None, None), name='phoneme_labels')

    # Embedding layers
    with tf.variable_scope("embeddings") as embedding_scope:
        gra_embeddings = tf.Variable(tf.random_uniform([gra_vocab_size, embedding_dim], -1.0, 1.0), dtype=tf.float32,
                                     name='grapheme_embedding')
        # gra_inputs_embedded: [batch_size, time_step, embedding_dim] -> batch major format
        gra_inputs_embedded = tf.nn.embedding_lookup(gra_embeddings, gra_inputs)

        pho_embeddings = tf.Variable(tf.random_uniform([pho_vocab_size, embedding_dim], -1.0, 1.0), dtype=tf.float32,
                                     name='phoneme_embedding')
        # pho_output_embedded: [batch_size, time_step, embedding_dim] -> batch major format
        dec_pho_inputs_embedded = tf.nn.embedding_lookup(pho_embeddings, dec_pho_inputs)

    # create encoder and decoder LSTMs
    with tf.variable_scope("encoding") as encoding_scope:
        lstm_enc = tfc.rnn.BasicLSTMCell(num_hidden_units)

        # Dropout (= 1 - keep_prob)
        if use_dropout:
            dropout = 1 - keep_prob
            if dropout < 0.0:
                dropout = .2
                keep_prob = 1.0 - dropout
            lstm_enc = tf.contrib.rnn.DropoutWrapper(cell=lstm_enc, input_keep_prob=keep_prob)

        _, last_state = tf.nn.dynamic_rnn(lstm_enc, inputs=gra_inputs_embedded, sequence_length=gra_input_lens,
                                          dtype=tf.float32)

    with tf.variable_scope("decoding") as decoding_scope:
        # encoder initial state is last_state of encoder
        lstm_dec = tfc.rnn.BasicLSTMCell(num_hidden_units)

        # Dropout (= 1 - keep_prob)
        if use_dropout:
            dropout = 1 - keep_prob
            if dropout < 0.0:
                dropout = .2
                keep_prob = 1.0 - dropout
            lstm_dec = tf.contrib.rnn.DropoutWrapper(cell=lstm_enc, input_keep_prob=keep_prob)

        dec_outputs, _ = tf.nn.dynamic_rnn(lstm_dec, inputs=dec_pho_inputs_embedded,
                                           sequence_length=dec_pho_inputs_lens,
                                           initial_state=last_state, dtype=tf.float32)

    # output projection
    with tf.name_scope("output_projection"):
        logits = tfc.layers.fully_connected(dec_outputs, num_outputs=pho_vocab_size, activation_fn=tf.nn.softmax)
    """
        weights = tf.Variable(tf.random_uniform([num_hidden_units, pho_vocab_size], -0.01, 0.01, dtype=tf.float32))
        b = tf.Variable(tf.random_uniform([pho_vocab_size], -0.01, 0.01, dtype=tf.float32))
        predictions = tf.add(tf.matmul(dec_outputs, weights), b)
    """

    logits_argmax = tf.argmax(logits, axis=-1)

    with tf.name_scope("optimization"):
        # Loss function
        # TODO

        # get dynamic batch_size
        batch_size = tf.shape(gra_inputs)[0]
        # get dynamic output seq len
        pho_output_len = tf.shape(dec_pho_inputs)[0]

        loss = tfc.seq2seq.sequence_loss(logits, pho_labels, tf.ones([batch_size, pho_output_len]),
                                         average_across_batch=True, average_across_timesteps=True)
        tf.summary.scalar('loss', loss)

        optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(loss)

    return gra_inputs, gra_input_lens, dec_pho_inputs, dec_pho_inputs_lens, pho_labels, optimizer, loss, logits, logits_argmax




In [10]:
grapheme_sequences_train, grapheme_sequences_test, phoneme_sequences_train, phoneme_sequences_test = train_test_split(
        encoded_graphemes, encoded_phonemes)

In [11]:
learning_rate = 0.01
num_hidden_units = 128
embedding_dim = 100
epochs = 2
batch_size = 1
use_dropout = True
keep_prob = 0.8


In [12]:
import tensorflow as tf
import tensorflow.contrib as tfc
gra_inputs, gra_input_lens, dec_pho_inputs, dec_pho_input_lens, pho_labels, optimizer, loss, logits, logits_argmax = build_graph(
        num_grapheme,
        num_phoneme,
        embedding_dim=embedding_dim,
        num_hidden_units=num_hidden_units,
        learning_rate=learning_rate,
        use_dropout=use_dropout,
        keep_prob=keep_prob,
    )

In [13]:
saver = tf.train.Saver()
init_op = tf.global_variables_initializer()
summary_merge_op = tf.summary.merge_all()


In [22]:
import logging
import os
import sys
import time
def generate_batch_data(gra_seqs, pho_seqs, batch_size=1):
    start = 0
    shuffle = np.random.permutation(len(pho_seqs))
    gra_seqs = gra_seqs[shuffle]
    pho_seqs = pho_seqs[shuffle]
    while start + batch_size <= len(gra_seqs):
        enc_inputs = []
        enc_input_lens = []
        dec_inputs = []
        dec_input_lens = []
        labels = []
        for g in gra_seqs[start:start + batch_size]:
            enc_inputs.append(g)
            enc_input_lens.append(len(g))
        for p in pho_seqs[start:start + batch_size]:
            # dec_inputs doesn't contain last char -> end_token
            dec_inputs.append(p[:-1])
            # since pho_seqs have an appended <GO> token, the length has to be decreased by 1
            dec_input_lens.append(len(p) - 1)
            # labels doesn't contain first char -> go_token
            labels.append(p[1:])

        enc_inputs = np.array(enc_inputs)
        enc_input_lens = np.array(enc_input_lens)
        dec_inputs = np.array(dec_inputs)
        dec_input_lens = np.array(dec_input_lens)

        labels = np.array(labels)

        yield enc_inputs, enc_input_lens, dec_inputs, dec_input_lens, labels
        start += batch_size
with tf.Session() as sess:
    sess.run(init_op)
    writer = tf.summary.FileWriter('Phoneme', graph=sess.graph)
    for epoch_i in range(epochs):
        accuracies = []
        batch_losses = []
        print("Epoch {:3}: ".format(epoch_i))
        start = time.time()
        for batch_i, (
                input_batch, input_lens_batch, dec_input_batch, dec_input_lens_batch, label_batch) in enumerate(
            generate_batch_data(grapheme_sequences_train, phoneme_sequences_train, batch_size)):
            # build feed dict
            f_dict = {gra_inputs: input_batch,
                      gra_input_lens: input_lens_batch,
                      dec_pho_inputs: dec_input_batch,
                      dec_pho_input_lens: dec_input_lens_batch,
                      pho_labels: label_batch}

            _, batch_loss, batch_logits, batch_logits_argmax = sess.run([optimizer, loss, logits, logits_argmax])
            batch_accuracy = np.mean(batch_logits.argmax(axis=-1) == label_batch)
            accuracies.append(batch_accuracy)
            batch_losses.append(batch_loss)

Epoch   0: 


InvalidArgumentError: Shape [-1,-1] has negative dimensions
	 [[Node: data_input_placeholders/grapheme_inputs = Placeholder[dtype=DT_INT32, shape=[?,?], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]

Caused by op 'data_input_placeholders/grapheme_inputs', defined at:
  File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/runpy.py", line 170, in _run_module_as_main
    "__main__", mod_spec)
  File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2698, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2802, in run_ast_nodes
    if self.run_code(code, result):
  File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2862, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-bfd994f103df>", line 10, in <module>
    keep_prob=keep_prob,
  File "<ipython-input-9-87b3d2b8e28a>", line 6, in build_graph
    gra_inputs = tf.placeholder(tf.int32, (None, None), name='grapheme_inputs')
  File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/tensorflow/python/ops/array_ops.py", line 1530, in placeholder
    return gen_array_ops._placeholder(dtype=dtype, shape=shape, name=name)
  File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/tensorflow/python/ops/gen_array_ops.py", line 1954, in _placeholder
    name=name)
  File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
    op_def=op_def)
  File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 2506, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 1269, in __init__
    self._traceback = _extract_stack()

InvalidArgumentError (see above for traceback): Shape [-1,-1] has negative dimensions
	 [[Node: data_input_placeholders/grapheme_inputs = Placeholder[dtype=DT_INT32, shape=[?,?], _device="/job:localhost/replica:0/task:0/cpu:0"]()]]


### LSTM-model (by Keras)