In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

In [2]:
import re

dimension = 400
vocab = "EOS abcdefghijklmnopqrstuvwxyz'"
char2idx = {char: idx for idx, char in enumerate(vocab)}
idx2char = {idx: char for idx, char in enumerate(vocab)}

def text2idx(text):
    text = re.sub(r'[^a-z ]', '', text.lower()).strip()
    converted = [char2idx[char] for char in text]
    return text, converted

In [3]:
GO = 1
PAD = 0
EOS = 2

In [4]:
import tensorflow as tf
import numpy as np

train_X, train_Y = [], []
text_files = [f for f in os.listdir('spectrogram-train') if f.endswith('.npy')]
for fpath in text_files:
    try:
        splitted = fpath.split('-')
        if len(splitted) == 2:
            splitted[1] = splitted[1].split('.')[1]
            fpath = splitted[0] + '.' + splitted[1]
        with open('data/' + fpath.replace('npy', 'txt')) as fopen:
            text, converted = text2idx(fopen.read())
        w = np.load('spectrogram-train/' + fpath)
        if w.shape[1] != dimension:
            continue
        train_X.append(w)
        train_Y.append(converted)
    except:
        pass

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [5]:
test_X, test_Y = [], []
text_files = [f for f in os.listdir('spectrogram-test') if f.endswith('.npy')]
for fpath in text_files:
    with open('data/' + fpath.replace('npy', 'txt')) as fopen:
        text, converted = text2idx(fopen.read())
    w = np.load('spectrogram-test/' + fpath)
    if w.shape[1] != dimension:
        continue
    test_X.append(w)
    test_Y.append(converted)

In [6]:
class Model:
    def __init__(
        self,
        num_layers,
        size_layer,
        learning_rate,
        num_features,
        dropout = 1.0,
        beam_width=5, force_teaching_ratio=0.5
    ):
        
        def lstm_cell(size, reuse=False):
            return tf.nn.rnn_cell.LSTMCell(size, initializer=tf.orthogonal_initializer(),reuse=reuse)
        
        self.X = tf.placeholder(tf.float32, [None, None, num_features])
        self.Y = tf.placeholder(tf.int32, [None, None])
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)
        self.X_seq_len = tf.reduce_mean(self.X_seq_len, axis = 1)
        self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32)
        
        batch_size = tf.shape(self.X)[0]
        decoder_embeddings = tf.Variable(tf.random_uniform([len(char2idx), size_layer], -1, 1))
        main = tf.strided_slice(self.Y, [0, 0], [batch_size, -1], [1, 1])
        decoder_input = tf.concat([tf.fill([batch_size, 1], GO), main], 1)
        
        self.encoder_out = self.X
        print(self.X_seq_len)

        for n in range(num_layers):
            (out_fw, out_bw), (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = lstm_cell(size_layer // 2),
                cell_bw = lstm_cell(size_layer // 2),
                inputs = self.encoder_out,
                sequence_length = self.X_seq_len,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_%d'%(n))
            self.encoder_out = tf.concat((out_fw, out_bw), 2)
            
        bi_state_c = tf.concat((state_fw.c, state_bw.c), -1)
        bi_state_h = tf.concat((state_fw.h, state_bw.h), -1)
        bi_lstm_state = tf.nn.rnn_cell.LSTMStateTuple(c=bi_state_c, h=bi_state_h)
        encoder_state = tuple([bi_lstm_state] * num_layers)
        
        print(self.encoder_out, encoder_state)
        
        with tf.variable_scope('decode'):
            attention_mechanism = tf.contrib.seq2seq.LuongAttention(
            num_units = size_layer, 
            memory = self.encoder_out,
            memory_sequence_length = self.X_seq_len)
            decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
                cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell(size_layer) for _ in range(num_layers)]),
                attention_mechanism = attention_mechanism,
                attention_layer_size = size_layer)
            training_helper = tf.contrib.seq2seq.ScheduledEmbeddingTrainingHelper(
            inputs = tf.nn.embedding_lookup(decoder_embeddings, decoder_input),
                sequence_length = self.Y_seq_len,
                embedding = decoder_embeddings,
                sampling_probability = 1 - force_teaching_ratio,
                time_major = False)
            training_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell = decoder_cell,
                helper = training_helper,
                initial_state = decoder_cell.zero_state(batch_size, tf.float32).clone(cell_state=encoder_state),
                output_layer = tf.layers.Dense(len(char2idx)))
            training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder = training_decoder,
                impute_finished = True,
                maximum_iterations = tf.reduce_max(self.Y_seq_len))
            self.training_logits = training_decoder_output.rnn_output
            
        
        with tf.variable_scope('decode', reuse=True):
            encoder_out_tiled = tf.contrib.seq2seq.tile_batch(self.encoder_out, beam_width)
            encoder_state_tiled = tf.contrib.seq2seq.tile_batch(encoder_state, beam_width)
            X_seq_len_tiled = tf.contrib.seq2seq.tile_batch(self.X_seq_len, beam_width)
            attention_mechanism = tf.contrib.seq2seq.LuongAttention(
                num_units = size_layer, 
                memory = encoder_out_tiled,
                memory_sequence_length = X_seq_len_tiled)
            decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
                cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell(size_layer, reuse=True) for _ in range(num_layers)]),
                attention_mechanism = attention_mechanism,
                attention_layer_size = size_layer)
            predicting_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
                cell = decoder_cell,
                embedding = decoder_embeddings,
                start_tokens = tf.tile(tf.constant([GO], dtype=tf.int32), [batch_size]),
                end_token = EOS,
                initial_state = decoder_cell.zero_state(batch_size * beam_width, tf.float32).clone(cell_state = encoder_state_tiled),
                beam_width = beam_width,
                output_layer = tf.layers.Dense(len(char2idx), _reuse=True),
                length_penalty_weight = 0.0)
            predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder = predicting_decoder,
                impute_finished = False,
                maximum_iterations = tf.reduce_max(self.X_seq_len))
            self.predicting_ids = predicting_decoder_output.predicted_ids[:, :, 0]
        
        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        self.cost = tf.contrib.seq2seq.sequence_loss(logits = self.training_logits,
                                                     targets = self.Y,
                                                     weights = masks)
        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost)
        y_t = tf.argmax(self.training_logits,axis=2)
        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.Y, masks)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [7]:
tf.reset_default_graph()
sess = tf.InteractiveSession()

size_layers = 512
learning_rate = 1e-3
num_layers = 2
batch_size = 64
epoch = 20

model = Model(num_layers, size_layers, learning_rate, dimension)
sess.run(tf.global_variables_initializer())

W0830 20:30:26.191031 139661149812544 deprecation.py:506] From /home/husein/.local/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:507: calling count_nonzero (from tensorflow.python.ops.math_ops) with axis is deprecated and will be removed in a future version.
Instructions for updating:
reduction_indices is deprecated, use axis instead
W0830 20:30:26.227196 139661149812544 deprecation.py:323] From <ipython-input-6-88e88c0e2693>:13: LSTMCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
W0830 20:30:26.229180 139661149812544 deprecation.py:323] From <ipython-input-6-88e88c0e2693>:36: bidirectional_dynamic_rnn (from tensorflow.python.ops.rnn) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))

Tensor("Mean:0", shape=(?,), dtype=int32)


W0830 20:30:27.024843 139661149812544 deprecation.py:323] From /home/husein/.local/lib/python3.6/site-packages/tensorflow/python/ops/rnn.py:244: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Tensor("concat_2:0", shape=(?, ?, 512), dtype=float32) (LSTMStateTuple(c=<tf.Tensor 'concat_3:0' shape=(?, 512) dtype=float32>, h=<tf.Tensor 'concat_4:0' shape=(?, 512) dtype=float32>), LSTMStateTuple(c=<tf.Tensor 'concat_3:0' shape=(?, 512) dtype=float32>, h=<tf.Tensor 'concat_4:0' shape=(?, 512) dtype=float32>))


W0830 20:30:28.160248 139661149812544 lazy_loader.py:50] 
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

W0830 20:30:28.180953 139661149812544 deprecation.py:506] From /home/husein/.local/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0830 20:30:28.457204 139661149812544 deprecation.py:323] From <ipython-input-6-88e88c0e2693>:52: MultiRNNCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprec

In [8]:
train_X = tf.keras.preprocessing.sequence.pad_sequences(
    train_X, dtype = 'float32', padding = 'post'
)

In [9]:
test_X = tf.keras.preprocessing.sequence.pad_sequences(
    test_X, dtype = 'float32', padding = 'post'
)

In [10]:
def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

In [11]:
from tqdm import tqdm

for e in range(epoch):
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'minibatch loop')
    train_cost, train_accuracy, test_cost, test_accuracy = [], [], [], []
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, len(train_X))]
        y = train_Y[i : min(i + batch_size, len(train_X))]
        batch_y, _ = pad_sentence_batch(y, 0)
        _, cost, accuracy = sess.run(
            [model.optimizer, model.cost, model.accuracy],
            feed_dict = {model.X: batch_x, model.Y: batch_y},
        )
        train_cost.append(cost)
        train_accuracy.append(accuracy)
        pbar.set_postfix(cost = cost, accuracy = accuracy)
    
    pbar = tqdm(
        range(0, len(test_X), batch_size), desc = 'testing minibatch loop')
    for i in pbar:
        batch_x = test_X[i : min(i + batch_size, len(test_X))]
        y = test_Y[i : min(i + batch_size, len(test_X))]
        batch_y, _ = pad_sentence_batch(y, 0)
        cost, accuracy = sess.run(
            [model.cost, model.accuracy],
            feed_dict = {model.X: batch_x, model.Y: batch_y},
        )
        
        test_cost.append(cost)
        test_accuracy.append(accuracy)
        
        pbar.set_postfix(cost = cost, accuracy = accuracy)
    print('epoch %d, training avg cost %f, training avg accuracy %f'%(e + 1, np.mean(train_cost), 
                                                                      np.mean(train_accuracy)))
    
    print('epoch %d, testing avg cost %f, testing avg accuracy %f'%(e + 1, np.mean(test_cost), 
                                                                    np.mean(test_accuracy)))

minibatch loop: 100%|██████████| 206/206 [00:53<00:00,  3.94it/s, accuracy=0.871, cost=0.398]
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  7.93it/s, accuracy=0.845, cost=0.485]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 1, training avg cost 0.655583, training avg accuracy 0.794257
epoch 1, testing avg cost 0.488879, testing avg accuracy 0.839991


minibatch loop: 100%|██████████| 206/206 [00:52<00:00,  3.90it/s, accuracy=0.935, cost=0.167]
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  8.66it/s, accuracy=0.843, cost=0.57] 
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 2, training avg cost 0.281657, training avg accuracy 0.907501
epoch 2, testing avg cost 0.496763, testing avg accuracy 0.856464


minibatch loop: 100%|██████████| 206/206 [00:52<00:00,  3.94it/s, accuracy=0.978, cost=0.0699]
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  8.69it/s, accuracy=0.864, cost=0.625]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 3, training avg cost 0.118196, training avg accuracy 0.962703
epoch 3, testing avg cost 0.531638, testing avg accuracy 0.871761


minibatch loop: 100%|██████████| 206/206 [00:51<00:00,  4.14it/s, accuracy=1, cost=0.0255]    
testing minibatch loop: 100%|██████████| 9/9 [00:00<00:00,  9.25it/s, accuracy=0.859, cost=0.73] 
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 4, training avg cost 0.052165, training avg accuracy 0.984415
epoch 4, testing avg cost 0.601152, testing avg accuracy 0.872022


minibatch loop: 100%|██████████| 206/206 [00:52<00:00,  3.98it/s, accuracy=0.964, cost=0.139] 
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  7.70it/s, accuracy=0.854, cost=0.597]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 5, training avg cost 0.328712, training avg accuracy 0.920891
epoch 5, testing avg cost 0.511296, testing avg accuracy 0.870854


minibatch loop: 100%|██████████| 206/206 [00:51<00:00,  4.17it/s, accuracy=1, cost=0.00741]   
testing minibatch loop: 100%|██████████| 9/9 [00:00<00:00,  9.28it/s, accuracy=0.859, cost=0.723]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 6, training avg cost 0.039436, training avg accuracy 0.988837
epoch 6, testing avg cost 0.580661, testing avg accuracy 0.875055


minibatch loop: 100%|██████████| 206/206 [00:51<00:00,  3.77it/s, accuracy=1, cost=0.00124]    
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  8.18it/s, accuracy=0.855, cost=0.823]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 7, training avg cost 0.015785, training avg accuracy 0.995936
epoch 7, testing avg cost 0.639060, testing avg accuracy 0.876876


minibatch loop: 100%|██████████| 206/206 [00:53<00:00,  3.88it/s, accuracy=1, cost=0.00652]    
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  8.64it/s, accuracy=0.86, cost=0.831] 
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 8, training avg cost 0.007906, training avg accuracy 0.998070
epoch 8, testing avg cost 0.692831, testing avg accuracy 0.877201


minibatch loop: 100%|██████████| 206/206 [00:52<00:00,  4.19it/s, accuracy=1, cost=0.00841]    
testing minibatch loop: 100%|██████████| 9/9 [00:00<00:00,  9.08it/s, accuracy=0.859, cost=0.898]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 9, training avg cost 0.010532, training avg accuracy 0.997386
epoch 9, testing avg cost 0.697859, testing avg accuracy 0.876059


minibatch loop: 100%|██████████| 206/206 [00:52<00:00,  3.95it/s, accuracy=0.993, cost=0.0653] 
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  8.55it/s, accuracy=0.855, cost=0.854]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 10, training avg cost 0.010709, training avg accuracy 0.997237
epoch 10, testing avg cost 0.696859, testing avg accuracy 0.876878


minibatch loop: 100%|██████████| 206/206 [00:53<00:00,  3.88it/s, accuracy=1, cost=0.00593]    
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  8.57it/s, accuracy=0.862, cost=0.889]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 11, training avg cost 0.010798, training avg accuracy 0.997151
epoch 11, testing avg cost 0.727074, testing avg accuracy 0.875874


minibatch loop: 100%|██████████| 206/206 [00:53<00:00,  3.89it/s, accuracy=0.993, cost=0.0095] 
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  8.57it/s, accuracy=0.865, cost=0.912]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 12, training avg cost 0.008230, training avg accuracy 0.997800
epoch 12, testing avg cost 0.763999, testing avg accuracy 0.877770


minibatch loop: 100%|██████████| 206/206 [00:53<00:00,  3.88it/s, accuracy=1, cost=0.00225]    
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  8.81it/s, accuracy=0.867, cost=0.843]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 13, training avg cost 0.016961, training avg accuracy 0.995333
epoch 13, testing avg cost 0.714068, testing avg accuracy 0.880861


minibatch loop: 100%|██████████| 206/206 [00:53<00:00,  3.88it/s, accuracy=1, cost=0.00109]    
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  8.55it/s, accuracy=0.856, cost=0.914]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 14, training avg cost 0.008995, training avg accuracy 0.997587
epoch 14, testing avg cost 0.754256, testing avg accuracy 0.875979


minibatch loop: 100%|██████████| 206/206 [00:53<00:00,  3.94it/s, accuracy=1, cost=0.00035]    
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  8.52it/s, accuracy=0.856, cost=1.1]  
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 15, training avg cost 0.013078, training avg accuracy 0.996375
epoch 15, testing avg cost 0.793212, testing avg accuracy 0.880082


minibatch loop: 100%|██████████| 206/206 [00:53<00:00,  3.90it/s, accuracy=1, cost=9.37e-5]    
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  9.17it/s, accuracy=0.859, cost=1.02] 
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 16, training avg cost 0.007403, training avg accuracy 0.998159
epoch 16, testing avg cost 0.758150, testing avg accuracy 0.878791


minibatch loop: 100%|██████████| 206/206 [00:53<00:00,  3.88it/s, accuracy=1, cost=8.76e-5]    
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  8.64it/s, accuracy=0.867, cost=0.957]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 17, training avg cost 0.002010, training avg accuracy 0.999534
epoch 17, testing avg cost 0.766934, testing avg accuracy 0.883904


minibatch loop: 100%|██████████| 206/206 [00:53<00:00,  3.94it/s, accuracy=1, cost=0.00124]     
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  8.44it/s, accuracy=0.864, cost=0.985]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 18, training avg cost 0.001095, training avg accuracy 0.999765
epoch 18, testing avg cost 0.770777, testing avg accuracy 0.882391


minibatch loop: 100%|██████████| 206/206 [00:53<00:00,  4.12it/s, accuracy=1, cost=0.00445]    
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  8.69it/s, accuracy=0.862, cost=0.888]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 19, training avg cost 0.012810, training avg accuracy 0.996553
epoch 19, testing avg cost 0.795411, testing avg accuracy 0.875523


minibatch loop: 100%|██████████| 206/206 [00:53<00:00,  3.93it/s, accuracy=1, cost=0.0049]     
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  9.00it/s, accuracy=0.854, cost=0.997]

epoch 20, training avg cost 0.018642, training avg accuracy 0.994988
epoch 20, testing avg cost 0.745505, testing avg accuracy 0.878676





In [12]:
import random

random_index = random.randint(0, len(test_X) - 1)
batch_x = test_X[random_index : random_index + 1]
print(
    'real:',
    ''.join(
        [idx2char[no] for no in test_Y[random_index : random_index + 1][0]]
    ),
)
pred = sess.run(model.predicting_ids, feed_dict = {model.X: batch_x})[0]
print('predicted:', ''.join([idx2char[no] for no in pred]))

real: say the word book
predicted: say the word hotelthtltelth
