In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [2]:
import re

dimension = 400
vocab = "EOS abcdefghijklmnopqrstuvwxyz'"
char2idx = {char: idx for idx, char in enumerate(vocab)}
idx2char = {idx: char for idx, char in enumerate(vocab)}

def text2idx(text):
    text = re.sub(r'[^a-z ]', '', text.lower()).strip()
    converted = [char2idx[char] for char in text]
    return text, converted

In [3]:
GO = 1
PAD = 0
EOS = 2

In [4]:
import tensorflow as tf
import numpy as np

train_X, train_Y = [], []
text_files = [f for f in os.listdir('spectrogram-train') if f.endswith('.npy')]
for fpath in text_files:
    try:
        splitted = fpath.split('-')
        if len(splitted) == 2:
            splitted[1] = splitted[1].split('.')[1]
            fpath = splitted[0] + '.' + splitted[1]
        with open('data/' + fpath.replace('npy', 'txt')) as fopen:
            text, converted = text2idx(fopen.read())
        w = np.load('spectrogram-train/' + fpath)
        if w.shape[1] != dimension:
            continue
        train_X.append(w)
        train_Y.append(converted)
    except:
        pass

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [5]:
test_X, test_Y = [], []
text_files = [f for f in os.listdir('spectrogram-test') if f.endswith('.npy')]
for fpath in text_files:
    with open('data/' + fpath.replace('npy', 'txt')) as fopen:
        text, converted = text2idx(fopen.read())
    w = np.load('spectrogram-test/' + fpath)
    if w.shape[1] != dimension:
        continue
    test_X.append(w)
    test_Y.append(converted)

In [6]:
def encoder_block(inp, n_hidden, filter_size):
    inp = tf.expand_dims(inp, 2)
    inp = tf.pad(inp, [[0, 0], [(filter_size[0]-1)//2, (filter_size[0]-1)//2], [0, 0], [0, 0]])
    conv = tf.layers.conv2d(inp, n_hidden, filter_size, padding="VALID", activation=None)
    conv = tf.squeeze(conv, 2)
    return conv

def glu(x):
    return tf.multiply(x[:, :, :tf.shape(x)[2]//2], tf.sigmoid(x[:, :, tf.shape(x)[2]//2:]))

def layer(inp, conv_block, kernel_width, n_hidden, residual=None):
    z = conv_block(inp, n_hidden, (kernel_width, 1))
    return glu(z) + (residual if residual is not None else 0)

def pad_second_dim(x, desired_size):
    padding = tf.tile([[0]], tf.stack([tf.shape(x)[0], desired_size - tf.shape(x)[1]], 0))
    return tf.concat([x, padding], 1)

class Model:
    def __init__(
        self,
        num_layers,
        size_layers,
        learning_rate,
        num_features,
        dropout = 1.0,
    ):
        self.X = tf.placeholder(tf.float32, [None, None, num_features])
        self.label = tf.placeholder(tf.int32, [None, None])
        self.Y_seq_len = tf.placeholder(tf.int32, [None])
        self.Y = tf.sparse_placeholder(tf.int32)
        seq_lens = tf.count_nonzero(
            tf.reduce_sum(self.X, -1), 1, dtype = tf.int32
        )
        batch_size = tf.shape(self.X)[0]
        
        def cells(reuse = False):
            return tf.contrib.rnn.DropoutWrapper(
                tf.nn.rnn_cell.LSTMCell(
                    size_layers,
                    initializer = tf.orthogonal_initializer(),
                    reuse = reuse,
                ),
                state_keep_prob = dropout,
                output_keep_prob = dropout,
            )
        def attention(encoder_out, seq_len, reuse=False):
            attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(num_units = size_layers, 
                                                                    memory = encoder_out,
                                                                    memory_sequence_length = seq_len)
            return tf.contrib.seq2seq.AttentionWrapper(
            cell = tf.nn.rnn_cell.MultiRNNCell([cells(reuse) for _ in range(num_layers)]), 
                attention_mechanism = attention_mechanism,
                attention_layer_size = size_layers)
        
        encoder_embedded = self.X
        encoder_embedded = tf.layers.conv1d(encoder_embedded, size_layers, 1)
        e = tf.identity(encoder_embedded)
        for i in range(num_layers * 2):
            z = layer(encoder_embedded, encoder_block, 3, size_layers * 2, encoder_embedded)
            encoder_embedded = z
        
        encoder_output, output_memory = z, z + e
        print(encoder_output, output_memory)
        
        init_state = tf.reduce_mean(output_memory,axis=1)
        encoder_state = tuple(tf.nn.rnn_cell.LSTMStateTuple(c=init_state, h=init_state) for _ in range(num_layers))
        main = tf.strided_slice(self.X, [0, 0, 0], [batch_size, -1, num_features], [1, 1, 1])
        decoder_input = tf.concat([tf.fill([batch_size, 1, num_features], 0.0), main], 1)
        decoder_cell = attention(encoder_output, seq_lens)
        dense_layer = tf.layers.Dense(len(vocab))
        
        training_helper = tf.contrib.seq2seq.TrainingHelper(
                inputs = decoder_input,
                sequence_length = seq_lens,
                time_major = False)
        training_decoder = tf.contrib.seq2seq.BasicDecoder(
                cell = decoder_cell,
                helper = training_helper,
                initial_state = decoder_cell.zero_state(batch_size, tf.float32).clone(cell_state=encoder_state),
                output_layer = dense_layer)
        training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(
                decoder = training_decoder,
                impute_finished = True,
                maximum_iterations = tf.reduce_max(seq_lens))
        self.seq_lens = seq_lens
        
        logits = training_decoder_output.rnn_output
        time_major = tf.transpose(logits, [1, 0, 2])
        self.time_major = time_major
        decoded, log_prob = tf.nn.ctc_beam_search_decoder(time_major, seq_lens)
        decoded = tf.to_int32(decoded[0])
        self.preds = tf.sparse.to_dense(decoded)
        self.cost = tf.reduce_mean(
            tf.nn.ctc_loss(
                self.Y,
                time_major,
                seq_lens
            )
        )
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate = learning_rate
        ).minimize(self.cost)
        
        preds = self.preds[:, :tf.reduce_max(self.Y_seq_len)]
        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        preds = pad_second_dim(preds, tf.reduce_max(self.Y_seq_len))
        y_t = tf.cast(preds, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.label, masks)
        self.mask_label = mask_label
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [7]:
tf.reset_default_graph()
sess = tf.InteractiveSession()

size_layers = 512
learning_rate = 1e-4
num_layers = 2
batch_size = 64
epoch = 20

model = Model(num_layers, size_layers, learning_rate, dimension)
sess.run(tf.global_variables_initializer())

W0830 22:01:10.105256 140353921439552 deprecation.py:506] From /home/husein/.local/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:507: calling count_nonzero (from tensorflow.python.ops.math_ops) with axis is deprecated and will be removed in a future version.
Instructions for updating:
reduction_indices is deprecated, use axis instead
W0830 22:01:10.115298 140353921439552 deprecation.py:323] From <ipython-input-6-bf2828824d5e>:57: conv1d (from tensorflow.python.layers.convolutional) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.keras.layers.Conv1D` instead.
W0830 22:01:10.120007 140353921439552 deprecation.py:506] From /home/husein/.local/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead 

Tensor("add_3:0", shape=(?, ?, 512), dtype=float32) Tensor("add_4:0", shape=(?, ?, 512), dtype=float32)


W0830 22:01:11.372579 140353921439552 lazy_loader.py:50] 
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

W0830 22:01:11.675654 140353921439552 deprecation.py:323] From <ipython-input-6-bf2828824d5e>:42: LSTMCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
W0830 22:01:11.681477 140353921439552 deprecation.py:323] From <ipython-input-6-bf2828824d5e>:52: MultiRNNCell.__init__ (from tensorflow.python.ops.rnn_cell_impl) is deprecated and will be removed in a future version.
Instructions for 

In [8]:
train_X = tf.keras.preprocessing.sequence.pad_sequences(
    train_X, dtype = 'float32', padding = 'post'
)

In [9]:
test_X = tf.keras.preprocessing.sequence.pad_sequences(
    test_X, dtype = 'float32', padding = 'post'
)

In [10]:
def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

def sparse_tuple_from(sequences, dtype=np.int32):
    indices = []
    values = []

    for n, seq in enumerate(sequences):
        indices.extend(zip([n] * len(seq), range(len(seq))))
        values.extend(seq)

    indices = np.asarray(indices, dtype=np.int64)
    values = np.asarray(values, dtype=dtype)
    shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1] + 1], dtype=np.int64)

    return indices, values, shape

In [11]:
from tqdm import tqdm

for e in range(epoch):
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'minibatch loop')
    train_cost, train_accuracy, test_cost, test_accuracy = [], [], [], []
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, len(train_X))]
        y = train_Y[i : min(i + batch_size, len(train_X))]
        batch_y = sparse_tuple_from(y)
        batch_label, batch_len = pad_sentence_batch(y, 0)
        _, cost, accuracy = sess.run(
            [model.optimizer, model.cost, model.accuracy],
            feed_dict = {model.X: batch_x, model.Y: batch_y, 
                         model.label: batch_label, model.Y_seq_len: batch_len},
        )
        train_cost.append(cost)
        train_accuracy.append(accuracy)
        pbar.set_postfix(cost = cost, accuracy = accuracy)
    
    pbar = tqdm(
        range(0, len(test_X), batch_size), desc = 'testing minibatch loop')
    for i in pbar:
        batch_x = test_X[i : min(i + batch_size, len(test_X))]
        y = test_Y[i : min(i + batch_size, len(test_X))]
        batch_y = sparse_tuple_from(y)
        batch_label, batch_len = pad_sentence_batch(y, 0)
        cost, accuracy = sess.run(
            [model.cost, model.accuracy],
            feed_dict = {model.X: batch_x, model.Y: batch_y, 
                         model.label: batch_label, model.Y_seq_len: batch_len},
        )
        
        test_cost.append(cost)
        test_accuracy.append(accuracy)
        
        pbar.set_postfix(cost = cost, accuracy = accuracy)
    print('epoch %d, training avg cost %f, training avg accuracy %f'%(e + 1, np.mean(train_cost), 
                                                                      np.mean(train_accuracy)))
    
    print('epoch %d, testing avg cost %f, testing avg accuracy %f'%(e + 1, np.mean(test_cost), 
                                                                    np.mean(test_accuracy)))

minibatch loop: 100%|██████████| 206/206 [02:28<00:00,  1.79it/s, accuracy=0.784, cost=12]  
testing minibatch loop: 100%|██████████| 9/9 [00:06<00:00,  1.45it/s, accuracy=0.773, cost=13.4]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 1, training avg cost 22.843679, training avg accuracy 0.551524
epoch 1, testing avg cost 13.325702, testing avg accuracy 0.772444


minibatch loop: 100%|██████████| 206/206 [02:26<00:00,  1.80it/s, accuracy=0.784, cost=11.2]
testing minibatch loop: 100%|██████████| 9/9 [00:06<00:00,  1.52it/s, accuracy=0.777, cost=11.8]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 2, training avg cost 11.787023, training avg accuracy 0.776832
epoch 2, testing avg cost 11.816154, testing avg accuracy 0.779041


minibatch loop: 100%|██████████| 206/206 [02:27<00:00,  1.73it/s, accuracy=0.806, cost=9.78]
testing minibatch loop: 100%|██████████| 9/9 [00:06<00:00,  1.46it/s, accuracy=0.771, cost=11.3]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 3, training avg cost 10.717728, training avg accuracy 0.787439
epoch 3, testing avg cost 11.396130, testing avg accuracy 0.776252


minibatch loop: 100%|██████████| 206/206 [02:32<00:00,  1.73it/s, accuracy=0.82, cost=8.86] 
testing minibatch loop: 100%|██████████| 9/9 [00:06<00:00,  1.49it/s, accuracy=0.775, cost=11.7]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 4, training avg cost 9.914606, training avg accuracy 0.799418
epoch 4, testing avg cost 11.572537, testing avg accuracy 0.781861


minibatch loop: 100%|██████████| 206/206 [02:33<00:00,  1.75it/s, accuracy=0.82, cost=8.25] 
testing minibatch loop: 100%|██████████| 9/9 [00:06<00:00,  1.51it/s, accuracy=0.773, cost=12.1]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 5, training avg cost 9.083913, training avg accuracy 0.812834
epoch 5, testing avg cost 11.790721, testing avg accuracy 0.781523


minibatch loop: 100%|██████████| 206/206 [02:28<00:00,  1.78it/s, accuracy=0.863, cost=7.26]
testing minibatch loop: 100%|██████████| 9/9 [00:06<00:00,  1.51it/s, accuracy=0.773, cost=12.2]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 6, training avg cost 8.280313, training avg accuracy 0.828048
epoch 6, testing avg cost 12.523357, testing avg accuracy 0.786584


minibatch loop: 100%|██████████| 206/206 [02:29<00:00,  1.75it/s, accuracy=0.827, cost=6.91]
testing minibatch loop: 100%|██████████| 9/9 [00:06<00:00,  1.51it/s, accuracy=0.772, cost=12.2]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 7, training avg cost 7.425717, training avg accuracy 0.842378
epoch 7, testing avg cost inf, testing avg accuracy 0.787262


minibatch loop: 100%|██████████| 206/206 [02:32<00:00,  1.72it/s, accuracy=0.892, cost=6.41]
testing minibatch loop: 100%|██████████| 9/9 [00:06<00:00,  1.49it/s, accuracy=0.779, cost=12.3]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 8, training avg cost 6.640522, training avg accuracy 0.857383
epoch 8, testing avg cost 13.313684, testing avg accuracy 0.791818


minibatch loop: 100%|██████████| 206/206 [02:31<00:00,  1.73it/s, accuracy=0.906, cost=5.14]
testing minibatch loop: 100%|██████████| 9/9 [00:06<00:00,  1.44it/s, accuracy=0.787, cost=12.7]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 9, training avg cost 5.950582, training avg accuracy 0.869927
epoch 9, testing avg cost 13.820772, testing avg accuracy 0.792435


minibatch loop: 100%|██████████| 206/206 [02:34<00:00,  1.72it/s, accuracy=0.871, cost=5.63]
testing minibatch loop: 100%|██████████| 9/9 [00:06<00:00,  1.47it/s, accuracy=0.775, cost=14.4]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 10, training avg cost 5.191116, training avg accuracy 0.884032
epoch 10, testing avg cost 14.773592, testing avg accuracy 0.791521


minibatch loop: 100%|██████████| 206/206 [02:35<00:00,  1.74it/s, accuracy=0.921, cost=4.37]
testing minibatch loop: 100%|██████████| 9/9 [00:06<00:00,  1.49it/s, accuracy=0.793, cost=14.7]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 11, training avg cost 4.592854, training avg accuracy 0.894829
epoch 11, testing avg cost inf, testing avg accuracy 0.792228


minibatch loop: 100%|██████████| 206/206 [02:36<00:00,  1.66it/s, accuracy=0.914, cost=4.35]
testing minibatch loop: 100%|██████████| 9/9 [00:06<00:00,  1.45it/s, accuracy=0.786, cost=16.2]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 12, training avg cost 4.044317, training avg accuracy 0.905290
epoch 12, testing avg cost 16.948906, testing avg accuracy 0.792237


minibatch loop: 100%|██████████| 206/206 [02:37<00:00,  1.65it/s, accuracy=0.921, cost=3.74]
testing minibatch loop: 100%|██████████| 9/9 [00:06<00:00,  1.43it/s, accuracy=0.792, cost=16.8]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 13, training avg cost 3.496344, training avg accuracy 0.916858
epoch 13, testing avg cost 17.670719, testing avg accuracy 0.792578


minibatch loop: 100%|██████████| 206/206 [02:39<00:00,  1.64it/s, accuracy=0.935, cost=3.06]
testing minibatch loop: 100%|██████████| 9/9 [00:06<00:00,  1.40it/s, accuracy=0.796, cost=18.4]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 14, training avg cost 3.089757, training avg accuracy 0.924015
epoch 14, testing avg cost 18.522146, testing avg accuracy 0.799595


minibatch loop: 100%|██████████| 206/206 [02:41<00:00,  1.63it/s, accuracy=0.935, cost=2.92]
testing minibatch loop: 100%|██████████| 9/9 [00:06<00:00,  1.42it/s, accuracy=0.785, cost=18.8]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 15, training avg cost 2.671993, training avg accuracy 0.932817
epoch 15, testing avg cost 19.291397, testing avg accuracy 0.799696


minibatch loop: 100%|██████████| 206/206 [02:43<00:00,  1.62it/s, accuracy=0.942, cost=2.44]
testing minibatch loop: 100%|██████████| 9/9 [00:06<00:00,  1.41it/s, accuracy=0.792, cost=20.6]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 16, training avg cost 2.382796, training avg accuracy 0.938367
epoch 16, testing avg cost 19.683201, testing avg accuracy 0.798279


minibatch loop: 100%|██████████| 206/206 [02:43<00:00,  1.62it/s, accuracy=0.95, cost=2.18] 
testing minibatch loop: 100%|██████████| 9/9 [00:06<00:00,  1.38it/s, accuracy=0.786, cost=22.3]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 17, training avg cost 2.003694, training avg accuracy 0.947037
epoch 17, testing avg cost 21.430529, testing avg accuracy 0.798072


minibatch loop: 100%|██████████| 206/206 [02:43<00:00,  1.61it/s, accuracy=0.942, cost=2.02]
testing minibatch loop: 100%|██████████| 9/9 [00:06<00:00,  1.41it/s, accuracy=0.802, cost=21.9]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 18, training avg cost 1.906986, training avg accuracy 0.948546
epoch 18, testing avg cost 21.528790, testing avg accuracy 0.801692


minibatch loop: 100%|██████████| 206/206 [02:44<00:00,  1.63it/s, accuracy=0.942, cost=1.6]  
testing minibatch loop: 100%|██████████| 9/9 [00:06<00:00,  1.38it/s, accuracy=0.779, cost=23.1]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 19, training avg cost 1.680656, training avg accuracy 0.953059
epoch 19, testing avg cost 24.193068, testing avg accuracy 0.801057


minibatch loop: 100%|██████████| 206/206 [02:44<00:00,  1.63it/s, accuracy=0.935, cost=1.98] 
testing minibatch loop: 100%|██████████| 9/9 [00:06<00:00,  1.40it/s, accuracy=0.802, cost=26.1]

epoch 20, training avg cost 1.497375, training avg accuracy 0.957022
epoch 20, testing avg cost 25.008738, testing avg accuracy 0.802387





In [12]:
import random

random_index = random.randint(0, len(test_X) - 1)
batch_x = test_X[random_index : random_index + 1]
print(
    'real:',
    ''.join(
        [idx2char[no] for no in test_Y[random_index : random_index + 1][0]]
    ),
)
batch_y = sparse_tuple_from(test_Y[random_index : random_index + 1])
pred = sess.run(model.preds, feed_dict = {model.X: batch_x})[0]
print('predicted:', ''.join([idx2char[no] for no in pred]))

real: say the word bath
predicted: say the word chash
