In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
import re

dimension = 400
vocab = "EOS abcdefghijklmnopqrstuvwxyz'"
char2idx = {char: idx for idx, char in enumerate(vocab)}
idx2char = {idx: char for idx, char in enumerate(vocab)}

def text2idx(text):
    text = re.sub(r'[^a-z ]', '', text.lower()).strip()
    converted = [char2idx[char] for char in text]
    return text, converted

In [3]:
GO = 1
PAD = 0
EOS = 2

In [4]:
import tensorflow as tf
import numpy as np

train_X, train_Y = [], []
text_files = [f for f in os.listdir('spectrogram-train') if f.endswith('.npy')]
for fpath in text_files:
    try:
        splitted = fpath.split('-')
        if len(splitted) == 2:
            splitted[1] = splitted[1].split('.')[1]
            fpath = splitted[0] + '.' + splitted[1]
        with open('data/' + fpath.replace('npy', 'txt')) as fopen:
            text, converted = text2idx(fopen.read())
        w = np.load('spectrogram-train/' + fpath)
        if w.shape[1] != dimension:
            continue
        train_X.append(w)
        train_Y.append(converted)
    except:
        pass

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [5]:
test_X, test_Y = [], []
text_files = [f for f in os.listdir('spectrogram-test') if f.endswith('.npy')]
for fpath in text_files:
    with open('data/' + fpath.replace('npy', 'txt')) as fopen:
        text, converted = text2idx(fopen.read())
    w = np.load('spectrogram-test/' + fpath)
    if w.shape[1] != dimension:
        continue
    test_X.append(w)
    test_Y.append(converted)

In [14]:
def pad_second_dim(x, desired_size):
    padding = tf.tile([[0]], tf.stack([tf.shape(x)[0], desired_size - tf.shape(x)[1]], 0))
    return tf.concat([x, padding], 1)

class Model:
    def __init__(
        self,
        num_layers,
        size_layer,
        learning_rate,
        num_features,
        dropout = 1.0,
        beam_width=5, force_teaching_ratio=0.5
    ):
        
        def lstm_cell(size, reuse=False):
            return tf.nn.rnn_cell.LSTMCell(size, initializer=tf.orthogonal_initializer(),reuse=reuse)
        
        self.X = tf.placeholder(tf.float32, [None, None, num_features])
        self.label = tf.placeholder(tf.int32, [None, None])
        self.Y_seq_len = tf.placeholder(tf.int32, [None])
        self.Y = tf.sparse_placeholder(tf.int32)
        seq_lens = tf.count_nonzero(
            tf.reduce_sum(self.X, -1), 1, dtype = tf.int32
        )
        
        batch_size = tf.shape(self.X)[0]
        self.encoder_out = self.X

        for n in range(num_layers):
            (out_fw, out_bw), (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = lstm_cell(size_layer // 2),
                cell_bw = lstm_cell(size_layer // 2),
                inputs = self.encoder_out,
                sequence_length = seq_lens,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_%d'%(n))
            self.encoder_out = tf.concat((out_fw, out_bw), 2)
            
        bi_state_c = tf.concat((state_fw.c, state_bw.c), -1)
        bi_state_h = tf.concat((state_fw.h, state_bw.h), -1)
        bi_lstm_state = tf.nn.rnn_cell.LSTMStateTuple(c=bi_state_c, h=bi_state_h)
        encoder_state = tuple([bi_lstm_state] * num_layers)
        
        attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
            num_units = size_layer, 
            memory = self.encoder_out,
            memory_sequence_length = seq_lens)
        
        decoder_cell = tf.contrib.seq2seq.AttentionWrapper(
                cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell(size_layer) for _ in range(num_layers)]),
                attention_mechanism = attention_mechanism,
                attention_layer_size = size_layer)
        
        initial_state = decoder_cell.zero_state(batch_size, tf.float32).clone(cell_state=encoder_state)
        
        f, _ = tf.nn.dynamic_rnn(
            decoder_cell,
            self.X,
            sequence_length=seq_lens,
            initial_state=initial_state,
            dtype=tf.float32)
        
        logits = tf.layers.dense(f, len(vocab))
        time_major = tf.transpose(logits, [1, 0, 2])
        decoded, log_prob = tf.nn.ctc_greedy_decoder(time_major, seq_lens)
        decoded = tf.to_int32(decoded[0])
        self.preds = tf.sparse.to_dense(decoded)
        self.cost = tf.reduce_mean(
            tf.nn.ctc_loss(
                self.Y,
                time_major,
                seq_lens
            )
        )
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate = learning_rate
        ).minimize(self.cost)
        
        preds = self.preds[:, :tf.reduce_max(self.Y_seq_len)]
        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        preds = pad_second_dim(preds, tf.reduce_max(self.Y_seq_len))
        y_t = tf.cast(preds, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.label, masks)
        self.mask_label = mask_label
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [15]:
tf.reset_default_graph()
sess = tf.InteractiveSession()

size_layers = 512
learning_rate = 1e-3
num_layers = 2
batch_size = 64
epoch = 20

model = Model(num_layers, size_layers, learning_rate, dimension)
sess.run(tf.global_variables_initializer())

W0830 21:39:23.183917 140344873588544 deprecation.py:323] From <ipython-input-14-880cfd5c541b>:64: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.dense instead.
W0830 21:39:23.221365 140344873588544 deprecation.py:323] From <ipython-input-14-880cfd5c541b>:67: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `tf.cast` instead.


In [16]:
train_X = tf.keras.preprocessing.sequence.pad_sequences(
    train_X, dtype = 'float32', padding = 'post'
)

In [17]:
test_X = tf.keras.preprocessing.sequence.pad_sequences(
    test_X, dtype = 'float32', padding = 'post'
)

In [18]:
def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

def sparse_tuple_from(sequences, dtype=np.int32):
    indices = []
    values = []

    for n, seq in enumerate(sequences):
        indices.extend(zip([n] * len(seq), range(len(seq))))
        values.extend(seq)

    indices = np.asarray(indices, dtype=np.int64)
    values = np.asarray(values, dtype=dtype)
    shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1] + 1], dtype=np.int64)

    return indices, values, shape

In [19]:
from tqdm import tqdm

for e in range(epoch):
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'minibatch loop')
    train_cost, train_accuracy, test_cost, test_accuracy = [], [], [], []
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, len(train_X))]
        y = train_Y[i : min(i + batch_size, len(train_X))]
        batch_y = sparse_tuple_from(y)
        batch_label, batch_len = pad_sentence_batch(y, 0)
        _, cost, accuracy = sess.run(
            [model.optimizer, model.cost, model.accuracy],
            feed_dict = {model.X: batch_x, model.Y: batch_y, 
                         model.label: batch_label, model.Y_seq_len: batch_len},
        )
        train_cost.append(cost)
        train_accuracy.append(accuracy)
        pbar.set_postfix(cost = cost, accuracy = accuracy)
    
    pbar = tqdm(
        range(0, len(test_X), batch_size), desc = 'testing minibatch loop')
    for i in pbar:
        batch_x = test_X[i : min(i + batch_size, len(test_X))]
        y = test_Y[i : min(i + batch_size, len(test_X))]
        batch_y = sparse_tuple_from(y)
        batch_label, batch_len = pad_sentence_batch(y, 0)
        cost, accuracy = sess.run(
            [model.cost, model.accuracy],
            feed_dict = {model.X: batch_x, model.Y: batch_y, 
                         model.label: batch_label, model.Y_seq_len: batch_len},
        )
        
        test_cost.append(cost)
        test_accuracy.append(accuracy)
        
        pbar.set_postfix(cost = cost, accuracy = accuracy)
    print('epoch %d, training avg cost %f, training avg accuracy %f'%(e + 1, np.mean(train_cost), 
                                                                      np.mean(train_accuracy)))
    
    print('epoch %d, testing avg cost %f, testing avg accuracy %f'%(e + 1, np.mean(test_cost), 
                                                                    np.mean(test_accuracy)))

minibatch loop: 100%|██████████| 206/206 [01:06<00:00,  3.19it/s, accuracy=0.77, cost=11.7] 
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  6.69it/s, accuracy=0.773, cost=11.7]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 1, training avg cost 16.363361, training avg accuracy 0.698916
epoch 1, testing avg cost 11.629887, testing avg accuracy 0.774970


minibatch loop: 100%|██████████| 206/206 [01:05<00:00,  3.19it/s, accuracy=0.799, cost=9.03]
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  7.21it/s, accuracy=0.792, cost=10.5]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 2, training avg cost 10.210957, training avg accuracy 0.798453
epoch 2, testing avg cost 10.023038, testing avg accuracy 0.813071


minibatch loop: 100%|██████████| 206/206 [01:05<00:00,  3.18it/s, accuracy=0.871, cost=6.03]
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  7.22it/s, accuracy=0.815, cost=10.1]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 3, training avg cost 7.720115, training avg accuracy 0.838868
epoch 3, testing avg cost 9.819579, testing avg accuracy 0.821520


minibatch loop: 100%|██████████| 206/206 [01:05<00:00,  3.15it/s, accuracy=0.935, cost=3.6] 
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  7.17it/s, accuracy=0.815, cost=11.4]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 4, training avg cost 4.740903, training avg accuracy 0.892283
epoch 4, testing avg cost 10.699003, testing avg accuracy 0.830354


minibatch loop: 100%|██████████| 206/206 [01:05<00:00,  3.19it/s, accuracy=0.957, cost=2.17]
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  7.36it/s, accuracy=0.82, cost=13.9] 
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 5, training avg cost 3.061362, training avg accuracy 0.929911
epoch 5, testing avg cost 11.718001, testing avg accuracy 0.841515


minibatch loop: 100%|██████████| 206/206 [01:04<00:00,  3.29it/s, accuracy=0.964, cost=2.04] 
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  7.34it/s, accuracy=0.82, cost=17.1] 
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 6, training avg cost 1.437098, training avg accuracy 0.967344
epoch 6, testing avg cost 13.232852, testing avg accuracy 0.846709


minibatch loop: 100%|██████████| 206/206 [01:02<00:00,  3.40it/s, accuracy=0.986, cost=0.744]
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  7.82it/s, accuracy=0.83, cost=18.9] 
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 7, training avg cost 0.810221, training avg accuracy 0.982241
epoch 7, testing avg cost 15.204712, testing avg accuracy 0.848272


minibatch loop: 100%|██████████| 206/206 [01:14<00:00,  2.73it/s, accuracy=0.971, cost=0.898]
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  5.38it/s, accuracy=0.831, cost=17]  
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 8, training avg cost 0.599719, training avg accuracy 0.986627
epoch 8, testing avg cost 16.227919, testing avg accuracy 0.846889


minibatch loop: 100%|██████████| 206/206 [01:15<00:00,  2.81it/s, accuracy=1, cost=0.139]    
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  5.27it/s, accuracy=0.845, cost=18.6]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 9, training avg cost 0.447315, training avg accuracy 0.990221
epoch 9, testing avg cost 15.750239, testing avg accuracy 0.854385


minibatch loop: 100%|██████████| 206/206 [01:15<00:00,  2.78it/s, accuracy=0.993, cost=1.6]   
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  5.28it/s, accuracy=0.845, cost=22.4]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 10, training avg cost 0.277357, training avg accuracy 0.994148
epoch 10, testing avg cost 17.481514, testing avg accuracy 0.856499


minibatch loop: 100%|██████████| 206/206 [01:15<00:00,  2.80it/s, accuracy=0.993, cost=0.221] 
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  5.25it/s, accuracy=0.822, cost=21.2]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 11, training avg cost 0.383343, training avg accuracy 0.991403
epoch 11, testing avg cost 17.036907, testing avg accuracy 0.852053


minibatch loop: 100%|██████████| 206/206 [01:06<00:00,  3.47it/s, accuracy=1, cost=0.255]     
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  7.30it/s, accuracy=0.826, cost=25]  
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 12, training avg cost 0.282495, training avg accuracy 0.994451
epoch 12, testing avg cost 18.421242, testing avg accuracy 0.851324


minibatch loop: 100%|██████████| 206/206 [01:02<00:00,  3.46it/s, accuracy=1, cost=0.0514]    
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  7.64it/s, accuracy=0.847, cost=22.9]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 13, training avg cost 0.292280, training avg accuracy 0.993977
epoch 13, testing avg cost 18.684343, testing avg accuracy 0.855421


minibatch loop: 100%|██████████| 206/206 [01:02<00:00,  3.25it/s, accuracy=1, cost=0.0432]    
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  7.40it/s, accuracy=0.827, cost=22.6]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 14, training avg cost 0.110310, training avg accuracy 0.998080
epoch 14, testing avg cost 18.044865, testing avg accuracy 0.853792


minibatch loop: 100%|██████████| 206/206 [01:02<00:00,  3.43it/s, accuracy=1, cost=0.163]     
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  7.65it/s, accuracy=0.833, cost=22.3]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 15, training avg cost 0.061842, training avg accuracy 0.998794
epoch 15, testing avg cost 19.036623, testing avg accuracy 0.854991


minibatch loop: 100%|██████████| 206/206 [01:03<00:00,  3.19it/s, accuracy=1, cost=0.0191]    
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  7.45it/s, accuracy=0.837, cost=23.1]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 16, training avg cost 0.192031, training avg accuracy 0.995593
epoch 16, testing avg cost 18.769970, testing avg accuracy 0.854253


minibatch loop: 100%|██████████| 206/206 [01:04<00:00,  3.31it/s, accuracy=0.978, cost=0.347] 
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  7.42it/s, accuracy=0.833, cost=25.5]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 17, training avg cost 0.276063, training avg accuracy 0.994664
epoch 17, testing avg cost 21.280766, testing avg accuracy 0.845839


minibatch loop: 100%|██████████| 206/206 [01:05<00:00,  3.33it/s, accuracy=1, cost=0.213]     
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  6.91it/s, accuracy=0.842, cost=18.1]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 18, training avg cost 0.727623, training avg accuracy 0.984439
epoch 18, testing avg cost 16.612070, testing avg accuracy 0.859568


minibatch loop: 100%|██████████| 206/206 [01:05<00:00,  2.95it/s, accuracy=0.986, cost=0.299] 
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  7.27it/s, accuracy=0.853, cost=19.8]
minibatch loop:   0%|          | 0/206 [00:00<?, ?it/s]

epoch 19, training avg cost 0.112676, training avg accuracy 0.997748
epoch 19, testing avg cost 17.484631, testing avg accuracy 0.858974


minibatch loop: 100%|██████████| 206/206 [01:02<00:00,  3.46it/s, accuracy=1, cost=0.0359]    
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  7.38it/s, accuracy=0.857, cost=20.2]

epoch 20, training avg cost 0.038571, training avg accuracy 0.999130
epoch 20, testing avg cost 17.538042, testing avg accuracy 0.863558





In [20]:
import random

random_index = random.randint(0, len(test_X) - 1)
batch_x = test_X[random_index : random_index + 1]
print(
    'real:',
    ''.join(
        [idx2char[no] for no in test_Y[random_index : random_index + 1][0]]
    ),
)
batch_y = sparse_tuple_from(test_Y[random_index : random_index + 1])
pred = sess.run(model.preds, feed_dict = {model.X: batch_x})[0]
print('predicted:', ''.join([idx2char[no] for no in pred]))

real: say the word five
predicted: say the word luve
