## Make sure you already run

1. [wav2vec-preprocessing.ipynb](wav2vec-preprocessing.ipynb)
2. [wav2vec.ipynb](wav2vec.ipynb)

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
import librosa
import tensorflow as tf
import glob
import numpy as np

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
import pickle

with open('train-wav.pkl', 'rb') as fopen:
    X = pickle.load(fopen)
    
with open('test-wav.pkl', 'rb') as fopen:
    Y = pickle.load(fopen)

In [4]:
X['x'][0]

'augment/OAF_boat_happy-3.wav'

In [5]:
import re

vocab = "ES abcdefghijklmnopqrstuvwxyz'"
char2idx = {char: idx for idx, char in enumerate(vocab)}
idx2char = {idx: char for idx, char in enumerate(vocab)}

def text2idx(text):
    text = re.sub(r'[^a-z ]', '', text.lower()).strip()
    converted = [char2idx[char] for char in text]
    return text, converted

In [6]:
train_Y = []
for fpath in X['x']:
    fpath = fpath.split('/')[1]
    splitted = fpath.split('-')
    if len(splitted) == 2:
        splitted[1] = splitted[1].split('.')[1]
        fpath = splitted[0] + '.' + splitted[1]
    with open('data/' + fpath.replace('wav', 'txt')) as fopen:
        text, converted = text2idx(fopen.read())
    train_Y.append(converted)
    
train_X = X['X']

In [7]:
len(train_X), len(train_Y)

(16341, 16341)

In [8]:
test_Y = []
for fpath in Y['y']:
    fpath = fpath.split('/')[1]
    splitted = fpath.split('-')
    if len(splitted) == 2:
        splitted[1] = splitted[1].split('.')[1]
        fpath = splitted[0] + '.' + splitted[1]
    with open('data/' + fpath.replace('wav', 'txt')) as fopen:
        text, converted = text2idx(fopen.read())
    test_Y.append(converted)
    
test_X = Y['Y']

In [9]:
len(test_X), len(test_Y)

(560, 560)

In [10]:
features = [(512, 10, 5), (512, 8, 4), (512, 8, 4), (512, 4, 2), 
            (512, 4, 2), (512, 4, 2), (512, 1, 1), (512, 1, 1)]
aggs = [(512, 2, 1), (512, 3, 1), (512, 4, 1), (512, 5, 1), (512, 6, 1), (512, 7, 1), (512, 8, 1), (512, 9, 1), 
 (512, 10, 1), (512, 11, 1), (512, 12, 1), (512, 13, 1)]
num_negatives = 10
prediction_steps = 12
learning_rate = 1e-6

In [11]:
import math

def pad_second_dim(x, desired_size):
    padding = tf.tile([[0]], tf.stack([tf.shape(x)[0], desired_size - tf.shape(x)[1]], 0))
    return tf.concat([x, padding], 1)

def layer_norm(inputs, epsilon=1e-8):
    mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
    normalized = (inputs - mean) / (tf.sqrt(variance + epsilon))
    params_shape = inputs.get_shape()[-1:]
    gamma = tf.get_variable('gamma', params_shape, tf.float32, tf.ones_initializer())
    beta = tf.get_variable('beta', params_shape, tf.float32, tf.zeros_initializer())
    return gamma * normalized + beta


def cnn_block(x, hidden_dim, kernel_size, strides):
    x =  tf.layers.conv1d(inputs = x,
                          filters = hidden_dim,
                          kernel_size = kernel_size,
                          strides = strides)
    x = layer_norm(x)
    x = tf.nn.relu(x)
    return x

def cnn_aggregator(x, hidden_dim, kernel_size, strides):
    ka = kernel_size // 2
    kb = ka - 1 if kernel_size % 2 == 0 else ka
    pad = tf.zeros([tf.shape(x)[0], kb + ka, hidden_dim])
    x =  tf.layers.conv1d(inputs = tf.concat([pad, x], 1),
                          filters = hidden_dim,
                          kernel_size = kernel_size,
                          strides = strides)
    x = layer_norm(x)
    x = tf.nn.relu(x)
    return x

class Model:
    def __init__(self):
        self.X = tf.placeholder(tf.float32, (None, None))
        feature = tf.expand_dims(self.X, axis = 2)
        
        for no, f in enumerate(features):
            size_layers = f[0]
            kernel_size = f[1]
            strides = f[2]
            with tf.variable_scope('feature_%d'%no):
                feature = cnn_block(feature, size_layers, kernel_size, strides)
        
        x = tf.identity(feature)
        for no, f in enumerate(aggs):
            size_layers = f[0]
            kernel_size = f[1]
            strides = f[2]
            with tf.variable_scope('agg_%d'%no):
                x = cnn_aggregator(x, size_layers, kernel_size, strides)
        
        self.logits = x # X
        self.targets = feature # Y
        
class RNN:
    def __init__(
        self,
        num_layers,
        size_layers,
        learning_rate,
        dropout = 1.0):
        
        self.label = tf.placeholder(tf.int32, [None, None])
        self.Y_seq_len = tf.placeholder(tf.int32, [None])
        self.Y = tf.sparse_placeholder(tf.int32)
        self.model = Model()
        
        def cells(size, reuse = False):
            return tf.contrib.rnn.DropoutWrapper(
                tf.nn.rnn_cell.LSTMCell(
                    size,
                    initializer = tf.orthogonal_initializer(),
                    reuse = reuse,
                ),
                state_keep_prob = dropout,
                output_keep_prob = dropout,
            )

        features = self.model.logits
        seq_lens = tf.fill([tf.shape(features)[0]], tf.shape(features)[1])
        
        for n in range(num_layers):
            (out_fw, out_bw), (
                state_fw,
                state_bw,
            ) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw = cells(size_layers),
                cell_bw = cells(size_layers),
                inputs = features,
                sequence_length = seq_lens,
                dtype = tf.float32,
                scope = 'bidirectional_rnn_%d' % (n),
            )
            features = tf.concat((out_fw, out_bw), 2)
            
        logits = tf.layers.dense(features, len(vocab))
        time_major = tf.transpose(logits, [1, 0, 2])
        decoded, log_prob = tf.nn.ctc_greedy_decoder(time_major, seq_lens)
        decoded = tf.to_int32(decoded[0])
        self.preds = tf.sparse.to_dense(decoded)
        self.cost = tf.reduce_mean(
            tf.nn.ctc_loss(
                self.Y,
                time_major,
                seq_lens
            )
        )
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate = learning_rate
        ).minimize(self.cost)
        
        preds = self.preds[:, :tf.reduce_max(self.Y_seq_len)]
        masks = tf.sequence_mask(self.Y_seq_len, tf.reduce_max(self.Y_seq_len), dtype=tf.float32)
        preds = pad_second_dim(preds, tf.reduce_max(self.Y_seq_len))
        y_t = tf.cast(preds, tf.int32)
        self.prediction = tf.boolean_mask(y_t, masks)
        mask_label = tf.boolean_mask(self.label, masks)
        self.mask_label = mask_label
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [12]:
tf.reset_default_graph()
sess = tf.InteractiveSession()

size_layers = 512
learning_rate = 1e-5
num_layers = 2
batch_size = 64
epoch = 20

model = RNN(num_layers, size_layers, learning_rate)
sess.run(tf.global_variables_initializer())

Instructions for updating:
Use `tf.keras.layers.Conv1D` instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to th

In [13]:
import collections

def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
    """Compute the union of the current variables and checkpoint variables."""
    assignment_map = {}
    initialized_variable_names = {}

    name_to_variable = collections.OrderedDict()
    for var in tvars:
        name = var.name
        m = re.match('^(.*):\\d+$', name)
        if m is not None:
            name = m.group(1)
        name_to_variable[name] = var

    init_vars = tf.train.list_variables(init_checkpoint)

    assignment_map = collections.OrderedDict()
    for x in init_vars:
        (name, var) = (x[0], x[1])
        if name not in name_to_variable:
            continue
        assignment_map[name] = name_to_variable[name]
        initialized_variable_names[name] = 1
        initialized_variable_names[name + ':0'] = 1

    return (assignment_map, initialized_variable_names)

In [14]:
tvars = tf.trainable_variables()

checkpoint = 'wav2vec/model.ckpt'
assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint(tvars, 
                                                                                checkpoint)

In [15]:
saver = tf.train.Saver(var_list = assignment_map)
saver.restore(sess, checkpoint)

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from wav2vec/model.ckpt


In [16]:
def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(sentence + [pad_int] * (max_sentence_len - len(sentence)))
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

def sparse_tuple_from(sequences, dtype=np.int32):
    indices = []
    values = []

    for n, seq in enumerate(sequences):
        indices.extend(zip([n] * len(seq), range(len(seq))))
        values.extend(seq)

    indices = np.asarray(indices, dtype=np.int64)
    values = np.asarray(values, dtype=dtype)
    shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1] + 1], dtype=np.int64)

    return indices, values, shape

In [17]:
from tqdm import tqdm

for e in range(epoch):
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'minibatch loop')
    train_cost, train_accuracy, test_cost, test_accuracy = [], [], [], []
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, len(train_X))]
        batch_x = tf.keras.preprocessing.sequence.pad_sequences(
            batch_x, dtype = 'float32', padding = 'post'
        )
        y = train_Y[i : min(i + batch_size, len(train_X))]
        batch_y = sparse_tuple_from(y)
        batch_label, batch_len = pad_sentence_batch(y, 0)
        _, cost, accuracy = sess.run(
            [model.optimizer, model.cost, model.accuracy],
            feed_dict = {model.model.X: batch_x, model.Y: batch_y, 
                         model.label: batch_label, model.Y_seq_len: batch_len},
        )
        train_cost.append(cost)
        train_accuracy.append(accuracy)
        pbar.set_postfix(cost = cost, accuracy = accuracy)
    
    pbar = tqdm(
        range(0, len(test_X), batch_size), desc = 'testing minibatch loop')
    for i in pbar:
        batch_x = test_X[i : min(i + batch_size, len(test_X))]
        batch_x = tf.keras.preprocessing.sequence.pad_sequences(
            batch_x, dtype = 'float32', padding = 'post'
        )
        y = test_Y[i : min(i + batch_size, len(test_X))]
        batch_y = sparse_tuple_from(y)
        batch_label, batch_len = pad_sentence_batch(y, 0)
        cost, accuracy = sess.run(
            [model.cost, model.accuracy],
            feed_dict = {model.model.X: batch_x, model.Y: batch_y, 
                         model.label: batch_label, model.Y_seq_len: batch_len},
        )
        
        test_cost.append(cost)
        test_accuracy.append(accuracy)
        
        pbar.set_postfix(cost = cost, accuracy = accuracy)
    print('epoch %d, training avg cost %f, training avg accuracy %f'%(e + 1, np.mean(train_cost), 
                                                                      np.mean(train_accuracy)))
    
    print('epoch %d, testing avg cost %f, testing avg accuracy %f'%(e + 1, np.mean(test_cost), 
                                                                    np.mean(test_accuracy)))

minibatch loop: 100%|██████████| 256/256 [14:50<00:00,  3.48s/it, accuracy=0.175, cost=25]   
testing minibatch loop: 100%|██████████| 9/9 [00:06<00:00,  1.33it/s, accuracy=0.174, cost=25.2]
minibatch loop:   0%|          | 0/256 [00:00<?, ?it/s]

epoch 1, training avg cost 43.598000, training avg accuracy 0.064888
epoch 1, testing avg cost 24.874825, testing avg accuracy 0.175300


minibatch loop: 100%|██████████| 256/256 [03:55<00:00,  1.09it/s, accuracy=0.758, cost=14]  
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  4.59it/s, accuracy=0.754, cost=14]  
minibatch loop:   0%|          | 0/256 [00:00<?, ?it/s]

epoch 2, training avg cost 18.001740, training avg accuracy 0.536167
epoch 2, testing avg cost 13.956101, testing avg accuracy 0.759634


minibatch loop: 100%|██████████| 256/256 [03:55<00:00,  1.09it/s, accuracy=0.758, cost=13.3]
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  4.69it/s, accuracy=0.754, cost=13.1]
minibatch loop:   0%|          | 0/256 [00:00<?, ?it/s]

epoch 3, training avg cost 13.324810, training avg accuracy 0.760206
epoch 3, testing avg cost 13.051960, testing avg accuracy 0.759634


minibatch loop: 100%|██████████| 256/256 [03:55<00:00,  1.09it/s, accuracy=0.758, cost=12.5]
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  4.69it/s, accuracy=0.757, cost=12.3]
minibatch loop:   0%|          | 0/256 [00:00<?, ?it/s]

epoch 4, training avg cost 12.380519, training avg accuracy 0.762767
epoch 4, testing avg cost 12.263859, testing avg accuracy 0.761555


minibatch loop: 100%|██████████| 256/256 [03:55<00:00,  1.09it/s, accuracy=0.783, cost=11.7]
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  4.64it/s, accuracy=0.777, cost=11.4]
minibatch loop:   0%|          | 0/256 [00:00<?, ?it/s]

epoch 5, training avg cost 11.345423, training avg accuracy 0.771282
epoch 5, testing avg cost 11.406837, testing avg accuracy 0.778740


minibatch loop: 100%|██████████| 256/256 [03:55<00:00,  1.09it/s, accuracy=0.786, cost=11]  
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  4.56it/s, accuracy=0.781, cost=10.7]
minibatch loop:   0%|          | 0/256 [00:00<?, ?it/s]

epoch 6, training avg cost 10.564371, training avg accuracy 0.782923
epoch 6, testing avg cost 10.812013, testing avg accuracy 0.784442


minibatch loop: 100%|██████████| 256/256 [03:55<00:00,  1.09it/s, accuracy=0.789, cost=10.3]
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  4.71it/s, accuracy=0.789, cost=10.3]
minibatch loop:   0%|          | 0/256 [00:00<?, ?it/s]

epoch 7, training avg cost 9.970288, training avg accuracy 0.786440
epoch 7, testing avg cost 10.414001, testing avg accuracy 0.788496


minibatch loop: 100%|██████████| 256/256 [03:55<00:00,  1.09it/s, accuracy=0.8, cost=9.85]  
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  4.66it/s, accuracy=0.792, cost=9.8] 
minibatch loop:   0%|          | 0/256 [00:00<?, ?it/s]

epoch 8, training avg cost 9.366413, training avg accuracy 0.789188
epoch 8, testing avg cost 9.973436, testing avg accuracy 0.790625


minibatch loop: 100%|██████████| 256/256 [03:55<00:00,  1.09it/s, accuracy=0.8, cost=9.31]  
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  4.59it/s, accuracy=0.79, cost=9.2]  
minibatch loop:   0%|          | 0/256 [00:00<?, ?it/s]

epoch 9, training avg cost 8.785997, training avg accuracy 0.792536
epoch 9, testing avg cost 9.549655, testing avg accuracy 0.792472


minibatch loop: 100%|██████████| 256/256 [03:55<00:00,  1.09it/s, accuracy=0.803, cost=8.87]
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  4.67it/s, accuracy=0.793, cost=8.94]
minibatch loop:   0%|          | 0/256 [00:00<?, ?it/s]

epoch 10, training avg cost 8.247080, training avg accuracy 0.795484
epoch 10, testing avg cost 9.344234, testing avg accuracy 0.792895


minibatch loop: 100%|██████████| 256/256 [04:07<00:00,  1.03it/s, accuracy=0.811, cost=8.79]
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  4.64it/s, accuracy=0.792, cost=8.83]
minibatch loop:   0%|          | 0/256 [00:00<?, ?it/s]

epoch 11, training avg cost 7.768482, training avg accuracy 0.798755
epoch 11, testing avg cost 9.094260, testing avg accuracy 0.793565


minibatch loop: 100%|██████████| 256/256 [03:55<00:00,  1.09it/s, accuracy=0.803, cost=8.01]
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  4.75it/s, accuracy=0.79, cost=8.84] 
minibatch loop:   0%|          | 0/256 [00:00<?, ?it/s]

epoch 12, training avg cost 7.222462, training avg accuracy 0.802877
epoch 12, testing avg cost 8.959049, testing avg accuracy 0.793602


minibatch loop: 100%|██████████| 256/256 [03:56<00:00,  1.08it/s, accuracy=0.811, cost=7.49]
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  4.74it/s, accuracy=0.802, cost=8.23]
minibatch loop:   0%|          | 0/256 [00:00<?, ?it/s]

epoch 13, training avg cost 6.774594, training avg accuracy 0.807660
epoch 13, testing avg cost 8.574215, testing avg accuracy 0.801739


minibatch loop: 100%|██████████| 256/256 [03:56<00:00,  1.08it/s, accuracy=0.825, cost=6.63]
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  4.73it/s, accuracy=0.808, cost=7.48]
minibatch loop:   0%|          | 0/256 [00:00<?, ?it/s]

epoch 14, training avg cost 6.286732, training avg accuracy 0.814884
epoch 14, testing avg cost 7.930902, testing avg accuracy 0.807989


minibatch loop: 100%|██████████| 256/256 [03:56<00:00,  1.08it/s, accuracy=0.828, cost=6.35]
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  4.72it/s, accuracy=0.814, cost=7.12]
minibatch loop:   0%|          | 0/256 [00:00<?, ?it/s]

epoch 15, training avg cost 5.716238, training avg accuracy 0.823812
epoch 15, testing avg cost 7.493068, testing avg accuracy 0.815648


minibatch loop: 100%|██████████| 256/256 [03:56<00:00,  1.08it/s, accuracy=0.842, cost=5.94]
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  4.70it/s, accuracy=0.813, cost=6.96]
minibatch loop:   0%|          | 0/256 [00:00<?, ?it/s]

epoch 16, training avg cost 5.235321, training avg accuracy 0.834013
epoch 16, testing avg cost 7.386287, testing avg accuracy 0.816727


minibatch loop: 100%|██████████| 256/256 [03:56<00:00,  1.08it/s, accuracy=0.853, cost=5.49]
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  4.69it/s, accuracy=0.812, cost=6.65]
minibatch loop:   0%|          | 0/256 [00:00<?, ?it/s]

epoch 17, training avg cost 4.880253, training avg accuracy 0.842953
epoch 17, testing avg cost 6.975424, testing avg accuracy 0.821068


minibatch loop: 100%|██████████| 256/256 [03:56<00:00,  1.08it/s, accuracy=0.861, cost=5.08]
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  4.72it/s, accuracy=0.833, cost=6.5] 
minibatch loop:   0%|          | 0/256 [00:00<?, ?it/s]

epoch 18, training avg cost 4.658150, training avg accuracy 0.848625
epoch 18, testing avg cost 6.765450, testing avg accuracy 0.836382


minibatch loop: 100%|██████████| 256/256 [03:55<00:00,  1.09it/s, accuracy=0.856, cost=4.9] 
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  4.69it/s, accuracy=0.825, cost=6.65]
minibatch loop:   0%|          | 0/256 [00:00<?, ?it/s]

epoch 19, training avg cost 4.275014, training avg accuracy 0.856806
epoch 19, testing avg cost 6.725954, testing avg accuracy 0.832389


minibatch loop: 100%|██████████| 256/256 [03:56<00:00,  1.08it/s, accuracy=0.869, cost=4.54]
testing minibatch loop: 100%|██████████| 9/9 [00:01<00:00,  4.68it/s, accuracy=0.821, cost=6.73]

epoch 20, training avg cost 3.837261, training avg accuracy 0.868425
epoch 20, testing avg cost 6.744668, testing avg accuracy 0.832394





In [19]:
import random

random_index = random.randint(0, len(test_X) - 1)
batch_x = test_X[random_index : random_index + 1]
print(
    'real:',
    ''.join(
        [idx2char[no] for no in test_Y[random_index : random_index + 1][0]]
    ),
)
batch_y = sparse_tuple_from(test_Y[random_index : random_index + 1])
pred = sess.run(model.preds, feed_dict = {model.model.X: batch_x})[0]
print('predicted:', ''.join([idx2char[no] for no in pred]))

real: say the word goose
predicted: say the wor
