In [1]:
import tensorflow as tf
import os
from utils import *
from tqdm import tqdm

In [3]:
def encoder_block(inp, n_hidden, filter_size):
    inp = tf.expand_dims(inp, 2)
    inp = tf.pad(inp, [[0, 0], [(filter_size[0]-1)//2, (filter_size[0]-1)//2], [0, 0], [0, 0]])
    conv = tf.layers.conv2d(inp, n_hidden, filter_size, padding="VALID", activation=None)
    conv = tf.squeeze(conv, 2)
    return conv

def decoder_block(inp, n_hidden, filter_size):
    inp = tf.expand_dims(inp, 2)
    inp = tf.pad(inp, [[0, 0], [filter_size[0]-1, 0], [0, 0], [0, 0]])
    conv = tf.layers.conv2d(inp, n_hidden, filter_size, padding="VALID", activation=None)
    conv = tf.squeeze(conv, 2)
    return conv

def glu(x):
    return tf.multiply(x[:, :, :tf.shape(x)[2]//2], tf.sigmoid(x[:, :, tf.shape(x)[2]//2:]))

def layer(inp, conv_block, kernel_width, n_hidden, residual=None):
    z = conv_block(inp, n_hidden, (kernel_width, 1))
    return glu(z) + (residual if residual is not None else 0)

def layer_norm(inputs, epsilon=1e-8):
    mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
    normalized = (inputs - mean) / (tf.sqrt(variance + epsilon))
    params_shape = inputs.get_shape()[-1:]
    gamma = tf.get_variable('gamma', params_shape, tf.float32, tf.ones_initializer())
    beta = tf.get_variable('beta', params_shape, tf.float32, tf.zeros_initializer())
    return gamma * normalized + beta


def cnn_block(x, dilation_rate, pad_sz, hidden_dim, kernel_size):
    x = layer_norm(x)
    pad = tf.zeros([tf.shape(x)[0], pad_sz, hidden_dim])
    x =  tf.layers.conv1d(inputs = tf.concat([pad, x, pad], 1),
                          filters = hidden_dim,
                          kernel_size = kernel_size,
                          dilation_rate = dilation_rate)
    x = x[:, :-pad_sz, :]
    x = tf.nn.relu(x)
    return x

class Model:
    def __init__(
        self,
        num_layers,
        size_layers,
        learning_rate = 1e-4,
        n_attn_heads = 16,
        kernel_size = 3
    ):
        self.X = tf.placeholder(tf.int32, (None, None))
        self.training = tf.placeholder(tf.bool, None)
        lookup_table = tf.get_variable(
            'lookup_table',
            dtype = tf.float32,
            shape = [len(vocab), size_layers],
            initializer = tf.truncated_normal_initializer(
                mean = 0.0, stddev = 0.01
            ),
        )
        lookup_table = tf.concat(
            (tf.zeros(shape = [1, size_layers]), lookup_table[1:, :]), 0
        )
        self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32)
        self.gts = tf.convert_to_tensor(guided_attention())
        forward = tf.nn.embedding_lookup(lookup_table, self.X)
        self.Y = tf.placeholder(tf.float32, (None, None, n_mels * resampled))
        batch_size = tf.shape(self.Y)[0]
        self.decoder_inputs = tf.concat(
            (tf.ones_like(self.Y[:, :1, :]), self.Y[:, :-1, :]), 1
        )
        self.decoder_inputs = self.decoder_inputs[:, :, -n_mels:]
        
        self.Z = tf.placeholder(
            tf.float32, (None, None, fourier_window_size // 2 + 1)
        )
        
        seq_lens = tf.count_nonzero(
            tf.reduce_sum(self.decoder_inputs, -1), 1, dtype = tf.int32
        )
        
        e = tf.identity(forward)
        for i in range(num_layers):
            z = layer(forward, encoder_block, kernel_size, size_layers * 2, forward)
            forward = z
        
        encoder_output, output_memory = z, z + e
        decoder_inputs = tf.layers.dense(self.decoder_inputs, size_layers)
        g = tf.identity(decoder_inputs)
        
        for i in range(num_layers):
            attn_res = h = layer(decoder_inputs, decoder_block, kernel_size, size_layers * 2, 
                                         residual=tf.zeros_like(decoder_inputs))
            C = []
            for j in range(n_attn_heads):
                h_ = tf.layers.dense(h, size_layers//n_attn_heads)
                g_ = tf.layers.dense(g, size_layers//n_attn_heads)
                zu_ = tf.layers.dense(encoder_output, size_layers//n_attn_heads)
                ze_ = tf.layers.dense(output_memory, size_layers//n_attn_heads)

                d = tf.layers.dense(h_, size_layers//n_attn_heads) + g_
                dz = tf.matmul(d, tf.transpose(zu_, [0, 2, 1]))
                a = tf.nn.softmax(dz)
                c_ = tf.matmul(a, ze_)
                C.append(c_)
            c = tf.concat(C, 2)
            h = tf.layers.dense(attn_res + c, size_layers)
            decoder_inputs = h
        
        decoder_output = tf.sigmoid(h)
        self.Y_hat = tf.layers.dense(decoder_output, n_mels * resampled)
        self.loss1 = tf.reduce_mean(tf.abs(self.Y_hat - self.Y))
        self.loss_bd1 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.Y_hat, 
                                                                               labels=self.Y))
        
        out_decoder = tf.reshape(
                self.Y, [tf.shape(self.Y)[0], -1, n_mels]
        )
        out_decoder = tf.layers.dense(out_decoder, size_layers)
        
        for i in range(num_layers): 
            dilation_rate = 2 ** i
            pad_sz = (kernel_size - 1) * dilation_rate 
            with tf.variable_scope('block_%d'%i):
                out_decoder += cnn_block(out_decoder, dilation_rate, pad_sz, size_layers, kernel_size)
        
        self.Z_hat = tf.layers.dense(out_decoder, 1 + fourier_window_size // 2)
        self.loss2 = tf.reduce_mean(tf.abs(self.Z_hat - self.Z))
        self.loss_bd2 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.Z_hat, 
                                                                          labels=self.Z))
        self.loss = self.loss1 + self.loss2 + self.loss_bd1 + self.loss_bd2
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate = learning_rate
        ).minimize(self.loss)

In [4]:
tf.reset_default_graph()
sess = tf.InteractiveSession()

size_layers = 256
learning_rate = 1e-4
num_layers = 4

model = Model(num_layers, size_layers, learning_rate)
sess.run(tf.global_variables_initializer())

In [5]:
paths, lengths, texts, raw_texts = [], [], [], []
text_files = [f for f in os.listdir('mel') if f.endswith('.npy')]
for fpath in text_files:
    with open('%s/%s' % (path, fpath.replace('npy', 'txt'))) as fopen:
        text = fopen.read()
    paths.append(fpath.replace('.npy', ''))
    text = text_normalize(text)
    raw_texts.append(text)
    text = text + 'E'
    texts.append(np.array([char2idx[char] for char in text], np.int32))
    lengths.append(len(text))

In [6]:
def dynamic_batching(paths):
    files, max_y, max_z = [], 0, 0
    for n in range(len(paths)):
        files.append(get_cached(paths[n]))
        if files[-1][0].shape[0] > max_y:
            max_y = files[-1][0].shape[0]
        if files[-1][1].shape[0] > max_z:
            max_z = files[-1][1].shape[0]
    return files, max_y, max_z

In [7]:
EPOCH = 50
for i in range(EPOCH):
    pbar = tqdm(range(0, len(paths), batch_size), desc = 'minibatch loop')
    for k in pbar:
        index = min(k + batch_size, len(paths))
        files, max_y, max_z = dynamic_batching(paths[k:index])
        max_x = max(lengths[k:index])
        batch_x = np.zeros((batch_size, max_x))
        batch_y = np.zeros((batch_size, max_y, n_mels * resampled))
        batch_z = np.zeros((batch_size, max_z, fourier_window_size // 2 + 1))
        for n in range(len(files)):
            batch_x[n, :] = np.pad(
                texts[k + n],
                ((0, max_x - texts[k + n].shape[0])),
                mode = 'constant',
            )
            batch_y[n, :, :] = np.pad(
                files[n][0],
                ((0, max_y - files[n][0].shape[0]), (0, 0)),
                mode = 'constant',
            )
            batch_z[n, :, :] = np.pad(
                files[n][1],
                ((0, max_z - files[n][1].shape[0]), (0, 0)),
                mode = 'constant',
            )
        _, cost, loss1, loss2, loss_bd1, loss_bd2 = sess.run(
            [model.optimizer, model.loss, 
            model.loss1, model.loss2, model.loss_bd1,
            model.loss_bd2],
            feed_dict = {model.X: batch_x, model.Y: batch_y, model.Z: batch_z},
        )
        pbar.set_postfix(cost = cost, loss1 = loss1, loss2 = loss2, loss_bd1 = loss_bd1,
                        loss_bd2 = loss_bd2)

minibatch loop: 100%|██████████| 88/88 [00:26<00:00,  3.20it/s, cost=1.68, loss1=0.126, loss2=0.129, loss_bd1=0.71, loss_bd2=0.71]  
minibatch loop: 100%|██████████| 88/88 [00:22<00:00,  3.97it/s, cost=1.6, loss1=0.0953, loss2=0.089, loss_bd1=0.715, loss_bd2=0.698] 
minibatch loop: 100%|██████████| 88/88 [00:23<00:00,  3.89it/s, cost=1.57, loss1=0.0806, loss2=0.069, loss_bd1=0.719, loss_bd2=0.7]   
minibatch loop: 100%|██████████| 88/88 [00:22<00:00,  4.02it/s, cost=1.55, loss1=0.0704, loss2=0.0585, loss_bd1=0.717, loss_bd2=0.702]
minibatch loop: 100%|██████████| 88/88 [00:22<00:00,  3.98it/s, cost=1.53, loss1=0.0606, loss2=0.0526, loss_bd1=0.715, loss_bd2=0.704]
minibatch loop: 100%|██████████| 88/88 [00:22<00:00,  4.02it/s, cost=1.52, loss1=0.0532, loss2=0.0486, loss_bd1=0.713, loss_bd2=0.704]
minibatch loop: 100%|██████████| 88/88 [00:22<00:00,  3.90it/s, cost=1.51, loss1=0.049, loss2=0.0502, loss_bd1=0.711, loss_bd2=0.701] 
minibatch loop: 100%|██████████| 88/88 [00:23<00:00,  3.95

In [8]:
y_hat = np.ones((1, batch_y.shape[1], n_mels * resampled), np.float32)
for j in tqdm(range(batch_y.shape[1])):
    _y_hat = sess.run(model.Y_hat, {model.X: [texts[0]], model.Y: y_hat})
    y_hat[:, j, :] = _y_hat[:, j, :]

100%|██████████| 38/38 [00:03<00:00, 11.92it/s]


In [9]:
mags = sess.run(model.Z_hat, {model.Y: y_hat})

In [10]:
audio = spectrogram2wav(mags[0])

In [11]:
import IPython.display as ipd
ipd.Audio(audio, rate = sample_rate)

In [12]:
from scipy.io.wavfile import write
print('saving: %s'%(raw_texts[0]))
write(os.path.join('test.wav'), sample_rate, audio)

saving: say the word burn


In [13]:
mags = sess.run(model.Z_hat, {model.Y: [batch_y[0]]})
audio = spectrogram2wav(mags[0])
ipd.Audio(audio, rate = sample_rate)

In [14]:
real_audio = spectrogram2wav(batch_z[0])
ipd.Audio(real_audio, rate = sample_rate)

In [15]:
_y_hat = sess.run(model.Y_hat, {model.X: [texts[0]], model.Y: [batch_y[0]]})
mags = sess.run(model.Z_hat, {model.Y: y_hat})
audio = spectrogram2wav(mags[0])
ipd.Audio(audio, rate = sample_rate)