In [1]:
import tensorflow as tf
import os
from utils import *
from tqdm import tqdm

In [2]:
def layer_norm(inputs, epsilon=1e-8):
    mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
    normalized = (inputs - mean) / (tf.sqrt(variance + epsilon))
    params_shape = inputs.get_shape()[-1:]
    gamma = tf.get_variable('gamma', params_shape, tf.float32, tf.ones_initializer())
    beta = tf.get_variable('beta', params_shape, tf.float32, tf.zeros_initializer())
    return gamma * normalized + beta


def cnn_block(x, dilation_rate, pad_sz, hidden_dim, kernel_size):
    x = layer_norm(x)
    pad = tf.zeros([tf.shape(x)[0], pad_sz, hidden_dim])
    x =  tf.layers.conv1d(inputs = tf.concat([pad, x, pad], 1),
                          filters = hidden_dim,
                          kernel_size = kernel_size,
                          dilation_rate = dilation_rate)
    x = x[:, :-pad_sz, :]
    x = tf.nn.relu(x)
    return x

class Model:
    def __init__(
        self,
        num_layers,
        size_layers,
        learning_rate = 1e-4,
        kernel_size = 3
    ):
        self.Y = tf.placeholder(tf.float32, (None, None, n_mels * resampled))
        self.Z = tf.placeholder(
            tf.float32, (None, None, fourier_window_size // 2 + 1)
        )
        
        batch_size = tf.shape(self.Y)[0]
        
        out_decoder = tf.reshape(
                self.Y, [tf.shape(self.Y)[0], -1, n_mels]
        )
        out_decoder = tf.layers.dense(out_decoder, size_layers)
        
        for i in range(num_layers): 
            dilation_rate = 2 ** i
            pad_sz = (kernel_size - 1) * dilation_rate 
            with tf.variable_scope('block_%d'%i):
                out_decoder += cnn_block(out_decoder, dilation_rate, pad_sz, size_layers, kernel_size)
        
        self.Z_hat = tf.layers.dense(out_decoder, 1 + fourier_window_size // 2)
        self.loss1 = tf.reduce_mean(tf.square(tf.sigmoid(self.Z_hat) - self.Z))
        self.loss_bd = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.Z_hat, 
                                                                          labels=self.Z))
        self.loss = self.loss1 + self.loss_bd
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate = learning_rate
        ).minimize(self.loss)

In [3]:
tf.reset_default_graph()
sess = tf.InteractiveSession()

size_layers = 128
learning_rate = 1e-3
num_layers = 4

model = Model(num_layers, size_layers, learning_rate)
sess.run(tf.global_variables_initializer())

In [4]:
paths, lengths, texts, raw_texts = [], [], [], []
text_files = [f for f in os.listdir('mel_old') if f.endswith('.npy')]
for fpath in text_files:
    with open('%s/%s' % ('old', fpath.replace('npy', 'txt'))) as fopen:
        text = fopen.read()
    paths.append(fpath.replace('.npy', ''))
    text = text_normalize(text)
    raw_texts.append(text)
    text = text + 'E'
    texts.append(np.array([char2idx[char] for char in text], np.int32))
    lengths.append(len(text))

In [5]:
def get_cached(path):
    mel = 'mel_old/{}.npy'.format(path)
    mag_ori = 'mag_old/{}.npy'.format(path)
    path = path.replace('OAF','YAF')
    mag = 'mag_young/{}.npy'.format(path)
    return np.load(mel), np.load(mag), np.load(mag_ori)

In [6]:
def dynamic_batching(paths):
    files, max_y, max_z = [], 0, 0
    for n in range(len(paths)):
        files.append(get_cached(paths[n]))
        if files[-1][0].shape[0] > max_y:
            max_y = files[-1][0].shape[0]
        if files[-1][1].shape[0] > max_z:
            max_z = files[-1][1].shape[0]
    return files, max_y, max_z

In [7]:
train_paths = paths[:-1]
test_paths = paths[-1:]

In [8]:
EPOCH = 300
for i in range(EPOCH):
    pbar = tqdm(range(0, len(train_paths), batch_size), desc = 'minibatch loop')
    for k in pbar:
        index = min(k + batch_size, len(train_paths))
        files, max_y, _ = dynamic_batching(train_paths[k:index])
        max_y += 10
        batch = len(files)
        batch_y = np.zeros((len(files), max_y, n_mels * resampled))
        max_z = (np.array(batch_y.shape).prod() // batch // n_mels)
        batch_z = np.zeros((len(files), max_z, fourier_window_size // 2 + 1))
        batch_ori = []
        for n in range(len(files)):
            batch_y[n, :, :] = np.pad(
                files[n][0],
                ((0, max_y - files[n][0].shape[0]), (0, 0)),
                mode = 'constant',
            )
            batch_z[n, :, :] = np.pad(
                files[n][1],
                ((0, max_z - files[n][1].shape[0]), (0, 0)),
                mode = 'constant',
            )
            batch_ori.append(files[n][2])
        _, cost, loss1, loss_bd = sess.run(
            [model.optimizer, model.loss, model.loss1, model.loss_bd],
            feed_dict = {model.Y: batch_y, model.Z: batch_z},
        )
        pbar.set_postfix(cost = cost, loss1 = loss1, loss_bd = loss_bd, epoch = i + 1)

minibatch loop: 100%|██████████| 7/7 [00:05<00:00,  1.44it/s, cost=0.674, epoch=1, loss1=0.0599, loss_bd=0.615]
minibatch loop: 100%|██████████| 7/7 [00:00<00:00,  7.54it/s, cost=0.545, epoch=2, loss1=0.0285, loss_bd=0.517]
minibatch loop: 100%|██████████| 7/7 [00:00<00:00,  7.33it/s, cost=0.525, epoch=3, loss1=0.0233, loss_bd=0.501]
minibatch loop: 100%|██████████| 7/7 [00:00<00:00,  7.73it/s, cost=0.517, epoch=4, loss1=0.0208, loss_bd=0.496]
minibatch loop: 100%|██████████| 7/7 [00:00<00:00,  7.51it/s, cost=0.51, epoch=5, loss1=0.0187, loss_bd=0.492] 
minibatch loop: 100%|██████████| 7/7 [00:00<00:00,  7.51it/s, cost=0.508, epoch=6, loss1=0.018, loss_bd=0.49]  
minibatch loop: 100%|██████████| 7/7 [00:00<00:00,  7.42it/s, cost=0.503, epoch=7, loss1=0.0165, loss_bd=0.486]
minibatch loop: 100%|██████████| 7/7 [00:00<00:00,  7.90it/s, cost=0.502, epoch=8, loss1=0.0164, loss_bd=0.486]
minibatch loop: 100%|██████████| 7/7 [00:00<00:00,  7.50it/s, cost=0.499, epoch=9, loss1=0.0153, loss_bd

minibatch loop: 100%|██████████| 7/7 [00:01<00:00,  6.77it/s, cost=0.469, epoch=73, loss1=0.00731, loss_bd=0.461]
minibatch loop: 100%|██████████| 7/7 [00:01<00:00,  6.76it/s, cost=0.47, epoch=74, loss1=0.00767, loss_bd=0.462] 
minibatch loop: 100%|██████████| 7/7 [00:01<00:00,  7.00it/s, cost=0.469, epoch=75, loss1=0.00747, loss_bd=0.462]
minibatch loop: 100%|██████████| 7/7 [00:01<00:00,  6.79it/s, cost=0.468, epoch=76, loss1=0.00724, loss_bd=0.461]
minibatch loop: 100%|██████████| 7/7 [00:01<00:00,  6.90it/s, cost=0.468, epoch=77, loss1=0.00718, loss_bd=0.461]
minibatch loop: 100%|██████████| 7/7 [00:00<00:00,  7.15it/s, cost=0.468, epoch=78, loss1=0.007, loss_bd=0.461]  
minibatch loop: 100%|██████████| 7/7 [00:00<00:00,  7.07it/s, cost=0.468, epoch=79, loss1=0.007, loss_bd=0.461]  
minibatch loop: 100%|██████████| 7/7 [00:00<00:00,  7.19it/s, cost=0.467, epoch=80, loss1=0.00695, loss_bd=0.461]
minibatch loop: 100%|██████████| 7/7 [00:01<00:00,  6.86it/s, cost=0.467, epoch=81, loss

minibatch loop: 100%|██████████| 7/7 [00:00<00:00,  7.19it/s, cost=0.465, epoch=144, loss1=0.0061, loss_bd=0.459] 
minibatch loop: 100%|██████████| 7/7 [00:01<00:00,  6.91it/s, cost=0.465, epoch=145, loss1=0.00618, loss_bd=0.459]
minibatch loop: 100%|██████████| 7/7 [00:01<00:00,  6.91it/s, cost=0.465, epoch=146, loss1=0.00624, loss_bd=0.459]
minibatch loop: 100%|██████████| 7/7 [00:01<00:00,  6.86it/s, cost=0.465, epoch=147, loss1=0.00618, loss_bd=0.459]
minibatch loop: 100%|██████████| 7/7 [00:01<00:00,  6.89it/s, cost=0.465, epoch=148, loss1=0.00623, loss_bd=0.459]
minibatch loop: 100%|██████████| 7/7 [00:00<00:00,  7.12it/s, cost=0.465, epoch=149, loss1=0.00606, loss_bd=0.459]
minibatch loop: 100%|██████████| 7/7 [00:01<00:00,  6.74it/s, cost=0.465, epoch=150, loss1=0.00609, loss_bd=0.459]
minibatch loop: 100%|██████████| 7/7 [00:01<00:00,  6.77it/s, cost=0.465, epoch=151, loss1=0.00609, loss_bd=0.459]
minibatch loop: 100%|██████████| 7/7 [00:01<00:00,  6.89it/s, cost=0.464, epoch=

minibatch loop: 100%|██████████| 7/7 [00:01<00:00,  6.93it/s, cost=0.461, epoch=215, loss1=0.00508, loss_bd=0.456]
minibatch loop: 100%|██████████| 7/7 [00:01<00:00,  6.95it/s, cost=0.461, epoch=216, loss1=0.00504, loss_bd=0.456]
minibatch loop: 100%|██████████| 7/7 [00:00<00:00,  7.11it/s, cost=0.461, epoch=217, loss1=0.00506, loss_bd=0.456]
minibatch loop: 100%|██████████| 7/7 [00:01<00:00,  6.83it/s, cost=0.461, epoch=218, loss1=0.00508, loss_bd=0.456]
minibatch loop: 100%|██████████| 7/7 [00:01<00:00,  6.95it/s, cost=0.461, epoch=219, loss1=0.00508, loss_bd=0.456]
minibatch loop: 100%|██████████| 7/7 [00:01<00:00,  6.94it/s, cost=0.461, epoch=220, loss1=0.00506, loss_bd=0.456]
minibatch loop: 100%|██████████| 7/7 [00:01<00:00,  6.97it/s, cost=0.462, epoch=221, loss1=0.00512, loss_bd=0.456]
minibatch loop: 100%|██████████| 7/7 [00:01<00:00,  6.96it/s, cost=0.461, epoch=222, loss1=0.00509, loss_bd=0.456]
minibatch loop: 100%|██████████| 7/7 [00:01<00:00,  6.91it/s, cost=0.461, epoch=

minibatch loop: 100%|██████████| 7/7 [00:00<00:00,  7.03it/s, cost=0.46, epoch=286, loss1=0.00457, loss_bd=0.455] 
minibatch loop: 100%|██████████| 7/7 [00:01<00:00,  6.89it/s, cost=0.46, epoch=287, loss1=0.00456, loss_bd=0.455] 
minibatch loop: 100%|██████████| 7/7 [00:00<00:00,  7.20it/s, cost=0.46, epoch=288, loss1=0.00459, loss_bd=0.455] 
minibatch loop: 100%|██████████| 7/7 [00:01<00:00,  6.90it/s, cost=0.46, epoch=289, loss1=0.00457, loss_bd=0.455] 
minibatch loop: 100%|██████████| 7/7 [00:01<00:00,  6.76it/s, cost=0.46, epoch=290, loss1=0.00456, loss_bd=0.455] 
minibatch loop: 100%|██████████| 7/7 [00:01<00:00,  6.80it/s, cost=0.46, epoch=291, loss1=0.00458, loss_bd=0.455] 
minibatch loop: 100%|██████████| 7/7 [00:01<00:00,  6.76it/s, cost=0.46, epoch=292, loss1=0.00459, loss_bd=0.455] 
minibatch loop: 100%|██████████| 7/7 [00:01<00:00,  6.92it/s, cost=0.46, epoch=293, loss1=0.00465, loss_bd=0.455] 
minibatch loop: 100%|██████████| 7/7 [00:01<00:00,  6.85it/s, cost=0.46, epoch=2

In [9]:
import IPython.display as ipd

## Original young

In [10]:
audio = spectrogram2wav(batch_z[0])
ipd.Audio(audio, rate = sample_rate)

## Original old

In [11]:
audio = spectrogram2wav(batch_ori[0])
ipd.Audio(audio, rate = sample_rate)

## Changing from old to young

In [12]:
mags = sess.run(tf.sigmoid(model.Z_hat), {model.Y: [batch_y[0]]})
audio = spectrogram2wav(mags[0])
ipd.Audio(audio, rate = sample_rate)

## Test

In [13]:
files, max_y, _ = dynamic_batching(test_paths)
max_y += 10
batch = len(files)
batch_y = np.zeros((len(files), max_y, n_mels * resampled))
max_z = (np.array(batch_y.shape).prod() // batch // n_mels)
batch_z = np.zeros((len(files), max_z, fourier_window_size // 2 + 1))
batch_y[0, :, :] = np.pad(
    files[0][0],
    ((0, max_y - files[0][0].shape[0]), (0, 0)),
    mode = 'constant')
batch_z[0, :, :] = np.pad(
    files[0][1],
    ((0, max_z - files[0][1].shape[0]), (0, 0)),
    mode = 'constant',)
audio = spectrogram2wav(batch_z[0])
ipd.Audio(audio, rate = sample_rate)

In [14]:
mags = sess.run(tf.sigmoid(model.Z_hat), {model.Y: [batch_y[0]]})
audio = spectrogram2wav(mags[0])
ipd.Audio(audio, rate = sample_rate)