In [1]:
import IPython.display as ipd
import librosa
import numpy as np
import time

import keras.backend as K
from keras import losses
from keras.models import Model, load_model
from keras.layers import Input, Embedding, Cropping1D, Conv1D, Multiply, Dense, Lambda
from keras.layers import Reshape, RepeatVector, Concatenate
from keras.optimizers import Adam
from keras.callbacks import Callback

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
sample_rate = 16000
model_id = int(time.time())

In [3]:
# single channel
dataset_path = 'C:/Datasets/LJSpeech-1.1'
with open('{0}/metadata.csv'.format(dataset_path), encoding='utf-8') as f:
    metadata_str = f.read()
metadata = [line.split('|') for line in metadata_str.split('\n')]
metadata = [ex[:2] for ex in metadata[:-1]]
print('first example:')
ipd.display(ipd.Audio('{0}/wavs/{1}.wav'.format(
    dataset_path, metadata[0][0])))
print(metadata[0][1])

first example:


Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition


In [4]:
print(len(metadata))
print(metadata[0][0], metadata[-1][0])

13100
LJ001-0001 LJ050-0278


In [5]:
y, sr = librosa.load('{0}/wavs/{1}.wav'.format(
    dataset_path, metadata[0][0]), sr=sample_rate)
print(len(y), sr)
ipd.display(ipd.Audio(y, rate=sr))

154481 16000


In [6]:
def mu_law(x, mu=255):
    return np.sign(x) * np.log(1 + mu * np.abs(x)) / np.log(1 + mu)

def mu_law_inverse(x, mu=255):
    return np.sign(x) * (1 / mu) * (np.power(1 + mu, np.abs(x)) - 1)

def pred_sample(prob_vec):
    psum = np.sum(prob_vec)
    choice = np.random.rand()
    itersum = 0
    for idx, val in enumerate(prob_vec):
        itersum += val/psum
        if choice <= itersum:
            return idx
    return 0

In [7]:
def process(x):
    x = mu_law(x)
    x = ((x + 1) * 256 / 2).astype('int32')
    x = np.asarray([255 if x == 256 else x for x in x])
    return x

def deprocess(x):
    x = (x + 0.5) / (256 / 2.) - 1
    x = mu_law_inverse(x)
    return x

In [8]:
yp = process(y)
ydp = deprocess(yp)
ipd.display(ipd.Audio(ydp, rate=sample_rate))

In [9]:
class WaveNetVAE(object):
    """Variational autoencoder for audio using WaveNet-style encoder and decoder
    
    # Arguments
        num_layers: int, number of layers in encoder and decoder with increasing dilation rates in each group
        num_groups: int, number of layer groups with dilation rates reset between them
        num_filters: int, number of filters in each filter and gate convolution, and in embedding
        latent_size: int, size of encoded latent representation
        epsilon_std: float, standard deviation for vae sampling
    """
    
    def __init__(self, num_layers=10, num_groups=1, num_filters=64, latent_size=64, epsilon_std=1.0):
        """Build models"""
        self.num_layers = num_layers
        self.num_groups = num_groups
        self.num_filters = num_filters
        self.latent_size = latent_size
        self.epsilon_std = epsilon_std
        self.receptive_field = 2**num_layers
        
        rf = self.receptive_field
        
        
        # embed
        l_embed_in = Input(shape=(None,), name='embed_in')
        l_embed_out = Embedding(input_dim=256, output_dim=num_filters, name='embed')(l_embed_in)
        self.embed_model = Model(inputs=l_embed_in, outputs=l_embed_out)
        
        
        # encode
        l_enc_in = Input(shape=(rf,), name='enc_in')
        h = self.embed_model(l_enc_in)
        for i in range(num_groups):
            for j in range(num_layers):
                l_filter = Conv1D(filters=num_filters, kernel_size=2,
                    dilation_rate=2**j, padding='valid', activation='tanh',
                    name='enc_filter_{0}_{1}'.format(i+1, j+1))(h)
                l_gate = Conv1D(filters=num_filters, kernel_size=2,
                    dilation_rate=2**j, padding='valid', activation='sigmoid',
                    name='enc_gate_{0}_{1}'.format(i+1, j+1))(h)
                h = Multiply(name='enc_multiply_{0}_{1}'.format(i+1, j+1))([l_filter, l_gate])
        # time dim should be 1
        h = Reshape((num_filters,), name='enc_reshape')(h)
        z_mean = Dense(latent_size, name='enc_z_mean')(h)
        z_log_var = Dense(latent_size, name='enc_z_log_var')(h)
        
        enc_mlv_model = Model(inputs=l_enc_in, outputs=[z_mean, z_log_var], name='enc_mlv_model')
        
        def sampling(args):
            z_mean, z_log_var = args
            epsilon = K.random_normal(shape=(K.shape(z_mean)[0], latent_size),
                mean=0., stddev=epsilon_std)
            return z_mean + K.exp(z_log_var / 2) * epsilon
        
        l_enc_sample = Lambda(sampling, output_shape=(latent_size,), name='enc_sample')
        self.encode_model = Model(inputs=l_enc_in, outputs=l_enc_sample([z_mean, z_log_var]), name='enc_model')
        
        
        # decode
        # shapes are (rf * 2, ...) for training, use (rf, ...) for inference
        l_dec_in_main = Input(shape=(None,))
        l_dec_embed = self.embed_model(l_dec_in_main)
        l_dec_in_latent = Input(shape=(None, latent_size), name='dec_in_latent')
        h = Concatenate(axis=-1, name='dec_concat')([l_dec_embed, l_dec_in_latent])
        for i in range(num_groups):
            for j in range(num_layers):
                l_filter = Conv1D(filters=num_filters, kernel_size=2,
                    dilation_rate=2**j, padding='valid', activation='tanh',
                    name='dec_filter_{0}_{1}'.format(i+1, j+1))(h)
                l_gate = Conv1D(filters=num_filters, kernel_size=2,
                    dilation_rate=2**j, padding='valid', activation='sigmoid',
                    name='dec_gate_{0}_{1}'.format(i+1, j+1))(h)
                h = Multiply(name='dec_multiply_{0}_{1}'.format(i+1, j+1))([l_filter, l_gate])
        l_dec_out = Conv1D(filters=256, kernel_size=1, activation='softmax', name='dec_out')(h)
        self.decode_model = Model(inputs=[l_dec_in_main, l_dec_in_latent], outputs=l_dec_out, name='dec_model')        
        
        
        # train
        l_in = Input(shape=(rf * 2,), dtype='int32', name='train_in')
        l_in_reshape = Reshape((rf * 2, 1), name='train_in_pre_crop_reshape')(l_in)
        h = Cropping1D((0, rf), name='train_crop_1')(l_in_reshape)
        l_in_enc_1 = Reshape((rf,), name='train_in_post_crop_reshape_1')(h)
        h = Cropping1D((rf, 0), name='train_crop_2')(l_in_reshape)
        l_in_enc_2 = Reshape((rf,), name='train_in_post_crop_reshape_2')(h)
        
        l_enc_mlv_1 = enc_mlv_model(l_in_enc_1)
        l_enc_1 = l_enc_sample(l_enc_mlv_1)
        l_enc_mlv_2 = enc_mlv_model(l_in_enc_2)
        l_enc_2 = l_enc_sample(l_enc_mlv_2)
        l_enc_1_rep = RepeatVector(rf, name='enc_1_rep')(l_enc_1)
        l_enc_2_rep = RepeatVector(rf, name='enc_2_rep')(l_enc_2)
        l_enc_concat = Concatenate(axis=-2, name='enc_concat')([l_enc_1_rep, l_enc_2_rep])
        
        l_out = self.decode_model([l_in, l_enc_concat])
        
        def vae_loss(y_true, y_pred):
            xent_loss = losses.binary_crossentropy(y_true, y_pred)
            xent_loss = K.mean(xent_loss, axis=-1)
            zm1, zlv1 = l_enc_mlv_1
            kl_loss_1 = -0.5 * K.sum(1 + zlv1 - K.square(zm1) - K.exp(zlv1), axis=-1)
            zm2, zlv2 = l_enc_mlv_2
            kl_loss_2 = -0.5 * K.sum(1 + zlv2 - K.square(zm2) - K.exp(zlv2), axis=-1)
            return xent_loss + kl_loss_1 + kl_loss_2
        
        self.train_model = Model(inputs=l_in, outputs=l_out, name='train_model')
        
        adam = Adam(0.001)
        self.train_model.compile(optimizer=adam, loss=vae_loss)
    
    def generate_data(self, batch_size=8, fpath=None):
        rf = self.receptive_field
        while True:
            batch_x = np.zeros((batch_size, rf * 2), dtype='int32')
            batch_y = np.zeros((batch_size, 
                rf * 2 - (rf - 1), 256), dtype='int32')
            for batch_idx in range(batch_size):
                data = np.zeros((rf * 2 + 1,))
                dataset_idx = np.random.randint(len(metadata))
                if fpath:
                    x_aud, _ = librosa.load(fpath, sr=sample_rate)
                else:
                    x_aud, _ = librosa.load('{0}/wavs/{1}.wav'.format(
                        dataset_path, metadata[dataset_idx][0]), sr=sample_rate)
                x_aud = np.clip(x_aud, -1, 1)
                slice_s = max(0, len(x_aud) - (rf * 2 + 1))
                if slice_s != 0:
                    slice_s = np.random.randint(0, slice_s)
                slice_e = min(slice_s + rf * 2 + 1, len(x_aud))
                slice_len = slice_e - slice_s
                data[-slice_len:] = x_aud[slice_s:slice_e]
                data = process(data)
                batch_x[batch_idx] = data[:-1]
                y_slice = data[rf:]
                for j, sample_id in enumerate(y_slice):
                    batch_y[batch_idx, j, sample_id] = 1
            yield (batch_x, batch_y)
    
    def generate_audio(self, samples=sample_rate*3, samp_id=0, save=False):
        rf = self.receptive_field
        bx, by = next(self.generate_data(batch_size=1))
        x = bx[0][-rf:]
        audio = []
        enc_1 = vae.encode_model.predict(np.array([x]))[0]
        enc_2 = np.random.randn(self.latent_size) * self.epsilon_std
        enc_pos = 0
        # todo: more efficient sampling
        for i in range(samples):
            enc_in = np.zeros((rf, self.latent_size))
            if enc_pos == rf:
                enc_pos = 0
                enc_1 = enc_2
                enc_2 = np.random.randn(self.latent_size) * self.epsilon_std
            enc_in[:rf - enc_pos] = enc_1
            enc_in[rf - enc_pos:] = enc_2
            y = self.decode_model.predict([np.array([x]), np.array([enc_in])])
            y = pred_sample(y[0, -1])
            x = x[:][1:]
            x = np.concatenate((x, [y]), axis=-1)
            enc_pos += 1
            audio.append(deprocess(y))
        audio = np.array(audio)
        if save:
            fpath = 'outputs/wnvae_{0}_{1}.wav'.format(
                samp_id, int(time.time()))
            librosa.output.write_wav(fpath, audio, sample_rate)
            print(fpath)
        ipd.display(ipd.Audio(audio, rate=sample_rate))

In [10]:
vae = WaveNetVAE()

In [11]:
bx, by = next(vae.generate_data(batch_size=1))
print(bx[0,-5:])
print(np.argmax(by[0,-5:], axis=-1))
print(len(bx[0]), len(by[0]))

[ 56 170 161 181 101]
[170 161 181 101  61]
2048 1025


In [12]:
time_s = time.time()
vae.generate_audio(100)
print('generated 100 samples in {0:.2f} seconds'.format(time.time()-time_s))

generated 100 samples in 4.84 seconds


In [13]:
class GenAudCB(Callback):
    def __init__(self):
        super(GenAudCB, self).__init__()
    
    def on_epoch_end(self, epoch, logs=None):
        if epoch % 10 == 0:
            print()
            vae.generate_audio(samples=sample_rate, samp_id=epoch, save=True)

In [14]:
class SaveCB(Callback):
    def __init__(self):
        super(SaveCB, self).__init__()
    
    def on_epoch_end(self, epoch, logs=None):
        if epoch % 10 == 0:
            print()
            vae.encode_model.save('models/wnvae_{0}_encode.h5'.format(model_id))
            vae.decode_model.save('models/wnvae_{0}_decode.h5'.format(model_id))
            print('saved models')

In [15]:
def trainfor(epochs, save=True, gen_aud=True):
    callbacks = []
    if save:
        callbacks.append(SaveCB())
    if gen_aud:
        callbacks.append(GenAudCB())
    vae.train_model.fit_generator(vae.generate_data(), 2e2, epochs, callbacks=callbacks)

In [None]:
trainfor(1e9)

Epoch 1/1000000000
saved models

outputs/wnvae_0_1525580995.wav


Epoch 2/1000000000
Epoch 3/1000000000
Epoch 4/1000000000
Epoch 5/1000000000
Epoch 6/1000000000
Epoch 7/1000000000
Epoch 8/1000000000
Epoch 9/1000000000
Epoch 10/1000000000
Epoch 11/1000000000
saved models

outputs/wnvae_10_1525582899.wav


Epoch 12/1000000000
Epoch 13/1000000000
Epoch 14/1000000000
Epoch 15/1000000000
Epoch 16/1000000000
Epoch 17/1000000000
Epoch 18/1000000000
Epoch 19/1000000000
Epoch 20/1000000000
Epoch 21/1000000000
saved models

outputs/wnvae_20_1525584678.wav


Epoch 22/1000000000
Epoch 23/1000000000
Epoch 24/1000000000
Epoch 25/1000000000
Epoch 26/1000000000
Epoch 27/1000000000
Epoch 28/1000000000
Epoch 29/1000000000
Epoch 30/1000000000
Epoch 31/1000000000
saved models

outputs/wnvae_30_1525586427.wav


Epoch 32/1000000000
Epoch 33/1000000000
Epoch 34/1000000000
Epoch 35/1000000000
Epoch 36/1000000000
Epoch 37/1000000000
Epoch 38/1000000000
Epoch 39/1000000000
Epoch 40/1000000000
Epoch 41/1000000000
saved models

outputs/wnvae_40_1525588189.wav


Epoch 42/1000000000
Epoch 43/1000000000
Epoch 44/1000000000
Epoch 45/1000000000
Epoch 46/1000000000
Epoch 47/1000000000
Epoch 48/1000000000
Epoch 49/1000000000
Epoch 50/1000000000
Epoch 51/1000000000
saved models

outputs/wnvae_50_1525589944.wav


Epoch 52/1000000000
Epoch 53/1000000000
Epoch 54/1000000000
Epoch 55/1000000000
Epoch 56/1000000000
Epoch 57/1000000000
Epoch 58/1000000000
Epoch 59/1000000000
Epoch 60/1000000000
Epoch 61/1000000000
saved models

outputs/wnvae_60_1525591711.wav


Epoch 62/1000000000
Epoch 63/1000000000
Epoch 64/1000000000
Epoch 65/1000000000
Epoch 66/1000000000
Epoch 67/1000000000
Epoch 68/1000000000
Epoch 69/1000000000
Epoch 70/1000000000
Epoch 71/1000000000
saved models

outputs/wnvae_70_1525593478.wav


Epoch 72/1000000000
Epoch 73/1000000000
Epoch 74/1000000000
Epoch 75/1000000000
Epoch 76/1000000000
Epoch 77/1000000000
Epoch 78/1000000000
Epoch 79/1000000000
Epoch 80/1000000000
Epoch 81/1000000000
saved models

outputs/wnvae_80_1525595245.wav


Epoch 82/1000000000
Epoch 83/1000000000
Epoch 84/1000000000
Epoch 85/1000000000
Epoch 86/1000000000
Epoch 87/1000000000
Epoch 88/1000000000
Epoch 89/1000000000
Epoch 90/1000000000
Epoch 91/1000000000
saved models

outputs/wnvae_90_1525597023.wav


Epoch 92/1000000000
Epoch 93/1000000000
Epoch 94/1000000000
Epoch 95/1000000000
Epoch 96/1000000000
Epoch 97/1000000000
Epoch 98/1000000000
Epoch 99/1000000000
Epoch 100/1000000000
Epoch 101/1000000000
saved models

outputs/wnvae_100_1525598782.wav


Epoch 102/1000000000
Epoch 103/1000000000
Epoch 104/1000000000
Epoch 105/1000000000
Epoch 106/1000000000
Epoch 107/1000000000
Epoch 108/1000000000
Epoch 109/1000000000
Epoch 110/1000000000
Epoch 111/1000000000
saved models

outputs/wnvae_110_1525600998.wav


Epoch 112/1000000000
Epoch 113/1000000000
Epoch 114/1000000000
Epoch 115/1000000000
Epoch 116/1000000000
Epoch 117/1000000000
Epoch 118/1000000000
Epoch 119/1000000000
Epoch 120/1000000000
Epoch 121/1000000000
saved models

outputs/wnvae_120_1525602756.wav


Epoch 122/1000000000
Epoch 123/1000000000
Epoch 124/1000000000
Epoch 125/1000000000
Epoch 126/1000000000
Epoch 127/1000000000
Epoch 128/1000000000
Epoch 129/1000000000
Epoch 130/1000000000
Epoch 131/1000000000
saved models

outputs/wnvae_130_1525604504.wav


Epoch 132/1000000000
Epoch 133/1000000000
Epoch 134/1000000000
Epoch 135/1000000000
Epoch 136/1000000000
Epoch 137/1000000000
Epoch 138/1000000000
Epoch 139/1000000000
Epoch 140/1000000000
Epoch 141/1000000000
saved models

outputs/wnvae_140_1525606265.wav


Epoch 142/1000000000
Epoch 143/1000000000
Epoch 144/1000000000
Epoch 145/1000000000
Epoch 146/1000000000
Epoch 147/1000000000
Epoch 148/1000000000
Epoch 149/1000000000
Epoch 150/1000000000
Epoch 151/1000000000
saved models

outputs/wnvae_150_1525608026.wav


Epoch 152/1000000000
Epoch 153/1000000000
Epoch 154/1000000000
Epoch 155/1000000000
Epoch 156/1000000000
Epoch 157/1000000000
Epoch 158/1000000000
Epoch 159/1000000000
Epoch 160/1000000000
Epoch 161/1000000000
saved models

outputs/wnvae_160_1525609779.wav


Epoch 162/1000000000
Epoch 163/1000000000
Epoch 164/1000000000
Epoch 165/1000000000
Epoch 166/1000000000
Epoch 167/1000000000
Epoch 168/1000000000
Epoch 169/1000000000
Epoch 170/1000000000
Epoch 171/1000000000
saved models

outputs/wnvae_170_1525611630.wav


Epoch 172/1000000000
Epoch 173/1000000000
Epoch 174/1000000000
Epoch 175/1000000000
Epoch 176/1000000000
Epoch 177/1000000000
Epoch 178/1000000000
Epoch 179/1000000000
Epoch 180/1000000000
Epoch 181/1000000000
saved models

outputs/wnvae_180_1525613513.wav


Epoch 182/1000000000
Epoch 183/1000000000
Epoch 184/1000000000
Epoch 185/1000000000
Epoch 186/1000000000
Epoch 187/1000000000
Epoch 188/1000000000
Epoch 189/1000000000
Epoch 190/1000000000
Epoch 191/1000000000
saved models

outputs/wnvae_190_1525615345.wav


Epoch 192/1000000000
Epoch 193/1000000000
Epoch 194/1000000000
Epoch 195/1000000000
Epoch 196/1000000000
Epoch 197/1000000000
Epoch 198/1000000000
Epoch 199/1000000000
Epoch 200/1000000000
Epoch 201/1000000000
saved models

outputs/wnvae_200_1525617331.wav


Epoch 202/1000000000
Epoch 203/1000000000
Epoch 204/1000000000
Epoch 205/1000000000
Epoch 206/1000000000
Epoch 207/1000000000
Epoch 208/1000000000
Epoch 209/1000000000
Epoch 210/1000000000
Epoch 211/1000000000
saved models

