In [1]:
import IPython.display as ipd
import librosa
import numpy as np
import time

from keras.models import Sequential, load_model
from keras.layers import Embedding, Dense, Conv1D
from keras.layers.wrappers import TimeDistributed
from keras.optimizers import Adam
from keras.callbacks import Callback

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# single channel
dataset_path = 'C:/Datasets/LJSpeech-1.1'
with open('{0}/metadata.csv'.format(dataset_path), encoding='utf-8') as f:
    metadata_str = f.read()
metadata = [line.split('|') for line in metadata_str.split('\n')]
metadata = [ex[:2] for ex in metadata[:-1]]
print('first example:')
ipd.display(ipd.Audio('{0}/wavs/{1}.wav'.format(
    dataset_path, metadata[0][0])))
print(metadata[0][1])

first example:


Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition


In [3]:
print(len(metadata))
print(metadata[0][0], metadata[-1][0])

13100
LJ001-0001 LJ050-0278


In [4]:
y, sr = librosa.load('{0}/wavs/{1}.wav'.format(
    dataset_path, metadata[0][0]), sr=8000)
print(len(y), sr)
ipd.display(ipd.Audio(y, rate=sr))

77241 8000


In [5]:
#librosa.output.write_wav('test.wav', y, sr)

In [6]:
def mu_law(x, mu=255):
    return np.sign(x)*np.log(1 + mu*np.abs(x))/np.log(1 + mu)

def mu_law_inverse(x, mu=255):
    return np.sign(x)*(1/mu)*(np.power(1 + mu, np.abs(x)) - 1)

def pred_sample(prob_vec):
    psum = np.sum(prob_vec)
    choice = np.random.rand()
    itersum = 0
    for idx, val in enumerate(prob_vec):
        itersum += val/psum
        if choice <= itersum:
            return idx
    return 0

In [7]:
def process(x):
    x = mu_law(x)
    x = ((x+1)*128).astype('int32')
    x = np.asarray([255 if x == 256 else x for x in x])
    return x

def deprocess(x):
    x = (x + 0.5)/128. - 1
    x = mu_law_inverse(x)
    return x

In [8]:
yp = process(y)
ydp = deprocess(yp)
ipd.display(ipd.Audio(ydp, rate=sr))

In [31]:
num_groups = 1
num_layers = 9
num_filters = 32
batch_size = 32
batch_len = 2**num_layers

In [32]:
def generate_data():
    while True:
        batch_x = np.zeros((batch_size, batch_len)).astype('int32')
        batch_y = np.zeros((batch_size, batch_len, 256)).astype('int32')
        for batch_idx in range(batch_size):
            data = np.zeros((batch_len+1,))
            dataset_idx = np.random.randint(len(metadata))
            x_aud, _ = librosa.load('{0}/wavs/{1}.wav'.format(
                dataset_path, metadata[dataset_idx][0]), sr=8000)
            slice_s = max(0, len(x_aud) - (batch_len+1))
            if slice_s != 0:
                slice_s = np.random.randint(0, slice_s)
            slice_e = min(slice_s + batch_len+1, len(x_aud))
            slice_len = slice_e - slice_s
            data[:slice_len] = x_aud[slice_s:slice_e]
            data = process(data)
            batch_x[batch_idx][:slice_len-1] = data[:slice_len-1]
            y_slice = data[1:slice_len]
            for j, sample_id in enumerate(y_slice):
                batch_y[batch_idx, j, sample_id] = 1
        yield (batch_x, batch_y)

In [33]:
bx, by = next(generate_data())

In [34]:
print(bx[0,:5])
print(np.argmax(by[0,:5], axis=-1))

[ 26  31  48 138 195]
[ 31  48 138 195 194]


In [35]:
model = Sequential()
model.add(Embedding(input_dim=256, output_dim=num_filters,
    input_length=batch_len))
for i in range(num_groups):
    for j in range(num_layers):
        model.add(Conv1D(filters=num_filters, kernel_size=2,
            dilation_rate=2**j, padding='causal', activation='tanh'))
model.add(TimeDistributed(Dense(256, activation='softmax')))

adam = Adam(0.002)
model.compile(optimizer=adam, loss='categorical_crossentropy')

In [41]:
def generate_audio(samples=24000, samp_id=0):
    bx, by = next(generate_data())
    x = bx[0]
    audio = []
    # todo: fix inefficient sampling
    for i in range(samples):
        y = model.predict(np.array([x]))
        y = pred_sample(y[0, -1])
        x = x[:][1:]
        x = np.concatenate((x, [y]), axis=-1)
        audio.append(deprocess(y))
    audio = np.array(audio)
    fpath = 'outputs/wavenet_{0}_{1}.wav'.format(
        samp_id, int(time.time()))
    librosa.output.write_wav(fpath, audio, 8000)
    print(fpath)
    ipd.display(ipd.Audio(audio, rate=8000))

In [37]:
time_s = time.time()
generate_audio()
print('generated 3 seconds in {0:.2f} seconds'.format(time.time()-time_s))

outputs/output_0_1523334835.wav


generated 3 seconds in 106.73 seconds


In [38]:
class DoGenCB(Callback):
    def __init__(self):
        super(DoGenCB, self).__init__()
    
    def on_epoch_end(self, epoch, logs=None):
        if epoch % 10 == 0:
            generate_audio(samp_id=epoch)

In [43]:
def trainfor(epochs, do_gen=True):
    callbacks = []
    if do_gen:
        callbacks.append(DoGenCB())
    model.fit_generator(generate_data(), 1e2, epochs, callbacks=callbacks)

In [None]:
trainfor(1e9)

Epoch 1/1000000000


Epoch 2/1000000000
 21/100 [=====>........................] - ETA: 287s - loss: 4.3466