In [1]:
import IPython.display as ipd
import librosa
import numpy as np
import time

from keras.models import Model, load_model
from keras.layers import Input, Embedding, Conv1D, Multiply, Dense
from keras.layers.wrappers import TimeDistributed
from keras.optimizers import Adam
from keras.callbacks import Callback

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
sample_rate = 16000
bit_depth = 8
bd_cats = 2**bit_depth # 256 todo: change to mixture of logistics to model 16 bit

num_groups = 1
num_layers = 10
num_filters = 64
batch_size = 16
batch_len = 2**num_layers * 2 # todo: masking

model_id = int(time.time())

In [3]:
# single channel
dataset_path = 'C:/Datasets/LJSpeech-1.1'
with open('{0}/metadata.csv'.format(dataset_path), encoding='utf-8') as f:
    metadata_str = f.read()
metadata = [line.split('|') for line in metadata_str.split('\n')]
metadata = [ex[:2] for ex in metadata[:-1]]
print('first example:')
ipd.display(ipd.Audio('{0}/wavs/{1}.wav'.format(
    dataset_path, metadata[0][0])))
print(metadata[0][1])

first example:


Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition


In [4]:
print(len(metadata))
print(metadata[0][0], metadata[-1][0])

13100
LJ001-0001 LJ050-0278


In [5]:
y, sr = librosa.load('{0}/wavs/{1}.wav'.format(
    dataset_path, metadata[0][0]), sr=sample_rate)
print(len(y), sr)
ipd.display(ipd.Audio(y, rate=sr))

154481 16000


In [6]:
#librosa.output.write_wav('test.wav', y, sr)

In [7]:
def mu_law(x, mu=bd_cats-1):
    return np.sign(x)*np.log(1 + mu*np.abs(x))/np.log(1 + mu)

def mu_law_inverse(x, mu=bd_cats-1):
    return np.sign(x)*(1/mu)*(np.power(1 + mu, np.abs(x)) - 1)

def pred_sample(prob_vec):
    psum = np.sum(prob_vec)
    choice = np.random.rand()
    itersum = 0
    for idx, val in enumerate(prob_vec):
        itersum += val/psum
        if choice <= itersum:
            return idx
    return 0

In [8]:
def process(x):
    x = mu_law(x)
    x = ((x+1)*bd_cats/2).astype('int32')
    x = np.asarray([bd_cats-1 if x == bd_cats else x for x in x])
    return x

def deprocess(x):
    x = (x + 0.5)/(bd_cats/2.) - 1
    x = mu_law_inverse(x)
    return x

In [9]:
yp = process(y)
ydp = deprocess(yp)
ipd.display(ipd.Audio(ydp, rate=sample_rate))

In [10]:
def generate_data():
    while True:
        batch_x = np.zeros((batch_size, batch_len)).astype('int32')
        batch_y = np.zeros((batch_size, batch_len, bd_cats)).astype('int32')
        for batch_idx in range(batch_size):
            data = np.zeros((batch_len+1,))
            dataset_idx = np.random.randint(len(metadata))
            x_aud, _ = librosa.load('{0}/wavs/{1}.wav'.format(
                dataset_path, metadata[dataset_idx][0]), sr=sample_rate)
            x_aud = np.clip(x_aud, -1, 1)
            slice_s = max(0, len(x_aud) - (batch_len+1))
            if slice_s != 0:
                slice_s = np.random.randint(0, slice_s)
            slice_e = min(slice_s + batch_len+1, len(x_aud))
            slice_len = slice_e - slice_s
            data[:slice_len] = x_aud[slice_s:slice_e]
            data = process(data)
            batch_x[batch_idx][:slice_len-1] = data[:slice_len-1]
            y_slice = data[1:slice_len]
            for j, sample_id in enumerate(y_slice):
                batch_y[batch_idx, j, sample_id] = 1
        yield (batch_x, batch_y)

In [11]:
bx, by = next(generate_data())
print(bx[0,:5])
print(np.argmax(by[0,:5], axis=-1))

[177 179 181 183 183]
[179 181 183 183 182]


In [12]:
l_in = Input(shape=(None,), dtype='int32')
h = Embedding(input_dim=bd_cats, output_dim=num_filters)(l_in)
for i in range(num_groups):
    for j in range(num_layers):
        l_filter = Conv1D(filters=num_filters, kernel_size=2,
            dilation_rate=2**j, padding='causal', activation='tanh')(h)
        l_gate = Conv1D(filters=num_filters, kernel_size=2,
            dilation_rate=2**j, padding='causal', activation='sigmoid')(h)
        h = Multiply()([l_filter, l_gate])
l_out = TimeDistributed(Dense(256, activation='softmax'))(h)

model = Model(inputs=l_in, outputs=l_out)

adam = Adam(0.001)
model.compile(optimizer=adam, loss='categorical_crossentropy')

In [13]:
print(model.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, None)          0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, None, 64)      16384       input_1[0][0]                    
____________________________________________________________________________________________________
conv1d_1 (Conv1D)                (None, None, 64)      8256        embedding_1[0][0]                
____________________________________________________________________________________________________
conv1d_2 (Conv1D)                (None, None, 64)      8256        embedding_1[0][0]                
___________________________________________________________________________________________

In [14]:
def generate_audio(samples=sample_rate*3, samp_id=0):
    bx, by = next(generate_data())
    x = bx[0][-(2**num_layers):]
    audio = []
    # todo: fix inefficient sampling
    for i in range(samples):
        y = model.predict(np.array([x]))
        y = pred_sample(y[0, -1])
        x = x[:][1:]
        x = np.concatenate((x, [y]), axis=-1)
        audio.append(deprocess(y))
    audio = np.array(audio)
    fpath = 'outputs/wavenet_{0}_{1}.wav'.format(
        samp_id, int(time.time()))
    librosa.output.write_wav(fpath, audio, sample_rate)
    print(fpath)
    ipd.display(ipd.Audio(audio, rate=sample_rate))

In [15]:
time_s = time.time()
generate_audio()
print('generated 3 seconds in {0:.2f} seconds'.format(time.time()-time_s))

outputs/wavenet_0_1524071472.wav


generated 3 seconds in 381.86 seconds


In [16]:
class GenAudCB(Callback):
    def __init__(self):
        super(GenAudCB, self).__init__()
    
    def on_epoch_end(self, epoch, logs=None):
        if epoch % 10 == 0:
            print()
            generate_audio(samp_id=epoch)

In [17]:
class SaveCB(Callback):
    def __init__(self):
        super(SaveCB, self).__init__()
    
    def on_epoch_end(self, epoch, logs=None):
        if epoch % 10 == 0:
            print()
            model.save('models/wn_{0}.h5'.format(model_id))
            print('saved model')

In [18]:
def trainfor(epochs, save=True, gen_aud=True):
    callbacks = []
    if save:
        callbacks.append(SaveCB())
    if gen_aud:
        callbacks.append(GenAudCB())
    model.fit_generator(generate_data(), 2e2, epochs, callbacks=callbacks)

In [19]:
trainfor(10)
adam = Adam(0.0002)
model.compile(optimizer=adam, loss='categorical_crossentropy')
trainfor(1e9)

Epoch 1/10
saved model

outputs/wavenet_0_1524072220.wav


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/1000000000
saved model

outputs/wavenet_0_1524076251.wav


Epoch 2/1000000000
Epoch 3/1000000000
Epoch 4/1000000000
Epoch 5/1000000000
Epoch 6/1000000000
Epoch 7/1000000000
Epoch 8/1000000000
Epoch 9/1000000000
Epoch 10/1000000000
Epoch 11/1000000000
saved model

outputs/wavenet_10_1524080162.wav


Epoch 12/1000000000
Epoch 13/1000000000
Epoch 14/1000000000
Epoch 15/1000000000
Epoch 16/1000000000
Epoch 17/1000000000
Epoch 18/1000000000
Epoch 19/1000000000
Epoch 20/1000000000
Epoch 21/1000000000
saved model

outputs/wavenet_20_1524084094.wav


Epoch 22/1000000000
Epoch 23/1000000000
Epoch 24/1000000000
Epoch 25/1000000000
Epoch 26/1000000000
Epoch 27/1000000000
Epoch 28/1000000000
Epoch 29/1000000000
Epoch 30/1000000000
Epoch 31/1000000000
saved model

outputs/wavenet_30_1524088121.wav


Epoch 32/1000000000
Epoch 33/1000000000
Epoch 34/1000000000
Epoch 35/1000000000
Epoch 36/1000000000
Epoch 37/1000000000
Epoch 38/1000000000
Epoch 39/1000000000
Epoch 40/1000000000
Epoch 41/1000000000
saved model

outputs/wavenet_40_1524092380.wav


Epoch 42/1000000000
Epoch 43/1000000000
Epoch 44/1000000000
Epoch 45/1000000000
Epoch 46/1000000000
Epoch 47/1000000000
Epoch 48/1000000000
Epoch 49/1000000000
Epoch 50/1000000000
Epoch 51/1000000000
saved model

outputs/wavenet_50_1524096858.wav


Epoch 52/1000000000
Epoch 53/1000000000

KeyboardInterrupt: 