In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
import tensorflow as tf
import utils

In [3]:
class Config(object):
  """Configuration object that helps manage the graph."""

  def __init__(self, train_path=None):
    self.num_iters = 200000
    self.learning_rate_schedule = {
        0: 2e-4,
        90000: 4e-4 / 3,
        120000: 6e-5,
        150000: 4e-5,
        180000: 2e-5,
        210000: 6e-6,
        240000: 2e-6,
    }
    self.ae_hop_length = 512
    self.ae_bottleneck_width = 16
    self.train_path = train_path

  def get_batch(self, batch_size):
    assert self.train_path is not None
    data_train = reader.NSynthDataset(self.train_path, is_training=True)
    return data_train.get_wavenet_batch(batch_size, length=6144)

  @staticmethod
  def _condition(x, encoding):
    """Condition the input on the encoding.
    Args:
      x: The [mb, length, channels] float tensor input.
      encoding: The [mb, encoding_length, channels] float tensor encoding.
    Returns:
      The output after broadcasting the encoding to x's shape and adding them.
    """
    mb, length, channels = x.get_shape().as_list()
    enc_mb, enc_length, enc_channels = encoding.get_shape().as_list()
    mb = tf.shape(x)[0]
    enc_mb = tf.shape(x)[0]
    enc_length = tf.shape(encoding)[1]
    length = tf.shape(x)[1]

    encoding = tf.reshape(encoding, [mb, enc_length, 1, channels])
    x = tf.reshape(x, [mb, enc_length, -1, channels])
    x += encoding
    x = tf.reshape(x, [mb, length, channels])
#     x.set_shape([mb, length, channels])
    return x

  def build(self, inputs, is_training, rescale_inputs=True,
            include_decoder=True, use_reduce_mean_to_pool=False):
    """Build the graph for this configuration.
    Args:
      inputs: A dict of inputs. For training, should contain 'wav'.
      is_training: Whether we are training or not. Not used in this config.
      rescale_inputs: Whether to convert inputs to mu-law and back to unit
        scaling before passing through the model (loses gradients).
      include_decoder: bool, whether to include the decoder in the build().
      use_reduce_mean_to_pool: whether to use reduce_mean (instead of pool1d)
        for pooling.
    Returns:
      A dict of outputs that includes the 'predictions', 'loss', the 'encoding',
      the 'quantized_input', and whatever metrics we want to track for eval.
    """
    num_stages = 10
    num_layers = 30
    filter_length = 3
    width = 512
    skip_width = 256
    ae_num_stages = 10
    ae_num_layers = 30
    ae_filter_length = 3
    ae_width = 128

    # Encode the source with 8-bit Mu-Law.
    x = inputs['wav']
    x_quantized = utils.mu_law(x)
    x_scaled = tf.cast(x_quantized, tf.float32) / 128.0
    x_scaled = tf.expand_dims(x_scaled, 2)
    x = tf.expand_dims(x, 2)

    ###
    # The Non-Causal Temporal Encoder.
    ###
    en = conv1d(
        x_scaled if rescale_inputs else x,
        causal=False,
        num_filters=ae_width,
        filter_length=ae_filter_length,
        name='ae_startconv',
        is_training=is_training)

    for num_layer in range(ae_num_layers):
      dilation = 2**(num_layer % ae_num_stages)
      d = tf.nn.relu(en)
      d = conv1d(
          d,
          causal=False,
          num_filters=ae_width,
          filter_length=ae_filter_length,
          dilation=dilation,
          name='ae_dilatedconv_%d' % (num_layer + 1),
          is_training=is_training)
      d = tf.nn.relu(d)
      en += conv1d(
          d,
          num_filters=ae_width,
          filter_length=1,
          name='ae_res_%d' % (num_layer + 1),
          is_training=is_training)

    en = conv1d(
        en,
        num_filters=self.ae_bottleneck_width,
        filter_length=1,
        name='ae_bottleneck',
        is_training=is_training)

    if use_reduce_mean_to_pool:
      # Depending on the accelerator used for training, masked.pool1d may
      # lead to out of memory error.
      # reduce_mean is equivalent to masked.pool1d when the stride is the same
      # as the window length (which is the case here).
      batch_size, unused_length, depth = en.shape.as_list()
      en = tf.reshape(en, [batch_size, -1, self.ae_hop_length, depth])
      en = tf.reduce_mean(en, axis=2)
    else:
      en = pool1d(en, self.ae_hop_length, name='ae_pool', mode='avg')
    encoding = en

    if not include_decoder:
      return {'encoding': encoding}

    ###
    # The WaveNet Decoder.
    ###
    print('WaveNet Decoder')
    l = shift_right(x_scaled if rescale_inputs else x)
    l = conv1d(
        l,
        num_filters=width,
        filter_length=filter_length,
        name='startconv',
        is_training=is_training)

    # Set up skip connections.
    s = conv1d(
        l,
        num_filters=skip_width,
        filter_length=1,
        name='skip_start',
        is_training=is_training)

    # Residual blocks with skip connections.
    for i in range(num_layers):
      dilation = 2**(i % num_stages)
      d = conv1d(
          l,
          num_filters=2 * width,
          filter_length=filter_length,
          dilation=dilation,
          name='dilatedconv_%d' % (i + 1),
          is_training=is_training)
      d = self._condition(d,
                          conv1d(
                              en,
                              num_filters=2 * width,
                              filter_length=1,
                              name='cond_map_%d' % (i + 1),
                              is_training=is_training))

      assert d.get_shape().as_list()[2] % 2 == 0
      m = d.get_shape().as_list()[2] // 2
      d_sigmoid = tf.sigmoid(d[:, :, :m])
      d_tanh = tf.tanh(d[:, :, m:])
      d = d_sigmoid * d_tanh

      l += conv1d(
          d,
          num_filters=width,
          filter_length=1,
          name='res_%d' % (i + 1),
          is_training=is_training)
      s += conv1d(
          d,
          num_filters=skip_width,
          filter_length=1,
          name='skip_%d' % (i + 1),
          is_training=is_training)

    s = tf.nn.relu(s)
    s = conv1d(
        s,
        num_filters=skip_width,
        filter_length=1,
        name='out1',
        is_training=is_training)
    s = self._condition(s,
                        conv1d(
                            en,
                            num_filters=skip_width,
                            filter_length=1,
                            name='cond_map_out1',
                            is_training=is_training))
    s = tf.nn.relu(s)

    ###
    # Compute the logits and get the loss.
    ###
    logits = conv1d(
        s,
        num_filters=256,
        filter_length=1,
        name='logits',
        is_training=is_training)
    logits = tf.reshape(logits, [-1, 256])
    probs = tf.nn.softmax(logits, name='softmax')
    x_indices = tf.cast(tf.reshape(x_quantized, [-1]), tf.int32) + 128
    loss = tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=x_indices, name='nll'),
        0,
        name='loss')

    return {
        'predictions': probs,
        'loss': loss,
        'eval': {
            'nll': loss
        },
        'quantized_input': x_quantized,
        'encoding': encoding,
    }


In [4]:
def shift_right(x):
    """Shift the input over by one and a zero to the front.

  Args:
    x: The [mb, time, channels] tensor input.

  Returns:
    x_sliced: The [mb, time, channels] tensor output.
  """
    shape = x.get_shape().as_list()
    x_padded = tf.pad(x, [[0, 0], [1, 0], [0, 0]])
    x_sliced = tf.slice(x_padded, [0, 0, 0], tf.stack([-1, tf.shape(x)[1], -1]))
    x_sliced.set_shape(shape)
    return x_sliced


def mul_or_none(a, b):
    """Return the element wise multiplicative of the inputs.

  If either input is None, we return None.

  Args:
    a: A tensor input.
    b: Another tensor input with the same type as a.

  Returns:
    None if either input is None. Otherwise returns a * b.
  """
    if a is None or b is None or isinstance(a, tf.Tensor):
        return None
    return int(a * b)


def time_to_batch(x, block_size):
    """Splits time dimension (i.e. dimension 1) of `x` into batches.

  Within each batch element, the `k*block_size` time steps are transposed,
  so that the `k` time steps in each output batch element are offset by
  `block_size` from each other.

  The number of input time steps must be a multiple of `block_size`.

  Args:
    x: Tensor of shape [nb, k*block_size, n] for some natural number k.
    block_size: number of time steps (i.e. size of dimension 1) in the output
      tensor.

  Returns:
    Tensor of shape [nb*block_size, k, n]
  """
    # x = [b, t, 1]
    shape = x.get_shape().as_list()
    batch_size = tf.shape(x)[0]
    length = tf.shape(x)[1]
    y = tf.reshape(
        x, [batch_size, length // block_size, block_size, shape[2]]
    )
    y = tf.transpose(y, [0, 2, 1, 3])
    y = tf.reshape(
        y, [batch_size * block_size, length // block_size, shape[2]]
    )
    return y


def batch_to_time(x, block_size):
    """Inverse of `time_to_batch(x, block_size)`.

  Args:
    x: Tensor of shape [nb*block_size, k, n] for some natural number k.
    block_size: number of time steps (i.e. size of dimension 1) in the output
      tensor.

  Returns:
    Tensor of shape [nb, k*block_size, n].
  """
    # x = [b, t, 1]
    shape = x.get_shape().as_list()
    batch_size = tf.shape(x)[0]
    length = tf.shape(x)[1]
    y = tf.reshape(x, [batch_size // block_size, block_size, length, shape[2]])
    y = tf.transpose(y, [0, 2, 1, 3])
    y = tf.reshape(y, [batch_size // block_size, length * block_size, shape[2]])
    return y


def conv1d(
    x,
    num_filters,
    filter_length,
    name,
    dilation = 1,
    causal = True,
    kernel_initializer = tf.uniform_unit_scaling_initializer(1.0),
    biases_initializer = tf.constant_initializer(0.0),
    is_training = True,
):
    """Fast 1D convolution that supports causal padding and dilation.

  Args:
    x: The [mb, time, channels] float tensor that we convolve.
    num_filters: The number of filter maps in the convolution.
    filter_length: The integer length of the filter.
    name: The name of the scope for the variables.
    dilation: The amount of dilation.
    causal: Whether or not this is a causal convolution.
    kernel_initializer: The kernel initialization function.
    biases_initializer: The biases initialization function.
    is_training: Whether or not ot use traininable variables.

  Returns:
    y: The output of the 1D convolution.
  """
    batch_size, length, num_input_channels = x.get_shape().as_list()
    batch_size = tf.shape(x)[0]
    length = tf.shape(x)[1]

    kernel_shape = [1, filter_length, num_input_channels, num_filters]
    strides = [1, 1]
    biases_shape = [num_filters]
    padding = 'VALID' if causal else 'SAME'

    x_ttb = time_to_batch(x, dilation)
    if filter_length > 1 and causal:
        x_ttb = tf.pad(x_ttb, [[0, 0], [filter_length - 1, 0], [0, 0]])

    x_ttb_shape = x_ttb.get_shape().as_list()
    x_4d = tf.reshape(
        x_ttb, [tf.shape(x_ttb)[0], 1, tf.shape(x_ttb)[1], num_input_channels]
    )
    y = tf.layers.conv2d(x_4d, num_filters, [1, filter_length], 
                         strides = strides, padding = padding,
                    kernel_initializer = kernel_initializer, 
                     bias_initializer = biases_initializer)
#     y = tf.nn.conv2d(x_4d, weights, strides, padding = padding)
#     y = tf.nn.bias_add(y, biases)
    y_shape = y.get_shape().as_list()
    y = tf.reshape(y, [tf.shape(y)[0], tf.shape(y)[2], num_filters])
    y = batch_to_time(y, dilation)
#     y.set_shape([batch_size, length, num_filters])
    return y


def pool1d(x, window_length, name, mode = 'avg', stride = None):
    """1D pooling function that supports multiple different modes.

  Args:
    x: The [mb, time, channels] float tensor that we are going to pool over.
    window_length: The amount of samples we pool over.
    name: The name of the scope for the variables.
    mode: The type of pooling, either avg or max.
    stride: The stride length.

  Returns:
    pooled: The [mb, time // stride, channels] float tensor result of pooling.
  """
    if mode == 'avg':
        pool_fn = tf.nn.avg_pool
    elif mode == 'max':
        pool_fn = tf.nn.max_pool

    stride = stride or window_length
    batch_size, length, num_channels = x.get_shape().as_list()
    batch_size = tf.shape(x)[0]
    length = tf.shape(x)[1]
#     assert length % window_length == 0
#     assert length % stride == 0

    window_shape = [1, 1, window_length, 1]
    strides = [1, 1, stride, 1]
    x_4d = tf.reshape(x, [batch_size, 1, length, num_channels])
    pooled = pool_fn(x_4d, window_shape, strides, padding = 'SAME', name = name)
    return tf.reshape(pooled, [batch_size, length // stride, num_channels])

Instructions for updating:
Use tf.initializers.variance_scaling instead with distribution=uniform to get equivalent behavior.


In [5]:
config = Config()
x = tf.placeholder(tf.float32, shape=[None, None])
graph = config.build({"wav": x}, is_training=True)

Instructions for updating:
Use `tf.keras.layers.Conv2D` instead.
Instructions for updating:
Please use `layer.__call__` method instead.
WaveNet Decoder


In [6]:
graph

{'predictions': <tf.Tensor 'softmax:0' shape=(?, 256) dtype=float32>,
 'loss': <tf.Tensor 'loss:0' shape=() dtype=float32>,
 'eval': {'nll': <tf.Tensor 'loss:0' shape=() dtype=float32>},
 'quantized_input': <tf.Tensor 'Floor:0' shape=(?, ?) dtype=float32>,
 'encoding': <tf.Tensor 'Reshape_373:0' shape=(?, ?, 16) dtype=float32>}

In [7]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

In [8]:
import librosa

In [9]:
y, sr = librosa.load('aac-id04265-X8H8gVas7zE-00235.wav', sr = 16000)
len(y), sr

(135168, 16000)

In [10]:
def frames(
    audio,
    frame_duration_ms: int = 30,
    sample_rate: int = 16000,
    append_ending_trail: bool = False,
):

    n = int(sample_rate * (frame_duration_ms / 1000.0))
    offset = 0
    timestamp = 0.0
    duration = float(n) / sample_rate
    results = []
    while offset + n < len(audio):
        results.append(audio[offset : offset + n])
        timestamp += duration
        offset += n
    if append_ending_trail:
        results.append(audio[offset:])
    return results

In [11]:
fs = frames(y, 4000)
[len(i) for i in fs]

[64000, 64000]

In [12]:
out = sess.run(graph, feed_dict = {x: fs})

In [13]:
out['predictions'].shape

(128000, 256)

In [14]:
out['quantized_input'][0]

array([ 36.,  50.,  60., ..., -85., -87., -82.], dtype=float32)

In [16]:
import numpy as np

def inv_mu_law_numpy(x, mu = 255.0):
    """A numpy implementation of inverse Mu-Law.

  Args:
    x: The Mu-Law samples to decode.
    mu: The Mu we used to encode these samples.

  Returns:
    out: The decoded data.
  """
    x = np.array(x).astype(np.float32)
    out = (x + 0.5) * 2.0 / (mu + 1)
    out = np.sign(out) / mu * ((1 + mu) ** np.abs(out) - 1)
    out = np.where(np.equal(x, 0), x, out)
    return out

In [17]:
inv_mu_law_numpy(out['quantized_input'][0])

array([ 0.01514114,  0.03103959,  0.0499959 , ..., -0.14858007,
       -0.16238265, -0.12999411], dtype=float32)

In [18]:
import IPython.display as ipd
ipd.Audio(inv_mu_law_numpy(out['quantized_input'][0]), rate = sr)

In [19]:
argmax = np.argmax(out['predictions'], axis = 1)

In [22]:
ipd.Audio(inv_mu_law_numpy(argmax), rate = sr)