
##### Copyright 2019 Google LLC.

Licensed under the Apache License, Version 2.0 (the "License");





In [0]:
# Copyright 2019 Google LLC. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# DDSP Training

This notebook demonstrates the process of training a mode through the simple example overfitting a single sample. This notebook gives examples of how to instantiate a model both in python and with gin.

In [0]:
import time

import ddsp
import ddsp.training
from ddsp.colab.colab_utils import play, specplot, DEFAULT_SAMPLE_RATE
import gin
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

tf.disable_v2_behavior()
sample_rate = DEFAULT_SAMPLE_RATE  # 16000
f32 = ddsp.core.f32

# Get a Batch of Data

In [0]:
# Get a single example from NSynth.
# Takes a few seconds to load from GCS.
data_provider = ddsp.training.data.NSynthTfds(split='test')
batch = data_provider.get_batch(batch_size=1, shuffle=False)
batch = next(tfds.as_numpy(batch))
audio = batch['audio']
n_samples = audio.shape[1]

specplot(audio[0])
play(audio[0])

# Train

### Model in python 

In [0]:
tf.reset_default_graph()
preprocessing = ddsp.training.preprocessing
encoders = ddsp.training.encoders
decoders = ddsp.training.decoders
models = ddsp.training.models
TIME_STEPS = 1000

# Create Neural Networks.
preprocessor = preprocessing.DefaultPreprocessor(time_steps=TIME_STEPS)

decoder = decoders.RnnFcDecoder(rnn_channels = 256,
                                rnn_type = 'gru',
                                ch = 256,
                                layers_per_stack = 1,
                                output_splits = (('amps', 1),
                                                 ('harmonic_distribution', 20),
                                                 ('noise_magnitudes', 20)))

# Create Processors.
additive = ddsp.synths.Additive(n_samples=n_samples, 
                                sample_rate=sample_rate,
                                name='additive')

# Gradually fade in noise during training for this example.
# Not required when training on whole dataset as batch variations help avoid
# local minima (only noise and no harmonic components).
noise_fade_fn = lambda: ddsp.training.nn.exp_fade(iter_start=0, 
                                                  iter_end=100, 
                                                  start_value=1e-5)

noise = ddsp.synths.FilteredNoise(window_size=0,
                                  noise_fade_fn=noise_fade_fn,
                                  name='noise')
add = ddsp.processors.Add(name='add')

# Create ProcessorGroup.
dag = [(additive, ['amps', 'harmonic_distribution', 'f0_hz']),
       (noise, ['noise_magnitudes']),
       (add, ['noise/signal', 'additive/signal'])]

processor_group = ddsp.processors.ProcessorGroup(dag=dag, name='processor_group')


# Loss_functions
spectral_loss = ddsp.losses.SpectralLoss(loss_type='L1',
                                         mag_weight=1.0,
                                         logmag_weight=1.0)

# Put it together in a model.
model = models.Autoencoder(preprocessor=preprocessor,
                           encoder=None,
                           decoder=decoder,
                           processor_group=processor_group,
                           losses=[spectral_loss])

#### Or model in gin...

In [0]:
tf.reset_default_graph()
gin_string = """
import ddsp
import ddsp.training

# =======
# Network
# =======

# Preprocessor
models.Autoencoder.preprocessor = @preprocessing.DefaultPreprocessor()
preprocessing.DefaultPreprocessor.time_steps = 1000


# Encoder
models.Autoencoder.encoder = None

# Decoder
models.Autoencoder.decoder = @decoders.RnnFcDecoder()
decoders.RnnFcDecoder.rnn_channels = 256
decoders.RnnFcDecoder.rnn_type = 'gru'
decoders.RnnFcDecoder.ch = 256
decoders.RnnFcDecoder.layers_per_stack = 1
decoders.RnnFcDecoder.output_splits = (('amps', 1),
                                       ('harmonic_distribution', 20),
                                       ('noise_magnitudes', 20))


# =================
# Signal Processors
# =================

# ProcessorGroup
models.Autoencoder.processor_group = @processors.ProcessorGroup()

processors.ProcessorGroup.dag = [
  (@additive/synths.Additive(),
    ['amps', 'harmonic_distribution', 'f0_hz']),
  (@noise/synths.FilteredNoise(),
    ['noise_magnitudes']),
  (@add/processors.Add(),
    ['noise/signal', 'additive/signal']),
]

# Additive Synthesizer
additive/synths.Additive.name = 'additive'
additive/synths.Additive.n_samples = 64000
additive/synths.Additive.sample_rate = 16000
additive/synths.Additive.normalize_below_nyquist = True
additive/synths.Additive.amp_scale_fn = @core.exp_sigmoid

# Filtered Noise Synthesizer
noise/synths.FilteredNoise.name = 'noise'
noise/synths.FilteredNoise.n_samples = 64000
noise/synths.FilteredNoise.window_size = 0
noise/synths.FilteredNoise.amp_scale_fn = @core.exp_sigmoid
noise/synths.FilteredNoise.noise_fade_fn = @noise_fade/nn.exp_fade

noise_fade/nn.exp_fade.iter_start = 0
noise_fade/nn.exp_fade.iter_end = 300
noise_fade/nn.exp_fade.start_value = 1e-5

# Add
add/processors.Add.name = 'add'


# ======
# Losses
# ======

models.Autoencoder.losses = [
    @losses.SpectralLoss(),
]
losses.SpectralLoss.loss_type = 'L1'
losses.SpectralLoss.mag_weight = 1.0
losses.SpectralLoss.logmag_weight = 1.0
"""

with gin.unlock_config():
  gin.parse_config(gin_string)

model = ddsp.training.models.Autoencoder()

## Get training op

In [0]:
learning_rate = 1e-3

# Single batch.
batch_tf = {k:f32(v) for k, v in batch.items()}

# Get model predictions for the batch.
outputs = model(batch_tf)
loss = outputs['total_loss']
train_op = ddsp.training.train_util.get_train_op(loss, 
                                                 learning_rate=learning_rate)

# Setup the session.
target = ''
target = 'uptc://prod/dragonfish_2x2'  # GOOGLE-INTERNAL
sess = tf.Session(target)
sess.run(tf.initialize_all_variables())

## Train Loop

In [0]:
for i in range(300):
  _, loss_ = sess.run([train_op, loss])
  print('i: {}\tLoss: {}'.format(i, loss_))

# Make Predictions

# Analyze results

In [0]:
# Run a batch of predictions.
start_time = time.time()
predictions = sess.run(outputs)
end_time = time.time()
infer_time = end_time - start_time
print('Prediction took %.1f seconds' % infer_time)

In [0]:
# Predictions
tf.reset_default_graph()
sess = tf.Session()

k = 0

audio = predictions['audio'][k]
audio_gen = predictions['audio_gen'][k]
amps = predictions['additive']['controls']['amplitudes'][k]
harmonic_distribution = predictions['additive']['controls']['harmonic_distribution'][k]
f0_hz = predictions['f0_hz'][k]
loudness = predictions['loudness'][k]

play(audio)
play(audio_gen)

specplot(audio)
plt.title('Audio')
specplot(audio_gen)
plt.title('Audio Synth')

f, ax = plt.subplots(1, 2, figsize=(14, 4))
ax[0].semilogy(amps)
ax[0].set_xlabel('Amps')
ax[0].set_ylim(1e-5, 2)
ax[1].plot(loudness)
ax[1].set_xlabel('loudness')

f, ax = plt.subplots(1, 2, figsize=(14, 4))
ax[0].plot(harmonic_distribution)
ax[0].set_title('Harmonic Distribution')
ax[1].plot(f0_hz)
ax[1].set_title('F0_Hz')
