
##### Copyright 2019 Google LLC.

Licensed under the Apache License, Version 2.0 (the "License");





In [0]:
# Copyright 2019 Google LLC. All Rights Reserved.

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# DDSP Timbre Transfer Demo

This notebook is a demo of timbre transfer using DDSP (Differentiable Digital Signal Processing). 
The model here is trained to generate audio conditioned on a time series of fundamental frequency and loudness. 

* [DDSP ICLR paper](https://openreview.net/forum?id=B1x1ma4tDr)
* [Audio Examples](http://goo.gl/magenta/ddsp-examples) 

<img src="https://storage.googleapis.com/ddsp/additive_diagram/ddsp_autoencoder.png" alt="DDSP Autoencoder figure" width="700">


# Environment Setup


This notebook extracts these features from input audio (either uploaded files, or recorded from the microphone) and resynthesizes with the model.

Have fun! And please feel free to hack this notebook to make your own creative interactions.

### Instructions for running:

* Make sure to use a GPU runtime, click:  __Runtime >> Change Runtime Type >> GPU__
* Press the ▶️button on the left of each of the cells
* View the code: Double-click any of the cells
* Hide the code: Double click the right side of the cell


In [0]:
#@title #Install

#@markdown Install ddsp, define some helper functions, and download the model. This transfers a lot of data and _should take a minute or two_.

!gcloud auth login
!mkdir /content/repos
!gcloud source repos clone --project=brain-magenta ddsp /content/repos/ddsp
!pip install -Ue /content/repos/ddsp

print('Copying checkpoint from GCS...')
!mkdir /content/ckpts
!mkdir /content/recordings
!mkdir /content/samples
!gsutil -q -m cp gs://magentadata/models/ddsp/solo_violin_ckpt.zip /content/ckpts/
!unzip -o /content/ckpts/solo_violin_ckpt.zip -d /content/ckpts &>/dev/null
print('Checkpoint copied!')

CKPT_DIR = '/content/ckpts/solo_violin_ckpt'
SAMPLES_DIR = '/content/samples'
RECORDINGS_DIR = '/content/recordings'
DEFAULT_SAMPLE_RATE = 16000

In [0]:
#@title #Import

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
from google.colab import files
import librosa

import gin
import os
import time
import crepe

import ddsp
import ddsp.training
from ddsp.colab.colab_utils import (download, play, record, specplot, upload,
                                    DEFAULT_SAMPLE_RATE)

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

# Ignore a bunch of deprecation warnings
import warnings
warnings.filterwarnings("ignore")

# Helper Functions
tf.disable_v2_behavior()
sample_rate = DEFAULT_SAMPLE_RATE  # 16000
f32 = ddsp.core.f32

def reset_crepe():
  """Reset the global state of CREPE to force model re-building."""
  for k in crepe.core.models:
    crepe.core.models[k] = None

In [0]:
#@title #Record or Upload Audio
#@markdown * Either record audio from microphone or upload audio from file (.mp3 or .wav) 
#@markdown * Audio should be monophonic (single instrument / voice)
#@markdown * Extracts fundmanetal frequency (f0) and loudness features. 

record_or_upload = "Upload (.mp3 or .wav)" #@param ["Record", "Upload (.mp3 or .wav)"]

record_seconds =  5.5 #@param {type:"number", min:1, max:10, step:1}

if record_or_upload == "Record":
  audio = record(seconds=record_seconds)
else:
  # Load audio sample here (.mp3 or .wav3 file)
  # Just use the first file.
  filenames, audios = upload()
  audio = audios[0]

# Plot.
specplot(audio)
play(audio)


# Setup the session.
tf.reset_default_graph()
target = ''
sess = tf.Session(target)
tf.keras.backend.set_session(sess)
reset_crepe()

# Compute features.
print('\nExtracting audio features...')
start_time = time.time()
audio_features = ddsp.training.eval_util.compute_audio_features(audio)
print('Audio features took %.1f seconds' % (time.time() - start_time))

# Plot Features.
fig, ax = plt.subplots(nrows=3, 
                       ncols=1, 
                       sharex=True,
                       figsize=(6, 8))
ax[0].plot(audio_features['loudness'])
ax[0].set_ylabel('loudness')

ax[1].plot(127.0 * audio_features['f0'])
ax[1].set_ylabel('f0 [midi note]')

ax[2].plot(audio_features['f0_confidence'])
ax[2].set_ylabel('f0 confidence')
_ = ax[2].set_xlabel('Time step [frame]')

In [0]:
#@title Modify conditioning

def mask_by_confidence(audio_features, confidence_level=0.1):
  """For the violin model, the masking causes fast dips in loudness. 
  This quick transient is interpreted by the model as the "plunk" sound.
  """
  mask_idx = audio_features['f0_confidence'] < confidence_level
  audio_features['f0'][mask_idx] = 0.0
  audio_features['loudness'][mask_idx] = -1.5
  return audio_features


def shift_f0(audio_features, shift_octaves=1.0):
  """Shift f0 by a number of ocatves."""
  octave_offset = 12.0 / 127.0
  offset = shift_octaves * octave_offset
  audio_features['f0'] += offset
  return audio_features


def smooth_loudness(audio_features, filter_size=3):
  """Smooth loudness with a box filter."""
  smoothing_filter = np.ones([filter_size]) / float(filter_size)
  audio_features['loudness'] = np.convolve(audio_features['loudness'], 
                                           smoothing_filter, 
                                           mode='same')
  return audio_features


# audio_features = shift_f0(audio_features, shift_octaves=1.0)
# audio_features = mask_by_confidence(audio_features, 0.4)

### Plot
plt.figure()
plt.plot(audio_features['f0'])
plt.title("f0")

plt.figure()
plt.plot(audio_features['loudness'])
plt.title("loudness")

In [0]:
#@title #Choose a model

model = 'Violin' #@param ['Violin', 'Upload your own (checkpoint folder as .zip)']


# Parse gin config
with gin.unlock_config():
  gin_file = os.path.join(model_dir, 'operative_config-0.gin')
  gin.parse_config_file(gin_file, skip_unknown=False)

# Ensure dimensions and sampling rates are equal
time_steps_train = gin.query_parameter('DefaultPreprocessor.time_steps')
n_samples_train = gin.query_parameter('additive/Additive.n_samples')
hop_size = int(n_samples_train / time_steps_train)

time_steps = int(audio.shape[0] / hop_size)
n_samples = time_steps * hop_size

print("TIME_STEPS_TRAIN", time_steps_train)
print("N_SAMPLES_TRAIN", n_samples_train)
print("HOP_SIZE", hop_size)
print("TIME_STEPS", time_steps)
print("N_SAMPLES_TRANSFER", n_samples)


# Trim all input vectors to correct lengths 
for key in ['f0', 'f0_confidence', 'loudness']:
  audio_features[key] = audio_features[key][:time_steps]
audio_features['audio'] = audio_features['audio'][:n_samples]


gin_params = [
    'additive/Additive.n_samples = {}'.format(n_samples),
    'noise/FilteredNoise.n_samples = {}'.format(n_samples),
    'DefaultPreprocessor.time_steps = {}'.format(time_steps),
]

with gin.unlock_config():
  gin.parse_config(gin_params)


# Set up the model just to predict audio given new conditioning
tf.reset_default_graph()

features_tf = {k:f32(v)[tf.newaxis, :] for k, v in audio_features.items()}

model = ddsp.training.models.Autoencoder()
predictions = model.get_outputs(features_tf, training=False)

target = ''
sess = tf.Session(target)

start_time = time.time()
model.restore(sess, model_dir)
print('Loading model took %.1f seconds' % (time.time() - start_time))


In [0]:
#@title #Resynthesize Audio

# Run a batch of predictions.
start_time = time.time()
audio_gen = sess.run(predictions['audio_gen'])[0]
print('Prediction took %.1f seconds' % (time.time() - start_time))

# Plot
print('Original')
play(audio)

print('Resynthesis')
play(audio_gen)

specplot(audio)
plt.title("Original")

specplot(audio_gen)
plt.title("Resynthesis")

# Extras

In [0]:
# Adding some reverb after the fact...
gain = 0.0
decay = 3.0
reverb = ddsp.effects.ExpDecayReverb(reverb_length=reverb_length)
audio2 = sess.run(reverb(audio[np.newaxis, :], gain, decay))[0]
play(audio2 / audio2.max())

In [0]:
# Get violin stats
tf.reset_default_graph()
data_provider = ddsp.training.data.SoloViolin()
batch = data_provider.get_batch(batch_size=1, shuffle=False)
batch_np = next(tfds.as_numpy(batch))

from functools import partial

def rms(audio):
  return np.mean(audio**2.0)**0.5
  
def normalize_by_rms(audio, norm_rms_value):
  audio_rms = rms(audio)
  return audio / audio_rms * norm_rms_value

VIOLIN_RMS = rms(batch_np['audio'])
norm_rms = partial(normalize_by_rms, norm_rms_value=VIOLIN_RMS)

VIOLIN_RMS = 0.09

## add reverb (impulse response extracted from graph above)

In [0]:
def add_reverb(audio):
  # Add reverb 
  # Normalize input audio volume as well
  tf.reset_default_graph()
  sess = tf.Session()

  dry_audio = audio.copy()
  ir_mod = ir.copy()

  PRE_RMS = rms(audio)
  dry_audio = norm_rms(audio, norm_rms_value=AUDIO_RMS_DRY)
  wet_audio = sess.run(ddsp.core.fft_convolve(f32(dry_audio)[tf.newaxis, :],
                                        f32(ir_mod)[tf.newaxis, :], 
                                        delay_compensation=0))[0]
  audio = audio + wet_audio * 1.0
  return audio

# AUDIO_RMS = rms(audio)
# play(audio)  