Timbre Transfer using DDSP

Here, you can play any instrument with your voice: sing the melody or mimic the sound you would like to play, and upload a sample of the real instrument. Both audios should be monophonic - that means that there are no background noises and only one note is being played at a time.

In [None]:
# Melody Input
audio_melody = 'https://pandasdb-ddsp-demo.s3.eu-central-1.amazonaws.com/.pandas_db_files/samples/vozes_trimmed.mp3' #@param {type: "string"}

# Timbre Input
audio_timbre = 'https://pandasdb-ddsp-demo.s3.eu-central-1.amazonaws.com/.pandas_db_files/samples/urmp_test/AuSep_4_sax_27_King.wav' #@param {type: "string"}

fine_tune_steps = 20 #@param {type: "integer"}
output_path = "/content/output"

In [None]:
from uuid import uuid4
# Maybe download data
def maybe_download(filepath):
  if filepath.startswith("http"):
    new_name = filepath.split("/")[-1]
    !wget -N {filepath} -O {new_name}
    return new_name
  return filepath
melody = maybe_download(audio_melody)
timbre = maybe_download(audio_timbre)

In [None]:
#@title #Install and Import
!pip install git+git://github.com/nielsrolf/pandas_db &> /dev/null
!pip install tensorflow==2.4 &> /dev/null
!pip install apache-beam avro-python3==1.9.0 &> /dev/null
!pip install --upgrade git+git://github.com/nielsrolf/ddsp &> /dev/null
%load_ext autoreload
%autoreload 2

import os
os.makedirs(output_path, exist_ok=True)
#@markdown Install ddsp, define some helper functions, and download the model. This transfers a lot of data and _should take a minute or two_.
%tensorflow_version 2.x



# Ignore a bunch of deprecation warnings
import warnings
warnings.filterwarnings("ignore")

from ddsp.training.file_api import TrainedModelFileAPI
from ddsp.training.tuned import TunedAEFileApi
from ddsp.colab.colab_utils import play

# Helper Functions
sample_rate = 16000


from ddsp.training.data_preparation.prepare_tfrecord_lib import _load_audio


def load_audio_file(file):
  print(str(file))
  wav = _load_audio(file, sample_rate)
  return wav['audio']


def play_file(file):
  wav = load_audio_file(file)
  play(wav)



print('Done!')

In [None]:
#@title #Download model and sample wav files
!wget -N https://pandasdb-ddsp-demo.s3.eu-central-1.amazonaws.com/dashboard/improved_baseline_ae_combined_train.zip
!unzip improved_baseline_ae_combined_train.zip
model_dir = "improved_baseline_ae_combined_train"

## Timbre Transfer with fine-tuning

In [None]:
"""
Wrapper for an autoencoder that fine tunes on a batch
whenever it needs to extract z
"""
import functools
import os
import shutil
import tempfile

import gin
import numpy as np
import tensorflow as tf
from ddsp.training import train_util
from ddsp.training.data_preparation.prepare_tfrecord_lib import (
    _add_f0_estimate, _load_audio, add_loudness, add_loudness_new,
    split_example)
from ddsp.training.file_api import TrainedModelFileAPI
from ddsp.training.models import get_model
from ddsp.training.trainers import Trainer


@functools.lru_cache(4)
def preprocess_file(file):
  ex = _load_audio(file, 16000)
  ex = _add_f0_estimate(ex, 16000, 250)
  ex = add_loudness(ex, 16000, 250)
  ex = add_loudness_new(ex,  16000, 250)
  #sample_rate, frame_rate, window_secs, hop_secs
  ex = list(split_example(ex, 16000, 250, 4, 4))
  return ex


def preprocess_files(files, batch_size=2):
  try:
    full_batch = {}
    for file in files:
      examples = preprocess_file(file)
      for example in examples:
        for k, v in example.items():
          full_batch[k] = full_batch.get(k, []) + [v]
    permutation = np.random.permutation((len(full_batch['audio'])//batch_size)*batch_size)
    for k, v in full_batch.items():
      full_batch[k] = np.array(full_batch[k])[permutation].reshape(
                              [-1, batch_size] + list(full_batch[k][0].shape))
  except Exception as e:
    print("Is your audio too short for the required batch_size? E.g. 16s for batch_size=2")
    raise e
  return full_batch


def get_trained_model(model_dir):
  gin_file = os.path.join(model_dir, "operative_config-0.gin")
  # Parse gin config,
  with gin.unlock_config():
    gin.parse_config_file(gin_file, skip_unknown=True)

  ckpt_files = [f for f in tf.io.gfile.listdir(model_dir) if "ckpt" in f]
  step_of = lambda f: int(f.split(".")[0].split("-")[1])
  latest = max([step_of(f) for f in ckpt_files])
  ckpt_name = [i for i in ckpt_files if step_of(i) == latest][0].split(".")[0]
  ckpt = os.path.join(model_dir, ckpt_name)

  model = get_model()
  model.restore(ckpt)
  return model, ckpt


def split_batches(batches):
  num_test_batches =len(batches['audio'])//2
  test_sec = num_test_batches*len(batches['audio'][0])*4
  print(f"The first {test_sec} seconds are not used for fine-tuning")
  batches_train = {k: v[num_test_batches:] for k, v in batches.items()}
  batches_test = {k: tf.reshape(v[:num_test_batches], [-1] + list(v.shape[2:])) for k, v in batches.items()}
  return batches_train, batches_test, test_sec


class TunedAEFileApi():
  """Similar to TrainedModelFileAPI, but does fine tuning on the
  target timbre before prediction"""
  def __init__(self, model_dir, steps=fine_tune_steps):
    self.model_dir = model_dir
    self.steps = steps

  def fine_tune(self, files, tmpdir):
    batches = preprocess_files(files)
    model, ckpt = get_trained_model(self.model_dir)
    strategy = train_util.get_strategy()
    trainer = Trainer(model, strategy, checkpoints_to_keep=1)
    trainer.restore(ckpt)
    for i in range(self.steps):
      batch = {k: v[i % len(v)] for k, v in batches.items()}
      trainer.train_step(batch)
    trainer.save(tmpdir)
    shutil.copy(f"{self.model_dir}/operative_config-0.gin",
                f"{tmpdir}/operative_config-0.gin")
    return model

  def get_finetuned_file_api(self, files, tmpdir): 
    self.fine_tune(files, tmpdir)
    if self.steps == 0:
      tmpdir = self.model_dir
    return TrainedModelFileAPI(tmpdir)

  def reconstruct(self, audio):
    with tempfile.TemporaryDirectory() as tmpdir:
      return self.get_finetuned_file_api([audio], tmpdir).reconstruct(audio)

  def transfer(self, melody, timbre):
    with tempfile.TemporaryDirectory() as tmpdir:
      return self.get_finetuned_file_api([timbre], tmpdir).transfer(melody, timbre)

  def continuous_interpolation(self, melody, final_timbre):
    return self.get_finetuned_file_api([melody, final_timbre]).continuous_interpolation(melody, final_timbre)

  def cycle_reconstruct(self, audio, intermediate_melody):
    with tempfile.TemporaryDirectory() as tmpdir:
      return self.get_finetuned_file_api([audio], tmpdir)\
                 .cycle_reconstruct(audio, intermediate_melody)




ae = TunedAEFileApi(model_dir)
audio_synth = ae.transfer(melody, timbre)
play(audio_synth)


In [None]:
from scipy.io import wavfile
os.makedirs(output_path, exist_ok=True)
wavfile.write(f"{output_path}/audio_synth.wav", 16000, audio_synth.numpy()[0])