<a href="https://colab.research.google.com/github/magenta/ddsp/blob/master/ddsp/colab/demos/pitch_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##### Copyright 2020 Google LLC.

Licensed under the Apache License, Version 2.0 (the "License");





In [None]:
# Copyright 2020 Google LLC. All Rights Reserved.

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# DDSP Pitch Detection Demo

This notebook is a demo of pitch detection using inverse audio synthesis. 


* [ICML Workshop paper](https://openreview.net/forum?id=RlVTYWhsky7)
* [Audio Examples](http://goo.gl/magenta/ddsp-inv) 

This notebook extracts these features from input audio (either uploaded files, or recorded from the microphone) and resynthesizes the audio from the model. The DDSP-INV model is hierarchical, and provides both resynthesis from the sinusoidal model and harmonic model.

<img src="https://storage.googleapis.com/ddsp-inv/full_stack/diagram.png" alt="DDSP Pitch Detection" width="700">


### Instructions for running:

* Make sure to use a GPU runtime, click:  __Runtime >> Change Runtime Type >> GPU__
* Press ▶️ on the left of each of the cells
* View the code: Double-click any of the cells
* Hide the code: Double click the right side of the cell





In [None]:
#@title #Install and Import

#@markdown Install ddsp, define some helper functions, and download the model. This transfers a lot of data and _should take a minute or two_.
%tensorflow_version 2.x
print('Installing from pip package...')
!pip install -qU ddsp

# Ignore a bunch of deprecation warnings
import warnings
warnings.filterwarnings("ignore")

import copy
import os
import time

import ddsp
import ddsp.training
from ddsp.colab import colab_utils
from ddsp.colab.colab_utils import (play, record, 
    specplot, upload, DEFAULT_SAMPLE_RATE)
import gin
from google.colab import files
import librosa
import matplotlib.pyplot as plt
import numpy as np
import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds

# Helper Functions
sample_rate = DEFAULT_SAMPLE_RATE  # 16000


print('Done!')

In [None]:
#@title Record or Upload Audio
#@markdown * Either record audio from microphone or upload audio from file (.mp3 or .wav) 
#@markdown * Audio should be monophonic (single instrument / voice)

record_or_upload = "Upload (.mp3 or .wav)"  #@param ["Record", "Upload (.mp3 or .wav)"]

record_seconds =     5#@param {type:"number", min:1, max:10, step:1}

if record_or_upload == "Record":
  audio = record(seconds=record_seconds)
else:
  # Load audio sample here (.mp3 or .wav3 file)
  # Just use the first file.
  filenames, audios = upload()
  audio = audios[0]
audio = audio[np.newaxis, :]


# Plot.
specplot(audio)
play(audio)


In [None]:
#@title Load a model
#@markdown Run for every new audio input. Models separately trained on the [URMP](http://www2.ece.rochester.edu/projects/air/projects/URMP/annotations_5P.html), [MDB-stem-synth](https://zenodo.org/record/1481172#.Xzouy5NKhTY), and [MIR1k](https://sites.google.com/site/unvoicedsoundseparation/mir-1k) datasets.
model = 'urmp' #@param ['urmp', 'mdb_stem_synth', 'mir1k']
MODEL = model

# Pretrained models.
PRETRAINED_DIR = '/content/pretrained'
# Copy over from gs:// for faster loading.
!rm -r $PRETRAINED_DIR &> /dev/null
!mkdir $PRETRAINED_DIR &> /dev/null
GCS_CKPT_DIR = 'gs://ddsp-inv/ckpts'
model_dir = os.path.join(GCS_CKPT_DIR, '%s_ckpt' % model.lower())

!gsutil cp $model_dir/* $PRETRAINED_DIR &> /dev/null
model_dir = PRETRAINED_DIR
gin_file_pattern = os.path.join(model_dir, 'operative_config*.gin')
gin_file = tf.io.gfile.glob(gin_file_pattern)[0]


# Parse gin config,
with gin.unlock_config():
  gin.parse_config_file(gin_file, skip_unknown=True)

# Assumes only one checkpoint in the folder, 'ckpt-[iter]`.
ckpt_files = [f for f in tf.io.gfile.listdir(model_dir) if 'ckpt' in f]
ckpt_name = ckpt_files[0].split('.')[0]
ckpt = os.path.join(model_dir, ckpt_name)

# Ensure dimensions and sampling rates are equal
time_steps_train = 125
n_samples_train = 64000
hop_size = int(n_samples_train / time_steps_train)

time_steps = int(audio.shape[1] / hop_size)
n_samples = time_steps * hop_size
audio = audio[:, :n_samples]

gin_params = [
    'TranscribingAutoencoder.n_samples = {}'.format(n_samples),
    'oscillator_bank.use_angular_cumsum = True',  # Avoids cumsum accumulation errors.
]

with gin.unlock_config():
  gin.parse_config(gin_params)



# Set up the model just to predict audio given new conditioning
model = ddsp.training.models.TranscribingAutoencoder()
model.restore(ckpt)

# Build model by running a batch through it.
start_time = time.time()
_ = model({'audio': audio}, training=False)
print('Restoring model took %.1f seconds' % (time.time() - start_time))

In [None]:
#@title #Predict Pitch

#@markdown Compare DDSP-INV (self-supervised) and [CREPE](https://github.com/marl/crepe) (supervised) models

# DDSP-INV,
start_time = time.time()
print('\nExtracting f0 with DDSP-INV...')
controls = model.get_controls({'audio': audio}, training=False)
print('Prediction took %.1f seconds' % (time.time() - start_time))

# CREPE.
start_time = time.time()
print('\nExtracting f0 with CREPE...')
ddsp.spectral_ops.reset_crepe()
f0_crepe, f0_confidence = ddsp.spectral_ops.compute_f0(audio[0], 
                                                       sample_rate=16000,
                                                       frame_rate=31.25,
                                                       viterbi=False)
print('Prediction took %.1f seconds' % (time.time() - start_time))

# Synthesize the CREPE audio
synth = ddsp.synths.Wavetable(n_samples=n_samples, scale_fn=None)
wavetable = np.sin(np.linspace(0, 2.0 * np.pi, 2048))[np.newaxis, np.newaxis, :]
amps = np.ones([1, time_steps, 1]) * 0.1
audio_crepe = synth(amps, wavetable, f0_crepe[np.newaxis, :, np.newaxis])

# Synthesize the DDSP-INV audio
audio_ddsp_inv = synth(controls['harm_amp'], wavetable, controls['f0_hz'])

In [None]:
#@title #Plot Resynthesis

k = 0

# Plot Pitch.
plt.figure(figsize=(6, 4))
f0_crepe_midi = ddsp.core.hz_to_midi(f0_crepe)
f0_harm_midi = ddsp.core.hz_to_midi(controls['f0_hz'])
plt.plot(np.ravel(f0_crepe_midi), label='crepe')
plt.plot(np.ravel(f0_harm_midi[k]), label='ddsp-inv')
plt.ylabel('Pitch (MIDI)')
plt.xlabel('Time')
plt.xticks([])
plt.legend(loc='upper right')

# Play Audio.
print('Original')
play(audio)

print('Sinusoidal Resynthesis')
play(controls['sin_audio'][k])

print('Harmonic Resynthesis')
play(controls['harm_audio'][k])

print('DDSP-INV Pitch')
play(audio_ddsp_inv[k])

print('CREPE Pitch')
play(audio_crepe[k])

# Plot spectrograms.
specplot(audio)
plt.title("Original")

specplot(controls['sin_audio'][k])
_ = plt.title("Sinusoidal Resynthesis")

specplot(controls['harm_audio'][k])
_ = plt.title("Harmonic Resynthesis")


# Plot sinusoids.
plt.figure(figsize=(6, 6))
t = np.arange(controls['sin_freqs'].shape[1])
for a, f in zip(np.transpose(controls['sin_amps'][k]), np.transpose(controls['sin_freqs'][k])):
  plt.scatter(t, f, s=a*200, linewidths=1)
  plt.ylim(0, 8000)
plt.title('Sinusoids (Sinusoidal)')
plt.ylabel('Frequency (Hz)')
plt.xlabel('Time')
plt.xticks([])

plt.figure(figsize=(6, 6))
t = np.arange(controls['harm_freqs'].shape[1])
for a, f in zip(np.transpose(controls['harm_amps'][k]), np.transpose(controls['harm_freqs'][k])):
  plt.scatter(t, f, s=a*200, linewidths=1)
  plt.ylim(0, 8000)
plt.title('Sinusoids (Harmonic)')
plt.ylabel('Frequency (Hz)')
plt.xlabel('Time')
_ = plt.xticks([])
