Copyright 2020 Google LLC.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

# Onsets and Frames Transcription

Onsets and Frames is an automatic music transcription framework with piano and drums models. This notebook demonstrates running the model on user-supplied recordings. For more details on the architecture of the model and training datasets, see our papers:

* [Onsets and Frames: Dual-Objective Piano Transcription](https://goo.gl/magenta/onsets-frames-paper)
* [Enabling Factorized Piano Music Modeling and Generation with the MAESTRO Dataset](https://goo.gl/magenta/maestro-paper)
* [Improving Perceptual Quality of Drum Transcription with the Expanded Groove MIDI Dataset](https://goo.gl/magenta/e-gmd-paper)

And blog posts:

* [Onsets and Frames: Dual-Objective Piano Transcription
](http://g.co/magenta/onsets-frames)
* [The MAESTRO Dataset and Wave2Midi2Wave](https://g.co/magenta/maestro-wave2midi2wave)
* [Improving Perceptual Quality of Drum Transcription with the Expanded Groove MIDI Dataset](https://g.co/magenta/oaf-drums)
---

This colab notebook is self-contained and should run natively on google cloud. The code and checkpoints can be downloaded separately and run locally, which is recommended if you want to train your own model. Details on how to do this can be found in the [GitHub repo](https://goo.gl/magenta/onsets-frames-code).

# Environment Setup

Includes package installation for sequence synthesis and downloading pretrained checkpoint. May take a few minutes.

In [None]:
#@title Setup Environment

import glob

print('Copying checkpoints from GCS...')
!rm -r /content/onsets-frames
!mkdir /content/onsets-frames
!gsutil -q -m cp -R gs://magentadata/models/onsets_frames_transcription/*checkpoint*.zip /content/onsets-frames/
!unzip -o /content/onsets-frames/maestro_checkpoint.zip -d /content/onsets-frames/maestro
MAESTRO_CHECKPOINT_DIR = '/content/onsets-frames/maestro/train'
!unzip -o /content/onsets-frames/e-gmd_checkpoint.zip -d /content/onsets-frames/e-gmd
EGMD_CHECKPOINT_DIR = '/content/onsets-frames/e-gmd'
  
print('Installing dependencies...')
!apt-get update -qq && apt-get install -qq libfluidsynth1 fluid-soundfont-gm build-essential libasound2-dev libjack-dev ffmpeg  
!pip install pyfluidsynth pretty_midi

!pip install -qU magenta


# Model Initializiation

In [None]:
#@title Select Model
model_type = "MAESTRO (Piano)" #@param ["MAESTRO (Piano)", "E-GMD (Drums)"]


In [None]:
#@title Initialize Model
import tensorflow.compat.v1 as tf
import librosa
import numpy as np

from google.colab import files

from magenta.common import tf_utils
from note_seq import audio_io
from magenta.models.onsets_frames_transcription import audio_label_data_utils
from magenta.models.onsets_frames_transcription import configs
from magenta.models.onsets_frames_transcription import constants
from magenta.models.onsets_frames_transcription import data
from magenta.models.onsets_frames_transcription import infer_util
from magenta.models.onsets_frames_transcription import train_util
import note_seq
from note_seq import midi_io
from note_seq import sequences_lib

tf.disable_v2_behavior()

## Define model and load checkpoint
## Only needs to be run once.

if model_type.startswith('MAESTRO'):
  config = configs.CONFIG_MAP['onsets_frames']
  hparams = config.hparams
  hparams.use_cudnn = False
  hparams.batch_size = 1
  checkpoint_dir = MAESTRO_CHECKPOINT_DIR
elif model_type.startswith('E-GMD'):
  config = configs.CONFIG_MAP['drums']
  hparams = config.hparams
  hparams.batch_size = 1
  checkpoint_dir = EGMD_CHECKPOINT_DIR
else:
  raise ValueError('Unknown Model Type')

examples = tf.placeholder(tf.string, [None])

dataset = data.provide_batch(
    examples=examples,
    preprocess_examples=True,
    params=hparams,
    is_training=False,
    shuffle_examples=False,
    skip_n_initial_records=0)

estimator = train_util.create_estimator(
    config.model_fn, checkpoint_dir, hparams)

iterator = tf.data.make_initializable_iterator(dataset)
next_record = iterator.get_next()

# Upload Audio

Run the following cell to upload audio files.

In [None]:
#@title Audio Upload
uploaded = files.upload()

to_process = []
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  wav_data = uploaded[fn]
  example_list = list(
      audio_label_data_utils.process_record(
          wav_data=wav_data,
          sample_rate=hparams.sample_rate,
          ns=note_seq.NoteSequence(),
          example_id=fn,
          min_length=0,
          max_length=-1,
          allow_empty_notesequence=True))
  assert len(example_list) == 1
  to_process.append(example_list[0].SerializeToString())
  
  print('Processing complete for', fn)
  
sess = tf.Session()

sess.run([
    tf.initializers.global_variables(),
    tf.initializers.local_variables()
])

sess.run(iterator.initializer, {examples: to_process})

def transcription_data(params):
  del params
  return tf.data.Dataset.from_tensors(sess.run(next_record))
input_fn = infer_util.labels_to_features_wrapper(transcription_data)

Saving MAPS_MUS-mz_331_3_ENSTDkCl-snippet.wav to MAPS_MUS-mz_331_3_ENSTDkCl-snippet (1).wav
User uploaded file "MAPS_MUS-mz_331_3_ENSTDkCl-snippet.wav" with length 2445488 bytes
Processing complete for MAPS_MUS-mz_331_3_ENSTDkCl-snippet.wav


# Inference

Run the following cell to transcribe the files you uploaded. Each time it runs it will transcribe one of the uploaded files.

In [None]:
#@title Run inference
prediction_list = list(
    estimator.predict(
        input_fn,
        yield_single_examples=False))
assert len(prediction_list) == 1

sequence_prediction = note_seq.NoteSequence.FromString(
    prediction_list[0]['sequence_predictions'][0])

# Ignore warnings caused by pyfluidsynth
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

note_seq.plot_sequence(sequence_prediction)
note_seq.play_sequence(sequence_prediction, note_seq.midi_synth.fluidsynth,
                 colab_ephemeral=False)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Running infer on CPU
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /content/onsets-frames/maestro/train/model.ckpt
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calculating metrics for b'MAPS_MUS-mz_331_3_ENSTDkCl-snippet.wav' with length 433
INFO:tensorflow:Reference pitches were length 0, returning empty metrics for b'MAPS_MUS-mz_331_3_ENSTDkCl-snippet.wav':
INFO:tensorflow:prediction_loop marked as finished
INFO:tensorflow:prediction_loop marked as finished


Optionally run the following cell to download a MIDI version of the inferred transcription.

In [None]:
#@title Download MIDI
midi_filename = ('prediction.mid')
midi_io.sequence_proto_to_midi_file(sequence_prediction, midi_filename)

files.download(midi_filename)