Copyright 2020 Google LLC.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

# Onsets and Frames Transcription

Onsets and Frames is an automatic music transcription framework with piano and drums models. This notebook demonstrates running the model on user-supplied recordings. For more details on the architecture of the model and training datasets, see our papers:

* [Onsets and Frames: Dual-Objective Piano Transcription](https://goo.gl/magenta/onsets-frames-paper)
* [Enabling Factorized Piano Music Modeling and Generation with the MAESTRO Dataset](https://goo.gl/magenta/maestro-paper)
* [Improving Perceptual Quality of Drum Transcription with the Expanded Groove MIDI Dataset](https://goo.gl/magenta/e-gmd-paper)

And blog posts:

* [Onsets and Frames: Dual-Objective Piano Transcription
](http://g.co/magenta/onsets-frames)
* [The MAESTRO Dataset and Wave2Midi2Wave](https://g.co/magenta/maestro-wave2midi2wave)
* [Improving Perceptual Quality of Drum Transcription with the Expanded Groove MIDI Dataset](https://g.co/magenta/oaf-drums)
---

This colab notebook is self-contained and should run natively on google cloud. The code and checkpoints can be downloaded separately and run locally, which is recommended if you want to train your own model. Details on how to do this can be found in the [GitHub repo](https://goo.gl/magenta/onsets-frames-code).

# Environment Setup

Includes package installation for sequence synthesis and downloading pretrained checkpoint. May take a few minutes.

In [1]:
#@title Setup Environment

import glob
import os

# print('Copying checkpoints from GCS...')
# !rm -r /content/onsets-frames
# !mkdir /content/onsets-frames
# !gsutil -q -m cp -R gs://magentadata/models/onsets_frames_transcription/*checkpoint*.zip /content/onsets-frames/
# !unzip -o /content/onsets-frames/maestro_checkpoint.zip -d /content/onsets-frames/maestro
# !unzip -o /content/onsets-frames/e-gmd_checkpoint.zip -d /content/onsets-frames/e-gmd
# 
# print('Installing dependencies...')
# !apt-get update -qq && apt-get install -qq libfluidsynth2 fluid-soundfont-gm build-essential libasound2-dev libjack-dev ffmpeg
# !pip install cython wheel
# !pip install pyfluidsynth pretty_midi python-rtmidi
# 
# !pip install -qU magenta

print("Skipping all of this process because of manual setup.")

Skipping all of this process because of manual setup.


In [None]:
# Commands used to create the appropriate environment
# !pip3 install -c constraints.txt magenta==2.1.4
!python3.10 -m venv.venv
!pip3 install -r requirements.txt
!pip3 install --no-deps magenta == 2.1.4

In [None]:
# Commands used to create the conda venv for magenta
!conda craete -n magenta python=3.7
!source activate magenta
!pip3 install python-rtmidi == 1.1.2
!pip3 install magenta == 1.1.8

In [3]:
MAESTRO_CHECKPOINT_DIR = '/content/onsets-frames/maestro/train'
EGMD_CHECKPOINT_DIR = '/content/onsets-frames/e-gmd'

# Model Initializiation

In [4]:
#@title Select Model
model_type = "MAESTRO (Piano)"  #@param ["MAESTRO (Piano)", "E-GMD (Drums)"]


In [33]:
#@title Initialize Model
# import tensorflow as tf
# original import from file
import tensorflow.compat.v1 as tf
# workaround posted in: https://github.com/tensorflow/tensorflow/issues/38800
# (workaround for above command)
# import tensorflow._api.v2.compat.v1 as tf


import librosa
import numpy as np

# commenting this because of local execution
# from google.colab import files

# present in original code
# from magenta.common import tf_utils
from magenta.models.onsets_frames_transcription import audio_label_data_utils
from magenta.models.onsets_frames_transcription import configs
from magenta.models.onsets_frames_transcription import constants
from magenta.models.onsets_frames_transcription import data
from magenta.models.onsets_frames_transcription import infer_util
from magenta.models.onsets_frames_transcription import train_util
# note_seq is disabled because of duplicate music.proto file
from note_seq import audio_io
import note_seq
from note_seq import midi_io
from note_seq import sequences_lib

tf.disable_v2_behavior()

## Define model and load checkpoint
## Only needs to be run once.

if model_type.startswith('MAESTRO'):
    config = configs.CONFIG_MAP['onsets_frames']
    hparams = config.hparams
    hparams.use_cudnn = False
    hparams.batch_size = 1
    checkpoint_dir = MAESTRO_CHECKPOINT_DIR
elif model_type.startswith('E-GMD'):
    config = configs.CONFIG_MAP['drums']
    hparams = config.hparams
    hparams.batch_size = 1
    checkpoint_dir = EGMD_CHECKPOINT_DIR
else:
    raise ValueError('Unknown Model Type')

examples = tf.placeholder(tf.string, [None])

dataset = data.provide_batch(
    examples=examples,
    preprocess_examples=True,
    params=hparams,
    is_training=False,
    shuffle_examples=False,
    skip_n_initial_records=0)

estimator = train_util.create_estimator(
    config.model_fn, checkpoint_dir, hparams)

iterator = tf.data.make_initializable_iterator(dataset)
next_record = iterator.get_next()

INFO:tensorflow:Using config: {'_model_dir': '/content/onsets-frames/maestro/train', '_tf_random_seed': None, '_save_summary_steps': 300, '_save_checkpoints_steps': 300, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': None, '_keep_checkpoint_every_n_hours': 1, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fa74902a2d0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=300, num_shards=None, num_cor

# Upload Audio

Run the following cell to upload audio files.

In [32]:
from typing import List
import os

# from original jupyter code
#@title Audio Upload
# uploaded = files.upload()

directory_path: str = 'files-to-process'
uploaded: List[str] = os.listdir(directory_path)

to_process: List[str] = []
"""
This is a list of binary string of the data which is to be converted. The binary strings are created using the .SerializeToString method. 
TF documentation: All proto messages can be serialized to a binary-string using the .SerializeToString method:
"""
# for fn in uploaded:
#     with open(os.path.join(directory_path, fn)) as file_content:
#         print('User uploaded file "{name}" with length {length} bytes'.format(
#             name=fn, length=file_content.__sizeof__()))
#         wav_data = file_content
#         record = audio_label_data_utils.process_record(
#             wav_data=wav_data,
#             sample_rate=hparams.sample_rate,
#             ns=note_seq.NoteSequence(),
#             example_id=fn,
#             min_length=0,
#             max_length=-1,
#             allow_empty_notesequence=True)
#         print(type(record))
#         first = next(record).SerializeToString()
#         # print(str(type(first)))
#         # example_list = list(record)
#         # assert len(example_list) == 1
#         to_process.append(first.SerializeToString())
# 
#         print('Processing complete for', fn)

sess = tf.Session()

sess.run([
    tf.initializers.global_variables(),
    tf.initializers.local_variables()
])

# sess.run(iterator.initializer, {examples: to_process})
# 
# 
# def transcription_data(params):
#     del params
#     return tf.data.Dataset.from_tensors(sess.run(next_record))
# 
# 
# input_fn = infer_util.labels_to_features_wrapper(transcription_data)

2023-11-23 13:27:22.412749: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2023-11-23 13:27:22.423502: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 3599840000 Hz
2023-11-23 13:27:22.424219: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x129a480 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2023-11-23 13:27:22.424230: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version


[None, None]

In [None]:
directory_path: str = 'files-to-process'
uploaded: List[str] = os.listdir(directory_path)

for filename in uploaded:
    print('Starting transcription for %s...', filename)
    sess.run(iterator.initializer, )
    

# Inference

Run the following cell to transcribe the files you uploaded. Each time it runs it will transcribe one of the uploaded files.

In [5]:
#@title Run inference
prediction_list = list(
    estimator.predict(
        input_fn,
        yield_single_examples=False))
assert len(prediction_list) == 1

sequence_prediction = note_seq.NoteSequence.FromString(
    prediction_list[0]['sequence_predictions'][0])

# Ignore warnings caused by pyfluidsynth
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

note_seq.plot_sequence(sequence_prediction)
note_seq.play_sequence(sequence_prediction, note_seq.midi_synth.fluidsynth,
                       colab_ephemeral=False)

NameError: ignored

Optionally run the following cell to download a MIDI version of the inferred transcription.

In [6]:
#@title Download MIDI
midi_filename = ('prediction.mid')
midi_io.sequence_proto_to_midi_file(sequence_prediction, midi_filename)

files.download(midi_filename)

NameError: ignored