In [1]:
import glob
import os
import re
import pickle
import time

import numpy as np
import pandas as pd
import tensorflow.keras.utils as utils

from music21 import converter, instrument, note, chord, stream, midi

# Convert from MIDI to tensor
* Music is more complex than text (e.g., more than one note might happen at once).  
* We use the Music21 library to read MIDI music files.  
* The functions below turn a Music21 "stream" (of notes) into a numpy array of 8-bit integers.  
* All complex rhythms are simplified to 16 note versions. Chords are simplified to the highest note.

## Percussion MIDI files
#### Groove MIDI Dataset (GMD)
The **Groove MIDI Dataset (GMD)** is composed of 13.6 hours of aligned MIDI and (synthesized) audio of human-performed, tempo-aligned expressive drumming. The dataset contains 1,150 MIDI files and over 22,000 measures of drumming.

Source: [Groove MIDI Dataset (GMD)](https://magenta.tensorflow.org/datasets/groove)

In [2]:
!unzip /content/groove-v1.0.0-midionly.zip

Archive:  /content/groove-v1.0.0-midionly.zip
replace groove/drummer8/session2/12_funk_81_beat_4-4.mid? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [3]:
directory = "/content/groove/"
dir_glob_midi = os.path.join(directory, "**/**/**.mid")
path_data_pickle = './data.pkl'

# total number of files
print('total number of files')
midi_files = glob.glob(dir_glob_midi, recursive=True)
len(midi_files)

total number of files


3450

# 1. MidiFile

MIDI 파일은 아래와 같은 표준화된 디지털 음악 표현입니다.

In [4]:
mf = midi.MidiFile()
mf.open(midi_files[0])
mf.read()
mf.close()
mf

<MidiFile 1 tracks
  <MidiTrack 0 -- 68 events
    <MidiEvent DeltaTime, t=0, track=0, channel=None>
    <MidiEvent SEQUENCE_TRACK_NAME, t=None, track=0, channel=None, data=b'Midi Drums'>
    <MidiEvent DeltaTime, t=0, track=0, channel=None>
    <MidiEvent INSTRUMENT_NAME, t=None, track=0, channel=None, data=b'Midi Drums'>
    <MidiEvent DeltaTime, t=0, track=0, channel=None>
    <MidiEvent TIME_SIGNATURE, t=None, track=0, channel=None, data=b'\x04\x02\x18\x08'>
    <MidiEvent DeltaTime, t=0, track=0, channel=None>
    <MidiEvent KEY_SIGNATURE, t=None, track=0, channel=None, data=b'\x00\x00'>
    <MidiEvent DeltaTime, t=0, track=0, channel=None>
    <MidiEvent SMTPE_OFFSET, t=None, track=0, channel=None, data=b'!\t8\x0e\x17'>
    <MidiEvent DeltaTime, t=0, track=0, channel=None>
    <MidiEvent SET_TEMPO, t=None, track=0, channel=None, data=b'\x07\xa1 '>
    <MidiEvent DeltaTime, t=8, track=0, channel=None>
    <MidiEvent NOTE_ON, t=None, track=0, channel=10, pitch=38, velocity=127>
   

# 2. Music21 Stream
Music21은 MIDI 파일을 조작할 수 있는 강력한 Python 라이브러리입니다.

In [5]:
stream = midi.translate.midiFileToStream(mf)
stream.show('text')

{0.0} <music21.stream.Part 0x7f36ee1857c0>
    {0.0} <music21.tempo.MetronomeMark animato Quarter=120.0>
    {0.0} <music21.key.Key of C major>
    {0.0} <music21.meter.TimeSignature 4/4>
    {0.0} <music21.note.Note D>
    {0.25} <music21.note.Note D>
    {0.5} <music21.note.Rest rest>
    {0.75} <music21.note.Note C#>
    {1.0} <music21.note.Note C>
    {1.25} <music21.note.Rest rest>
    {1.5} <music21.note.Note A>
    {1.75} <music21.note.Note A>
    {2.0} <music21.note.Rest rest>
    {2.3333} <music21.note.Note G>
    {2.5833} <music21.note.Rest rest>
    {2.6667} <music21.note.Note G>
    {2.9167} <music21.note.Rest rest>
    {3.0} <music21.chord.Chord B3 C2 G3>


# 3. Chord Encoding
Music21 stream object를 가져와 sparse numpy matrix로 인코딩합니다. matrix는 어떤 note가 어떤 timestep에서 연주되는지를 나타냅니다.

In [6]:
MELODY_NOTE_OFF = 128 # (stop playing all previous notes)
MELODY_NO_EVENT = 129 # (no change from previous event)

def streamToNoteArray(stream):
    """
    Convert a Music21 sequence to a numpy array of int8s:
        0-127 - note on at specified pitch
        128   - note off
        129   - no event
    """
    # Part one, extract from stream
    total_length = int(np.round(stream.flat.highestTime / 0.25)) # in semiquavers
    stream_list = []
    for element in stream.flat:
        if isinstance(element, note.Note):
            stream_list.append([np.round(element.offset / 0.25), np.round(element.quarterLength / 0.25), element.pitch.midi])
        elif isinstance(element, chord.Chord):
            stream_list.append([np.round(element.offset / 0.25), np.round(element.quarterLength / 0.25), element.sortAscending().pitches[-1].midi])
    np_stream_list = np.array(stream_list, dtype=int)
    df = pd.DataFrame({'pos': np_stream_list.T[0], 'dur': np_stream_list.T[1], 'pitch': np_stream_list.T[2]})
    df = df.sort_values(['pos','pitch'], ascending=[True, False]) # sort the dataframe properly
    df = df.drop_duplicates(subset=['pos']) # drop duplicate values
    # part 2, convert into a sequence of note events
    output = np.zeros(total_length+1, dtype=np.int16) + np.int16(MELODY_NO_EVENT)  # set array full of no events by default.
    # Fill in the output list
    for i in range(total_length):
        if not df[df.pos==i].empty:
            n = df[df.pos==i].iloc[0] # pick the highest pitch at each semiquaver
            output[i] = n.pitch # set note on
            output[i+n.dur] = MELODY_NOTE_OFF
    return output

In [7]:
arr = streamToNoteArray(stream); arr

array([ 38,  38, 128,  37,  48, 128,  45,  45, 128,  43, 128,  43,  59,
       128], dtype=int16)

In [8]:
arr.shape

(14,)

In [16]:
# !pip install musicautobot
from musicautobot.numpy_encode import *

chordarr = stream2chordarr(stream); chordarr.shape

(14, 1, 128)

#### Chord Encoding Dimensions - (timestep x track x notes)
14 = timesteps in song  
1 = tracks/parts (track 1 = melody, track 2 = chords)  
128 = midi pitch range - [reference](https://www.inspiredacoustics.com/en/MIDI_note_numbers_and_center_frequencies)

## Representation

In [10]:
chordarr

array([[[0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.]]])

In [11]:
# First timestep
ts1 = chordarr[0].nonzero(); ts1

(array([0]), array([38]))

# 4. Note Encoding
Chord encoding을 dense matrix로 변환합니다.  
Sparse matric(timestep x track x notes)는 너무 많은 공간을 차지하기 때문에 0과 1대신 음이 연주되는 시기와 길이만 추적합니다.

In [12]:
npenc = chordarr2npenc(chordarr); npenc.shape

(19, 2)

#### Note Encoding Dimensions: (timesteps, (pitch x duration))
18 = timesteps  
2 = note representation (pitch x duration)  

* note pitch range (1-128)
    * Separator Index = -1
* note duration range (1 - 256)
    * Quarter Note = 4

#### Matrix size 비교:

In [13]:
f'Chord encoding size: {np.prod(chordarr.shape)}', f'Note encoding size: {np.prod(npenc.shape)}'

('Chord encoding size: 1792', 'Note encoding size: 38')

## Representation

Sigle note:

In [14]:
n = npenc[:1]; n

array([[38,  1]])

38 = D2 (note pitch)  
1 = Eighth Note (1 1/16th notes)

Whole note:

In [15]:
npenc

array([[38,  1],
       [-1,  1],
       [38,  1],
       [-1,  2],
       [37,  1],
       [-1,  1],
       [48,  1],
       [-1,  2],
       [45,  1],
       [-1,  1],
       [45,  1],
       [-1,  2],
       [43,  1],
       [-1,  2],
       [43,  1],
       [-1,  1],
       [59,  1],
       [55,  1],
       [36,  1]])

# 5. Tensor Encoding