### Data Files Path Preparation

In [5]:
import os
from os import listdir
from os.path import isfile, join

# Modify this path to your own MAESTRO dataset
maestro_dir = '/home/scpark/hard/datasets/maestro-v2.0.0'

data_dirs = []
for year in ['2004', '2006', '2008', '2009', '2011', '2013', '2014', '2015', '2017', '2018']:
    data_dirs.append(maestro_dir + '/' + str(year))

data_files = []
for data_dir in data_dirs:
    data_files += [join(data_dir, f) for f in listdir(data_dir) if isfile(join(data_dir, f)) if 'midi' in f]

data_files.sort()

print('total midi files : ', len(data_files))

total midi files :  1282


### Event Extract from Midi File

In [6]:
import mido
import numpy as np



def get_eventlist(data_file):
    
    '''
    event : [Time, Type(ON, OFF, CC), Value1, Value2]
    '''
    
    ON = 1
    OFF = 0
    CC = 2

    midi = mido.MidiFile(data_file)

    current_time = 0
    eventlist = []
    cc = False
    for msg in midi:
        #print(msg)
        current_time += msg.time

         # NOTE ON CASE
        if msg.type is 'note_on' and msg.velocity > 0:
            event = [current_time, ON, msg.note, msg.velocity]
            eventlist.append(event)

         # NOTE OFF CASE        
        elif msg.type is 'note_off' or (msg.type is 'note_on' and msg.velocity == 0):
            event = [current_time, OFF, msg.note, msg.velocity]
            eventlist.append(event)
            
        if msg.type is 'control_change':
            
            # 64 sustain pedal
            if msg.control != 64:
                continue
            
            if cc == False and msg.value > 0:
                cc = True
                event = [current_time, CC, 0, 1]
                eventlist.append(event)
                
            elif cc == True and msg.value == 0:
                cc = False
                event = [current_time, CC, 0, 0]
                eventlist.append(event)
                
    eventlist = np.array(eventlist)
    return eventlist

# Sample print
index = np.random.randint(0, len(data_files))
eventlist = get_eventlist(data_files[index])
print(eventlist)

[[  0.           2.           0.           1.        ]
 [  1.           1.          42.          35.        ]
 [  1.02604167   1.          49.          30.        ]
 ...
 [326.62291667   0.          30.           0.        ]
 [326.93020833   0.          42.           0.        ]
 [327.16770833   0.          37.           0.        ]]


### Midifile to EventListfile

In [4]:
from tqdm import tqdm_notebook as tqdm
import os

dataset_dir = 'dataset_cc'

# Eventlist file will be saved in dataset_dir
if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)

# converting loop
for i in tqdm(range(len(data_files))):
    print(data_files[i])
    eventlist = get_eventlist(data_files[i])
    print(eventlist.shape)
    
    save_file = dataset_dir + '/' + str(i)
    data = {'eventlist': eventlist}
    np.savez(save_file, **data, allow_pickle=False)

HBox(children=(IntProgress(value=0, max=1282), HTML(value='')))

/home/scpark/hard/datasets/maestro-v2.0.0/2004/MIDI-Unprocessed_SMF_02_R1_2004_01-05_ORIG_MID--AUDIO_02_R1_2004_05_Track05_wav.midi
(17273, 4)
/home/scpark/hard/datasets/maestro-v2.0.0/2004/MIDI-Unprocessed_SMF_02_R1_2004_01-05_ORIG_MID--AUDIO_02_R1_2004_06_Track06_wav.midi
(2477, 4)
/home/scpark/hard/datasets/maestro-v2.0.0/2004/MIDI-Unprocessed_SMF_02_R1_2004_01-05_ORIG_MID--AUDIO_02_R1_2004_08_Track08_wav.midi
(5864, 4)
/home/scpark/hard/datasets/maestro-v2.0.0/2004/MIDI-Unprocessed_SMF_02_R1_2004_01-05_ORIG_MID--AUDIO_02_R1_2004_10_Track10_wav.midi
(13519, 4)
/home/scpark/hard/datasets/maestro-v2.0.0/2004/MIDI-Unprocessed_SMF_05_R1_2004_01_ORIG_MID--AUDIO_05_R1_2004_02_Track02_wav.midi
(38238, 4)
/home/scpark/hard/datasets/maestro-v2.0.0/2004/MIDI-Unprocessed_SMF_05_R1_2004_01_ORIG_MID--AUDIO_05_R1_2004_03_Track03_wav.midi
(12024, 4)
/home/scpark/hard/datasets/maestro-v2.0.0/2004/MIDI-Unprocessed_SMF_05_R1_2004_02-03_ORIG_MID--AUDIO_05_R1_2004_06_Track06_wav.midi
(20346, 4)
/home/s

KeyboardInterrupt: 