# Convert Midi Files to Text

In [1]:
import glob
import pickle
from tqdm import tqdm
import time
import numpy

# !pip install music21
from music21 import converter, instrument, note, chord
from keras.utils import np_utils

from collections import defaultdict
import multiprocessing as mp
import concurrent
print("Number of processors: ", mp.cpu_count())

Number of processors:  8


In [21]:
def extract_notes(filepath):
    ''' Extract notes from a single MIDI file'''
    notes = []
    # print("Parsing %s" % file)

    midi = converter.parse(filepath)

    notes_to_parse = None

    try: # file has instrument parts
        s2 = instrument.partitionByInstrument(midi)
        notes_to_parse = s2.parts[1].recurse() 
    except: # file has notes in a flat structure
        notes_to_parse = midi.flat.notes

    for element in notes_to_parse:
        if isinstance(element, note.Note):
            notes.append(str(element.pitch))
        elif isinstance(element, chord.Chord):
            notes.append('.'.join(str(n) for n in element.normalOrder))
    print(f'Conversion of "{filepath}" completed')

    with open(filepath.replace('.mid',''), 'wb') as f:
        pickle.dump(notes, f)

    # return notes

In [32]:
def func(path):
    print(path)
    return path[:10]

In [33]:
""" Get all the notes and chords from the midi files"""

# notes = []

#divide list of paths in n equal parts where n = number of cpus available
n = int(len(paths) / mp.cpu_count()) 
paths_split = [paths[i * n:(i + 1) * n] for i in range((len(paths) + n - 1) // n )]  

with concurrent.futures.ProcessPoolExecutor() as executor:
    results = executor.map(func, paths_split)



In [30]:
paths = glob.glob(r"data\classical_composers\midi\*\*.mid")
paths = [path.replace('\\','/') for path in paths]

# paths
paths = paths[:16]

In [22]:
extract_notes(paths[1])

Conversion of "data/classical_composers/midi/albeniz/alb_esp2.mid" completed


In [27]:
""" Get all the notes and chords from the midi files"""

# notes = []

#divide list of paths in n equal parts where n = number of cpus available
n = int(len(paths) / mp.cpu_count()) 
paths_split = [paths[i * n:(i + 1) * n] for i in range((len(paths) + n - 1) // n )]  

with concurrent.futures.ProcessPoolExecutor() as executor:
    # results = [executor.submit(extract_notes, paths_split[i]) for i in range(mp.cpu_count())]
    results = executor.map(extract_notes, paths_split)

    # for f in concurrent.futures.as_completed(results):
    #     print(f.result())
# #save as pickle file
# with open('data/notes', 'wb') as filepath:
#     pickle.dump(notes, filepath)

In [28]:
for r in results:
    print(r)

BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.

In [48]:
class MusicData:
    def __init__(self, 
                 paths, 
                 sequence_length = 32, 
                 load = False):

        
        if load: #load already pre-processed data
            with open('data/notes', 'rb') as f:
                self.notes = pickle.load(f)

        else: #pre-process
            print('Parsing midi files:')
            time.sleep(1)
            self.notes = self.get_notes(paths)

        # get amount of pitch names
        self.n_vocab = len(set(self.notes))
        # prepare data for training
        self.X, self.y = self.notes_to_seq(self.notes, self.n_vocab, sequence_length)

    # https://towardsdatascience.com/how-to-generate-music-using-a-lstm-neural-network-in-keras-68786834d4c5
    def get_notes(self, paths):
        """ Get all the notes and chords from the midi files"""
        notes = []

        #loop over each file path
        for file in tqdm(paths):
            midi = converter.parse(file)

            # print("Parsing %s" % file)

            notes_to_parse = None

            try: # file has instrument parts
                s2 = instrument.partitionByInstrument(midi)
                notes_to_parse = s2.parts[1].recurse() 
            except: # file has notes in a flat structure
                notes_to_parse = midi.flat.notes

            for element in notes_to_parse:
                if isinstance(element, note.Note):
                    notes.append(str(element.pitch))
                elif isinstance(element, chord.Chord):
                    notes.append('.'.join(str(n) for n in element.normalOrder))

        #save as pickle file
        with open('data/notes', 'wb') as filepath:
            pickle.dump(notes, filepath)

        return notes



    def notes_to_seq(self, notes, n_vocab,sequence_length):
        """ Prepare the sequences used by the model """

        # get all pitch names
        pitchnames = sorted(set(item for item in notes))

        # create a dictionary to map pitches to integers
        note_to_int = dict((note, number) for number, note in enumerate(pitchnames))

        X = []
        y = []

        # create input sequences and the corresponding outputs
        for i in range(0, len(notes) - sequence_length):
            sequence_in = notes[i:i + sequence_length]
            sequence_out = notes[i + sequence_length]
            X.append([note_to_int[char] for char in sequence_in])
            y.append(note_to_int[sequence_out])

        n_patterns = len(X)

        # reshape the input into a format compatible with LSTM layers
        X = numpy.reshape(X, (n_patterns, sequence_length, 1))
        # normalize input
        X = X / float(n_vocab)

        y = np_utils.to_categorical(y)

        return (X, y)

In [49]:
data = MusicData(paths[:5],100,load=False)

Parsing midi files:
100%|██████████| 5/5 [00:05<00:00,  1.12s/it]


In [54]:
data.X.shape

(3997, 100, 1)