# Convert Midi Files to Text

In [27]:
import glob
import pickle
import numpy
from tqdm import tqdm
!pip install music21
from music21 import converter, instrument, note, chord
from keras.utils import np_utils
import time




In [28]:
paths = glob.glob(r"data\classical_composers\midi\*\*.mid")
paths = [path.replace('\\','/') for path in paths]


# paths

In [48]:
class MusicData:
    def __init__(self, 
                 paths, 
                 sequence_length = 32, 
                 load = False):

        
        if load: #load already pre-processed data
            with open('data/notes', 'rb') as f:
                self.notes = pickle.load(f)

        else: #pre-process
            print('Parsing midi files:')
            time.sleep(1)
            self.notes = self.get_notes(paths)

        # get amount of pitch names
        self.n_vocab = len(set(self.notes))
        # prepare data for training
        self.X, self.y = self.notes_to_seq(self.notes, self.n_vocab, sequence_length)


    def get_notes(self, paths):
        """ Get all the notes and chords from the midi files"""
        notes = []

        #loop over each file path
        for file in tqdm(paths):
            midi = converter.parse(file)

            # print("Parsing %s" % file)

            notes_to_parse = None

            try: # file has instrument parts
                s2 = instrument.partitionByInstrument(midi)
                notes_to_parse = s2.parts[1].recurse() 
            except: # file has notes in a flat structure
                notes_to_parse = midi.flat.notes

            for element in notes_to_parse:
                if isinstance(element, note.Note):
                    notes.append(str(element.pitch))
                elif isinstance(element, chord.Chord):
                    notes.append('.'.join(str(n) for n in element.normalOrder))

        #save as pickle file
        with open('data/notes', 'wb') as filepath:
            pickle.dump(notes, filepath)

        return notes



    def notes_to_seq(self, notes, n_vocab,sequence_length):
        """ Prepare the sequences used by the model """

        # get all pitch names
        pitchnames = sorted(set(item for item in notes))

        # create a dictionary to map pitches to integers
        note_to_int = dict((note, number) for number, note in enumerate(pitchnames))

        X = []
        y = []

        # create input sequences and the corresponding outputs
        for i in range(0, len(notes) - sequence_length):
            sequence_in = notes[i:i + sequence_length]
            sequence_out = notes[i + sequence_length]
            X.append([note_to_int[char] for char in sequence_in])
            y.append(note_to_int[sequence_out])

        n_patterns = len(X)

        # reshape the input into a format compatible with LSTM layers
        X = numpy.reshape(X, (n_patterns, sequence_length, 1))
        # normalize input
        X = X / float(n_vocab)

        y = np_utils.to_categorical(y)

        return (X, y)

In [49]:
data = MusicData(paths[:5],100,load=False)

Parsing midi files:
100%|██████████| 5/5 [00:05<00:00,  1.12s/it]


In [54]:
data.X.shape

(3997, 100, 1)