In [56]:
import pandas as pd
import random
import os

from music21 import converter, instrument, note, chord

In [2]:
# Define the path to the data
DataPath = "Data\\Immersic_Archive"

# Create an empty dataframe
df = pd.DataFrame()

In [5]:
# Find file paths to MIDI files
FilePaths = []

for file in os.listdir(DataPath):
        file_path = os.path.join(file)
        # print(file_path) # Uncomment to see file paths
        FilePaths.append(file_path)

# Add file paths to DataFrame, print current DataFrame
df["file_paths"] = FilePaths

# Drop files that are not MIDI files
df.drop(df[df["file_paths"].str.endswith(".mid") == False].index, inplace=True)
df

Unnamed: 0,file_paths
1,1. MLP - Sweet - C#min9-BMaj9-C#min9-BMaj9-G#7...
2,10. MLP - Old Timer - Em9-A9-DMaj7-B7.mid
3,2. MLP - Jazz - Cmin11-F13-Dmin7-G13.mid
4,3. MLP - Sentimental Melody - GbMaj7-Ebm9.mid
5,4. MLP - Crunchy - Em7-Ebdim-Dmin11-G7b9.mid
...,...
62,LofiPianoSample7.mid
63,LofiPianoSample8.mid
64,LofiPianoSample9.mid
65,merge_from_ofoct.mid


In [33]:
# Test info extraction from MIDI file
test_notes = []
test_chords = []
test_instruments = []

# Load a sample MIDI file
test_path = df["file_paths"][37]

# Convert the MIDI file to a stream object
test_midi = converter.parse(os.path.join(DataPath, test_path))

# Extract notes and chords from the MIDI file
for element in test_midi.flat:
    if isinstance(element, note.Note):
        test_notes.append(str(element.pitch))
    elif isinstance(element, chord.Chord):
        test_chords.append('.'.join(str(n) for n in element.normalOrder))

# Print the notes and chords
print(test_notes)
print(test_chords)


['B5', 'F#6', 'G5', 'A5', 'B5', 'D6', 'B5', 'F#6', 'G5', 'A5', 'B5', 'D6', 'E5']
['7.9.11.2.4', '11.2', '7.11.2', '7.9.11.2.4', '0.2.4.7.9', '7.9.11.2.4', '11.2', '7.11.2', '7.9.11.2.4', '0.2.4.7.9', '9.0']


In [44]:
Notes = []
Chords = []
Instruments = []

In [45]:
# Extract notes, chords and instruments from MIDI files
for file in df["file_paths"]:
    notes = []
    chords = []
    instrumentCount = []

    midi = converter.parse(os.path.join(DataPath, file))
    notes_to_parse = None

    try:
        parts = instrument.partitionByInstrument(midi)
        notes_to_parse = parts.parts[0].recurse()
    except:
        notes_to_parse = midi.flat.notes

    for element in notes_to_parse:
        if isinstance(element, note.Note):
            notes.append(str(element.pitch))
        elif isinstance(element, chord.Chord):
            chords.append('.'.join(str(n) for n in element.normalOrder))

    Instruments.append(len(parts.parts))
    Notes.append(notes)
    Chords.append(chords)

for i in range(df.shape[0]):
    df["notes"] = Notes
    df["chords"] = Chords
    df["instruments"] = Instruments
    
df

[['F#5', 'B4', 'F#5', 'C#5', 'F#5', 'B4', 'G#3', 'G#5', 'F#5'], ['G3', 'E6', 'F#6', 'E6', 'B5', 'F#6', 'F#6', 'A5', 'E6', 'A5', 'B5', 'G3', 'E6', 'F#6', 'E6', 'B5', 'F#6', 'F#6', 'A5', 'E6', 'A5', 'B5'], ['F5', 'E-5', 'A4', 'G4', 'A4', 'A4', 'G4', 'F5', 'E-5', 'A4', 'G4', 'A4', 'A4', 'G4'], [], [], ['B-3', 'C#5', 'B-4', 'F#4', 'E-4', 'F#4', 'G#4', 'C#5', 'B-4', 'F#4', 'E-4', 'F#4', 'G#4', 'C5', 'C#5', 'B-4', 'F#4', 'E-4', 'F#4', 'G#4', 'B-4', 'F#4', 'E-4', 'F4', 'G#4'], ['F#2', 'G#4', 'E4', 'F#2', 'E4', 'B4', 'E4'], ['D2', 'C4', 'E4', 'C2', 'B3', 'D4', 'B3', 'C4', 'E4', 'C4'], ['E5', 'A4', 'C5', 'D5', 'A4', 'C5', 'G5', 'A4', 'C5', 'G5', 'E5', 'A4', 'C5', 'D5', 'C5', 'E5', 'A4', 'C5', 'D5', 'G5', 'G5', 'E5', 'A4', 'C5', 'D5', 'C5'], ['E5', 'E-5', 'E5', 'G#4', 'E5', 'E-5', 'E5', 'A4', 'E5', 'E5', 'E-5', 'E5', 'G#5', 'F#5', 'E5', 'E-5', 'E5', 'G#4', 'E5', 'E-5', 'E5', 'A4', 'E5', 'E5', 'E-5', 'E5'], ['E4', 'G6', 'F6', 'E6', 'A5', 'G5', 'D6', 'E5', 'E5', 'E4', 'G5', 'G3', 'C5', 'D5', 'E4',

Unnamed: 0,file_paths,notes,chords,instruments
1,1. MLP - Sweet - C#min9-BMaj9-C#min9-BMaj9-G#7...,"[F#5, B4, F#5, C#5, F#5, B4, G#3, G#5, F#5]","[8.1, 3.4.8.11, 6.11, 10.1.3.6, 8.1, 3.4.8.11,...",1
2,10. MLP - Old Timer - Em9-A9-DMaj7-B7.mid,"[G3, E6, F#6, E6, B5, F#6, F#6, A5, E6, A5, B5...","[4.6.7.11, 7.11, 6.7.11.2, 2.7, 7.9.1, 2.7, 6....",1
3,2. MLP - Jazz - Cmin11-F13-Dmin7-G13.mid,"[F5, E-5, A4, G4, A4, A4, G4, F5, E-5, A4, G4,...","[10.0.2.3.5.7, 3.5.9, 2.5.9, 9.0.2.5, 11.2.5.7...",1
4,3. MLP - Sentimental Melody - GbMaj7-Ebm9.mid,[],[],2
5,4. MLP - Crunchy - Em7-Ebdim-Dmin11-G7b9.mid,[],"[11.2.4.7, 0, 11.0.3.6, 9, 4.7.9.0, 5, 4.5.8.1...",1
...,...,...,...,...
62,LofiPianoSample7.mid,"[E5, A5, A5, F#5, D5, D5, E5, E5, A5, D5, D5, ...","[11.2.6, 7.9.2, 11.2.6, 7.9.2, 9.1, 11.2.6, 6....",1
63,LofiPianoSample8.mid,"[B4, C#5, G#4, F#4, B4, C#5, F#5, E5, D5, C#5,...","[6.9.1, 4.6.9, 6.9.11, 2.5, 1.4, 7.9, 9.1.2, 7...",1
64,LofiPianoSample9.mid,"[C6, B5, A5, G5, E-4, A5, C6, B5, A5, G5, D6, ...","[9.2, 6.7.11, 6.7.11, 11.2.4, 2.4.7, 9.0.4, 6....",1
65,merge_from_ofoct.mid,[],[],2


In [46]:
# Drop entries with empty rows
df.drop(df[df["notes"].apply(lambda x: len(x)) == 0].index, inplace=True)
df.drop(df[df["chords"].apply(lambda x: len(x)) == 0].index, inplace=True)

df

Unnamed: 0,file_paths,notes,chords,instruments
1,1. MLP - Sweet - C#min9-BMaj9-C#min9-BMaj9-G#7...,"[F#5, B4, F#5, C#5, F#5, B4, G#3, G#5, F#5]","[8.1, 3.4.8.11, 6.11, 10.1.3.6, 8.1, 3.4.8.11,...",1
2,10. MLP - Old Timer - Em9-A9-DMaj7-B7.mid,"[G3, E6, F#6, E6, B5, F#6, F#6, A5, E6, A5, B5...","[4.6.7.11, 7.11, 6.7.11.2, 2.7, 7.9.1, 2.7, 6....",1
3,2. MLP - Jazz - Cmin11-F13-Dmin7-G13.mid,"[F5, E-5, A4, G4, A4, A4, G4, F5, E-5, A4, G4,...","[10.0.2.3.5.7, 3.5.9, 2.5.9, 9.0.2.5, 11.2.5.7...",1
7,5. MLP - Confident - Ebmin7-Fmin7-DbMaj7.mid,"[B-3, C#5, B-4, F#4, E-4, F#4, G#4, C#5, B-4, ...","[1.3.6, 5.8.0, 10.1.3.6, 5.8.0, 10.1.3.6, 5.8....",1
8,6. MLP - Bright - F#min9-EMaj7-C#7.mid,"[F#2, G#4, E4, F#2, E4, B4, E4]","[9.1.4, 3.4.8.11, 11.1.5, 9.1, 3.4.8.11, 5.8.1...",1
...,...,...,...,...
61,LofiPianoSample6.mid,"[D5, B-4, F4, G4, B-4, G#4, G4, G#4, F4, E-4, ...","[0.2.3.7, 7.10.0.3, 0.2.3.7, 5.8.0, 10.2.5, 5....",1
62,LofiPianoSample7.mid,"[E5, A5, A5, F#5, D5, D5, E5, E5, A5, D5, D5, ...","[11.2.6, 7.9.2, 11.2.6, 7.9.2, 9.1, 11.2.6, 6....",1
63,LofiPianoSample8.mid,"[B4, C#5, G#4, F#4, B4, C#5, F#5, E5, D5, C#5,...","[6.9.1, 4.6.9, 6.9.11, 2.5, 1.4, 7.9, 9.1.2, 7...",1
64,LofiPianoSample9.mid,"[C6, B5, A5, G5, E-4, A5, C6, B5, A5, G5, D6, ...","[9.2, 6.7.11, 6.7.11, 11.2.4, 2.4.7, 9.0.4, 6....",1


In [55]:
# Test the data type of some random entry
x = random.randint(0, df.shape[0])
print(f"Entry {x}'s data types:")
print(type(df["file_paths"][x]))
print(type(df["notes"][x]))
print(type(df["chords"][x]))
print(type(df["instruments"][x]))


Entry 47's data types:
<class 'str'>
<class 'list'>
<class 'list'>
<class 'numpy.int64'>


In [50]:
df.to_csv("Data\\Preprocessed\\preprocessed_data.csv", index=False)