# Part 1 - Feature Extraction (MIDI)

In [1]:
import pandas as pd
import numpy as np
import mido

from sklearn.model_selection import train_test_split

In [None]:
dfx = pd.read_csv("./maestro-v3.0.0-midi/maestro-v3.0.0/maestro-v3.0.0.csv")
dfx = dfx[['midi_filename', 'canonical_composer']]

In [2]:
df = dfx
df

Unnamed: 0,midi_filename,canonical_composer
0,2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R...,Alban Berg
1,2008/MIDI-Unprocessed_03_R2_2008_01-03_ORIG_MI...,Alban Berg
2,2017/MIDI-Unprocessed_066_PIANO066_MID--AUDIO-...,Alban Berg
3,2004/MIDI-Unprocessed_XP_21_R1_2004_01_ORIG_MI...,Alexander Scriabin
4,2006/MIDI-Unprocessed_17_R1_2006_01-06_ORIG_MI...,Alexander Scriabin
...,...,...
1271,2004/MIDI-Unprocessed_XP_14_R1_2004_04_ORIG_MI...,Wolfgang Amadeus Mozart
1272,2004/MIDI-Unprocessed_XP_14_R1_2004_04_ORIG_MI...,Wolfgang Amadeus Mozart
1273,2004/MIDI-Unprocessed_SMF_12_01_2004_01-05_ORI...,Wolfgang Amadeus Mozart
1274,2018/MIDI-Unprocessed_Recital17-19_MID--AUDIO_...,Wolfgang Amadeus Mozart


# Modules

In [3]:
def extract_notes_mido(midi_file):
    try:
        mid = mido.MidiFile(midi_file)
        midi_notes = []
        for msg in mid:
            if msg.type == 'note_on':
                midi_notes.append(msg.note)
        return midi_notes
    except Exception as e:
        # Log the error
        print(f"Error processing {midi_file}: {e}")
        return None

i = 1
notes_list = []
for index, row in df.iterrows():
    if i % 10 == 0:
        print(f"Processed {i} files")
    i += 1
    
    # Extract notes
    notes = extract_notes_mido("./maestro-v3.0.0-midi/maestro-v3.0.0/" + row['midi_filename'])
    if notes is not None:
        notes_list.append(notes)
    else:
        notes_list.append([])  # Append an empty list if notes are None

# list ---> DataFrame
df['notes'] = notes_list


df.drop(columns=['midi_filename'], inplace=True)
print(df)

Processed 10 files
Processed 20 files
Processed 30 files
Processed 40 files
Processed 50 files
Processed 60 files
Processed 70 files
Processed 80 files
Processed 90 files
Processed 100 files
Processed 110 files
Processed 120 files
Processed 130 files
Processed 140 files
Processed 150 files
Processed 160 files
Processed 170 files
Processed 180 files
Processed 190 files
Processed 200 files
Processed 210 files
Processed 220 files
Processed 230 files
Processed 240 files
Processed 250 files
Processed 260 files
Processed 270 files
Processed 280 files
Processed 290 files
Processed 300 files
Processed 310 files
Processed 320 files
Processed 330 files
Processed 340 files
Processed 350 files
Processed 360 files
Processed 370 files
Processed 380 files
Processed 390 files
Processed 400 files
Processed 410 files
Processed 420 files
Processed 430 files
Processed 440 files
Processed 450 files
Processed 460 files
Processed 470 files
Processed 480 files
Processed 490 files
Processed 500 files
Processed

In [4]:
len(df['notes'][3])

12632

# One Hot Encode Composers

In [5]:
# One-hot encode Composers
one_hot_encoded_labels = pd.get_dummies(df['canonical_composer'])

# Concatenate : one-hot encoded ---> orig DataFrame
df = pd.concat([df, one_hot_encoded_labels], axis=1)

df.drop(columns=['canonical_composer'], inplace=True)
print(df)

                                                  notes  Alban Berg  \
0     [67, 72, 67, 72, 78, 71, 61, 67, 67, 61, 78, 7...           1   
1     [67, 72, 67, 78, 61, 67, 71, 72, 79, 71, 78, 6...           1   
2     [67, 72, 67, 78, 61, 71, 67, 72, 79, 78, 71, 6...           1   
3     [70, 66, 58, 48, 49, 48, 53, 58, 70, 54, 49, 7...           0   
4     [52, 62, 58, 80, 62, 52, 58, 80, 81, 81, 82, 6...           0   
...                                                 ...         ...   
1271  [73, 56, 53, 73, 72, 72, 73, 73, 72, 73, 72, 7...           0   
1272  [77, 81, 65, 77, 72, 81, 65, 72, 76, 79, 70, 7...           0   
1273  [72, 72, 70, 69, 70, 67, 69, 65, 67, 69, 65, 6...           0   
1274  [72, 72, 70, 69, 70, 69, 67, 67, 65, 69, 65, 6...           0   
1275  [79, 48, 48, 52, 52, 48, 77, 79, 76, 77, 48, 7...           0   

      Alexander Scriabin  Antonio Soler  Carl Maria von Weber  \
0                      0              0                     0   
1                

In [6]:
# Prepare the data
df_notes = pd.DataFrame(df['notes'].values.tolist())  # Expand the 'notes' column into separate columns
X = df_notes.fillna(0).values  # Fill NaN values with 0 and convert to numpy array
y = df.drop(columns=['notes']).iloc[:,:].values  # Extract labels and convert to numpy array

In [7]:
X

array([[67., 72., 67., ...,  0.,  0.,  0.],
       [67., 72., 67., ...,  0.,  0.,  0.],
       [67., 72., 67., ...,  0.,  0.,  0.],
       ...,
       [72., 72., 70., ...,  0.,  0.,  0.],
       [72., 72., 70., ...,  0.,  0.,  0.],
       [79., 48., 48., ...,  0.,  0.,  0.]])

In [8]:
y

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1]], dtype=uint8)

# Storing the Dataset

In [10]:
df.to_csv("out.tsv", sep='\t', index=False)