# Data Preprocessing

Our dataset is made of files in MIDI format. We need to process them to obtain the relevant features of each songs.

### Libraries

In [1]:
import ast
import os
import re
import shutil
from collections import Counter
from fractions import Fraction
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from music21 import chord, converter, instrument, key, meter, note, tempo

### Useful functions

Since the data was too heavy, we save the dataset into two separate CSV files. The following functions are for easier access

In [7]:
def load_dataframe_from_two_csvs(file1, file2):
    """
    Load and concatenate two CSV files into a single pandas DataFrame.
    """
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)
    full_df = pd.concat([df1, df2], ignore_index=True)
    return full_df

def save_dataframe_to_two_csvs(df, file1, file2):
    """
    Split a DataFrame in half and save it into two CSV files.
    """
    halfway = len(df) // 2
    df.iloc[:halfway].to_csv(file1, index=False)
    df.iloc[halfway:].to_csv(file2, index=False)


## Parse MIDI files and save into CSV format

We copy the files into a single folder

In [None]:
source_dirs = [Path("raw_data/live").resolve(), Path("raw_data/studio").resolve()]
destination_dir = Path("data_alltogheter").resolve()

os.makedirs(destination_dir, exist_ok=True)

file_id = 0

for source_dir in source_dirs:
    for root, _, files in os.walk(source_dir):
        for file in files:
            if file.lower().endswith((".mid", ".midi")):
                source_file = Path(root) / file
                new_filename = f"{file_id}_{file}"
                destination_file = destination_dir / new_filename

                try:
                    shutil.copy2(source_file, destination_file)
                    print(f"Copied: {source_file} → {destination_file}")
                    file_id += 1
                except Exception as e:
                    print(f"Failed to copy {source_file}: {e}")

print("Done copying MIDI files.")


We parse the files, retrieve the information from each MIDI and save it in CSV format

In [None]:
def parse_midi_file(file_path):
    score = converter.parse(file_path)
    
    midi_data = {
        'file_name': os.path.basename(file_path),
        'instrument': [],
        'notes': [],
        'chords': [],
        'velocities': [],
        'durations': [],
        'offsets': [],
        'tempos': [],
        'time_signatures': [],
        'key_signatures': [],
        'track_names': []
    }
    
    for element in score.flat:
        if isinstance(element, tempo.MetronomeMark):
            midi_data['tempos'].append(element.getQuarterBPM())
        if isinstance(element, meter.TimeSignature):
            midi_data['time_signatures'].append(element.ratioString)
        if isinstance(element, key.KeySignature):
            midi_data['key_signatures'].append(element.sharps)
    
    for part in score.parts:
        part_instrument = part.getInstrument().instrumentName or 'Unknown'
        midi_data['instrument'].append(part_instrument)
        
        track_name = part.partName or 'Unnamed Track'
        midi_data['track_names'].append(track_name)
        
        for elem in part.recurse().notes:
            if isinstance(elem, note.Note):
                midi_data['notes'].append(elem.pitch.midi)
                midi_data['durations'].append(elem.quarterLength)
                midi_data['offsets'].append(elem.offset)
                midi_data['velocities'].append(elem.volume.velocity)
            elif isinstance(elem, chord.Chord):
                chord_pitches = [p.midi for p in elem.pitches]
                midi_data['chords'].append(chord_pitches)
                midi_data['durations'].append(elem.quarterLength)
                midi_data['offsets'].append(elem.offset)
                midi_data['velocities'].append(elem.volume.velocity)
    
    return midi_data


def process_midi_files(midi_dir):
    all_data = []
    
    for root, _, files in os.walk(midi_dir):
        for file in files:
            if file.lower().endswith('.mid') or file.lower().endswith('.midi'):
                file_path = os.path.join(root, file)
                midi_data = parse_midi_file(file_path)
                all_data.append(midi_data)
    
    return all_data


def save_data_to_csv(all_data, output_file):
    df = pd.DataFrame(all_data)
    
    df.to_csv(output_file, index=False)
    print(f"Data saved to {output_file}")

midi_directory = 'data_alltogheter'
output_csv_file = 'data_preprocessed/data.csv'

all_midi_data = process_midi_files(midi_directory)

save_data_to_csv(all_midi_data, output_csv_file)

  midi_data = parse_midi_file(file_path)


Data saved to data_preprocessed/data.csv


## Preliminary Data Exploration and Cleaning

We want to have a preliminary look at the values we have collected to check if we have to already do any preprocessing (missing values, element type does not correspond to what we expect, etc)

In [13]:
root = 'data_preprocessed/'
file1 = root + 'data_part1.csv'
file2 = root + 'data_part2.csv'

df = load_dataframe_from_two_csvs(file1, file2)

print("The columns in the dataset are:")
print(df.columns)
print("The number of values in the dataset is:", len(df))

if (len(df) == len(df1) + len(df2)):
    print("The number of values in the dataset is correct.")

The columns in the dataset are:
Index(['file_name', 'instrument', 'track_names', 'notes', 'chords',
       'velocities', 'durations', 'offsets', 'tempos', 'time_signatures',
       'key_signatures', 'ordered_events'],
      dtype='object')
The number of values in the dataset is: 2775
The number of values in the dataset is correct.


We go through each of the attributes we have collected from the MIDI format:
- Instrument
- Notes
- Chords
- Velocities
- Durations
- Offsets
- Tempos
- Time_signatures
- Key_signatures

### Instrument

In [39]:
print("Instrument column summary:")
print(df['instrument'].describe())
print("Unique values:", df['instrument'].nunique())
print("Most common values:")
print(df['instrument'].value_counts().head(10))

Instrument column summary:
count            2775
unique              1
top       ['Unknown']
freq             2775
Name: instrument, dtype: object
Unique values: 1
Most common values:
instrument
['Unknown']    2775
Name: count, dtype: int64


We don't have any instrument information in the dataset, but judging from the name of the dataset it is probably piano

### Notes

In [45]:
def midi_to_name_note(n):
    try:
        return note.Note(n).nameWithOctave
    except:
        return str(n)

In [46]:
all_notes = []

for row in df['notes'].dropna():
    if isinstance(row, str):
        try:
            notes_list = ast.literal_eval(row)
        except Exception:
            continue
    else:
        notes_list = row
    all_notes.extend(notes_list)

notes_series = pd.Series(all_notes)
top_notes = notes_series.value_counts().head(10)

print("Summary of all notes in the dataset:")
print(notes_series.describe())

print("\nMost common notes:")
for midi_val, count in top_notes.items():
    print(f"{midi_to_name_note(midi_val)} ({midi_val}): {count} times")

Summary of all notes in the dataset:
count    2.730308e+06
mean     6.216901e+01
std      1.432184e+01
min      2.100000e+01
25%      5.300000e+01
50%      6.200000e+01
75%      7.200000e+01
max      1.080000e+02
dtype: float64

Most common notes:
F4 (65): 95563 times
G4 (67): 95028 times
C4 (60): 94973 times
D4 (62): 89063 times
C5 (72): 87045 times
B-4 (70): 78955 times
E-4 (63): 78195 times
A4 (69): 76656 times
G3 (55): 75575 times
E4 (64): 74487 times


### Chords

In [49]:
def midi_to_chord(n):
    try:
        return chord.Chord(midin_notes).pitchedCommonName
    except:
        return str(n)

In [None]:
all_chords = []

for row in df['chords'].dropna():
    if isinstance(row, str):
        try:
            notes_list = ast.literal_eval(row)
        except Exception:
            continue
    else:
        notes_list = row
    all_chords.extend(notes_list)

chords_series = pd.Series(all_chords)
top_chords = chords_series.value_counts().head(10)

print("Summary of all chords in the dataset:")
print(chords_series.describe())

print("\nMost common chords:")
for midi_val, count in top_chords.items():
    print(f"{midi_to_chord(midi_val)} ({midi_val}): {count} times")


Most common chords:
[60, 65] ([60, 65]): 3818 times
[60, 63] ([60, 63]): 3729 times
[62, 65] ([62, 65]): 3688 times
[58, 70] ([58, 70]): 3654 times
[54, 60] ([54, 60]): 3594 times
[55, 60] ([55, 60]): 3578 times
[58, 63] ([58, 63]): 3576 times
[60, 72] ([60, 72]): 3569 times
[57, 62] ([57, 62]): 3545 times
[55, 67] ([55, 67]): 3406 times


### Velocities

In [22]:
print("Velocities column summary:")
print(df['velocities'].describe())
print("Unique values:", df['velocities'].nunique())
print("Most common values:")
print(df['velocities'].value_counts().head(10))

Velocities column summary:
count                                                  2775
unique                                                 2775
top       [51, 35, 52, 48, 48, 50, 50, 50, 49, 42, 56, 4...
freq                                                      1
Name: velocities, dtype: object
Unique values: 2775
Most common values:
velocities
[51, 35, 52, 48, 48, 50, 50, 50, 49, 42, 56, 49, 49, 62, 39, 45, 56, 53, 52, 49, 43, 64, 50, 61, 64, 59, 55, 71, 60, 55, 56, 53, 55, 53, 52, 56, 59, 59, 50, 49, 51, 49, 50, 46, 46, 55, 61, 48, 54, 56, 51, 62, 66, 55, 66, 62, 65, 59, 51, 59, 63, 56, 64, 54, 49, 62, 50, 69, 58, 53, 56, 46, 68, 60, 50, 51, 53, 49, 60, 50, 43, 44, 53, 55, 57, 55, 58, 62, 59, 56, 50, 51, 52, 55, 49, 49, 50, 55, 52, 47, 54, 58, 50, 52, 61, 51, 63, 50, 55, 45, 44, 54, 59, 56, 71, 59, 68, 60, 42, 47, 50, 63, 58, 56, 64, 57, 66, 55, 62, 58, 65, 69, 69, 56, 73, 48, 57, 54, 49, 52, 60, 51, 53, 49, 61, 57, 69, 56, 55, 64, 72, 61, 64, 44, 62, 60, 53, 58, 71, 51, 63, 54, 6

### Durations

In [14]:
print("Durations column summary:")
print(df['durations'].describe())
print("Unique values:", df['durations'].nunique())
print("Most common values:")
print(df['durations'].value_counts().head(10))

Durations column summary:
count                                                  2775
unique                                                 2775
top       [0.25, 2.5, 0.25, Fraction(1, 3), Fraction(1, ...
freq                                                      1
Name: durations, dtype: object
Unique values: 2775
Most common values:
durations
[0.25, 2.5, 0.25, Fraction(1, 3), Fraction(1, 3), 0.25, Fraction(1, 3), 0.25, Fraction(1, 3), 0.75, 1.5, Fraction(1, 3), 0.25, 1.0, Fraction(1, 3), Fraction(1, 3), 1.75, 0.25, Fraction(17, 12), Fraction(1, 3), 1.0, 0.25, 0.25, 0.25, 0.25, Fraction(1, 3), Fraction(1, 3), Fraction(1, 3), Fraction(1, 3), Fraction(2, 3), 0.25, 0.25, 0.25, Fraction(1, 3), 2.0, Fraction(1, 3), Fraction(4, 3), Fraction(1, 3), Fraction(1, 3), 1.0, 0.25, 2.0, 1.25, 0.25, 0.5, Fraction(17, 12), Fraction(1, 3), 0.5, 0.25, 0.25, 2.0, 1.5, Fraction(1, 3), Fraction(1, 3), 0.25, 0.25, 2.0, 0.25, 1.75, 0.25, Fraction(1, 3), Fraction(1, 3), Fraction(1, 3), Fraction(1, 3), Fracti

We notice that in the column Durations there are Fraction objects that we want to evaluate as floats

In [15]:
def parse_and_flatten_durations(durations_column, precision=4):
    all_durations = []
    cleaned_rows = []

    for row in durations_column.dropna():
        cleaned_row = []
        if isinstance(row, str):
            input_str = row.strip()[1:-1]
            try:
                parsed_items = eval(f"[{input_str}]", {"Fraction": Fraction})
            except Exception:
                parsed_items = []

            for item in parsed_items:
                rounded = round(float(item), precision)
                cleaned_row.append(rounded)
                all_durations.append(rounded)

        else:
            for item in row:
                try:
                    rounded = round(float(item), precision)
                    cleaned_row.append(rounded)
                    all_durations.append(rounded)
                except Exception:
                    continue

        cleaned_rows.append(cleaned_row)

    durations_column.update(pd.Series([str(row) for row in cleaned_rows], index=durations_column.dropna().index))

    return all_durations

all_durations = parse_and_flatten_durations(df['durations'])
print(f"Mean duration: {np.mean(all_durations):.4f}")
print(f"Standard deviation of durations: {np.std(all_durations):.4f}")

Mean duration: 0.9670
Standard deviation of durations: 1.2823


In [None]:
root = 'data_preprocessed/'
file1 = root + 'data_part2_1.csv'
file2 = root + 'data_part2_2.csv'
save_dataframe_to_two_csvs(df, file1, file2)

### Offsets

In [16]:
print("Offsets column summary:")
print(df['offsets'].describe())
print("Unique values:", df['offsets'].nunique())
print("Most common values:")
print(df['offsets'].value_counts().head(10))

Offsets column summary:
count                                                  2775
unique                                                 2775
top       [0.5, 1.25, 3.75, Fraction(5, 3), Fraction(7, ...
freq                                                      1
Name: offsets, dtype: object
Unique values: 2775
Most common values:
offsets
[0.5, 1.25, 3.75, Fraction(5, 3), Fraction(7, 3), 2.75, 3.5, 0.25, 0.75, Fraction(4, 3), 2.25, Fraction(4, 3), 2.0, 2.25, Fraction(10, 3), Fraction(11, 3), 2.25, Fraction(7, 3), 0.0, Fraction(1, 3), Fraction(2, 3), 1.75, 2.0, 2.25, 2.5, 3.0, Fraction(11, 3), 1.0, Fraction(4, 3), 3.0, 0.0, 0.0, 0.5, 1.0, Fraction(4, 3), 3.5, 0.0, 2.0, Fraction(7, 3), 2.75, 3.75, 0.5, 2.75, 2.75, 3.0, 0.0, Fraction(1, 3), 0.75, 1.25, 1.75, 2.0, Fraction(1, 3), Fraction(8, 3), 3.0, Fraction(10, 3), 3.75, Fraction(1, 3), 3.5, 0.0, 0.25, 0.5, Fraction(4, 3), 1.75, Fraction(7, 3), Fraction(8, 3), 3.5, 0.75, 1.75, Fraction(8, 3), 3.0, 3.25, 3.5, 0.75, 2.0, Fraction(11, 3), F

We notice that in the column Offsets there are Fraction objects that we want to evaluate as floats

In [17]:
def parse_and_flatten_offsets(offsets_column, precision=4):
    all_offsets = []
    cleaned_rows = []

    for row in offsets_column.dropna():
        cleaned_row = []
        if isinstance(row, str):
            input_str = row.strip()[1:-1]
            try:
                parsed_items = eval(f"[{input_str}]", {"Fraction": Fraction})
            except Exception:
                parsed_items = []

            for item in parsed_items:
                rounded = round(float(item), precision)
                cleaned_row.append(rounded)
                all_offsets.append(rounded)

        else:
            for item in row:
                try:
                    rounded = round(float(item), precision)
                    cleaned_row.append(rounded)
                    all_offsets.append(rounded)
                except Exception:
                    continue

        cleaned_rows.append(cleaned_row)

    offsets_column.update(pd.Series([str(row) for row in cleaned_rows], index=offsets_column.dropna().index))

    return all_offsets

all_offsets = parse_and_flatten_offsets(df['offsets'])
print(f"Mean offset: {np.mean(all_offsets):.4f}")
print(f"Standard deviation of offset: {np.std(all_offsets):.4f}")

Mean offset: 1.5558
Standard deviation of offsets: 1.2686


In [18]:
root = 'data_preprocessed/'
file1 = root + 'data_part2_1.csv'
file2 = root + 'data_part2_2.csv'
save_dataframe_to_two_csvs(df, file1, file2)

### Tempos

In [19]:
print("Tempos column summary:")
print(df['tempos'].describe())
print("Unique values:", df['tempos'].nunique())
print("Most common values:")
print(df['tempos'].value_counts().head(10))

Tempos column summary:
count        2775
unique          1
top       [120.0]
freq         2775
Name: tempos, dtype: object
Unique values: 1
Most common values:
tempos
[120.0]    2775
Name: count, dtype: int64


### Time signatures

In [23]:
print("Time signatures column summary:")
print(df['time_signatures'].describe())
print("Unique values:", df['time_signatures'].nunique())
print("Most common values:")
print(df['time_signatures'].value_counts().head(10))

Time signatures column summary:
count        2775
unique          1
top       ['4/4']
freq         2775
Name: time_signatures, dtype: object
Unique values: 1
Most common values:
time_signatures
['4/4']    2775
Name: count, dtype: int64


### Key signatures

In [21]:
print("Key signatures column summary:")
print(df['key_signatures'].describe())
print("Unique values:", df['key_signatures'].nunique())
print("Most common values:")
print(df['key_signatures'].value_counts().head(10))

Key signatures column summary:
count     2775
unique       1
top         []
freq      2775
Name: key_signatures, dtype: object
Unique values: 1
Most common values:
key_signatures
[]    2775
Name: count, dtype: int64


### Ordered events

The ordered events column gives us the order at which notes and chord alternate in each song

In [24]:
print("Ordered events column summary:")
print(df['ordered_events'].describe())
print("Unique values:", df['ordered_events'].nunique())
print("Most common values:")
print(df['ordered_events'].value_counts().head(10))

Ordered events column summary:
count                                                  2775
unique                                                 2775
top       ['n', 'n', 'n', 'n', 'n', 'n', 'c', 'n', 'n', ...
freq                                                      1
Name: ordered_events, dtype: object
Unique values: 2775
Most common values:
ordered_events
['n', 'n', 'n', 'n', 'n', 'n', 'c', 'n', 'n', 'n', 'c', 'c', 'n', 'n', 'n', 'n', 'c', 'n', 'n', 'n', 'n', 'c', 'c', 'c', 'c', 'n', 'n', 'c', 'c', 'c', 'c', 'c', 'n', 'c', 'n', 'n', 'n', 'n', 'c', 'c', 'n', 'n', 'n', 'n', 'n', 'n', 'c', 'n', 'n', 'n', 'n', 'c', 'c', 'c', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'c', 'c', 'c', 'n', 'n', 'n', 'c', 'c', 'n', 'n', 'n', 'c', 'n', 'n', 'n', 'n', 'n', 'c', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'c', 'n', 'c', 'c', 'c', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'c', 'n', 'n', 'c', 'n', 'n', 'c', 'c', 'n', 'n', 'c', 'n', 'c', 'c', 'n', 'n', 'n', 'c

In [40]:
def parse_and_flatten_events(events_column, precision=4):
    events_count = dict()
    events_count['n'] = 0
    events_count['c'] = 0


    for row in events_column.dropna():
        cleaned_row = []
        if isinstance(row, str):
            input_str = row.strip()[1:-1]
            try:
                parsed_items = input_str.split(',')
            except Exception:
                parsed_items = []

            for item in parsed_items:
                item = item.strip()
                item = item.strip("\'\"")
                events_count[item] += 1

    return events_count

all_events = parse_and_flatten_events(df['ordered_events'])
print(f"Fraction of notes over total events: {(all_events['n'] / (all_events['n'] + all_events['c'])):.4f}")
print(f"Fraction of chords over total events: {(all_events['c'] / (all_events['n'] + all_events['c'])):.4f}")

Fraction of notes over total events: 0.6045
Fraction of chords over total events: 0.3955


## Save to CSV

In [42]:
root = 'data_processed/'
file1 = root + 'data_part1.csv'
file2 = root + 'data_part2.csv'
save_dataframe_to_two_csvs(df, file1, file2)