# Data Preprocessing

Our dataset is made of files in MIDI format. We need to process them to obtain the relevant features of each songs.

### Libraries

In [None]:
pip install seaborn

In [7]:
import ast
import os
import re
import shutil
from collections import Counter
from fractions import Fraction
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from music21 import chord, converter, instrument, key, meter, note, tempo

## Parse MIDI files and save into CSV format

We copy the files into a single folder

In [None]:
source_dirs = [Path("raw_data/live").resolve(), Path("raw_data/studio").resolve()]
destination_dir = Path("data_alltogheter").resolve()

os.makedirs(destination_dir, exist_ok=True)

file_id = 0

for source_dir in source_dirs:
    for root, _, files in os.walk(source_dir):
        for file in files:
            if file.lower().endswith((".mid", ".midi")):
                source_file = Path(root) / file
                new_filename = f"{file_id}_{file}"
                destination_file = destination_dir / new_filename

                try:
                    shutil.copy2(source_file, destination_file)
                    print(f"Copied: {source_file} → {destination_file}")
                    file_id += 1
                except Exception as e:
                    print(f"Failed to copy {source_file}: {e}")

print("Done copying MIDI files.")


We parse the files, retrieve the information from each MIDI and save it in CSV format

In [None]:
def parse_midi_file(file_path):
    score = converter.parse(file_path)
    
    midi_data = {
        'file_name': os.path.basename(file_path),
        'instrument': [],
        'notes': [],
        'chords': [],
        'velocities': [],
        'durations': [],
        'offsets': [],
        'tempos': [],
        'time_signatures': [],
        'key_signatures': [],
        'track_names': []
    }
    
    for element in score.flat:
        if isinstance(element, tempo.MetronomeMark):
            midi_data['tempos'].append(element.getQuarterBPM())
        if isinstance(element, meter.TimeSignature):
            midi_data['time_signatures'].append(element.ratioString)
        if isinstance(element, key.KeySignature):
            midi_data['key_signatures'].append(element.sharps)
    
    for part in score.parts:
        part_instrument = part.getInstrument().instrumentName or 'Unknown'
        midi_data['instrument'].append(part_instrument)
        
        track_name = part.partName or 'Unnamed Track'
        midi_data['track_names'].append(track_name)
        
        for elem in part.recurse().notes:
            if isinstance(elem, note.Note):
                midi_data['notes'].append(elem.pitch.midi)
                midi_data['durations'].append(elem.quarterLength)
                midi_data['offsets'].append(elem.offset)
                midi_data['velocities'].append(elem.volume.velocity)
            elif isinstance(elem, chord.Chord):
                chord_pitches = [p.midi for p in elem.pitches]
                midi_data['chords'].append(chord_pitches)
                midi_data['durations'].append(elem.quarterLength)
                midi_data['offsets'].append(elem.offset)
                midi_data['velocities'].append(elem.volume.velocity)
    
    return midi_data


def process_midi_files(midi_dir):
    all_data = []
    
    for root, _, files in os.walk(midi_dir):
        for file in files:
            if file.lower().endswith('.mid') or file.lower().endswith('.midi'):
                file_path = os.path.join(root, file)
                midi_data = parse_midi_file(file_path)
                all_data.append(midi_data)
    
    return all_data


def save_data_to_csv(all_data, output_file):
    df = pd.DataFrame(all_data)
    
    df.to_csv(output_file, index=False)
    print(f"Data saved to {output_file}")

midi_directory = 'data_alltogheter'
output_csv_file = 'data_preprocessed/data.csv'

all_midi_data = process_midi_files(midi_directory)

save_data_to_csv(all_midi_data, output_csv_file)

  midi_data = parse_midi_file(file_path)


Data saved to data_preprocessed/data.csv


# Preliminary Data Exploration and Cleaning

We want to have a preliminary look at the values we have collected to check if we have to already do any preprocessing (missing values, element type does not correspond to what we expect, etc)

In [12]:
root = 'data_preprocessed/'
df1 = pd.read_csv(root + 'data_part1.csv')
df2 = pd.read_csv(root + 'data_part2.csv')
df = pd.concat([df1, df2], ignore_index=True)

print("The columns in the dataset are:")
print(df.columns)
print("The number of values in the dataset is:", len(df))

if (len(df) == len(df1) + len(df2)):
    print("The number of values in the dataset is correct.")


The columns in the dataset are:
Index(['file_name', 'instrument', 'track_names', 'notes', 'chords',
       'velocities', 'durations', 'offsets', 'tempos', 'time_signatures',
       'key_signatures', 'ordered_events'],
      dtype='object')
The number of values in the dataset is: 2775
The number of values in the dataset is correct.


We go through each of the attributes we have collected from the MIDI format:
- Instrument
- Notes
- Chords
- Velocities
- Durations
- Offsets
- Tempos
- Time_signatures
- Key_signatures

In [17]:
import pandas as pd
from collections.abc import Iterable
from collections import Counter
import numpy as np

def is_listlike_column(series):
    """Check if the elements in a column are list-like (but not strings)."""
    return all(isinstance(x, Iterable) and not isinstance(x, str) for x in series.dropna())

def flatten_column(series):
    """Flatten all elements of a list-like column into a single list."""
    return [item for sublist in series.dropna() for item in sublist]

def summarize_feature(series, top_k=5):
    name = series.name
    print(f"\n--- {name.upper()} ---")

    if is_listlike_column(series):
        flat = flatten_column(series)
        if all(isinstance(x, (int, float, np.number)) for x in flat):
            print(f"Total items: {len(flat)}")
            print(f"Mean: {np.mean(flat):.2f}")
            print(f"Std Dev: {np.std(flat):.2f}")
            print(f"Min: {min(flat)}")
            print(f"Max: {max(flat)}")
        else:
            counts = Counter(flat).most_common(top_k)
            print(f"Top {top_k} most common values:")
            for value, count in counts:
                print(f"{value}: {count}")
                
    elif pd.api.types.is_numeric_dtype(series):
        print(f"Mean: {series.mean():.2f}")
        print(f"Std Dev: {series.std():.2f}")
        print(f"Min: {series.min()}")
        print(f"Max: {series.max()}")
        
    elif pd.api.types.is_string_dtype(series) or pd.api.types.is_categorical_dtype(series):
        counts = series.value_counts().head(top_k)
        print("Most common values:")
        print(counts)
        
    else:
        print("Unsupported or empty column.")


for col in df.columns:
    if col not in ['file_name', 'track_names', 'ordered_events']:
        summarize_feature(df[col])



--- INSTRUMENT ---
Most common values:
instrument
['Unknown']    2775
Name: count, dtype: int64

--- NOTES ---
Most common values:
notes
[88, 38, 38, 45, 45, 45, 45, 45, 38, 45, 61, 46, 46, 57, 38, 46, 38, 53, 52, 45, 38, 45, 61, 45, 38, 57, 59, 43, 45, 52, 46, 46, 46, 38, 63, 58, 65, 60, 38, 44, 45, 45, 64, 45, 61, 57, 59, 61, 45, 61, 57, 44, 38, 39, 38, 45, 45, 45, 45, 62, 64, 66, 62, 45, 55, 45, 38, 45, 59, 45, 45, 38, 45, 64, 61, 66, 38, 45, 45, 45, 45, 66, 57, 62, 38, 66, 69, 40, 64, 66, 57, 67, 57, 66, 47, 47, 47, 47, 47, 40, 46, 40, 64, 40, 67, 66, 67, 48, 67, 47, 40, 71, 40, 60, 47, 74, 55, 70, 45, 57, 69, 47, 47, 62, 66, 38, 49, 59, 55, 59, 61, 62, 45, 45, 64, 66, 57, 57, 38, 62, 69, 38, 59, 38, 45, 38, 45, 61, 66, 66, 45, 64, 64, 57, 69, 45, 44, 45, 38, 44, 45, 38, 62, 38, 66, 64, 64, 73, 57, 57, 71, 69, 38, 40, 40, 67, 57, 47, 59, 62, 64, 55, 47, 57, 71, 69, 47, 47, 66, 69, 67, 67, 62, 74, 40, 67, 64, 52, 74, 74, 47, 76, 52, 76, 71, 67, 62, 45, 64, 64, 62, 55, 55, 69, 47, 7

In [21]:
def summarize_feature(series, top_k=5):
    name = series.name
    print(f"--- {name} ---")
    
    if pd.api.types.is_numeric_dtype(series):
        print(f"Mean: {series.mean():.2f}")
        print(f"Std Dev: {series.std():.2f}")
        print(f"Min: {series.min()}")
        print(f"Max: {series.max()}")
    elif pd.api.types.is_string_dtype(series) or pd.api.types.is_categorical_dtype(series):
        counts = series.value_counts().head(top_k)
        print("Most common values:")
        print(counts)
    else:
        print("Summary not available for this data type.")
    print()


for col in df.columns:
    if col not in ['file_name', 'track_names', 'ordered_events']:
        summarize_feature(df[col])

--- instrument ---
Most common values:
instrument
['Unknown']    2775
Name: count, dtype: int64

--- notes ---
Most common values:
notes
[88, 38, 38, 45, 45, 45, 45, 45, 38, 45, 61, 46, 46, 57, 38, 46, 38, 53, 52, 45, 38, 45, 61, 45, 38, 57, 59, 43, 45, 52, 46, 46, 46, 38, 63, 58, 65, 60, 38, 44, 45, 45, 64, 45, 61, 57, 59, 61, 45, 61, 57, 44, 38, 39, 38, 45, 45, 45, 45, 62, 64, 66, 62, 45, 55, 45, 38, 45, 59, 45, 45, 38, 45, 64, 61, 66, 38, 45, 45, 45, 45, 66, 57, 62, 38, 66, 69, 40, 64, 66, 57, 67, 57, 66, 47, 47, 47, 47, 47, 40, 46, 40, 64, 40, 67, 66, 67, 48, 67, 47, 40, 71, 40, 60, 47, 74, 55, 70, 45, 57, 69, 47, 47, 62, 66, 38, 49, 59, 55, 59, 61, 62, 45, 45, 64, 66, 57, 57, 38, 62, 69, 38, 59, 38, 45, 38, 45, 61, 66, 66, 45, 64, 64, 57, 69, 45, 44, 45, 38, 44, 45, 38, 62, 38, 66, 64, 64, 73, 57, 57, 71, 69, 38, 40, 40, 67, 57, 47, 59, 62, 64, 55, 47, 57, 71, 69, 47, 47, 66, 69, 67, 67, 62, 74, 40, 67, 64, 52, 74, 74, 47, 76, 52, 76, 71, 67, 62, 45, 64, 64, 62, 55, 55, 69, 47, 71

In [None]:
def safe_parse_list_column(column_name):
    parsed = []
    for raw in df[column_name].dropna():
        try:
            parsed_list = ast.literal_eval(raw)
            if isinstance(parsed_list, list):
                parsed.extend(parsed_list)
        except Exception as e:
            print(f"Skipping malformed row in {column_name}: {raw}")
    return parsed

notes = safe_parse_list_column("notes")
durations = safe_parse_list_column("durations")
instruments = safe_parse_list_column("instrument")
tempos = safe_parse_list_column("tempos")
keys = safe_parse_list_column("key_signatures")
times = safe_parse_list_column("time_signatures")
velocities = safe_parse_list_column("velocities")

When trying to safely parse all the columns in the dataset we noticed that for the "durations" column there are some anormaly. We have to fix how the data was saved to be able to use this column.

In [11]:
def clean_and_parse_list_column(column_name):
    parsed = []
    for raw in df[column_name].dropna():
        if not isinstance(raw, str):
            continue
        raw = raw.strip()
        if not raw.startswith("[") or not raw.endswith("]"):
            continue
        try:
            raw = re.sub(r'^\[|]$', '', raw)
            items = raw.split(",")
            cleaned_items = []
            for item in items:
                item = item.strip().strip("'\"")
                try:
                    val = float(item)
                    cleaned_items.append(val)
                except ValueError:
                    cleaned_items.append(item)
            parsed.extend(cleaned_items)
        except Exception as e:
            print(f"Skipped malformed row: {raw}")
    return parsed


notes = clean_and_parse_list_column("notes")
durations = clean_and_parse_list_column("durations")
instruments = clean_and_parse_list_column("instrument")
tempos = clean_and_parse_list_column("tempos")
keys = clean_and_parse_list_column("key_signatures")
times = clean_and_parse_list_column("time_signatures")
velocities = clean_and_parse_list_column("velocities")


### Notes

In [13]:
print("Notes:")
print(f"Unique notes: {len(set(notes))}")
print(pd.Series(notes).describe())

Notes:
Unique notes: 88
count    1.831923e+06
mean     6.181524e+01
std      1.393873e+01
min      2.100000e+01
25%      5.200000e+01
50%      6.200000e+01
75%      7.100000e+01
max      1.080000e+02
dtype: float64


### Durations

In [29]:
print("\nDurations:")
print(pd.Series(durations).describe())
print(df["durations"].head())


Durations:
count     4039871
unique        412
top            3)
freq       852548
dtype: object
0    [0.5, 1.0, 0.75, 0.5, 0.75, 0.25, 2.0, 1.25, 1...
1    [0.25, 2.5, Fraction(5, 3), 0.75, 0.75, Fracti...
2    [2.75, 2.5, 2.25, Fraction(5, 3), Fraction(1, ...
3    [2.0, Fraction(1, 3), 1.0, 0.25, 0.75, 1.5, 0....
4    [1.25, 0.5, Fraction(1, 3), 1.0, 2.25, 1.0, Fr...
Name: durations, dtype: object


In [38]:
def convert_duration(value, precision=4):
    if isinstance(value, list):
        return [round(float(v), precision) if isinstance(v, Fraction) else round(v, precision) for v in value]
    elif isinstance(value, Fraction):
        return round(float(value), precision)
    try:
        return round(float(value), precision)
    except:
        return None

def process_durations(durations_column):
    cleaned_durations = []
    for item in durations_column:
        try:
            if isinstance(item, str):
                item = eval(item)
            cleaned_durations.append(convert_duration(item))
        except Exception as e:
            cleaned_durations.append(None)
    return cleaned_durations

df['durations'] = process_durations(df['durations'])

print(df['durations'].head())

0    [0.5, 1.0, 0.75, 0.5, 0.75, 0.25, 2.0, 1.25, 1...
1    [0.25, 2.5, 1.6667, 0.75, 0.75, 1.6667, 0.5, 1...
2    [2.75, 2.5, 2.25, 1.6667, 0.3333, 1.0, 0.6667,...
3    [2.0, 0.3333, 1.0, 0.25, 0.75, 1.5, 0.5, 0.5, ...
4    [1.25, 0.5, 0.3333, 1.0, 2.25, 1.0, 2.3333, 2....
Name: durations, dtype: object


### Instruments

In [15]:
print("\nInstruments:")
print(pd.Series(instruments).value_counts())


Instruments:
Unknown    2093
Name: count, dtype: int64


### Key Signatures

In [16]:
print("\nKey Signatures:")
print(pd.Series(keys).value_counts())


Key Signatures:
    2093
Name: count, dtype: int64


### Time Signatures

In [17]:
print("\nTime Signatures:")
print(pd.Series(times).value_counts())


Time Signatures:
4/4    2093
Name: count, dtype: int64


### Tempos

In [18]:
print("\nTempos:")
print(pd.Series(tempos).describe())


Tempos:
count    2093.0
mean      120.0
std         0.0
min       120.0
25%       120.0
50%       120.0
75%       120.0
max       120.0
dtype: float64


### Velocities

In [19]:
print("\nVelocities:")
print(pd.Series(velocities).describe())


Velocities:
count    3.036504e+06
mean     5.950470e+01
std      1.008162e+01
min      1.500000e+01
25%      5.200000e+01
50%      6.000000e+01
75%      6.700000e+01
max      8.800000e+01
dtype: float64
