# Data Cleaning and Exploration

Now we want to clean the dataset and explore the values.

### Libraries

In [5]:
from pathlib import Path
import shutil
import os
from music21 import converter, note, chord, instrument, meter, tempo, key
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import ast
import re
from fractions import Fraction

## Data cleaning


### Checking instrument types

In [None]:
import os
import pretty_midi

# Path to the directory containing MIDI files
midi_dir = "data_alltogheter"

# Get list of .mid or .midi files (first 100)
midi_files = [f for f in os.listdir(midi_dir) if f.lower().endswith(('.mid', '.midi'))]

# Iterate over each file
for filename in midi_files:
    filepath = os.path.join(midi_dir, filename)
    
    try:
        midi_data = pretty_midi.PrettyMIDI(filepath)
        printed_header = False  # Only print filename if non-piano instrument is found

        for instrument in midi_data.instruments:
            if not instrument.is_drum and instrument.program != 0:
                if not printed_header:
                    print(f"\n--- Processing: {filename} ---")
                    printed_header = True

                name = instrument.name if instrument.name else "Unnamed"
                program = instrument.program
                instrument_name = pretty_midi.program_to_instrument_name(program)
                print(f"Instrument: {instrument_name}, Program: {program}, Name: {name}, Drum: False")

    except Exception as e:
        print(f"Error processing {filename}: {e}")


### Converting fractions

In [None]:
# Load your CSV
df = pd.read_csv("data_preprocessed/data.csv")

# Step 1: Safe conversion of Fraction(x, y) strings
def safe_eval_fraction(obj):
    try:
        if isinstance(obj, str):
            # Replace all Fraction(x, y) with actual float values
            obj = re.sub(
                r'Fraction\((\d+),\s*(\d+)\)',
                lambda m: str(float(Fraction(int(m.group(1)), int(m.group(2))))),
                obj
            )
            obj = ast.literal_eval(obj)
        return obj
    except Exception as e:
        return None

# Step 2: Convert to rounded float
def convert_duration(value, precision=4):
    try:
        if isinstance(value, list):
            return [round(float(v), precision) for v in value]
        return round(float(value), precision)
    except:
        return None

# Step 3: Full processing function
def process_column(column):
    return [convert_duration(safe_eval_fraction(val)) for val in column]

# Apply to both columns
if 'durations' in df.columns:
    df['durations'] = process_column(df['durations'])

if 'offsets' in df.columns:
    df['offsets'] = process_column(df['offsets'])

# Save once at the end
df.to_csv("data_preprocessed/data.csv", index=False)
print("Updated 'durations' and 'offsets' columns successfully.")


## Preliminary Data Exploration

In [33]:
df = pd.read_csv("data_preprocessed/data.csv")
print("Columns in your CSV file:")
print(df.columns)

Columns in your CSV file:
Index(['file_name', 'instrument', 'notes', 'chords', 'velocities', 'durations',
       'offsets', 'tempos', 'time_signatures', 'key_signatures',
       'track_names'],
      dtype='object')


In [34]:
for i in range(3):
    print(f"Song {i+1} - {df.loc[i, 'file_name']}")
    print("Number of notes:", len(df.loc[i, 'notes']))
    print("Number of chords:", len(df.loc[i, 'chords']))
    print("Number of offsets:", len(df.loc[i, 'offsets']))
    print("Number of durations:", len(df.loc[i, 'durations']))
    print("Example notes and offsets:")
    for n, o in zip(df.loc[i, 'notes'][:5], df.loc[i, 'offsets'][:5]):
        print(f"Note {n} at time {o}")
    print("Example chords:", df.loc[i, 'chords'][:5])
    print('-'*30)


Song 1 - 0_Don't Ask Why - Live At Maybeck Recital Hall  Berkeley, CA.midi
Number of notes: 4906
Number of chords: 9547
Number of offsets: 11513
Number of durations: 12422
Example notes and offsets:
Note [ at time [
Note 4 at time 1
Note 8 at time .
Note , at time 2
Note   at time 5
Example chords: [[44,
------------------------------
Song 2 - 1000_Flight to Jordan.midi
Number of notes: 3012
Number of chords: 5276
Number of offsets: 6903
Number of durations: 7421
Example notes and offsets:
Note [ at time [
Note 5 at time 1
Note 5 at time .
Note , at time 2
Note   at time 5
Example chords: [[36,
------------------------------
Song 3 - 1001_How Deep Is the Ocean.midi
Number of notes: 2741
Number of chords: 7358
Number of offsets: 7060
Number of durations: 7604
Example notes and offsets:
Note [ at time [
Note 3 at time 1
Note 6 at time .
Note , at time 2
Note   at time 5
Example chords: [[51,
------------------------------


In [12]:
def safe_parse_list_column(column_name):
    parsed = []
    for raw in df[column_name].dropna():
        try:
            parsed_list = ast.literal_eval(raw)
            if isinstance(parsed_list, list):
                parsed.extend(parsed_list)
        except Exception as e:
            print(f"Skipping malformed row in {column_name}: {raw}")
    return parsed

notes = safe_parse_list_column("notes")
durations = safe_parse_list_column("durations")
instruments = safe_parse_list_column("instrument")
tempos = safe_parse_list_column("tempos")
keys = safe_parse_list_column("key_signatures")
times = safe_parse_list_column("time_signatures")
velocities = safe_parse_list_column("velocities")

print(f"Notes parsed: {len(notes)}")
print(f"Durations parsed: {len(durations)}")

Notes parsed: 1831923
Durations parsed: 3036504


When trying to safely parse all the columns in the dataset we noticed that for the "durations" column there are some anormaly. We have to fix how the data was saved to be able to use this column.

In [13]:
def clean_and_parse_list_column(df, column_name):
    parsed = []
    for raw in df[column_name].dropna():
        if not isinstance(raw, str):
            continue
        raw = raw.strip()
        if not raw.startswith("[") or not raw.endswith("]"):
            continue
        try:
            raw = re.sub(r'^\[|\]$', '', raw)
            items = raw.split(",")
            cleaned_items = []
            for item in items:
                item = item.strip().strip("'\"")
                try:
                    val = float(item)
                    cleaned_items.append(val)
                except ValueError:
                    cleaned_items.append(item)
            parsed.extend(cleaned_items)
        except Exception as e:
            print(f"Skipped malformed row: {raw}")
    return parsed

# Parse each relevant column
notes = clean_and_parse_list_column(df, "notes")
chords = clean_and_parse_list_column(df, "chords")
durations = clean_and_parse_list_column(df, "durations")
velocities = clean_and_parse_list_column(df, "velocities")
offsets = clean_and_parse_list_column(df, "offsets")
tempos = clean_and_parse_list_column(df, "tempos")
time_signatures = clean_and_parse_list_column(df, "time_signatures")
key_signatures = clean_and_parse_list_column(df, "key_signatures")
instruments = clean_and_parse_list_column(df, "instrument")
track_names = clean_and_parse_list_column(df, "track_names")

# Optional: print out sample sizes
print(f"Notes parsed: {len(notes)}")
print(f"Chords parsed: {len(chords)}")
print(f"Durations parsed: {len(durations)}")
print(f"Velocities parsed: {len(velocities)}")
print(f"Offsets parsed: {len(offsets)}")
print(f"Tempos parsed: {len(tempos)}")
print(f"Time Signatures parsed: {len(time_signatures)}")
print(f"Key Signatures parsed: {len(key_signatures)}")
print(f"Instruments parsed: {len(instruments)}")
print(f"Track Names parsed: {len(track_names)}")


Notes parsed: 1831923
Chords parsed: 3305994
Durations parsed: 3036504
Velocities parsed: 3036504
Offsets parsed: 3036504
Tempos parsed: 2093
Time Signatures parsed: 2093
Key Signatures parsed: 2093
Instruments parsed: 2093
Track Names parsed: 2093


### Notes

In [None]:
print("Notes:")
print(f"Unique notes: {len(set(notes))}")
print(pd.Series(notes).describe())

Notes:
Unique notes: 88
count    1.831923e+06
mean     6.181524e+01
std      1.393873e+01
min      2.100000e+01
25%      5.200000e+01
50%      6.200000e+01
75%      7.100000e+01
max      1.080000e+02
dtype: float64


### Durations

In [None]:
print("\nDurations:")
print(pd.Series(durations).describe())
print(df["durations"].head())


Durations:
count     4039871
unique        412
top            3)
freq       852548
dtype: object
0    [0.5, 1.0, 0.75, 0.5, 0.75, 0.25, 2.0, 1.25, 1...
1    [0.25, 2.5, Fraction(5, 3), 0.75, 0.75, Fracti...
2    [2.75, 2.5, 2.25, Fraction(5, 3), Fraction(1, ...
3    [2.0, Fraction(1, 3), 1.0, 0.25, 0.75, 1.5, 0....
4    [1.25, 0.5, Fraction(1, 3), 1.0, 2.25, 1.0, Fr...
Name: durations, dtype: object


In [None]:
def convert_duration(value, precision=4):
    if isinstance(value, list):
        return [round(float(v), precision) if isinstance(v, Fraction) else round(v, precision) for v in value]
    elif isinstance(value, Fraction):
        return round(float(value), precision)
    try:
        return round(float(value), precision)
    except:
        return None

def process_durations(durations_column):
    cleaned_durations = []
    for item in durations_column:
        try:
            if isinstance(item, str):
                item = eval(item)
            cleaned_durations.append(convert_duration(item))
        except Exception as e:
            cleaned_durations.append(None)
    return cleaned_durations

df['durations'] = process_durations(df['durations'])

print(df['durations'].head())

0    [0.5, 1.0, 0.75, 0.5, 0.75, 0.25, 2.0, 1.25, 1...
1    [0.25, 2.5, 1.6667, 0.75, 0.75, 1.6667, 0.5, 1...
2    [2.75, 2.5, 2.25, 1.6667, 0.3333, 1.0, 0.6667,...
3    [2.0, 0.3333, 1.0, 0.25, 0.75, 1.5, 0.5, 0.5, ...
4    [1.25, 0.5, 0.3333, 1.0, 2.25, 1.0, 2.3333, 2....
Name: durations, dtype: object


### Instruments

In [None]:
print("\nInstruments:")
print(pd.Series(instruments).value_counts())


Instruments:
Unknown    2093
Name: count, dtype: int64


### Key Signatures

In [None]:
print("\nKey Signatures:")
print(pd.Series(keys).value_counts())


Key Signatures:
    2093
Name: count, dtype: int64


### Time Signatures

In [None]:
print("\nTime Signatures:")
print(pd.Series(times).value_counts())


Time Signatures:
4/4    2093
Name: count, dtype: int64


### Tempos

In [None]:
print("\nTempos:")
print(pd.Series(tempos).describe())


Tempos:
count    2093.0
mean      120.0
std         0.0
min       120.0
25%       120.0
50%       120.0
75%       120.0
max       120.0
dtype: float64


### Velocities

In [None]:
print("\nVelocities:")
print(pd.Series(velocities).describe())


Velocities:
count    3.036504e+06
mean     5.950470e+01
std      1.008162e+01
min      1.500000e+01
25%      5.200000e+01
50%      6.000000e+01
75%      6.700000e+01
max      8.800000e+01
dtype: float64
