In [7]:
import os
import pretty_midi

# Path to the directory containing MIDI files
midi_dir = "data_alltogheter"

# Get list of .mid or .midi files (first 100)
midi_files = [f for f in os.listdir(midi_dir) if f.lower().endswith(('.mid', '.midi'))]

# Iterate over each file
for filename in midi_files:
    filepath = os.path.join(midi_dir, filename)
    
    try:
        midi_data = pretty_midi.PrettyMIDI(filepath)
        printed_header = False  # Only print filename if non-piano instrument is found

        for instrument in midi_data.instruments:
            if not instrument.is_drum and instrument.program != 0:
                if not printed_header:
                    print(f"\n--- Processing: {filename} ---")
                    printed_header = True

                name = instrument.name if instrument.name else "Unnamed"
                program = instrument.program
                instrument_name = pretty_midi.program_to_instrument_name(program)
                print(f"Instrument: {instrument_name}, Program: {program}, Name: {name}, Drum: False")

    except Exception as e:
        print(f"Error processing {filename}: {e}")


All the songs contain only one instrument: Grand acoustic Piano. 

In [8]:
import pandas as pd
from fractions import Fraction

# Load the CSV
df = pd.read_csv('data_preprocessed/data.csv')

# Define your conversion functions
def convert_duration(value, precision=4):
    if isinstance(value, list):
        return [round(float(v), precision) if isinstance(v, Fraction) else round(v, precision) for v in value]
    elif isinstance(value, Fraction):
        return round(float(value), precision)
    try:
        return round(float(value), precision)
    except:
        return None

def process_durations(durations_column):
    cleaned_durations = []
    for item in durations_column:
        try:
            if isinstance(item, str):
                item = eval(item)  # ⚠️ Caution: `eval` can be dangerous with untrusted input
            cleaned_durations.append(convert_duration(item))
        except Exception:
            cleaned_durations.append(None)
    return cleaned_durations

# Apply processing
df['durations'] = process_durations(df['durations'])

# Save the updated DataFrame back to CSV (overwrite the original file)
df.to_csv('data_preprocessed/data.csv', index=False)

print("Updated durations saved to data_preprocessed/data.csv.")


Updated durations saved to data_preprocessed/data.csv.


In [10]:
import pandas as pd
from fractions import Fraction
import ast
import re

# Load your CSV
df = pd.read_csv("data_preprocessed/data.csv")

# Step 1: Convert string representations of Fraction objects into real Fraction objects
def safe_eval_fraction(obj):
    """Converts a string like 'Fraction(1, 2)' or a list thereof into actual numbers."""
    try:
        # Try evaluating normally
        if isinstance(obj, str):
            # Find all occurrences of Fraction(x, y)
            matches = re.findall(r'Fraction\((\d+),\s*(\d+)\)', obj)
            for num, denom in matches:
                frac_str = f"Fraction({num}, {denom})"
                obj = obj.replace(frac_str, str(Fraction(int(num), int(denom))))
            obj = ast.literal_eval(obj)
        return obj
    except Exception:
        return obj  # Return original if conversion fails

# Step 2: Convert Fraction or numeric types to float with desired precision
def convert_duration(value, precision=4):
    if isinstance(value, list):
        return [round(float(v), precision) if isinstance(v, Fraction) else round(float(v), precision) for v in value]
    elif isinstance(value, Fraction):
        return round(float(value), precision)
    try:
        return round(float(value), precision)
    except:
        return None

# Step 3: Full process function
def process_durations(durations_column):
    cleaned_durations = []
    for item in durations_column:
        try:
            parsed_item = safe_eval_fraction(item)
            cleaned_durations.append(convert_duration(parsed_item))
        except:
            cleaned_durations.append(None)
    return cleaned_durations

# Apply and save
df['durations'] = process_durations(df['durations'])
df.to_csv("data_preprocessed/data.csv", index=False)

print("All Fraction strings converted and saved.")


All Fraction strings converted and saved.
