# Music Data Preprocessing

## Imports

In [None]:
from pathlib import Path
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='pygame')

from funcs import *

### Defining Constants

In [None]:
# File paths
PROJECT_PATH = PROJECT_PATH = Path.cwd()  # Assumes notebook is run from project root

RAW_DATA_PATH = f"{PROJECT_PATH}music/Raw MIDI Files/NESMDB/"

CLEANED_DATA_PATH = f"{PROJECT_PATH}music/cleaned_data/"

# Preprocessing parameters
MIN_TRACK_DURATION = 2.5        # 2.5 seconds
MIN_NOTE_DURATION = 0.0625      # 32nd note at 120 BPM - any smaller is unnecessary!

## Cleaning Functions

In [None]:
# Remove notes with duration less than a threshold
def remove_small_notes(df: pd.DataFrame, min_duration: float = MIN_NOTE_DURATION):
    return df[df["duration"] >= min_duration].reset_index(drop=True)

In [None]:
# Remove "percussion" and "breath noise" tracks
def remove_auxillary_tracks(df: pd.DataFrame):
    return df[~df["instrument"].isin(["Breath Noise", "Percussion"])]

## Preprocessing Pipeline

In [None]:
#======================================================================
# Main preprocessing pipeline
#======================================================================

# Establishing Directory Structure

# 1) Ensure destination directory exists
if not os.path.exists(CLEANED_DATA_PATH):
    print(f"Creating root destination path: {CLEANED_DATA_PATH}")
os.makedirs(CLEANED_DATA_PATH, exist_ok=True)

total_files = sum(len(files) for _, _, files in os.walk(RAW_DATA_PATH))
successful = 0
skipped = 0
failed = 0

# 2) Walk through all subdirectories
for root, dirs, files in os.walk(RAW_DATA_PATH):

    # 2.1) Compute relative path from the source root
    rel_path = os.path.relpath(root, RAW_DATA_PATH)

    # 2.2) Construct destination path for this folder
    dst_path = os.path.join(CLEANED_DATA_PATH, rel_path)
    if not os.path.exists(dst_path):
        print(f"Creating destination path: {dst_path}")
    os.makedirs(dst_path, exist_ok=True)

    print(f"Processing subfolder: {rel_path}")

#----------------------------------------------------------------------

# Cleaning data

    for fname in files:
        if not fname.lower().endswith(".mid"):
            continue

        src_file = os.path.join(root, fname)

        #-- Build destination filename â€” keep structure intact
        dst_file = os.path.join(
            dst_path,
            fname[:-4] + "_cleaned.mid"
        )

        # 1) Filter out MIDI files with durations too short
        try:
            midi = pretty_midi.PrettyMIDI(src_file)
        except Exception as e:
            print(f"\033[91mFailed to load MIDI file: {fname}: {str(e)}\033[0m")
            failed += 1
            continue
        if midi.get_end_time() < MIN_TRACK_DURATION:
            print(f"\033[91mSkipping file ({rel_path}): {fname} (duration too short)\033[0m")
            skipped += 1
            continue

        # 2) Load MIDI files to DataFrame
        print(f"Processing file ({rel_path}): {fname}")
        try:
            df = midi_to_notes(src_file)
        except Exception as e:
            print(f"\033[91mFailed to process {fname}: {str(e)}\033[0m")
            failed += 1
            continue

        # 3) Cleaning functions

        #-- 3.1) Skip (remove) empty DataFrames
        if df.empty:
            print(f"\033[91mSkipping empty dataframe for file: {fname}\033[0m")
            skipped += 1
            continue

        #-- 3.2) Remove notes with duration less than a threshold
        df = remove_small_notes(df, MIN_NOTE_DURATION)
        if df.empty: # if the dataframe is empty after trimming:
            print(f"\033[91mRemoving dataframe - no remaining notes after trimming: {fname}\033[0m")
            skipped += 1
            continue

        #-- 3.3) Remove "percussion" and "breath noise" tracks
        df = remove_auxillary_tracks(df)

        # 4) Save DataFrame to MIDI file
        notes_to_midi(df, dst_file)
        successful += 1

print(f"Preprocessing completed.")
print("="*50)
print(f"Total files: {total_files} (Successful: {successful}, Skipped: {skipped}, Failed: {failed})")
print(f"Success rate: {(successful / (total_files)) * 100:.2f}%")
print(f"Skipped rate: {(skipped / (total_files)) * 100:.2f}%")
print(f"Skipped rate: {(failed / (total_files)) * 100:.2f}%")
print("="*50)


#### Testing cleaned files

In [None]:
#song_name = "219_Mappy_00_01Main" # Mappy main theme
song_name = "322_SuperMarioBros__00_01RunningAbout" # Mario Bros main theme

raw_song_file = f"{RAW_DATA_PATH}train/{song_name}.mid"
cleaned_song_file = f"{CLEANED_DATA_PATH}train/{song_name}_cleaned.mid"

play_midi(raw_song_file)
play_midi(cleaned_song_file)