# Data Preprocessing

This notebook covers the preprocessing steps required to convert raw MIDI files into structured tabular data, which will be used for model training in our music generation project.

### Libraries

In [None]:
import os
import shutil
from pathlib import Path

from music21 import chord, converter, key, meter, note, tempo

### Useful functions

To handle large datasets, we define helper functions to load data from split CSVs and optionally save new processed DataFrames back into separate files. This approach keeps the workflow modular and scalable.

In [3]:
def load_dataframe_from_two_csvs(file1, file2):
    """
    Load and concatenate two CSV files into a single pandas DataFrame.
    """
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)
    full_df = pd.concat([df1, df2], ignore_index=True)
    return full_df

def save_dataframe_to_two_csvs(df, file1, file2):
    """
    Split a DataFrame in half and save it into two CSV files.
    """
    halfway = len(df) // 2
    df.iloc[:halfway].to_csv(file1, index=False)
    df.iloc[halfway:].to_csv(file2, index=False)


## Parse MIDI files and save into CSV format

We copy all MIDI files into a single folder.

In [None]:
source_dirs = [Path("raw_data/live").resolve(), Path("raw_data/studio").resolve()]
destination_dir = Path("data_alltogheter").resolve()

os.makedirs(destination_dir, exist_ok=True)

file_id = 0

for source_dir in source_dirs:
    for root, _, files in os.walk(source_dir):
        for file in files:
            if file.lower().endswith((".mid", ".midi")):
                source_file = Path(root) / file
                new_filename = f"{file_id}_{file}"
                destination_file = destination_dir / new_filename

                try:
                    shutil.copy2(source_file, destination_file)
                    print(f"Copied: {source_file} → {destination_file}")
                    file_id += 1
                except Exception as e:
                    print(f"Failed to copy {source_file}: {e}")

print("Done copying MIDI files.")

We parse the files, retrieve the information from each MIDI and save it in CSV format

In [None]:
def parse_midi_file(file_path):
    """
    Parse a MIDI file and extract relevant musical information.
    """

    score = converter.parse(file_path)
    
    midi_data = {
        'file_name': os.path.basename(file_path),
        'instrument': [],
        'notes': [],
        'chords': [],
        'velocities': [],
        'durations': [],
        'offsets': [],
        'tempos': [],
        'time_signatures': [],
        'key_signatures': [],
        'track_names': []
    }
    
    for element in score.flat:
        if isinstance(element, tempo.MetronomeMark):
            midi_data['tempos'].append(element.getQuarterBPM())
        if isinstance(element, meter.TimeSignature):
            midi_data['time_signatures'].append(element.ratioString)
        if isinstance(element, key.KeySignature):
            midi_data['key_signatures'].append(element.sharps)
    
    for part in score.parts:
        part_instrument = part.getInstrument().instrumentName or 'Unknown'
        midi_data['instrument'].append(part_instrument)
        
        track_name = part.partName or 'Unnamed Track'
        midi_data['track_names'].append(track_name)
        
        for elem in part.recurse().notes:
            if isinstance(elem, note.Note):
                midi_data['notes'].append(elem.pitch.midi)
                midi_data['durations'].append(elem.quarterLength)
                midi_data['offsets'].append(elem.offset)
                midi_data['velocities'].append(elem.volume.velocity)
            elif isinstance(elem, chord.Chord):
                chord_pitches = [p.midi for p in elem.pitches]
                midi_data['chords'].append(chord_pitches)
                midi_data['durations'].append(elem.quarterLength)
                midi_data['offsets'].append(elem.offset)
                midi_data['velocities'].append(elem.volume.velocity)
    
    return midi_data


def process_midi_files(midi_dir):
    all_data = []
    
    for root, _, files in os.walk(midi_dir):
        for file in files:
            if file.lower().endswith('.mid') or file.lower().endswith('.midi'):
                file_path = os.path.join(root, file)
                midi_data = parse_midi_file(file_path)
                all_data.append(midi_data)
    
    return all_data


def save_data_to_csv(all_data, output_file):
    df = pd.DataFrame(all_data)
    
    df.to_csv(output_file, index=False)
    print(f"Data saved to {output_file}")

midi_directory = 'data_alltogheter'
output_csv_file = 'data_preprocessed/data.csv'

all_midi_data = process_midi_files(midi_directory)

save_data_to_csv(all_midi_data, output_csv_file)

  midi_data = parse_midi_file(file_path)


Data saved to data_preprocessed/data.csv
