Running through all the files

In [None]:
import os

# Define the parent directory path
parent_directory = "lmd_matched_clean"

# Iterate through all subdirectories and their files
for folder in os.listdir(parent_directory):
    folder_path = os.path.join(parent_directory, folder)
    
    if os.path.isdir(folder_path):  # Check if it's a folder
        print(f"Entering folder: {folder}")
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            if os.path.isfile(file_path):  # Ensure it's a file
                print(f"Processing file: {filename} in folder: {folder}")
                # Add your file processing logic here


In [None]:
from typing import Dict, List, Any
import os
# Base directory for your dataset
base_dir = "lmd_matched_clean"

# Dictionaries to store mappings
genre_to_filepaths: Dict[str, List[str]] = {}
filepath_to_genre: Dict[str, str] = {}
filepath_list: List[str] = []

# Iterate through the directories and populate mappings
for genre in os.listdir(base_dir):
    genre_folder: str = os.path.join(base_dir, genre)
    if os.path.isdir(genre_folder):
        genre_to_filepaths[genre] = []
        for file in os.listdir(genre_folder):
            if file.endswith(".mid"):
                filepath: str = os.path.join(genre_folder, file)
                genre_to_filepaths[genre].append(filepath)
                filepath_to_genre[filepath] = genre
                filepath_list.append(filepath)

# save each to json file
# import json
# with open("genre_to_filepaths.json", "w") as f:
#     json.dump(genre_to_filepaths, f)
    
# with open("filepath_to_genre.json", "w") as f:
#     json.dump(filepath_to_genre, f)

# # Save the list of filepaths
# with open("filepath_list.json", "w") as f:
#     json.dump(filepath_list, f)

Processing midi file functions

In [None]:
# Basic Metadata
def get_midi_basic_metadata(midi, filepath):
    metadata = {
        "genre": filepath_to_genre[filepath],
        "number_of_tracks": len(midi.instruments),
        "duration_seconds": midi.get_end_time(),
        "file_size_kb": os.path.getsize(filepath) / 1024,
    }
    return metadata
    
def get_midi_musical_features(midi, filepath):
    features = {
        "tempo": midi.estimate_tempo(),
        "key_signatures": [str(key) for key in midi.key_signature_changes],
        "time_signatures": [str(ts) for ts in midi.time_signature_changes],
        "pitch_range": (
            min(note.pitch for instrument in midi.instruments for note in instrument.notes),
            max(note.pitch for instrument in midi.instruments for note in instrument.notes),
        ),
    }
    return features
    
    
def get_midi_performance_features(midi, filepath):
    performance = {
        "velocity_range": (
            min(note.velocity for instrument in midi.instruments for note in instrument.notes),
            max(note.velocity for instrument in midi.instruments for note in instrument.notes),
        ),
        "average_velocity": sum(
            note.velocity for instrument in midi.instruments for note in instrument.notes
        ) / sum(len(instrument.notes) for instrument in midi.instruments),
        "note_density": sum(len(instrument.notes) for instrument in midi.instruments) / midi.get_end_time(),
    }
    return performance

def get_midi_structural_features(midi, filepath):
    sections = {
        "time_signature_changes": len(midi.time_signature_changes),
        "tempo_changes": len(midi.get_tempo_changes()[0]),  # Number of tempo changes
        "instrument_usage": {instrument.name: len(instrument.notes) for instrument in midi.instruments},
    }
    return sections


In [None]:
from pretty_midi import PrettyMIDI
# Stores midi file information. Format: { filepath: data }
basic_metadata = {}
musical_features = {}
performance_features = {}
structural_features = {}
dataset_analysis: Dict[str, Dict[str, Any]] = {}

for filepath in filepath_list:
    try:
        midi = PrettyMIDI(filepath)
        basic_metadata[filepath] = get_midi_basic_metadata(midi, filepath)
        musical_features[filepath] = get_midi_musical_features(midi, filepath)
        performance_features[filepath] = get_midi_performance_features(midi, filepath)
        structural_features[filepath] = get_midi_structural_features(midi, filepath)
    except Exception as e:
        print(f"Error processing {filepath}: {e}")
    
    dataset_analysis[filepath] = {
        "basic_metadata": basic_metadata.get(filepath, {}),
        "musical_features": musical_features.get(filepath, {}),
        "performance_features": performance_features.get(filepath, {}),
        "structural_features": structural_features.get(filepath, {}),
    }
