# Parsing MIDI functions

In [97]:
# Basic Metadata
def get_midi_basic_metadata(midi, filepath):
    metadata = {
        "genre": filepath_to_genre[filepath],
        "number_of_tracks": len(midi.instruments),
        "duration_seconds": midi.get_end_time(),
        "file_size_kb": os.path.getsize(filepath) / 1024,
    }
    return metadata
    
def get_midi_musical_features(midi, filepath):
    features = {
        "tempo": midi.estimate_tempo(),
        "key_signatures": [str(key) for key in midi.key_signature_changes],
        "time_signatures": [str(ts) for ts in midi.time_signature_changes],
        "pitch_range": (
            min(note.pitch for instrument in midi.instruments for note in instrument.notes),
            max(note.pitch for instrument in midi.instruments for note in instrument.notes),
        ),
    }
    return features
    
    
def get_midi_performance_features(midi, filepath):
    performance = {
        "velocity_range": (
            min(note.velocity for instrument in midi.instruments for note in instrument.notes),
            max(note.velocity for instrument in midi.instruments for note in instrument.notes),
        ),
        "average_velocity": sum(
            note.velocity for instrument in midi.instruments for note in instrument.notes
        ) / sum(len(instrument.notes) for instrument in midi.instruments),
        "note_density": sum(len(instrument.notes) for instrument in midi.instruments) / midi.get_end_time(),
    }
    return performance

def get_midi_structural_features(midi, filepath):
    sections = {
        "time_signature_changes": len(midi.time_signature_changes),
        "tempo_changes": len(midi.get_tempo_changes()[0]),  # Number of tempo changes
        "instrument_usage": {instrument.name: len(instrument.notes) for instrument in midi.instruments},
    }
    return sections


In [None]:
from pretty_midi import PrettyMIDI
# Stores midi file information. Format: { filepath: data }
basic_metadata = {}
musical_features = {}
performance_features = {}
structural_features = {}
dataset_analysis: Dict[str, Dict[str, Any]] = {}

for filepath in filepath_list:
    try:
        midi = PrettyMIDI(filepath)
        basic_metadata[filepath] = get_midi_basic_metadata(midi, filepath)
        musical_features[filepath] = get_midi_musical_features(midi, filepath)
        performance_features[filepath] = get_midi_performance_features(midi, filepath)
        structural_features[filepath] = get_midi_structural_features(midi, filepath)
    except Exception as e:
        print(f"Error processing {filepath}: {e}")
    
    dataset_analysis[filepath] = {
        "basic_metadata": basic_metadata.get(filepath, {}),
        "musical_features": musical_features.get(filepath, {}),
        "performance_features": performance_features.get(filepath, {}),
        "structural_features": structural_features.get(filepath, {}),
    }


Error processing lmd_matched_clean/Hip-Hop/TRDKSKE128F930A28A_5711e0cb2d8d94fa98bcc1a09d71b70a_Hip-Hop.mid: data byte must be in range 0..127
Error processing lmd_matched_clean/Hip-Hop/TRGSSCI128F933D9C1_c9f0f2df33e5cc7623bfd2351654bad4_Hip-Hop.mid: data byte must be in range 0..127
Error processing lmd_matched_clean/Hip-Hop/TRNROKK128F9311248_0a1955f9b201d9d391d1a7ea5f812e7a_Hip-Hop.mid: data byte must be in range 0..127
Error processing lmd_matched_clean/Hip-Hop/TRLZTRZ128F429C1D5_827af6bbff51012a0ea35227aa4f877a_Hip-Hop.mid: Could not decode key with 5 flats and mode 255
Error processing lmd_matched_clean/Hip-Hop/TRDKSKE128F930A28A_123bed39220cac8584d041f39f8ea82e_Hip-Hop.mid: 
Error processing lmd_matched_clean/Hip-Hop/TRWYVBA128F424D7FC_408cc41309896762473442629f574226_Hip-Hop.mid: data byte must be in range 0..127
Error processing lmd_matched_clean/Hip-Hop/TRRMSGD128F425D56B_7d36cb4fb85945912f1919237ba36f2b_Hip-Hop.mid: 
Error processing lmd_matched_clean/Hip-Hop/TRCQDMP128F42483

In [None]:
import json

# Save dataset analysis to a JSON file
output_file: str = "dataset_analysis.json"
with open(output_file, "w") as json_file:
    json.dump(dataset_analysis, json_file, indent=4)

print(f"Dataset analysis saved to {output_file}")

# Step 1: Import Necessary Libraries

In [100]:
# We'll use `pandas` to create and manipulate the DataFrame, and `json` to save the final data in JSON format.
import pandas as pd
import json
from typing import Dict, List, Any


# Step 2: Load and parse the file

In [56]:
file_path = 'msd_genre_classification.cls'  # Replace with your actual file path

# Read the file into a list of dictionaries based on the header format
data = []
with open(file_path, 'r') as file:
    for line in file:
        # Skip comments and empty lines
        if line.startswith('#') or not line.strip():
            continue
        # Split each line into fields
        fields = line.strip().split('\t')

        # Parse the fields into respective columns
        track_id = fields[0]
        seed_genre = fields[1]
        num_labels = int(fields[2])
        labels = fields[3:]

        # Group labels and their strengths
        label_data = {labels[i]: float(labels[i+1]) for i in range(0, len(labels), 2)}

        # Append data as a dictionary
        data.append({
            'track_id': track_id,
            'seed_genre': seed_genre,
            'num_labels': num_labels,
            'labels': label_data
        })

# Convert the parsed data into a DataFrame
df = pd.DataFrame(data)
print("Initial DataFrame:")
print(len(df))
df.head()

Initial DataFrame:
677038


Unnamed: 0,track_id,seed_genre,num_labels,labels
0,TRAAAAK128F9318786,Rock,201,"{'Rock': 0.6766169, 'Metal': 0.09950249, 'Hard..."
1,TRAAAAV128F421A322,Rock,8,"{'Rock': 0.5, 'Punk': 0.5}"
2,TRAAAAW128F429D538,Hip-Hop,133,"{'Hip-Hop': 0.48872182, 'Hip-Hop/Rap': 0.27067..."
3,TRAAAAY128F42A73F0,World,1,{'World': 1.0}
4,TRAAABD128F429CF47,Rock,40,"{'Rock': 0.4, 'Rock/Pop': 0.15, 'Oldies': 0.1,..."


# Step 3: Identify and sort genres by num_labels


In [57]:
# This will allow us to filter genres with sufficient data later.
print(len(df))
genre_counts = df['seed_genre'].value_counts()
num_genres = len(genre_counts)
num_genres_under_50 = (genre_counts < 50).sum()
num_genres_above_10000 = (genre_counts > 10_000).sum()
print(f"Number of genres: {num_genres}")
print(f"Number of genres with counts under 50: {num_genres_under_50}")
print(f"Number of genres with counts above 10,000: {num_genres_above_10000}")
print("Genre Counts:\n", genre_counts)

677038
Number of genres: 6152
Number of genres with counts under 50: 6024
Number of genres with counts above 10,000: 14
Genre Counts:
 Rock                   261242
Pop                     57210
Electronic              38235
Jazz                    37844
Hip-Hop                 31580
                        ...  
General Spoken              1
Reggae/folk/version         1
Classic Ska                 1
Deep Chill House            1
Tanzania                    1
Name: seed_genre, Length: 6152, dtype: int64


# Step 4: Filter categories with low couts or unuseal genres

In [66]:
# Threshold for "big genres"
# 10,000 = 14 Genres , 11,000 = 11 Genres, 13,000 = 9
big_genre_threshold = 10_000

# Count occurrences of each seed genre
genre_counts = df['seed_genre'].value_counts()

# Filter big genres
big_genres = genre_counts[genre_counts > big_genre_threshold].index
big_genres_df = df[df['seed_genre'].isin(big_genres)]

print(f"Number of Big Genres: {len(big_genres)}")
print(f"Filtered DataFrame rows for Big Genres: {len(big_genres_df)}")
print("Sample of Filtered DataFrame:")
print(big_genres_df.head())

# Validate the genre counts
print(f"Total genres: {len(genre_counts)}")
print(f"Genres with counts under 50: {(genre_counts < 50).sum()}")
print(f"Genres with counts above {big_genre_threshold}: {(genre_counts > big_genre_threshold).sum()}")

Number of Big Genres: 14
Filtered DataFrame rows for Big Genres: 574269
Sample of Filtered DataFrame:
             track_id seed_genre  num_labels  \
0  TRAAAAK128F9318786       Rock         201   
1  TRAAAAV128F421A322       Rock           8   
2  TRAAAAW128F429D538    Hip-Hop         133   
3  TRAAAAY128F42A73F0      World           1   
4  TRAAABD128F429CF47       Rock          40   

                                              labels  
0  {'Rock': 0.6766169, 'Metal': 0.09950249, 'Hard...  
1                         {'Rock': 0.5, 'Punk': 0.5}  
2  {'Hip-Hop': 0.48872182, 'Hip-Hop/Rap': 0.27067...  
3                                     {'World': 1.0}  
4  {'Rock': 0.4, 'Rock/Pop': 0.15, 'Oldies': 0.1,...  
Total genres: 6152
Genres with counts under 50: 6024
Genres with counts above 10000: 14


# Step 5: Remove songs with less than 10 user labels

In [75]:
# Filter to include only songs with 10 or more labels
min_song_threshold = 10
big_genres_df_10_plus_songs = big_genres_df[big_genres_df['num_labels'] >= min_song_threshold]
print(f"Filtered DataFrame with songs having 5 or more labels: {big_genres_df_10_plus_songs.shape[0]} songs")

# Remove songs with 'other' as the seed genre (if any)
explicit_genre_exclude = ['other', 'Electronica/Dance', 'Soundtrack']
big_genres_df_final = big_genres_df_10_plus_songs[big_genres_df_10_plus_songs['seed_genre'].isin(explicit_genre_exclude) == False]
print(f"Filtered DataFrame after removing 'other' genre: {big_genres_df_final.shape[0]} songs")

# Display a sample of the filtered DataFrame
print("Sample of the filtered DataFrame:")
print(big_genres_df_final.head())


Filtered DataFrame with songs having 5 or more labels: 339099 songs
Filtered DataFrame after removing 'other' genre: 324807 songs
Sample of the filtered DataFrame:
             track_id seed_genre  num_labels  \
0  TRAAAAK128F9318786       Rock         201   
2  TRAAAAW128F429D538    Hip-Hop         133   
4  TRAAABD128F429CF47       Rock          40   
8  TRAAAED128E0783FAB       Jazz        2227   
9  TRAAAEF128F4273421       Rock         181   

                                              labels  
0  {'Rock': 0.6766169, 'Metal': 0.09950249, 'Hard...  
2  {'Hip-Hop': 0.48872182, 'Hip-Hop/Rap': 0.27067...  
4  {'Rock': 0.4, 'Rock/Pop': 0.15, 'Oldies': 0.1,...  
8  {'Jazz': 0.6897171, 'Pop': 0.061517738, 'Gener...  
9  {'Rock': 0.45303866, 'New Wave': 0.13812155, '...  


# Step 6: Clean the labels for each songs (Remove low probability and unused genres)

In [76]:
# Define a probability threshold for keeping genres
probability_threshold = 0.01

def clean_labels(label_data, threshold, valid_genres):
    """
    Remove low-probability genres from the label dictionary.

    Parameters:
        label_data (dict): Dictionary of genres and their probabilities.
        threshold (float): Minimum probability to retain a genre.
        valid_genres (list[str]): List of valid genres to retain.

    Returns:
        dict: Cleaned label dictionary with high-probability genres.
    """
    return {genre: prob for genre, prob in label_data.items() if prob >= threshold and genre in valid_genres}

# Apply the cleaning function to the 'labels' column
big_genres_df_final['labels'] = big_genres_df_final['labels'].apply(
    clean_labels,
    threshold=probability_threshold,
    valid_genres=list(big_genres)
)

# Remove rows where the cleaned 'labels' dictionary is empty (i.e., no genres meet the threshold)
big_genres_df_final = big_genres_df_final[big_genres_df_final['labels'].apply(bool)]

print(f"DataFrame after cleaning labels: {big_genres_df_final.shape[0]} songs")
print("Sample of cleaned labels:")
print(big_genres_df_final[['track_id', 'labels']].head())


DataFrame after cleaning labels: 312781 songs
Sample of cleaned labels:
             track_id                                   labels
0  TRAAAAK128F9318786                      {'Rock': 0.6766169}
2  TRAAAAW128F429D538                  {'Hip-Hop': 0.48872182}
4  TRAAABD128F429CF47              {'Rock': 0.4, 'Pop': 0.075}
8  TRAAAED128E0783FAB  {'Jazz': 0.6897171, 'Pop': 0.061517738}
9  TRAAAEF128F4273421   {'Rock': 0.45303866, 'Pop': 0.0718232}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Install basic depencies

Setup functions and variables

In [None]:
# Base directory for your dataset
base_dir = "lmd_matched_clean"

# Dictionaries to store mappings
genre_to_filepaths: Dict[str, List[str]] = {}
filepath_to_genre: Dict[str, str] = {}
filepath_list: List[str] = []

# Iterate through the directories and populate mappings
for genre in os.listdir(base_dir):
    genre_folder: str = os.path.join(base_dir, genre)
    if os.path.isdir(genre_folder):
        genre_to_filepaths[genre] = []
        for file in os.listdir(genre_folder):
            if file.endswith(".mid"):
                filepath: str = os.path.join(genre_folder, file)
                genre_to_filepaths[genre].append(filepath)
                filepath_to_genre[filepath] = genre
                filepath_list.append(filepath)



In [95]:
from itertools import islice

small_section = dict(islice(genre_to_filepaths.items(), 10))
print(small_section)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



# Save midi file data analysis to json

# Save the cleaned data to JSON and .cls file

In [102]:
# We'll save the final DataFrame in two formats as requested.
json_output_path = 'cleaned_data.json'
with open(json_output_path, 'w') as json_file:
    json.dump(big_genres_df_final.to_dict(orient='records'), json_file, indent=2)
print(f"Data saved to {json_output_path}")

# CLS format (tab-separated as per original format)
cls_output_path = 'cleaned_data.cls'
with open(cls_output_path, 'w') as cls_file:
    for _, row in big_genres_df_final.iterrows():
        line = f"{row['track_id']}\t{row['seed_genre']}\t{row['num_labels']}"
        for label, strength in row['labels'].items():
            line += f"\t{label}\t{strength}"
        cls_file.write(line + '\n')
print(f"Data saved to {cls_output_path}")

Data saved to cleaned_data.json


KeyboardInterrupt: 

# Process Original lmd_matched folder and move to lmd_matched_clean (with genres)


In [77]:
import os
import json
import shutil
import pretty_midi

# Paths
aligned_folder = 'lmd_matched'
match_scores_path = 'match_scores.json'
destination_folder = 'lmd_matched_clean'

In [78]:
# Load match scores
with open(match_scores_path, 'r') as f:
    match_scores = json.load(f)

# Initialize metadata storage
metadata_list = []
numOfExceptions: int = 0

# Function to generate directory paths
def msd_id_to_dirs(track_id: str) -> str:
    """Generate the directory structure for a track ID."""
    return os.path.join(track_id[2], track_id[3], track_id[4], track_id)

# Function to get MIDI file path
def get_midi_path(track_id: str, midi_md5: str) -> str:
    """Generate the full path to the MIDI file."""
    return os.path.join(aligned_folder, msd_id_to_dirs(track_id), midi_md5 + '.mid')

# Function to create new filename
def get_cleaned_midi_new_filename(track_id: str, midi_md5: str, genre: str) -> str:
    """Generate the cleaned filename in the format {track_id}_{midi_md5}_{genre}.mid."""
    return f"{track_id}_{midi_md5}_{genre}.mid"


In [79]:
# DON'T RUN UNLESS YOU WANT TO PARSE AND COPY FROM ORIGINIAL DATASET TO THE CLEAN

In [84]:
# Process files
# DON'T RUN UNLESS YOU WANT TO PARSE AND COPY FROM ORIGINAL DATASET TO THE CLEAN
for _, row in big_genres_df_final.iterrows():
    track_id = row['track_id']
    genre = row['seed_genre']
    
    # Check if track_id exists in match_scores
    if track_id in match_scores:
        for midi_md5_filename, score in match_scores[track_id].items():
            if score > 0.5:
                # Locate MIDI file
                try:
                    midi_path = get_midi_path(track_id, midi_md5_filename)
                    if os.path.exists(midi_path):
                        # Copy to genre folder
                        new_midi_filename = get_cleaned_midi_new_filename(track_id, midi_md5_filename, genre)
                        genre_folder = os.path.join(destination_folder, genre)
                        os.makedirs(genre_folder, exist_ok=True)  # Ensure the genre folder exists
                        destination_path = os.path.join(genre_folder, new_midi_filename)
                        shutil.copy(midi_path, destination_path)
                        # print(f"Copied {midi_path} to {destination_path}")
                except Exception as ex:
                    print(f"Error processing file {midi_md5_filename}: {ex}")
                    numOfExceptions += 1

In [86]:
print(numOfExceptions)

185
