In [1]:
import pretty_midi
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import sys
import os
from pathlib import Path

# Import PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

print("Python version")
print(sys.version)
print(sys.version_info)

print(f"\nPytorch version {torch.__version__}")

# Set pytorch to use cuda on GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if torch.cuda.is_available():
    print(f"Using device: '{device}' cuda: {torch.version.cuda}")
    # Get the name of the current CUDA device
    gpu_name = torch.cuda.get_device_name(0) # 0 for the first GPU
    print(f"GPU Name: {gpu_name}")
else:
    print("No CUDA-enabled GPU found or PyTorch not configured for CUDA.")
    print(f"Using device: '{device}'")

Python version
3.12.11 | packaged by Anaconda, Inc. | (main, Jun  5 2025, 12:58:53) [MSC v.1929 64 bit (AMD64)]
sys.version_info(major=3, minor=12, micro=11, releaselevel='final', serial=0)

Pytorch version 2.5.1
Using device: 'cuda' cuda: 12.4
GPU Name: NVIDIA GeForce RTX 4090


In [2]:
dir_name = 'music'

artist_list = []
file_list = {}

directory_path = Path('music')
for file_path in directory_path.iterdir():
    if file_path.is_file():
        print(file_path)
    else:
        artist_list.append(file_path.name)

print(artist_list)

for artist_name in artist_list:
    file_list[artist_name] = []
    directory_path = Path(os.path.join('music', artist_name))
    for file_path in directory_path.iterdir():
        if file_path.is_file():
            if '.mid' in str(file_path):
                file_list[artist_name].append(str(file_path))
        else:
            directory_path_sub = Path(os.path.join('music', artist_name, file_path.name))
            for file_path2 in directory_path_sub.iterdir():
                if file_path2.is_file():
                    if '.mid' in str(file_path2):
                        file_list[artist_name].append(str(file_path2))
                else:
                    directory_path_sub_sub = Path(os.path.join('music', artist_name, file_path.name, file_path2.name))
                    for file_path3 in directory_path_sub_sub.iterdir():
                        if file_path3.is_file():
                            if '.mid' in str(file_path3):
                                file_list[artist_name].append(str(file_path3))
                        else:
                            print(file_path3)

for artist_name in artist_list:
    print(f"{artist_name} number of midi files={len(file_list[artist_name])}")


print(file_list)


['Bach', 'Beethoven', 'Chopin', 'Mozart']
Bach number of midi files=925
Beethoven number of midi files=212
Chopin number of midi files=136
Mozart number of midi files=257
{'Bach': ['music\\Bach\\AveMaria.mid', 'music\\Bach\\Bwv0525 Sonate en trio n1.mid', 'music\\Bach\\Bwv0526 Sonate en trio n2.mid', 'music\\Bach\\Bwv0527 Sonate en trio n3.mid', 'music\\Bach\\Bwv0528 Sonate en trio n4.mid', 'music\\Bach\\Bwv0529 Sonate en trio n5.mid', 'music\\Bach\\Bwv0530 Sonate en trio n6.mid', 'music\\Bach\\Bwv0531 Prelude and Fugue.mid', 'music\\Bach\\Bwv0532 Toccata and Fugue.mid', 'music\\Bach\\Bwv0533 Prelude and Fugue.mid', 'music\\Bach\\Bwv0535 Prelude and Fugue.mid', 'music\\Bach\\Bwv0536 Prelude and Fugue.mid', 'music\\Bach\\Bwv0537 Fantasia and Fugue.mid', "music\\Bach\\Bwv0538 Toccata and Fugue ''Dorian''.mid", 'music\\Bach\\Bwv0539 Prelude and Fugue.mid', 'music\\Bach\\Bwv0540 Toccata and Fugue.mid', 'music\\Bach\\Bwv0541 Prelude and Fugue.mid', 'music\\Bach\\Bwv0542 Fantasia and Fugue.m

In [3]:
# first_file = str(file_list[artist_list[0]][0])

# print(first_file)

In [4]:
# # Load your MIDI file (replace 'your_music.mid' with your file)
# midi_data = pretty_midi.PrettyMIDI(first_file)

# # Determine the sampling frequency for the piano rolls
# fs = 100 # 100 samples per second

# # for instrument in midi_data.instruments:
# #     print(instrument)

# # Get the piano roll for each instrument
# piano_rolls = []
# piano_rolls_end = []
# for instrument in midi_data.instruments:
#     piano_rolls_end.append(instrument.get_end_time())
#     piano_roll = instrument.get_piano_roll(fs=fs)
#     piano_rolls.append(piano_roll)

# max_end = np.max(piano_rolls_end)

# for i in range(len(piano_rolls_end)):
#     if piano_rolls_end[i] < max_end:
#         # Total padding required is the difference between the longest piano_roll and current
#         total_pad = int(fs * (max_end - piano_rolls_end[i]))

#         end_pad = int((max_end - piano_rolls_end[i]) * fs)

#         if total_pad > end_pad:
#             start_pad = total_pad - end_pad
#         else:
#             start_pad = 0
        
#         end_pad_arr = np.zeros((piano_rolls[i].shape[0], end_pad))

#         piano_rolls[i] = np.concatenate((piano_rolls[i], end_pad_arr), axis=1)

#         if start_pad > 0:
#             start_pad_arr = np.zeros((piano_rolls[i].shape[0], start_pad))
#             piano_rolls[i] = np.concatenate((start_pad_arr, piano_rolls[i]), axis=1)




# # for piano_roll in piano_rolls:
# #     print(piano_roll.shape)

# # Stack the piano rolls to create the 3D matrix
# music_matrix_3d = np.stack(piano_rolls, axis=0)

# # The dimensions of music_matrix_3d will be (number_of_instruments, number_of_pitches, number_of_time_steps)
# print(music_matrix_3d.shape)

In [5]:
def midi_to_piano_roll(midi_file_path, fs=100, min_pitch=0, max_pitch=127):
    """
    Converts a MIDI file to a piano roll representation.

    Args:
        midi_file_path (str): Path to the MIDI file.
        fs (int): Sampling frequency (frames per second) for the piano roll.
                  A higher value means finer temporal resolution.
        min_pitch (int): Minimum MIDI pitch to include (0-127).
        max_pitch (int): Maximum MIDI pitch to include (0-127).

    Returns:
        numpy.ndarray: A 2D numpy array representing the piano roll,
                       shape (n_pitches, n_time_steps).
                       Values typically represent velocity (0-127).
                       Returns None if the MIDI file cannot be processed.
    """
    try:
        midi_data = pretty_midi.PrettyMIDI(midi_file_path)
    except Exception as e:
        print(f"Error loading MIDI file {midi_file_path}: {e}")
        return None

    # Get piano roll for all instruments
    piano_roll = midi_data.get_piano_roll(fs=fs)

    # Crop to desired pitch range
    piano_roll = piano_roll[min_pitch:max_pitch+1, :]

    # Normalize velocity to [0, 1] if desired (common for neural networks)
    piano_roll = piano_roll / 127.0

    return piano_roll

In [6]:
def prepare_data_for_lstm(sequence_length, fs=100, min_pitch=21, max_pitch=108,overlap=0):  # A0 to C8, standard piano range
    all_sequences = []
    all_labels = [] # Assuming you have a way to get labels, e.g., from folder names

    n_pitches = max_pitch - min_pitch + 1

    # start label ID at zero
    current_label = 0

    for artist_name in artist_list:
        print(f"Starting {artist_name} with {len(file_list[artist_name])} midi files")
        midi_files = file_list[artist_name]
        
        for midi_file in midi_files:
                # print(type(midi_file))
                # print(midi_file)
                piano_roll = midi_to_piano_roll(str(midi_file), fs=fs, min_pitch=min_pitch, max_pitch=max_pitch)
                if piano_roll is None:
                    continue

                # Transpose piano_roll to (time_steps, pitches) for LSTM
                piano_roll = piano_roll.T

                # Segment the piano roll into sequences
                num_time_steps = piano_roll.shape[0]
                step_size = sequence_length - overlap
                
                for i in range(0, num_time_steps - sequence_length + 1, step_size):
                    sequence = piano_roll[i:i + sequence_length, :]
                    all_sequences.append(sequence)
                    all_labels.append(current_label)

        if not all_sequences:
            print("No sequences processed. Check your MIDI file paths and parameters.")
            return None, None
        # Increment label Id
        current_label += 1

    X = np.array(all_sequences)
    y = np.array(all_labels)

    # # Convert labels to one-hot encoding
    # num_classes = len(label_map)

    print(f"\nTotal sequences: {X.shape[0]}")
    print(f"Input shape (X): {X.shape} (samples, sequence_length, pitches)")
    print(f"Output shape (y): {y.shape} (samples, num_classes)")

    return X, y

In [7]:
# --- Configuration ---
SEQUENCE_LENGTH = 100 # Number of time steps in each input sequence (corresponds to 1 second if fs=100)
FS = 10 # Piano roll sampling frequency (100 frames per second)
MIN_PITCH = 21 # A0
MAX_PITCH = 108 # C8
OVERLAP = 0 # Overlap between sequences to capture more context

# --- 1. Prepare Data ---
print("Preparing data...")
X, y = prepare_data_for_lstm(sequence_length=SEQUENCE_LENGTH, fs=FS, min_pitch=MIN_PITCH, max_pitch=MAX_PITCH, overlap=OVERLAP)

Preparing data...
Starting Bach with 925 midi files




Starting Beethoven with 212 midi files
Error loading MIDI file music\Beethoven\Anhang 14-3.mid: Could not decode key with 3 flats and mode 255
Starting Chopin with 136 midi files
Starting Mozart with 257 midi files
Error loading MIDI file music\Mozart\Piano Sonatas\Nueva carpeta\K281 Piano Sonata n03 3mov.mid: Could not decode key with 2 flats and mode 2

Total sequences: 36637
Input shape (X): (36637, 100, 88) (samples, sequence_length, pitches)
Output shape (y): (36637,) (samples, num_classes)


In [8]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout_prob=0.5, bidirectional=False):
        super(LSTMClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional # Store bidirectional flag

        # LSTM layer with bidirectional option and dropout
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers,
                            batch_first=True,
                            dropout=dropout_prob if num_layers > 1 else 0,
                            bidirectional=bidirectional)

        # Dropout layer after the last LSTM output
        self.dropout = nn.Dropout(dropout_prob)

        # The input to the final FC layer doubles if bidirectional is True
        # because the outputs from the forward and backward directions are concatenated.
        fc_input_size = hidden_size * 2 if bidirectional else hidden_size
        self.fc = nn.Linear(fc_input_size, output_size) 

    def forward(self, x):
        # Initialize hidden and cell states
        # For bidirectional LSTM, h0 and c0 size will be (num_layers * num_directions, batch_size, hidden_size)
        num_directions = 2 if self.bidirectional else 1
        h0 = torch.zeros(self.num_layers * num_directions, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers * num_directions, x.size(0), self.hidden_size).to(x.device)

        # Forward propagate LSTM
        # 'out' will have shape (batch_size, sequence_length, hidden_size * num_directions)
        out, _ = self.lstm(x, (h0, c0))

        # Take the output from the last time step.
        # For bidirectional, the last output 'out[:, -1, :]' already contains the concatenated
        # last hidden states from both forward and backward directions.
        out = self.dropout(out[:, -1, :])

        # Pass through the final fully connected layer
        out = self.fc(out)
        return out

In [9]:
class CustomDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

In [10]:
# Hyperparameters
num_layers = 2
hidden_size = 64
input_size = X.shape[2]
output_size = len(artist_list)

# --- Data splitting: Now with train, validation, and test ---
# First split: 70% for training + validation, 30% for test
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Second split: 80% of X_temp/y_temp for training, 20% for validation
# This means: 0.7 * 0.8 = 0.56 (56%) training, 0.7 * 0.2 = 0.14 (14%) validation
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42, stratify=y_temp)

print(f"Dataset sizes: Train={len(X_train)}, Val={len(X_val)}, Test={len(X_test)}")

# Convert to tensors
X_train = torch.from_numpy(X_train).float().to(device)
y_train = torch.from_numpy(y_train).long().to(device) # Labels as long for CrossEntropyLoss

X_val = torch.from_numpy(X_val).float().to(device)
y_val = torch.from_numpy(y_val).long().to(device) # Labels as long for CrossEntropyLoss

X_test = torch.from_numpy(X_test).float().to(device)
y_test = torch.from_numpy(y_test).long().to(device) # Labels as long for CrossEntropyLoss

# Create DataLoaders
train_dataset = CustomDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Create validation DataLoader
val_dataset = CustomDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False) # No need to shuffle validation data

# Model, Loss, Optimizer
model = LSTMClassifier(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers,\
                        output_size=output_size, dropout_prob=0.5, bidirectional=True).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 100
print("\n--- Training Model ---")
for epoch in range(num_epochs):
    # --- Training Phase ---
    model.train() # Set model to training mode
    running_loss = 0.0
    for batch_sequences, batch_labels in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_sequences)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * batch_sequences.size(0) # Accumulate loss per sample

    epoch_train_loss = running_loss / len(train_loader.dataset)

    # --- Validation Phase ---
    model.eval() # Set model to evaluation mode
    val_running_loss = 0.0
    correct_val = 0
    total_val = 0
    with torch.no_grad(): # Disable gradient calculation for validation
        for val_sequences, val_labels in val_loader:
            val_outputs = model(val_sequences)
            val_loss = criterion(val_outputs, val_labels)
            val_running_loss += val_loss.item() * val_sequences.size(0)

            _, predicted_val = torch.max(val_outputs.data, 1)
            total_val += val_labels.size(0)
            correct_val += (predicted_val == val_labels).sum().item()

    epoch_val_loss = val_running_loss / len(val_loader.dataset)
    epoch_val_accuracy = 100 * correct_val / total_val

    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {epoch_train_loss:.4f}, '
          f'Val Loss: {epoch_val_loss:.4f}, Val Acc: {epoch_val_accuracy:.2f}%')


# --- Evaluation on Test Set (after all epochs) ---
print("\n--- Evaluating Model on Test Set ---")
model.eval() # Set model to evaluation mode
with torch.no_grad():
    correct_test = 0
    total_test = 0
    test_outputs = model(X_test)
    test_loss = criterion(test_outputs, y_test).item() # Calculate test loss
    _, predicted_test = torch.max(test_outputs.data, 1)
    total_test += y_test.size(0)
    correct_test += (predicted_test == y_test).sum().item()

    test_accuracy = 100 * correct_test / total_test
    print(f'Test Loss: {test_loss:.4f}')
    print(f'Test Accuracy: {test_accuracy:.2f}%')

Dataset sizes: Train=20516, Val=5129, Test=10992

--- Training Model ---
Epoch 1/100, Train Loss: 1.1739, Val Loss: 1.1204, Val Acc: 49.05%
Epoch 2/100, Train Loss: 1.0863, Val Loss: 1.0336, Val Acc: 55.43%
Epoch 3/100, Train Loss: 1.0637, Val Loss: 1.0394, Val Acc: 54.98%
Epoch 4/100, Train Loss: 1.0319, Val Loss: 1.0022, Val Acc: 58.30%
Epoch 5/100, Train Loss: 1.0290, Val Loss: 0.9980, Val Acc: 56.91%
Epoch 6/100, Train Loss: 0.9877, Val Loss: 0.9173, Val Acc: 61.81%
Epoch 7/100, Train Loss: 0.9248, Val Loss: 0.8786, Val Acc: 63.50%
Epoch 8/100, Train Loss: 0.9063, Val Loss: 0.8620, Val Acc: 64.16%
Epoch 9/100, Train Loss: 0.8691, Val Loss: 0.8274, Val Acc: 66.25%
Epoch 10/100, Train Loss: 0.8274, Val Loss: 0.8002, Val Acc: 66.87%
Epoch 11/100, Train Loss: 0.7963, Val Loss: 0.7562, Val Acc: 69.31%
Epoch 12/100, Train Loss: 0.7610, Val Loss: 0.7485, Val Acc: 68.71%
Epoch 13/100, Train Loss: 0.7314, Val Loss: 0.7461, Val Acc: 68.49%
Epoch 14/100, Train Loss: 0.7034, Val Loss: 0.7119, 

In [11]:
# --- Original Evaluation Section (modified for per-class accuracy) ---
print("\n--- Evaluating Model (Overall and Per-Class) ---")
model.eval() # Set model to evaluation mode (disables dropout, batchnorm etc.)
with torch.no_grad(): # Disable gradient calculation for inference
    correct = 0
    total = 0
    
    # Initialize lists to store correct predictions and total samples for each class
    # The size should be `output_size` (number of classes)
    class_correct = list(0. for i in range(output_size))
    class_total = list(0. for i in range(output_size))

    test_outputs = model(X_test)
    _, predicted = torch.max(test_outputs.data, 1) # Get the class with the highest probability
    
    # Calculate overall accuracy
    total += y_test.size(0)
    correct += (predicted == y_test).sum().item()

    # Calculate per-class accuracy
    # Compare predicted with true labels
    c = (predicted == y_test).squeeze() # c is a boolean tensor indicating correct/incorrect for each sample
    
    # Iterate through the test samples (or use a DataLoader for larger test sets)
    # For simplicity, we are processing the entire X_test here.
    for i in range(y_test.size(0)):
        label = y_test[i].item() # Get the true class label for this sample
        class_correct[label] += c[i].item() # Increment correct count if prediction was right
        class_total[label] += 1 # Increment total count for this class

    overall_accuracy = 100 * correct / total
    print(f'Overall Accuracy on test set: {overall_accuracy:.2f}%')

    print("\nAccuracy per class:")
    for i in range(output_size):
        if class_total[i] > 0:
            # Assuming you have a way to map the class index (i) back to a class name
            # If you used LabelEncoder, you can use label_encoder.inverse_transform([i])[0]
            # For this dummy data, we'll just use the index.
            class_accuracy = 100 * class_correct[i] / class_total[i]
            print(f'  Class {i}: {class_accuracy:.2f}% ({int(class_correct[i])}/{int(class_total[i])})')
        else:
            print(f'  Class {i}: No samples in test set for this class.')


--- Evaluating Model (Overall and Per-Class) ---
Overall Accuracy on test set: 77.44%

Accuracy per class:
  Class 0: 86.37% (3491/4042)
  Class 1: 70.59% (2146/3040)
  Class 2: 69.20% (609/880)
  Class 3: 74.79% (2266/3030)
