# **Microproyecto 1:** Generación de Notas Musicales

El objetivo de este Notebook es entrenar un modelo de secuencia que, a partir de un contexto de notas previas (`pitch`, `step`, `duration`, y opcionalmente `velocity`), prediga la siguiente nota con las mismas características, siguiendo el estilo de un compositor y un instrumento de los presentes en el dataset.

Presentado por: 
- Manuel Estévez Bretón - me.estevez-breton10@uniandes.edu.co
- Ling Lung Zuñiga - l.lung@uniandes.edu.co
- Victoria Orellana Guerrero - v.orellana@uniandes.edu.co
- Jorge Paternina Montiel - j.paterninam@uniandes.edu.co 
- Benjamin Perdomo Morales - b.perdomom@uniandes.edu.co

## **Notebook Set-up**

In [138]:
# Instalar dependencias
# %pip install -q pretty_midi ipywidgets tqdm

In [139]:
# Importaciones básicas
import os
from pathlib import Path
from typing import List, Dict, Tuple, Optional
import glob

import numpy as np
import pandas as pd

# Visualización
import matplotlib.pyplot as plt
from tqdm import tqdm # Para visualizar barras de progreso

# Audio/MIDI
import pretty_midi

# Deep Learning 
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split, TensorDataset

# Reproducibilidad
import random

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

print('Librarías importadas correctamente. Torch CUDA:', torch.cuda.is_available())

Librarías importadas correctamente. Torch CUDA: True


## **Carga de Datos**

In [140]:
# Ruta principal donde se encuentran las carpetas de los compositores
base_path = '../dataset/music_artist'

# Path para extraer pistas de un único compositor 
composer = 'mozart'
composer_path = os.path.join(base_path, composer)
midi_files = glob.glob(os.path.join(composer_path, '*.mid'))

print(f"Se encontraron {len(midi_files)} archivos MIDI para el compositor: '{composer}'")
#print("Ruta de ejemplo:", midi_files[0])

Se encontraron 21 archivos MIDI para el compositor: 'mozart'


In [141]:
def list_available_instruments(midi_files):
    """
    Analiza una lista de archivos MIDI y devuelve una lista de todos los
    nombres de instrumentos únicos encontrados.
    """
    instrument_names = set()

    print("Buscando instrumentos en los archivos MIDI...")
    for midi_path in tqdm(midi_files, desc="Analizando archivos"):
        try:
            pm = pretty_midi.PrettyMIDI(midi_path)
            for instrument in pm.instruments:
                instrument_name = pretty_midi.program_to_instrument_name(instrument.program)
                instrument_names.add(instrument_name)
        except Exception as e:
            print(f"Ocurrió un error menor en el archivo {midi_path}: {e}")
            continue
    return sorted(list(instrument_names))

available_instruments = list_available_instruments(midi_files)
print("\n--- Instrumentos Disponibles ---")
for instrument in available_instruments:
    print(f"{instrument}")

# Nuestro approach para seleccionar el instrumento será en función de su frecuencia 
selected_instrument =max(set(available_instruments), key=available_instruments.count) 
print(f"\nEl instrumento seleccionado es: {selected_instrument}") 

Buscando instrumentos en los archivos MIDI...


Analizando archivos: 100%|██████████| 21/21 [00:01<00:00, 14.71it/s]


--- Instrumentos Disponibles ---
Acoustic Grand Piano

El instrumento seleccionado es: Acoustic Grand Piano





In [142]:
def extract_notes_by_instrument(midi_path, target_instrument_name):
    """
    Función para abrir un archivo MIDI y extraer las notas de un instrumento.
    """
    try:
        pm = pretty_midi.PrettyMIDI(midi_path)
        
        instrument_to_extract = None
        for instrument in pm.instruments:
            instrument_name = pretty_midi.program_to_instrument_name(instrument.program)
            # Ahora la comparación usa la variable correcta que pasaste como parámetro
            if instrument_name == target_instrument_name:
                instrument_to_extract = instrument
                break 
                
        if not instrument_to_extract:
            return pd.DataFrame()

        notes = instrument_to_extract.notes
        sorted_notes = sorted(notes, key=lambda note: note.start)

        prev_start = 0
        extracted_notes = []

        for note in sorted_notes:
            start = note.start
            end = note.end
            
            pitch = note.pitch
            step = start - prev_start
            duration = end - start
            velocity = note.velocity

            extracted_notes.append({
                'pitch': pitch, 'step': step, 
                'duration': duration, 'velocity': velocity
            })
            
            prev_start = start

        return pd.DataFrame(extracted_notes)

    except Exception as e:
        return pd.DataFrame()

df_test = extract_notes_by_instrument(midi_files[0], selected_instrument) 
print(df_test.head())

   pitch      step  duration  velocity
0     66  0.000000  0.423042        60
1     74  0.000000  0.423042        73
2     69  0.000000  0.423042        60
3     79  0.846084  0.111615        66
4     78  0.101468  0.111615        55


In [143]:
all_composer_notes = []
print(f"\nProcesando todos los archivos de '{composer}'")
for file in tqdm(midi_files, desc=f"Extrayendo notas"):
    notes_df = extract_notes_by_instrument(file, instrument)
    if not notes_df.empty:
        all_composer_notes.append(notes_df)

# Concatenamos todos los DataFrames de la lista en uno solo
if all_composer_notes:
    full_dataset = pd.concat(all_composer_notes, ignore_index=True)
    print(f"Número total de notas extraídas: {len(full_dataset)}")
    print("Primeras 10 filas del dataset completo:")
    print(full_dataset.head(10))
else:
    print("\nNo se pudieron extraer notas de ningún archivo. Revisar rutas o archivos MIDI.")


Procesando todos los archivos de 'mozart'


Extrayendo notas: 100%|██████████| 21/21 [00:01<00:00, 14.47it/s]

Número total de notas extraídas: 38545
Primeras 10 filas del dataset completo:
   pitch      step  duration  velocity
0     66  0.000000  0.423042        60
1     74  0.000000  0.423042        73
2     69  0.000000  0.423042        60
3     79  0.846084  0.111615        66
4     78  0.101468  0.111615        55
5     76  0.101468  0.111615        54
6     78  0.101468  0.111615        54
7     81  0.101468  0.111615        65
8     79  0.101468  0.112511        55
9     78  0.101468  0.121475        55





**¿Por qué no incluimos un ID de la pista en el dataset?**
Nuestro objetivo es que el modelo aprenda el lenguaje musical de un compositor, no que memorice canciones específicas. Por tal motivo, tratamos toda su obra como un único texto musical. El objetivo es encontrar patrones de su estilo, no sobreajustarnos a una canción. 

## **Dataload**

La música clásica, como la de Mozart, suele estar escrita en compases (8 notas). Por ende, se seleccionarán múltiplos de ese valor para la creación de las sequencias. 

Inicialmente, se comenzará con 32 (4 compases), pues con estos se compone una subfrase melódica. Esto, entonces, le da a la red neuronal suficiente contexto, tatno a nivel armónico como a nivel melódico, para aprender los patrones de las composiciones del artista. De no funcionar adecuadamente, se ajustará este valor a 64 o, si no se logra entrenar en un tiempo adecuado, a 16.

In [144]:
print(full_dataset.shape)

(38545, 4)


In [145]:
def create_sequences(df, seq_lenght):
    xs, ys = [], []
    for i in range(len(df) - seq_lenght):
        x = df.iloc[i:(i+seq_lenght), :]
        y = df.iloc[i+seq_lenght, :]

        xs.append(x)
        ys.append(y)

    return np.array(xs), np.array(ys)

In [146]:
train_data = full_dataset.copy()
test_data = full_dataset.copy()

train_size_threshold = int(full_dataset.shape[0] * 0.8)

train_data = train_data.iloc[:train_size_threshold , :]
test_data = test_data.iloc[train_size_threshold: , :]

print(train_data.shape)
print(test_data.shape)
print(train_data.shape[0] + test_data.shape[0])

(30836, 4)
(7709, 4)
38545


In [147]:
X_train, y_train = create_sequences(train_data, 128)
print(X_train.shape, y_train.shape)

(30708, 128, 4) (30708, 4)


In [148]:
X_test, y_test = create_sequences(test_data, 128)
print(X_test.shape, y_test.shape)

(7581, 128, 4) (7581, 4)


In [149]:
dataset_train = TensorDataset(
    torch.from_numpy(X_train).float(),
    torch.from_numpy(y_train).float()
)

In [150]:
dataset_test = TensorDataset(
    torch.from_numpy(X_test).float(),
    torch.from_numpy(y_test).float()
)

## **Arquitectura GRU**

### Instanciamiento

In [None]:
class GRUNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=4):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.gru = nn.GRU(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
        )
        self.dropout = nn.Dropout(0.3)

        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        out, hn = self.gru(x, h0)

        out = out[:, -1, :]
        
        out = self.dropout(out)

        out = self.fc(out)

        return out

### Traning loop

In [152]:
INPUT_FEATURES = 4
OUTPUT_FEATURES = 4
HIDDEN_UNITS = 512

model_gru = GRUNet(
    input_size=INPUT_FEATURES,
    hidden_size=HIDDEN_UNITS,
    output_size=OUTPUT_FEATURES
)

BATCH_SIZE = 64
LEARNING_RATE = 0.001
NUM_EPOCHS = 15

criterion = nn.MSELoss()
optimizer = optim.Adam(model_gru.parameters(), lr=LEARNING_RATE)

dataloader_train = DataLoader(dataset_train, batch_size=BATCH_SIZE, shuffle=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_gru.to(device)
model_gru.train()

print(f'Training start on {device}')


for epoch in range(NUM_EPOCHS):

    running_loss = 0.0

    for seqs, labels in dataloader_train:
        seqs, labels = seqs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model_gru(seqs)

        loss = criterion(outputs, labels)

        loss.backward()

        optimizer.step()

        running_loss += loss.item() * seqs.size(0)

    epoch_loss = running_loss / len(dataset_train)
    print(f'epoch [{epoch+1}/{NUM_EPOCHS}], Loss: {epoch_loss:.4f}')

print('Training finished')

Training start on cuda
epoch [1/15], Loss: 220.0514
epoch [2/15], Loss: 49.3249
epoch [3/15], Loss: 49.1831
epoch [4/15], Loss: 49.5430
epoch [5/15], Loss: 49.4478
epoch [6/15], Loss: 42.3398
epoch [7/15], Loss: 26.3052
epoch [8/15], Loss: 23.8187
epoch [9/15], Loss: 23.2539
epoch [10/15], Loss: 22.6975
epoch [11/15], Loss: 20.2508
epoch [12/15], Loss: 19.0903
epoch [13/15], Loss: 18.9162
epoch [14/15], Loss: 18.5693
epoch [15/15], Loss: 18.1803
Training finished


### Evaluation loop

In [153]:
from sklearn.metrics import mean_squared_error
dataloader_test = DataLoader(dataset_test, batch_size=BATCH_SIZE, shuffle=False)

def evaluate_model(model, data_loader, criterion, device):
    
    model.eval() 
    total_loss = 0
    all_preds = []
    all_targets = []

    with torch.no_grad():
        for seqs, labels in data_loader:
            seqs, labels = seqs.to(device), labels.to(device)
            
            # Forward pass
            outputs = model(seqs)
            
            # Calculate loss
            loss = criterion(outputs, labels)
            total_loss += loss.item() * seqs.size(0)
            
            # Store results
            all_preds.append(outputs.cpu().numpy())
            all_targets.append(labels.cpu().numpy())

    # Aggregate results
    avg_loss = total_loss / len(data_loader.dataset)
    predictions = np.concatenate(all_preds, axis=0)
    targets = np.concatenate(all_targets, axis=0)

    # Calculate Root Mean Squared Error (RMSE) for better interpretability
    # RMSE is the square root of MSE
    rmse = np.sqrt(mean_squared_error(targets, predictions))
    
    return avg_loss, rmse

# --- Run Evaluation ---
val_mse_loss, val_rmse = evaluate_model(model_gru, dataloader_test, criterion, device)

print(f"Validation Finished.")
print(f"Average MSE Loss: {val_mse_loss:.4f}")
print(f"Root Mean Squared Error (RMSE): {val_rmse:.4f}")

Validation Finished.
Average MSE Loss: 11.1036
Root Mean Squared Error (RMSE): 3.3322


### **Generación de música**

In [154]:
def generate_notes(model, initial_seed, num_notes_to_generate, device, 
                   max_pitch=127, min_pitch=0, 
                   max_velocity=127, min_velocity=1):
    """
    Generates a sequence of notes using the trained GRU model autoregressively.

    Args:
        model (nn.Module): The trained GRU model.
        initial_seed (torch.Tensor): A starting sequence of notes (32, 4).
        num_notes_to_generate (int): The total number of notes to generate.
        device (torch.device): 'cuda' or 'cpu'.
        max_pitch, min_pitch, etc.: Limits for note feature clipping/scaling.
    
    Returns:
        list: A list of the 4-feature numpy arrays for each generated note.
    """
    model.eval() # Set model to evaluation mode
    
    # current_sequence shape: (1, 32, 4) - Batch size of 1
    current_sequence = initial_seed.unsqueeze(0).to(device)
    
    generated_notes_list = []
    
    with torch.no_grad():
        for i in range(num_notes_to_generate):
            # 1. Prediction: Get the 4-feature output (pitch, step, duration, velocity)
            # output shape: (1, 4)
            predicted_note_tensor = model(current_sequence)
            
            # Convert prediction to numpy and squeeze out the batch dimension
            pred_note_np = predicted_note_tensor.squeeze(0).cpu().numpy()
            
            # 2. Post-processing/Discretization:
            # We must convert continuous predictions into valid note parameters.
            
            # a. Pitch and Velocity are integers [0-127]. We'll round and clip.
            pred_note_np[0] = np.clip(np.round(pred_note_np[0]), min_pitch, max_pitch)    # Pitch
            pred_note_np[3] = np.clip(np.round(pred_note_np[3]), min_velocity, max_velocity) # Velocity

            # b. Step and Duration are time-based (float), we can clip to positive values.
            pred_note_np[1] = np.clip(pred_note_np[1], 0.01, None) # Step (must be > 0)
            pred_note_np[2] = np.clip(pred_note_np[2], 0.01, None) # Duration (must be > 0)
            
            # Store the generated note (4 features)
            generated_notes_list.append(pred_note_np)
            
            # 3. Feedback: Prepare the input for the next prediction
            
            # Convert the new note back to a tensor (1, 1, 4)
            new_note_tensor = torch.from_numpy(pred_note_np).float().unsqueeze(0).unsqueeze(0).to(device)
            
            # Append the new note and drop the oldest note
            # new_sequence shape: (1, 32, 4)
            current_sequence = torch.cat((current_sequence[:, 1:, :], new_note_tensor), dim=1)
            
    return generated_notes_list

In [155]:
# Assuming X_train is your original (30804, 32, 4) data tensor
# Ensure these indices are far apart to get distinct musical starting points.
seed_index_1 = 0    # First 32 notes
seed_index_2 = 8000 # A sequence later in the dataset
seed_index_3 = 15000 # Another sequence much later

seed_1 = X_train[seed_index_1] # shape (32, 4)
seed_2 = X_train[seed_index_2] # shape (32, 4)
seed_3 = X_train[seed_index_3] # shape (32, 4)

# Create a list of seeds
seeds = [seed_1, seed_2, seed_3]

In [156]:
def notes_to_audio_file(generated_notes, output_path, instrument_name='Acoustic Grand Piano'):
    """
    Converts a list of generated notes (4 features each) into a WAV file.
    
    Args:
        generated_notes (list): List of 4-feature numpy arrays (pitch, step, duration, velocity).
        output_path (str): Path to save the output file (e.g., 'song_1.wav').
        instrument_name (str): MIDI instrument name.
    """
    pm = pretty_midi.PrettyMIDI()
    
    try:
        program = pretty_midi.instrument_name_to_program(instrument_name)
    except ValueError:
        print(f"Warning: Instrument '{instrument_name}' not found. Using default.")
        program = pretty_midi.instrument_name_to_program('Acoustic Grand Piano')
        
    instrument = pretty_midi.Instrument(program=program)
    
    current_time = 0.0 # Start time of the first note
    
    for note_features in generated_notes:
        pitch, step, duration, velocity = [int(note_features[0]), 
                                          note_features[1], 
                                          note_features[2], 
                                          int(note_features[3])]
        
        # 1. Calculate Start Time: Advance the time by the 'step' (time since last note ended)
        current_time += step 
        start = current_time
        
        # 2. Calculate End Time: Add the 'duration' to the start time
        end = start + duration
        
        # 3. Create the PrettyMIDI Note object
        note = pretty_midi.Note(velocity=velocity, pitch=pitch, start=start, end=end)
        
        instrument.notes.append(note)
        
        # 4. Update the time pointer: The next note's 'step' is added from *this* note's end
        current_time = end 

    pm.instruments.append(instrument)
    
    # Save as a MIDI file first (PrettyMIDI's primary output)
    midi_path = output_path.replace('.wav', '.mid')
    pm.write(midi_path)
    print(f"MIDI file saved to: {midi_path}")

In [157]:
# --- Setup (Re-run this part with the fix) ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_NOTES = 200
output_paths = ['generated_song_1.wav', 'generated_song_2.wav', 'generated_song_3.wav']

# Assuming seeds are defined as a list of NumPy arrays:
# seeds = [seed_1, seed_2, seed_3] 

for i, seed_np in enumerate(seeds):
    print(f"\n--- Generating Song {i+1} with Seed {i+1} ---")
    
    # FIX: Convert the NumPy array seed to a PyTorch Tensor
    seed_tensor = torch.from_numpy(seed_np).float() 
    
    # 1. Generate the notes
    # Pass the PyTorch tensor to the generation function
    generated_notes = generate_notes(model_gru, seed_tensor, NUM_NOTES, device)
    
    # 2. Convert to WAV file
    notes_to_audio_file(generated_notes, output_paths[i])

print("\nAll 3 songs generated successfully (MIDI). Check the console for WAV rendering status.")


--- Generating Song 1 with Seed 1 ---
MIDI file saved to: generated_song_1.mid

--- Generating Song 2 with Seed 2 ---
MIDI file saved to: generated_song_2.mid

--- Generating Song 3 with Seed 3 ---
MIDI file saved to: generated_song_3.mid

All 3 songs generated successfully (MIDI). Check the console for WAV rendering status.


## **Arquitectura LSTM (Long Short-Term Memory)**