Develop and train a deep neural network to generate music sequences using the MAESTRO Dataset. The
model should be capable of composing piano music that mimics the style of the training data. Load and
process the MIDI files, analyze their structure, and convert them into piano roll or token-based sequences.
Split the dataset into Training, Validation, and Test sets, apply scaling and normalization, and optionally
use data augmentation techniques. Design a deep learning model using LSTM, GRU, or Transformer-based
architectures, train and evaluate it, and analyze Loss, Accuracy, and the quality of generated music. Finally,
discuss model performance, music generation quality, challenges, and potential improvements.

In [None]:
!pip install pretty_midi
!pip install music21
!pip install pandas
!pip install tensorflow
!pip install scikit-learn
!pip install numpy

Collecting pretty_midi
  Downloading pretty_midi-0.2.10.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mido>=1.1.16 (from pretty_midi)
  Downloading mido-1.3.3-py3-none-any.whl.metadata (6.4 kB)
Downloading mido-1.3.3-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pretty_midi
  Building wheel for pretty_midi (setup.py) ... [?25l[?25hdone
  Created wheel for pretty_midi: filename=pretty_midi-0.2.10-py3-none-any.whl size=5592287 sha256=0a7e927d97afbef0804fef4d7f3bc32c0a69bcbc02b5d44fcd41d658103644ad
  Stored in directory: /root/.cache/pip/wheels/e6/95/ac/15ceaeb2823b04d8e638fd1495357adb8d26c00ccac9d7782e
Successfully built pretty_midi
Installing collected packages: mido, pretty_midi
Successf

In [None]:
!apt-get update -qq && apt-get install -qq libfluidsynth1 fluid-soundfont-gm build-essential libasound2-dev libjack-dev

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
E: Package 'libfluidsynth1' has no installation candidate


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import tensorflow as tf

# Check TensorFlow version and available devices
print("TensorFlow version:", tf.__version__)
print("Available devices:", tf.config.list_physical_devices())

# Force CPU usage if GPU is causing issues
tf.config.set_visible_devices([], 'GPU')

TensorFlow version: 2.18.0
Available devices: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pretty_midi
from sklearn.model_selection import train_test_split

class SimpleMusicGenerator:
    def __init__(self, data_path, sequence_length=32):
        """
        Initialize the music generator

        Args:
            data_path (str): Path to MAESTRO dataset
            sequence_length (int): Length of sequences for training
        """
        self.data_path = data_path
        self.sequence_length = sequence_length
        self.pitch_range = 88  # Standard piano range
        self.time_step = 0.25  # Quarter note time step

    def load_midi(self, midi_path):
        """Load and process MIDI file"""
        try:
            midi_data = pretty_midi.PrettyMIDI(midi_path)

            # Get piano roll with a specific time step
            piano_roll = midi_data.get_piano_roll(fs=1/self.time_step)

            # Trim to standard piano range (21-108)
            piano_roll = piano_roll[21:109, :]

            # Normalize and convert to binary (note on/off)
            piano_roll = (piano_roll > 0).astype(np.float32)

            return piano_roll.T  # Return time x pitch

        except Exception as e:
            print(f"Error loading {midi_path}: {e}")
            return None

    def prepare_sequences(self, piano_roll):
        """Create training sequences from piano roll"""
        sequences = []
        next_notes = []

        for i in range(0, len(piano_roll) - self.sequence_length, 1):
            sequence = piano_roll[i:i + self.sequence_length]
            next_note = piano_roll[i + self.sequence_length]
            sequences.append(sequence)
            next_notes.append(next_note)

        return np.array(sequences), np.array(next_notes)

    def create_dataset(self):
        """Create training dataset from MIDI files"""
        all_sequences = []
        all_next_notes = []

        # Load metadata
        metadata = pd.read_csv(os.path.join(self.data_path, 'maestro-v3.0.0.csv'))

        # Process first 100 files (for faster training)
        for idx, row in metadata.head(100).iterrows():
            midi_path = os.path.join(self.data_path, row['midi_filename'])
            if os.path.exists(midi_path):
                piano_roll = self.load_midi(midi_path)
                if piano_roll is not None:
                    sequences, next_notes = self.prepare_sequences(piano_roll)
                    all_sequences.extend(sequences)
                    all_next_notes.extend(next_notes)
                    print(f"Processed file {idx + 1}/100")

        # Convert to numpy arrays
        X = np.array(all_sequences)
        y = np.array(all_next_notes)

        # Split dataset
        X_train, X_val, y_train, y_val = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        return (X_train, y_train), (X_val, y_val)

    def build_model(self):
        """Create the model architecture"""
        model = keras.Sequential([
            # Input layer
            layers.Input(shape=(self.sequence_length, self.pitch_range)),

            # LSTM layers
            layers.LSTM(256, return_sequences=True),
            layers.Dropout(0.3),
            layers.LSTM(256),
            layers.Dropout(0.3),

            # Output layer
            layers.Dense(self.pitch_range, activation='sigmoid')
        ])

        return model

    def train(self, epochs=50, batch_size=64):
        """Train the model"""
        print("Preparing dataset...")
        (X_train, y_train), (X_val, y_val) = self.create_dataset()

        print("Building model...")
        self.model = self.build_model()

        # Compile model
        self.model.compile(
            optimizer=keras.optimizers.Adam(learning_rate=0.001),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )

        # Callbacks
        callbacks = [
            keras.callbacks.EarlyStopping(
                monitor='val_loss',
                patience=5,
                restore_best_weights=True
            ),
            keras.callbacks.ModelCheckpoint(
                'best_model.h5',
                save_best_only=True
            )
        ]

        # Train
        print("Training model...")
        history = self.model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=epochs,
            batch_size=batch_size,
            callbacks=callbacks
        )

        return history

    def generate(self, seed_sequence, length=128, temperature=1.0):
        """Generate new music sequence"""
        generated = seed_sequence.copy()

        for _ in range(length):
            # Prepare input
            sequence = generated[-self.sequence_length:]

            # Get prediction
            pred = self.model.predict(sequence[np.newaxis, ...], verbose=0)[0]

            # Apply temperature
            pred = np.log(pred + 1e-7) / temperature
            pred = np.exp(pred) / np.sum(np.exp(pred))

            # Sample from prediction
            new_note = (pred > 0.5).astype(np.float32)

            # Add to generated sequence
            generated = np.vstack([generated, new_note])

        return generated

    def save_midi(self, piano_roll, output_path):
        """Save piano roll as MIDI file"""
        pm = pretty_midi.PrettyMIDI()
        piano = pretty_midi.Instrument(program=0)  # Acoustic Grand Piano

        # Convert piano roll to notes
        for time_idx, step in enumerate(piano_roll):
            for note_idx, is_active in enumerate(step):
                if is_active:
                    note = pretty_midi.Note(
                        velocity=64,
                        pitch=note_idx + 21,  # Adjust for piano range
                        start=time_idx * self.time_step,
                        end=(time_idx + 1) * self.time_step
                    )
                    piano.notes.append(note)

        pm.instruments.append(piano)
        pm.write(output_path)

# Example usage
def main():

    # Initialize generator
    generator = SimpleMusicGenerator('drive/MyDrive/maestro_dataset')

    # Train model
    history = generator.train(epochs=50)

    # Generate music
    seed = generator.create_dataset()[0][0][0]  # Get a seed sequence
    generated = generator.generate(seed, length=128)

    # Save as MIDI
    generator.save_midi(generated, 'generated_music.midi')

if __name__ == "__main__":
    main()

Preparing dataset...
Processed file 1/100
Processed file 2/100
Processed file 3/100
Processed file 4/100
Processed file 5/100
Processed file 6/100
Processed file 7/100
Processed file 8/100
Processed file 9/100
Processed file 10/100
Processed file 11/100
Processed file 12/100
Processed file 13/100
Processed file 14/100
Processed file 15/100
Processed file 16/100
Processed file 17/100
Processed file 18/100
Processed file 19/100
Processed file 20/100
Processed file 21/100
Processed file 22/100
Processed file 23/100
Processed file 24/100
Processed file 25/100
Processed file 26/100
Processed file 27/100
Processed file 28/100
Processed file 29/100
Processed file 30/100
Processed file 31/100
Processed file 32/100
Processed file 33/100
Processed file 34/100
Processed file 35/100
Processed file 36/100
Processed file 37/100
Processed file 38/100
Processed file 39/100
Processed file 40/100
Processed file 41/100
Processed file 42/100
Processed file 43/100
Processed file 44/100
Processed file 45/10



[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 12ms/step - accuracy: 0.0489 - loss: 0.1627 - val_accuracy: 0.1247 - val_loss: 0.0884
Epoch 2/50
[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.1360 - loss: 0.0931



[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 11ms/step - accuracy: 0.1360 - loss: 0.0931 - val_accuracy: 0.1574 - val_loss: 0.0761
Epoch 3/50
[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 12ms/step - accuracy: 0.1562 - loss: 0.0841 - val_accuracy: 0.1726 - val_loss: 0.0763
Epoch 4/50
[1m2503/2504[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 10ms/step - accuracy: 0.1603 - loss: 0.0831



[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 12ms/step - accuracy: 0.1603 - loss: 0.0831 - val_accuracy: 0.1760 - val_loss: 0.0701
Epoch 5/50
[1m2499/2504[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 10ms/step - accuracy: 0.1687 - loss: 0.0793



[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 11ms/step - accuracy: 0.1687 - loss: 0.0793 - val_accuracy: 0.1791 - val_loss: 0.0700
Epoch 6/50
[1m2500/2504[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 10ms/step - accuracy: 0.1698 - loss: 0.0803



[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 11ms/step - accuracy: 0.1698 - loss: 0.0803 - val_accuracy: 0.1720 - val_loss: 0.0699
Epoch 7/50
[1m2501/2504[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 10ms/step - accuracy: 0.1759 - loss: 0.0788



[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 11ms/step - accuracy: 0.1759 - loss: 0.0788 - val_accuracy: 0.1814 - val_loss: 0.0687
Epoch 8/50
[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 11ms/step - accuracy: 0.1788 - loss: 0.0778 - val_accuracy: 0.1791 - val_loss: 0.0690
Epoch 9/50
[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.1796 - loss: 0.0756



[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 11ms/step - accuracy: 0.1796 - loss: 0.0756 - val_accuracy: 0.1843 - val_loss: 0.0668
Epoch 10/50
[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.1815 - loss: 0.0738



[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 11ms/step - accuracy: 0.1815 - loss: 0.0738 - val_accuracy: 0.1948 - val_loss: 0.0663
Epoch 11/50
[1m2501/2504[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 10ms/step - accuracy: 0.1831 - loss: 0.0729



[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 11ms/step - accuracy: 0.1831 - loss: 0.0729 - val_accuracy: 0.1928 - val_loss: 0.0659
Epoch 12/50
[1m2499/2504[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 10ms/step - accuracy: 0.1846 - loss: 0.0719



[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 11ms/step - accuracy: 0.1846 - loss: 0.0719 - val_accuracy: 0.1921 - val_loss: 0.0655
Epoch 13/50
[1m2502/2504[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 10ms/step - accuracy: 0.1860 - loss: 0.0718



[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 11ms/step - accuracy: 0.1860 - loss: 0.0718 - val_accuracy: 0.1964 - val_loss: 0.0655
Epoch 14/50
[1m2500/2504[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 10ms/step - accuracy: 0.1863 - loss: 0.0705



[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 12ms/step - accuracy: 0.1863 - loss: 0.0705 - val_accuracy: 0.2038 - val_loss: 0.0650
Epoch 15/50
[1m2501/2504[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 10ms/step - accuracy: 0.1860 - loss: 0.0702



[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 12ms/step - accuracy: 0.1860 - loss: 0.0702 - val_accuracy: 0.1915 - val_loss: 0.0649
Epoch 16/50
[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 11ms/step - accuracy: 0.1874 - loss: 0.0695 - val_accuracy: 0.1999 - val_loss: 0.0650
Epoch 17/50
[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.1876 - loss: 0.0694



[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 11ms/step - accuracy: 0.1876 - loss: 0.0694 - val_accuracy: 0.2063 - val_loss: 0.0647
Epoch 18/50
[1m2503/2504[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 10ms/step - accuracy: 0.1897 - loss: 0.0685



[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 11ms/step - accuracy: 0.1897 - loss: 0.0685 - val_accuracy: 0.2052 - val_loss: 0.0644
Epoch 19/50
[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 11ms/step - accuracy: 0.1905 - loss: 0.0682 - val_accuracy: 0.1970 - val_loss: 0.0645
Epoch 20/50
[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 11ms/step - accuracy: 0.1913 - loss: 0.0675 - val_accuracy: 0.1998 - val_loss: 0.0649
Epoch 21/50
[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.1908 - loss: 0.0671



[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 11ms/step - accuracy: 0.1908 - loss: 0.0671 - val_accuracy: 0.1951 - val_loss: 0.0643
Epoch 22/50
[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 12ms/step - accuracy: 0.1900 - loss: 0.0661 - val_accuracy: 0.2030 - val_loss: 0.0645
Epoch 23/50
[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 11ms/step - accuracy: 0.1917 - loss: 0.0655 - val_accuracy: 0.2108 - val_loss: 0.0645
Epoch 24/50
[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 12ms/step - accuracy: 0.1925 - loss: 0.0656 - val_accuracy: 0.1945 - val_loss: 0.0648
Epoch 25/50
[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 11ms/step - accuracy: 0.1947 - loss: 0.0648 - val_accuracy: 0.2194 - val_loss: 0.0648
Epoch 26/50
[1m2504/2504[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 11ms/step - accura