## Imports

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Input, Dense, Reshape, Dropout, LSTM, Bidirectional
from tensorflow.keras.layers import BatchNormalization, LeakyReLU
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from music21 import converter, instrument, note, chord, stream
from pathlib import Path
import matplotlib.pyplot as plt

2023-12-11 13:16:47.141231: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-11 13:16:49.376892: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-11 13:16:49.377007: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-11 13:16:49.703115: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-11 13:16:50.401839: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-11 13:16:50.403682: I tensorflow/core/platform/cpu_feature_guard.cc:1

## Constants and hyperparameters

In [2]:
def create_midi(prediction_output, filename):
    """ convert the output from the prediction to notes and create a midi file
        from the notes """
    offset = 0
    output_notes = []

    # create note and chord objects based on the values generated by the model
    for item in prediction_output:
        pattern = item[0]
        
        # pattern is a chord
        if ('.' in pattern) or pattern.isdigit():
            notes_in_chord = pattern.split('.')
            notes = []
            for current_note in notes_in_chord:
                new_note = note.Note(int(current_note))
                new_note.storedInstrument = instrument.Piano()
                notes.append(new_note)
            new_chord = chord.Chord(notes)
            new_chord.offset = offset
            output_notes.append(new_chord)
        # pattern is a note
        else:
            new_note = note.Note(pattern)
            new_note.offset = offset
            new_note.storedInstrument = instrument.Piano()
            output_notes.append(new_note)

        # increase offset each iteration so that notes do not stack
        offset += 0.5

    midi_stream = stream.Stream(output_notes)
    midi_stream.write('midi', fp='{}.mid'.format(filename))

In [9]:
def get_ds():
    """ Get the notes and chords for each midi file in the ./midi_songs directory 
    emotion = "Q1", "Q2", "Q3", 'Q4' """
    
    songs = []
    labels = []

    for file in Path("/workspaces/Transformer_GANs_music_generation/Dataset/midis_emopia/X_train").glob("*.mid"):
        # Extract the label from the filename
        label = file.stem[:2]  

        midi = converter.parse(file)
        notes_to_parse = midi.flat.notes

        song = []
        for element in notes_to_parse:
            if isinstance(element, note.Note):
                song.append(str(element.pitch))
            elif isinstance(element, chord.Chord):
                song.append('.'.join(str(n) for n in element.normalOrder))
        
        songs.append(song)
        labels.append(label)

    return songs, labels

def preprocess_data(songs, labels):
    """ Prepare the sequences and labels used by the Neural Network """
    sequence_length = 100

    # get all pitch names
    pitchnames = sorted(set(item for song in songs for item in song))
    # create a dictionary to map pitches to integers
    note_to_int = dict((note, number) for number, note in enumerate(pitchnames))

    # create a dictionary to map labels to integers
    label_to_int = dict((label, number) for number, label in enumerate(sorted(set(labels))))

    network_input = []
    network_output = []
    network_labels = []

    # create input sequences and the corresponding outputs for each song
    for song, label in zip(songs, labels):
        for i in range(0, len(song) - sequence_length, 1):
            sequence_in = song[i:i + sequence_length]
            sequence_out = song[i + sequence_length]
            network_input.append([note_to_int[char] for char in sequence_in])
            network_output.append(note_to_int[sequence_out])
            network_labels.append(label_to_int[label])

    n_patterns = len(network_input)

    # reshape the input into a format compatible with LSTM layers
    network_input = np.reshape(network_input, (n_patterns, sequence_length))
    # normalize input
    network_input = network_input / float(len(set(pitchnames)))

    network_output = np.array(network_output)
    network_output = keras.utils.to_categorical(network_output)

    network_labels = np.array(network_labels)

    return (network_input, network_output, network_labels)


ds = get_ds()
ds = preprocess_data(*ds)

In [15]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Input, Dense, Reshape, Dropout, LSTM, Bidirectional
from tensorflow.keras.layers import BatchNormalization, LeakyReLU, Flatten, Embedding, multiply, concatenate
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from music21 import converter, instrument, note, chord, stream
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np

NUM_LABELS = 4 # number of emotion labels
SEQ_LEN = 100 # length of music sequence
LATENT_DIM = 100 # latent dimension for generator input
BATCH_SIZE = 16
EPOCHS = 100
SAMPLE_INTERVAL = 1

def create_midi(prediction_output, filename):
    """ convert the output from the prediction to notes and create a midi file
        from the notes """
    offset = 0
    output_notes = []

    # create note and chord objects based on the values generated by the model
    for item in prediction_output:
        pattern = str(item)
             
        # pattern is a chord
        if ('.' in pattern) or pattern.isdigit():
            notes_in_chord = pattern.split('.')
            notes = []
            for current_note in notes_in_chord:
                new_note = note.Note(int(current_note))
                new_note.storedInstrument = instrument.Piano()
                notes.append(new_note)
            new_chord = chord.Chord(notes)
            new_chord.offset = offset
            output_notes.append(new_chord)
        # pattern is a note
        else:
            new_note = note.Note(pattern)
            new_note.offset = offset
            new_note.storedInstrument = instrument.Piano()
            output_notes.append(new_note)

        # increase offset each iteration so that notes do not stack
        offset += 0.5

    midi_stream = stream.Stream(output_notes)
    midi_stream.write('midi', fp='{}.mid'.format(filename))



class GAN():
    def __init__(self, rows, choose_emotion_label = lambda: np.random.randint(0, NUM_LABELS)):
        self.seq_length = rows
        self.seq_shape = (self.seq_length, 1)
        self.latent_dim = 100
        self.num_labels = 4
        self.choose_emotion_label = choose_emotion_label
        self.disc_loss = []
        self.gen_loss =[]
        
        optimizer = tf.keras.optimizers.legacy.Adam(0.0002, 0.5)

        # Build and compile the discriminator
        self.discriminator = self.build_discriminator()
        self.discriminator.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

        # Build the generator
        self.generator = self.build_generator()

        # The generator takes noise and the target label as input
        # and generates the corresponding digit of that label
        noise = Input(shape=(self.latent_dim,))
        label = Input(shape=(1,))
        seq = self.generator([noise, label])

        # For the combined model we will only train the generator
        self.discriminator.trainable = False

        # The discriminator takes generated sequence as input and determines validity
        valid = self.discriminator([seq, label])

        # The combined model  (stacked generator and discriminator)
        # Trains generator to fool discriminator
        self.combined = Model([noise, label], valid)
        self.combined.compile(loss='binary_crossentropy', optimizer=optimizer)


    def build_discriminator(self):
        model = Sequential()
        model.add(LSTM(512, input_shape=self.seq_shape, return_sequences=True))
        model.add(Bidirectional(LSTM(512)))
        model.add(Dense(512))
        model.add(LeakyReLU(alpha=0.2))
        model.add(Dense(256))
        model.add(LeakyReLU(alpha=0.2))
        model.summary()

        seq = Input(shape=self.seq_shape)
        label = Input(shape=(1,), dtype='int32')
        label_embedding = Flatten()(Embedding(self.num_labels+1, 100)(label))
        flat_seq = Flatten()(seq)
        model_input = multiply([flat_seq, label_embedding])
        validity = model(model_input)

        return Model([seq, label], validity)
    

    def build_generator(self):

        model = Sequential()
        model.add(Dense(256, input_dim=self.latent_dim))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dense(512))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dense(1024))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dense(np.prod(self.seq_shape), activation='tanh'))
        model.add(Reshape(self.seq_shape))
        model.summary()
        

        noise = Input(shape=(self.latent_dim,))
        label = Input(shape=(1,), dtype='int32')
        label_embedding = Flatten()(Embedding(self.num_labels+1, 100)(label))
        model_input = multiply([noise, label_embedding])
        seq = model(model_input)

    
        return Model([noise, label], seq)

    def train(self, epochs, batch_size=128, sample_interval=50):

        # Load and convert the data
        X_train = ds[0]
        y_train = ds[2]
        print("X_train shape: ", X_train.shape)
        print("y_train shape: ", y_train.shape)


        # Adversarial ground truths
        real = np.ones((batch_size, 1))
        fake = np.zeros((batch_size, 1))
        
        # Training the model
        for epoch in range(epochs):

            # ---------------------
            #  Train Discriminator
            # ---------------------

            # Select a random batch of note sequences
            idx = np.random.randint(0, X_train.shape[0], batch_size)
            seqs = X_train[idx]
            labels = y_train[idx]

            # Generate a batch of new note sequences
            noise = np.random.normal(0, 1, (batch_size, self.latent_dim))
            gen_seqs = self.generator.predict([noise, labels])

            # Train the discriminator
            d_loss_real = self.discriminator.train_on_batch([seqs, labels], real)
            d_loss_fake = self.discriminator.train_on_batch([gen_seqs, labels], fake)
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

            # ---------------------
            #  Train Generator
            # ---------------------

            # Select a random batch of note sequences
            idx = np.random.randint(0, X_train.shape[0], batch_size)
            seqs = X_train[idx]
            labels = y_train[idx]


            # Train the generator
            g_loss = self.combined.train_on_batch([noise, labels], real)

            # Plot the progress
            print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100*d_loss[1], g_loss))
            self.disc_loss.append(d_loss[0])
            self.gen_loss.append(g_loss)
            # If at save interval => save generated sequence samples
            if epoch % sample_interval == 0:
                chosen_label = self.choose_emotion_label()
                print("Generating for emotion: ", chosen_label)
                self.generate_for_emotion(seqs, chosen_label, 1, True)

        self.plot_loss()
    def generate_for_emotion(self, input_seqs, input_labels, num_samples=1, train = True):  
        """ Generate a piano midi file based on user chosen emotion and a trained model """
        if train:
           
            # Get pitch names and store in a dictionary
            input_labels = np.array([input_labels])
            notes = input_seqs
            flat_notes = [pitch for item in notes for pitch in item]
            pitchnames = sorted(set(flat_notes))
            int_to_note = dict((number, note) for number, note in enumerate(pitchnames))
            
            # Use random noise to generate sequences
            noise = np.random.normal(0, 1, (1, self.latent_dim))
            predictions = self.generator.predict([noise, input_labels])
            
            pred_notes = [x*242+242 for x in predictions[0]]
            
            # Map generated integer indices to note names, with error handling
            pred_notes_mapped = []
            for x in pred_notes:
                index = int(x)
                if index in int_to_note:
                    pred_notes_mapped.append(int_to_note[index])
                else:
                    # Fallback mechanism: Choose a default note when the index is out of range
                    pred_notes_mapped.append('C5')
            
            # create number of notes to be generated
            self.num_samples = num_samples
            for i in range(self.num_samples):
                create_midi(pred_notes_mapped, 'trained_{}gan_output{}'.format(input_labels[0], i))

        if train == False:
            # Get pitch names and store in a dictionary
            input_labels = np.array([input_labels])
            notes = input_seqs
            flat_notes = [pitch for item in notes for pitch in item]
            pitchnames = sorted(set(flat_notes))
            int_to_note = dict((number, note) for number, note in enumerate(pitchnames))
            
            # use other sequences to generate sequences
            noise = np.random.normal(0, 1, (1, self.latent_dim))
            predictions = self.generator.predict([noise, input_labels])
            
            pred_notes = [x*242+242 for x in predictions[0]]
            
            # Map generated integer indices to note names, with error handling
            pred_notes_mapped = []
            for x in pred_notes:
                index = int(x)
                if index in int_to_note:
                    pred_notes_mapped.append(int_to_note[index])
                else:
                    # Fallback mechanism: Choose a default note when the index is out of range
                    pred_notes_mapped.append('C5')
            
            # create number of notes to be generated
            self.num_samples = num_samples
            for i in range(self.num_samples):
                create_midi(pred_notes_mapped, 'generated_{}gan_output{}'.format(input_labels[0], i))
            return pred_notes_mapped
        
    def plot_loss(self):
        plt.plot(self.disc_loss, c='red')
        plt.plot(self.gen_loss, c='blue')
        plt.title("GAN Loss per Epoch")
        plt.legend(['Discriminator', 'Generator'])
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.savefig('GAN_Loss_per_Epoch_final{}.png'.format(self.choose_emotion_label()))
        plt.close()


    def save(self):
        """ Save the model """
        self.generator.save("generator0.h5")
        self.discriminator.save("discriminator0.h5")
        self.combined.save("combined0.h5")
        self.generator.save_weights("generator_weights0.h5")
        self.discriminator.save_weights("discriminator_weights0.h5")
        self.combined.save_weights("combined_weights0.h5")
        print("Saved model to disk")

if __name__ == '__main__':
    
    gan = GAN(SEQ_LEN, lambda:0)
    gan.train(epochs=EPOCHS, batch_size=BATCH_SIZE, sample_interval=SAMPLE_INTERVAL)
    gan.save()

# generate new musics based on new seq


In [4]:
def get_ds_test():
    """ Get the notes and chords for each midi file in the ./midi_songs directory 
    emotion = "Q1", "Q2", "Q3", 'Q4' """
    
    songs = []
    labels = []

    for file in Path("/workspaces/Transformer_GANs_music_generation/Dataset/midis_emopia/X_test").glob("*.mid"):
        # Extract the label from the filename
        label = file.stem[:2]  

        midi = converter.parse(file)
        notes_to_parse = midi.flat.notes

        song = []
        for element in notes_to_parse:
            if isinstance(element, note.Note):
                song.append(str(element.pitch))
            elif isinstance(element, chord.Chord):
                song.append('.'.join(str(n) for n in element.normalOrder))
        
        songs.append(song)
        labels.append(label)

    return songs, labels

def preprocess_data(songs, labels):
    """ Prepare the sequences and labels used by the Neural Network """
    sequence_length = 100

    # get all pitch names
    pitchnames = sorted(set(item for song in songs for item in song))
    # create a dictionary to map pitches to integers
    note_to_int = dict((note, number) for number, note in enumerate(pitchnames))

    # create a dictionary to map labels to integers
    label_to_int = dict((label, number) for number, label in enumerate(sorted(set(labels))))

    network_input = []
    network_output = []
    network_labels = []

    # create input sequences and the corresponding outputs for each song
    for song, label in zip(songs, labels):
        for i in range(0, len(song) - sequence_length, 1):
            sequence_in = song[i:i + sequence_length]
            sequence_out = song[i + sequence_length]
            network_input.append([note_to_int[char] for char in sequence_in])
            network_output.append(note_to_int[sequence_out])
            network_labels.append(label_to_int[label])

    n_patterns = len(network_input)

    # reshape the input into a format compatible with LSTM layers
    network_input = np.reshape(network_input, (n_patterns, sequence_length))
    # normalize input
    network_input = network_input / float(len(set(pitchnames)))

    network_output = np.array(network_output)
    network_output = keras.utils.to_categorical(network_output)

    network_labels = np.array(network_labels)

    return (network_input, network_output, network_labels)


ds_test = get_ds_test()
ds_test = preprocess_data(*ds_test)

  return self.iter().getElementsByClass(classFilterList)


In [17]:
# use trained model to generate music
gan = GAN(SEQ_LEN, lambda:3)
gan.generator.load_weights("generator_weights3.h5")
gan.generate_for_emotion(ds_test[0], 3, 50, False)

Model: "sequential_20"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_20 (LSTM)              (None, 100, 512)          1052672   
                                                                 
 bidirectional_10 (Bidirect  (None, 1024)              4198400   
 ional)                                                          
                                                                 
 dense_60 (Dense)            (None, 512)               524800    
                                                                 
 leaky_re_lu_50 (LeakyReLU)  (None, 512)               0         
                                                                 
 dense_61 (Dense)            (None, 256)               131328    
                                                                 
 leaky_re_lu_51 (LeakyReLU)  (None, 256)               0         
                                                     

  index = int(x)


[0.21515892420537897,
 0.044009779951100246,
 0.3080684596577017,
 0.1882640586797066,
 0.6161369193154034,
 0.14425427872860636,
 0.31295843520782396,
 0.49877750611246946,
 0.8410757946210269,
 0.9755501222493888,
 0.03178484107579462,
 'C5',
 'C5',
 0.9682151589242054,
 'C5',
 0.02689486552567237,
 0.019559902200488997,
 0.4449877750611247,
 0.726161369193154,
 0.07090464547677261,
 'C5',
 0.2567237163814181,
 0.3276283618581907,
 0.9951100244498777,
 'C5',
 0.16625916870415647,
 0.07579462102689487,
 0.21271393643031786,
 0.4743276283618582,
 0.8581907090464548,
 0.7530562347188264,
 0.7677261613691931,
 0.13447432762836187,
 'C5',
 0.15892420537897312,
 0.5378973105134475,
 'C5',
 0.5476772616136919,
 0.7628361858190709,
 0.823960880195599,
 0.1100244498777506,
 'C5',
 'C5',
 0.8581907090464548,
 'C5',
 0.6136919315403423,
 0.43765281173594134,
 0.2371638141809291,
 'C5',
 0.9633251833740831,
 0.0,
 0.32273838630806845,
 0.5232273838630807,
 0.034229828850855744,
 0.19315403422982

# Hear some samples

In [21]:
file = converter.parse('/workspaces/Transformer_GANs_music_generation/generated_0gan_output46.mid')
file.show('midi')