# Full data preprocessing for C-GAN

In [32]:
# Traditional imports
import numpy as np
import matplotlib.pyplot as plt
import os
import sys
import pandas as pd

# Music and image imports
from imageio import imwrite
from music21 import converter, instrument, note, chord, converter
from PIL import Image, ImageOps

## Converting midi file to images and vice versa

### From midi file to image

In [2]:
# Intermediary function
def extractNote(element):
    return int(element.pitch.ps)

In [3]:
# Intermediary function
def extractDuration(element):
    return element.duration.quarterLength

In [4]:
# Intermediary function
def get_notes(notes_to_parse):

    """
    Get all the notes and chords from the midi files into a dictionary containing:
        - Start: unit time at which the note starts playing
        - Pitch: pitch of the note
        - Duration: number of time units the note is played for
    """
    durations = []
    notes = []
    start = []

    for element in notes_to_parse:
        if isinstance(element, note.Note):
            if element.isRest:
                continue

            start.append(element.offset)
            notes.append(extractNote(element))
            durations.append(extractDuration(element))

        elif isinstance(element, chord.Chord):
            if element.isRest:
                continue
            for chord_note in element:
                start.append(element.offset)
                durations.append(extractDuration(element))
                notes.append(extractNote(chord_note))

    return {"start":start, "pitch":notes, "dur":durations}

In [5]:
def midi2image(midi_path, output_folder_path, max_repetitions = float("inf"), resolution = 0.25, lowerBoundNote = 21, upperBoundNote = 127, maxSongLength = 106):

    """
    1) Transform a midi file into a set of images:
        - Each image has a size of 106 (all notes between lowerBound and upperBound) x 106 time units (maxSongLength)
        - One time unit corresponds to 0.25 (resolution) beat from the original music
    2) Store images into the corresponding sub-folder (identified by music piece name) of the 'output_folder_path' folder
    """

    output_folder = f"{output_folder_path}{midi_path.split('/')[-1].replace('.mid', '')}"
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    mid = converter.parse(midi_path)

    instruments = instrument.partitionByInstrument(mid)

    data = {}

    try:
        i=0
        for instrument_i in instruments.parts:
            notes_to_parse = instrument_i.recurse()

            notes_data = get_notes(notes_to_parse)
            if len(notes_data["start"]) == 0:
                continue

            if instrument_i.partName is None:
                data["instrument_{}".format(i)] = notes_data
                i+=1
            else:
                data[instrument_i.partName] = notes_data

    except:
        notes_to_parse = mid.flat.notes
        data["instrument_0"] = get_notes(notes_to_parse)

    for instrument_name, values in data.items():

        pitches = values["pitch"]
        durs = values["dur"]
        starts = values["start"]

        index = 0
        while index < max_repetitions:
            matrix = np.zeros((upperBoundNote-lowerBoundNote,maxSongLength))


            for dur, start, pitch in zip(durs, starts, pitches):
                dur = int(dur/resolution)
                start = int(start/resolution)

                if not start > index*(maxSongLength+1) or not dur+start < index*maxSongLength:
                    for j in range(start,start+dur):
                        if j - index*maxSongLength >= 0 and j - index*maxSongLength < maxSongLength:
                            matrix[pitch-lowerBoundNote,j - index*maxSongLength] = 255

            if matrix.any(): # If matrix contains no notes (only zeros) don't save it
                output_filename = os.path.join(output_folder, midi_path.split('/')[-1].replace(".mid",f"_{instrument_name}_{index}.png"))
                imwrite(output_filename,matrix.astype(np.uint8))
                index += 1
            else:
                break

### From image to midi file

In [6]:
# Intermediary function
def column2notes(column, lowerBoundNote = 21):
    notes = []
    for i in range(len(column)):
        if column[i] > 255/2:
            notes.append(i+lowerBoundNote)
    return notes

In [7]:
# Intermediary function
def updateNotes(newNotes, prevNotes, resolution = 0.25): 
    res = {} 
    for note in newNotes:
        if note in prevNotes:
            res[note] = prevNotes[note] + resolution
        else:
            res[note] = resolution
    return res

In [8]:
def image2midi(image_path, lowerBoundNote = 21, resolution = 0.25):
    """
    From an existing image:
        - Convert to notes
        - Save result as a midi file in the subfolder 'music_piece_name' of the 'data_output_sound' folder 
    """
    
    output_folder = f"../../data_output_midi/{image_path.split('/')[-2]}"
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    output_filename = os.path.join(output_folder, image_path.split("/")[-1].replace(".png",".mid"))
    print(output_filename)
    
    with ImageOps.grayscale(Image.open(image_path)) as image:
        im_arr = np.frombuffer(image.tobytes(), dtype=np.uint8)
        print(im_arr.shape)
        try:
            im_arr = im_arr.reshape((image.size[1], image.size[0]))
        except:
            im_arr = im_arr.reshape((image.size[1], image.size[0],3))
            im_arr = np.dot(im_arr, [0.33, 0.33, 0.33])
    
    offset = 0
    output_notes = []

    # create note and chord objects based on the values generated by the model

    prev_notes = updateNotes(im_arr.T[0,:],{}, resolution = resolution)
    for column in im_arr.T[1:,:]:
        notes = column2notes(column, lowerBoundNote=lowerBoundNote)
        # pattern is a chord
        notes_in_chord = notes
        old_notes = prev_notes.keys()
        for old_note in old_notes:
            if not old_note in notes_in_chord:
                new_note = note.Note(old_note,quarterLength=prev_notes[old_note])
                new_note.storedInstrument = instrument.Piano()
                if offset - prev_notes[old_note] >= 0:
                    new_note.offset = offset - prev_notes[old_note]
                    output_notes.append(new_note)
                elif offset == 0:
                    new_note.offset = offset
                    output_notes.append(new_note)                    
                else:
                    print(offset,prev_notes[old_note],old_note)

        prev_notes = updateNotes(notes_in_chord,prev_notes)

        # increase offset each iteration so that notes do not stack
        offset += resolution

    for old_note in prev_notes.keys():
        new_note = note.Note(old_note,quarterLength=prev_notes[old_note])
        new_note.storedInstrument = instrument.Piano()
        new_note.offset = offset - prev_notes[old_note]

        output_notes.append(new_note)

    prev_notes = updateNotes(notes_in_chord,prev_notes)

    midi_stream = stream.Stream(output_notes)
    
    midi_stream.write('midi', fp=output_filename)

## From midi files, create a clean image dataset

In [9]:
def get_clean_midi_data_as_images(midi_path, output_folder_path, image_height = 106, image_length = 106):

    """
    Iterate on all midi files from the 'midi_path' folder to:
        - Keep music pieces with one piano only
        - Transform the midi file into images
        - Store all corresponding images into a 'music_piece' subfolder of the 'output_folder_path'
    """
    # Storing all midi files into a 'files_raw' list
    files_raw = [file for file in os.listdir(midi_path)]

    # Storing all midi files with only one piano in a 'files' list
    files = []
    for file in files_raw:
        try:
            mid = converter.parse(f'{midi_path}/{file}')
            file_instruments = instrument.partitionByInstrument(mid)
            if len(file_instruments)==1:
                files.append(file)
        except:
            pass

    # Iterating on all files from 'files' list to create images
    for file in files:
        file_path = f"{midi_path}/{file}"
        midi2image(file_path, output_folder_path)

In [10]:
def clean_images(input_path, output_path, height_image = 106, length_image = 106):
    """
    Iterate on all images created in the 'input_path' folder:
        - Resize images to height_image x length_image
        - Transform them into pure black and white images
        - Save them in a 'music piece' subfolder of the 'output_path' folder
       
    --> Input path: path to folder with input images (e.g., '../../data_test/Input_image')
    --> Output path: path to folder where we wish to save output reshaped images (e.g., '../../data_test/Input_image_cleaned')
    """
    
    for music in os.listdir(input_path):
        
        output_folder = f'{output_path}/{music}' # Creating one sub_folder for each music piece in the 'output_path' folder
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)
        
        for image in os.listdir(f"{input_path}/{music}"):
            image_path = f'{input_path}/{music}/{image}'
            image_read = Image.open(image_path) # Reading each image
            new_image = image_read.resize((106,106)) # Resizing each image
            new_image = new_image.convert("1") # Convert each image to pure black and white
            new_image.save(f'{output_folder}/{image}') # Saving each image

## Get clean array dataset from clean image dataset

In [11]:
def get_pixels_array(input_path): 
    """
    Generate an array containing all images from 'input_path' folder in array format
    
    --> input_path = path of the folder containing clean images (e.g., '../../data_image_cleaned') 
    """
    
    pixels = []
    for music in os.listdir(input_path):
        for image in os.listdir(f"{input_path}/{music}"):
            image_path = f'{input_path}/{music}/{image}'
            image_read = Image.open(image_path) # Reading each image
            pixels_image = np.array(image_read.getdata()).astype('float32') # Store all pixel values in an array, each i_th-sequence contains the values of pixels in a i_th-row
            pixels_image = pixels_image / 255.0 # All the values are 0 (black) and white (255). Normalize pixel values to be between 0 and 1
            pixels.append(pixels_image.reshape(106, 106,1)) # Reshape pixels to be a matrix

    pixels = np.array(pixels)
    
    return pixels

In [14]:
midi_path ='../../data_raw/'
output_folder_path = '../../data_image/'
get_clean_midi_data_as_images(midi_path, output_folder_path)

In [16]:
input_path = '../../data_image'
output_path = '../../data_image_cleaned'
clean_images(input_path, output_path)

In [12]:
# Implementing the function for the entire dataset
input_path = '../../data_image_cleaned'
pixels = get_pixels_array(input_path)
pixels.shape

(1425, 106, 106, 1)

## Model

In [13]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input,Dense, Reshape, Flatten, BatchNormalization, Conv2D, Conv2DTranspose, LeakyReLU, Dropout
from tensorflow.keras import Model, Sequential
from tensorflow.keras.utils import plot_model

2023-09-14 16:34:16.758754: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-14 16:34:16.760926: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-14 16:34:16.803471: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-14 16:34:16.804456: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Discriminator

In [14]:
height_image = 106
length_image = 106

In [17]:
#the input is an image (black and white) with 106x106 pixels
input_shape = (length_image, height_image, 1)
inputs = Input(input_shape)

#block1
convolutional_layer_1 = Conv2D(64, (4,4), strides=(2,2), padding='same', input_shape=input_shape) (inputs)
activation_1 = LeakyReLU(alpha=0.2) (convolutional_layer_1)
dropout_1 = Dropout(0.5) (activation_1)

#block2
convolutional_layer_2 = Conv2D(64, (4,4), strides=(2,2), padding='same', input_shape=input_shape) (dropout_1)
activation_2 = LeakyReLU(alpha=0.2) (convolutional_layer_2)
dropout_2 = Dropout(0.5) (activation_2)

flattened_layer = Flatten()(dropout_2)
batch_normalization_layer = BatchNormalization()(flattened_layer)
output_discriminator = Dense(1, activation="sigmoid")(batch_normalization_layer)

discriminator_model = Model(inputs, outputs=output_discriminator)

discriminator_model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.0002, beta_1=0.5), metrics=['accuracy'])

discriminator_model.summary()



Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 106, 106, 1)]     0         
                                                                 
 conv2d_4 (Conv2D)           (None, 53, 53, 64)        1088      
                                                                 
 leaky_re_lu_4 (LeakyReLU)   (None, 53, 53, 64)        0         
                                                                 
 dropout_4 (Dropout)         (None, 53, 53, 64)        0         
                                                                 
 conv2d_5 (Conv2D)           (None, 27, 27, 64)        65600     
                                                                 
 leaky_re_lu_5 (LeakyReLU)   (None, 27, 27, 64)        0         
                                                                 
 dropout_5 (Dropout)         (None, 27, 27, 64)        0   

In [24]:
! pip install pydot



In [25]:
! pip install graphviz

Collecting graphviz
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.0/47.0 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: graphviz
Successfully installed graphviz-0.20.1


## Generator

In [28]:
latent_dimension = 100

In [29]:
input_shape = (latent_dimension)
inputs = Input(input_shape)

dense_1 = Dense(128*53*53, input_dim=latent_dimension)(inputs)
activation_1 = LeakyReLU(alpha=0.2)(dense_1)
reshape_layer = Reshape( (53,53,128))(activation_1)

dense_2 = Dense(1024)(reshape_layer)
conv2d_transposed_layer_1 = Conv2DTranspose(1024,(4,4), strides=(2,2), padding="same")(dense_2)

dense_3 = Dense(1024)(conv2d_transposed_layer_1)
activation_2 = LeakyReLU(alpha=0.2)(dense_3)
dense_4 = Dense(1024)(activation_2)
conv2d_transposed_layer_1 = Conv2DTranspose(1,(7,7), padding="same", activation='sigmoid')(dense_4)

generator_model = Model(inputs, outputs=conv2d_transposed_layer_1)

generator_model.summary()


Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 100)]             0         
                                                                 
 dense_3 (Dense)             (None, 359552)            36314752  
                                                                 
 leaky_re_lu_6 (LeakyReLU)   (None, 359552)            0         
                                                                 
 reshape (Reshape)           (None, 53, 53, 128)       0         
                                                                 
 dense_4 (Dense)             (None, 53, 53, 1024)      132096    
                                                                 
 conv2d_transpose (Conv2DTr  (None, 106, 106, 1024)    16778240  
 anspose)                                                        
                                                           

## GAN (consolidation of generator and discriminator)

In [31]:
discriminator_model.trainable = False
GAN = Sequential()
GAN.add(generator_model)
GAN.add(discriminator_model)

GAN.compile(loss='binary_crossentropy', optimizer= Adam(lr=0.0002, beta_1=0.5))

GAN.summary()



Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model_3 (Functional)        (None, 106, 106, 1)       55374465  
                                                                 
 model_2 (Functional)        (None, 1)                 299969    
                                                                 
Total params: 55674434 (212.38 MB)
Trainable params: 55374465 (211.24 MB)
Non-trainable params: 299969 (1.14 MB)
_________________________________________________________________


In [33]:
def generate_real_music_samples(pixels, num_samples):
  #generate num_samples random integer number between 0 and the size of total dataset of pixels
  #these numbers/indexes will be the indexes of converted image that we will use as real music sample
  indexes_real_image = np.random.randint(0, pixels.shape[0], num_samples)
  real_choosen_images = pixels[indexes_real_image]
  #create a ground truth of 1 for each of the images (due to the fact are real images)
  ground_truth_choosen_images = np.ones( (num_samples, 1))
  return real_choosen_images, ground_truth_choosen_images

In [34]:
def generate_latent_samples(latent_dimension, num_samples):
  #generate latent_dimension*num_samples array of random values taken from x axis of Normal Distribution
  latent_samples = np.random.randn(latent_dimension*num_samples)
  #reshape to have num_samples entries each one with latent_dimension values
  latent_samples = latent_samples.reshape(num_samples, latent_dimension)
  return latent_samples

In [35]:
#a useful function to compute accuracy of the discriminator in predicting correctly both real and fake music samples

def show_current_discriminator_accuracy(discriminator_model, generator_model, pixels, latent_dimension):

  num_samples_to_test = 100

  #generate real music samples
  real_music_samples, ground_truth_real_music_samples = generate_real_music_samples(pixels, num_samples_to_test)
  
  #generate fake music samples
  latent_samples = generate_latent_samples(latent_dimension, num_samples_to_test)
  images_predicted_from_generator = generator_model.predict(latent_samples)
  #create a zero ground truth (because are no real images)
  ground_truth_images_predicted_from_generator = np.zeros( (num_samples_to_test, 1))

  #evaluate the accuracy of the discriminator on real music samples
  _, accuracy_on_real = discriminator_model.evaluate(real_music_samples, ground_truth_real_music_samples, verbose=0)
  #evaluate the accuracy of the discriminator on fake music samples
  _, accuracy_on_fake = discriminator_model.evaluate(images_predicted_from_generator, ground_truth_images_predicted_from_generator, verbose=0)

  #print results
  print("   Current accuracy of the discriminator on real music samples:", round(accuracy_on_real*100,3),"%")
  print("   Current accuracy of the discriminator on fake music samples:", round(accuracy_on_fake*100,3),"% \n")

  return accuracy_on_real, accuracy_on_fake

In [36]:
number_of_epochs = 250
number_of_batch_per_epoch = 15
number_of_samples_per_batch = int((pixels.shape[0] / number_of_batch_per_epoch)*0.04)
number_of_samples_to_take_per_batch = int(number_of_batch_per_epoch / 2)

#this array contains one image for each epoch that was generated by the generator. In this way is possible to "see" how the generations changes
images_generated_per_epoch = []

#here i will store loss and accuracy information of the discriminator during the epochs
discriminator_info_per_epoch = pd.DataFrame(columns=['loss_discriminator_on_real_music', 'loss_generator_on_fake_music', 'accuracy_on_real', 'accuracy_on_fake'])  

accuracy_on_fake = 0
accuracy_on_real = 0

num_samples_for_generator = number_of_samples_per_batch*2
num_samples_for_discriminator = number_of_samples_per_batch

#for each epoch...
for id_epoch in range(number_of_epochs):
  print("\n\n epoch:", id_epoch)

  
  #da eliminare (serve solo per fare le epoche da 199 a 500)
  if id_epoch == 0:
    num_samples_for_generator = 25
    num_samples_for_discriminator = 1
    accuracy_on_fake = 95
    accuracy_on_real = 95
  
  #for each batch
  for id_batch in range(number_of_batch_per_epoch):

    if num_samples_for_discriminator > 0:
      #take real music samples from pixels database (the ones obtained after conversion from midi to image and after pure black and white conversion)
      real_music_samples, ground_truth_real_music_samples = generate_real_music_samples(pixels, num_samples_for_discriminator)

      #generate lantent samples
      latent_samples = generate_latent_samples(latent_dimension, num_samples_for_discriminator )

      #use the generator to predict an image giving lantent samples
      images_generated_from_generator = generator_model.predict(latent_samples)
      images_generated_per_epoch.append(images_generated_from_generator[0])

      #create a zero ground truth (because are no real images)
      ground_truth_images_generated_from_generator = np.zeros( (num_samples_for_discriminator, 1))

      #create the input samples to fed to discriminatore which are made up of both real and fake music samples
      discriminator_inputs = np.vstack( (real_music_samples, images_generated_from_generator ))
      discriminator_ground_truth = np.vstack( (ground_truth_real_music_samples,ground_truth_images_generated_from_generator )) 


      #train the discriminator (the one alone) on this current batch
      discriminator_loss, _ = discriminator_model.train_on_batch(discriminator_inputs,discriminator_ground_truth)

    '''
      Now we force the discriminator to stop learning more. The GAN will now have an overtrained discriminator, which however can not be longer trained. 
      Now we generate latent samples that we mark as real (even if they are not), we take the images that the GAN generator has generated, 
      and we feed them to the discriminator. Obviously, being this "better" than the generator will classify them as fake. 
      By forcing the GAN to classify them as true instead, we will adjust the generator weights in such a way as to generate images, given 
      the same random latent samples, that can "make fun" of the discriminator. Note that the discriminator weights will not be affected.
    '''


    #generate latent samples (the double of before)
    latent_samples = generate_latent_samples(latent_dimension, num_samples_for_generator)
    #mark them as real (even if they are not)
    ground_truth_latent_samples = np.ones( (num_samples_for_generator,1) )
    #train the GAN (so, only the generator)
    GAN_loss = GAN.train_on_batch(latent_samples, ground_truth_latent_samples)

    #print some information related with the training
    print("   batch:", id_batch, "   discriminator_loss_on_real_music:", discriminator_loss, "   discriminator_loss_on_fake_images:", GAN_loss, "   #samples to train generator:", num_samples_for_generator,  "   #samples to train discriminator:", num_samples_for_discriminator)

    #this control is intended to train the generator until it reaches a level almost equal to that of the discriminator. If it were not there it would be impossible for the generator to reach the discriminator as this seems to learn very quickly.
    if accuracy_on_real > 0.8 and accuracy_on_fake > 0.6:
      num_samples_for_generator = int((pixels.shape[0] / number_of_batch_per_epoch)*0.25)
      num_samples_for_discriminator = 0
    else:
      num_samples_for_generator = number_of_samples_per_batch*2
      num_samples_for_discriminator = number_of_samples_per_batch 


  #print the current accuracy obtained in classifing correctly both real music samples and fake music samples (the ones generated by the generator)
  accuracy_on_real, accuracy_on_fake = show_current_discriminator_accuracy(discriminator_model, generator_model, pixels, latent_dimension)
  discriminator_info_per_epoch = discriminator_info_per_epoch.append({'loss_discriminator_on_real_music':  discriminator_loss, 'loss_generator_on_fake_music': GAN_loss, 'accuracy_on_real': accuracy_on_real, 'accuracy_on_fake': accuracy_on_fake}, ignore_index=True)



 epoch: 0
   batch: 0    discriminator_loss_on_real_music: 0.35084646940231323    discriminator_loss_on_fake_images: 1.000048279762268    #samples to train generator: 25    #samples to train discriminator: 1
   batch: 1    discriminator_loss_on_real_music: 0.35084646940231323    discriminator_loss_on_fake_images: 0.8907220363616943    #samples to train generator: 23    #samples to train discriminator: 0
   batch: 2    discriminator_loss_on_real_music: 0.35084646940231323    discriminator_loss_on_fake_images: 0.6909752488136292    #samples to train generator: 23    #samples to train discriminator: 0
   batch: 3    discriminator_loss_on_real_music: 0.35084646940231323    discriminator_loss_on_fake_images: 0.6894126534461975    #samples to train generator: 23    #samples to train discriminator: 0
   batch: 4    discriminator_loss_on_real_music: 0.35084646940231323    discriminator_loss_on_fake_images: 0.6891672015190125    #samples to train generator: 23    #samples to train discriminat

AttributeError: 'DataFrame' object has no attribute 'append'