# 8-Bit Music Generation Using RNN-GAN
Seyed Mohammadsaleh Mirzatabatabaei (smsmt@aut.ac.ir)

Salman Amimotlagh (motlaq@aut.ac.ir)

Aria Espahbodi (aria.esp@aut.ac.ir)

---





![RNN vs DNN t](https://salu133445.github.io/musegan/figs/multitrack1.png)

### Download & Import packages

---

In [1]:
!apt install fluidsynth
!cp /usr/share/sounds/sf2/FluidR3_GM.sf2 ./font.sf2
!pip install midi2audio
!pip install music21
!pip install pydub

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  fluid-soundfont-gm libevdev2 libfluidsynth3 libgudev-1.0-0 libinput-bin libinput10
  libinstpatch-1.0-2 libmd4c0 libmtdev1 libqt5core5a libqt5dbus5 libqt5gui5 libqt5network5
  libqt5svg5 libqt5widgets5 libwacom-bin libwacom-common libwacom9 libxcb-icccm4 libxcb-image0
  libxcb-keysyms1 libxcb-render-util0 libxcb-util1 libxcb-xinerama0 libxcb-xinput0 libxcb-xkb1
  libxkbcommon-x11-0 qsynth qt5-gtk-platformtheme qttranslations5-l10n timgm6mb-soundfont
Suggested packages:
  fluid-soundfont-gs qt5-image-formats-plugins qtwayland5 jackd
The following NEW packages will be installed:
  fluid-soundfont-gm fluidsynth libevdev2 libfluidsynth3 libgudev-1.0-0 libinput-bin libinput10
  libinstpatch-1.0-2 libmd4c0 libmtdev1 libqt5core5a libqt5dbus5 libqt5gui5 libqt5network5
  libqt5svg5 libqt5widgets5 libwacom-bin libwacom-common libwacom9 libxcb-icc

In [2]:
import sys, os, glob
import matplotlib.pyplot as plt
import numpy as np
import datetime
import csv
from sklearn.preprocessing import MinMaxScaler

from music21 import *
from midi2audio import FluidSynth
from IPython.display import Audio

from keras.layers import Input, Dense, Reshape, Dropout, LSTM, Bidirectional
from keras.layers import BatchNormalization, Activation, ZeroPadding2D, LeakyReLU
from keras.models import Sequential, Model
from keras.optimizers.legacy import Adam
from keras import utils

import tensorflow as tf

### MUSIC class

In [3]:
class MUSIC():
    def __init__(self, seq_length):
        self.seq_length = seq_length
        self.file_rows = []
        self.trainseq = []

    def parser(self, folderName):
        """ Get all the frequencies and amplitudes from the csv files """

        for file in glob.glob(f"{folderName}/*.csv"):
            print("Parsing %s" % file)
            current_file = []
            with open(file, 'r') as f:
              csvreader = csv.reader(f)
              next(csvreader)  # Skip the header row
              current_file = [[float(value) for value in row[1:]] for row in csvreader] # Don't care about interval column, need floats
            self.file_rows.append(current_file)

    def prepare_sequences(self):
        """ Prepare the sequences used by the Neural Network """

        # create input sequences of size seq_length
        for file in self.file_rows:
            for i in range(len(file) - self.seq_length):
                self.trainseq.append(file[i:i + self.seq_length])

        # Normalize sequences between 0 and 1
        self.trainseq = np.array(self.trainseq)
        self.num_samples, self.sequence_length, self.features = self.trainseq.shape
        self.scaler = MinMaxScaler(feature_range=(0, 1))
        self.trainseq = self.trainseq.reshape((self.num_samples, self.sequence_length * self.features))
        self.trainseq = self.scaler.fit_transform(self.trainseq)
        return self.trainseq

    def reverse(self, data):
        """ convert the output from the prediction to music format shape/scale """
        reversed_data = self.scaler.inverse_transform(data)

        #Reshape the reversed data back to the original shape
        return reversed_data.reshape((self.sequence_length, self.features))

### Generators and Discriminators

In [34]:
class MODEL():
  def __init__(self, music_obj):
    self.music_obj = music_obj
    self.seq_length = self.music_obj.seq_length
    self.seq_shape = (16 * self.seq_length,1)
    self.latent_dim = 1000
    self.disc_loss = []
    self.gen_loss = []

    optimizer = Adam(1e-4, 0.9)

    # Build and compile the discriminator
    self.discriminator = self.build_discriminator()
    self.discriminator.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    # Build the generator
    self.generator = self.build_generator()

    # The generator takes noise as input and generates note sequences
    z = Input(shape=(self.latent_dim, 1))
    generated_seq = self.generator(z)

    # For the combined model we will only train the generator
    self.discriminator.trainable = False

    # The discriminator takes generated images as input and determines validity
    validity = self.discriminator(generated_seq)

    # The combined model  (stacked generator and discriminator)
    # Trains the generator to fool the discriminator
    self.combined = Model(z, validity)
    self.combined.compile(loss='binary_crossentropy', optimizer=optimizer) # mean_squared_error

  def build_discriminator(self):
    print(self.seq_shape)
    model = Sequential()
    model.add(LSTM(512, input_shape=self.seq_shape, return_sequences=True))
    model.add(Bidirectional(LSTM(512)))
    model.add(Dense(512))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(512))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(1, activation='sigmoid'))

    seq = Input(shape=self.seq_shape)
    validity = model(seq)

    return Model(seq, validity)

  def build_generator(self):
    model = Sequential()
    model.add(LSTM(512, input_shape=(self.latent_dim, 1), return_sequences=True))
    model.add(Bidirectional(LSTM(512)))
    model.add(Dense(256))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(512))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(1024))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(np.prod(self.seq_shape), activation='relu'))
    model.add(Reshape(self.seq_shape))

    noise = Input(shape=(self.latent_dim, 1))
    seq = model(noise)

    return Model(noise, seq)

  def train(self, epochs, dataFolder, batch_size=128, sample_interval=50):
    # Load and the data
    self.music_obj.parser(dataFolder)
    sequences = self.music_obj.prepare_sequences()
    print(sequences.dtype)

    print(f"\nNumber of sequences for train: {sequences.shape[0]}\n")

    # Adversarial ground truths
    real = np.ones((batch_size, 1))
    fake = np.zeros((batch_size, 1))

    # Training the model
    for epoch in range(epochs):
      # Training the discriminator
      # Select a random batch of sequences
      index_seqs = np.random.randint(0, sequences.shape[0], batch_size)
      real_seqs = sequences[index_seqs]

      # Random noise for generator input
      noise = np.random.normal(0, 1, (batch_size, self.latent_dim))

      # Generate a batch of new sound sequences
      gen_seqs = self.generator.predict(noise)
      gen_seqs = tf.squeeze(gen_seqs, axis=-1)
      print(f"{real_seqs.shape}-{gen_seqs.shape}")

      # Train the discriminator
      d_loss_real = self.discriminator.train_on_batch(real_seqs, real)
      d_loss_fake = self.discriminator.train_on_batch(gen_seqs, fake)
      d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

      #  Training the Generator
      noise = np.random.normal(0, 1, (batch_size, self.latent_dim))

      # Train the generator (to have the discriminator label samples as real)
      g_loss = self.combined.train_on_batch(noise, real)

      # Print the progress and save into loss lists
      if epoch % sample_interval == 0:
          print("%d / %d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % (
          epoch + 1, epochs, d_loss[0], 100 * d_loss[1], g_loss))
          self.disc_loss.append(d_loss[0])
          self.gen_loss.append(g_loss)
      if epoch % 5 == 0:
        print(self.generate())
        self.plot_loss()
    self.save()
    print(f"The C-RNN-GAN model has been trained with {dataFolder} csv music,\n" +
          "and saved.")


  def save(self):
    # Get the current timestamp
    now = datetime.datetime.now()

    # Convert the timestamp to a string
    timestamp = str(now.timestamp())

    # create Model directory if there isn't exist
    if not os.path.exists('Model/'):
      os.makedirs('Model/')

    # save discriminator and generator trained model
    self.discriminator.save(f"Model/discriminator-{timestamp}.h5")
    self.generator.save(f"Model/generator-{timestamp}.h5")
    print("The trained C-RNN-GAN model (generator and discriminator) have been saved in the Model folder.")
    self.plot_loss()


  def generate(self):
    """ Use random noise to generate music"""

    # random noise for network input
    noise = np.random.normal(0, 1, (1, self.latent_dim))
    prediction = self.generator.predict(noise)
    prediction = tf.squeeze(prediction, axis=-1)
    return self.music_obj.reverse(prediction)


  def plot_loss(self):
    """ Plot and save discriminator and generator loss functions per epoch diagram"""
    plt.plot(self.disc_loss, c='red')
    plt.plot(self.gen_loss, c='blue')
    plt.title("GAN Loss per Epoch")
    plt.legend(['Discriminator', 'Generator'])
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.savefig('GAN_Loss_per_Epoch_final.png', transparent=True)
    plt.close()

In [35]:
music = MUSIC(seq_length=64)
model = MODEL(music_obj=music)
model.train(350, dataFolder='music', batch_size=64, sample_interval=1)

(1024, 1)
Parsing music/0.csv
[[ 535.          540.          545.         ...    0.
     0.            0.        ]
 [ 535.          540.          545.         ...    0.
     0.            0.        ]
 [ 535.          540.          545.         ...    0.
     0.            0.        ]
 ...
 [ 695.          395.          235.         ... 5956.7448444
  7361.99902496 8709.58378163]
 [ 695.          830.          395.         ... 5538.44652595
  6778.40040294 7520.30850044]
 [ 290.          230.          415.         ... 6234.14738005
  7570.75035156 8407.60376721]]
MAX: 40525.52922349852
[0.33668342 0.34343434 0.34848485 ... 0.23156439 0.25000016 0.20746438]
float64

Number of sequences for train: 9873

(64, 1024)-(64, 1024)
1 / 350 [D loss: 0.693824, acc.: 50.00%] [G loss: 0.668421]
[[ 200.          200.          200.         ... 1079.22376668
     0.         1387.35634184]
 [ 282.57750198  200.          200.         ... 1285.10958481
     0.         1007.77543656]
 [ 200.          200. 

  saving_api.save_model(


The trained C-RNN-GAN model (generator and discriminator) have been saved in the Model folder.
The C-RNN-GAN model has been trained with music csv music,
and saved.


### Use Decoder to Create New Music Sequences

In [None]:
# ONLY IF YOU NEED TO RELOAD THE MODEL

# import keras
# music = MUSIC(seq_length=16)
# model = MODEL(music_obj=music)
# model.music_obj.parser("music")
# model.music_obj.prepare_sequences()
# model.generator = keras.models.load_model('Model/generator.h5')

In [31]:
# Write GAN output to csv file

def write_results(values):
    # Create an empty audio segment

    filename = "output.csv"
    with open(filename, 'w', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['Interval', 'Frequency', 'Volume'])
        for i, row in enumerate(values):
          top_frequencies, top_amplitudes = row[:9], row[9:]
          # print(f"Interval {i + 1}:")
          csv_row = [i]
          csv_row.extend(top_frequencies)
          csv_row.extend(top_amplitudes)
          csvwriter.writerow(csv_row)


In [27]:
# Use trained model to generate new sequences

predictions = model.generate()
print(predictions)
model.save()

[[  496.21392488   632.16311932   740.62980413 ...     0.
      0.         19179.55220108]
 [  200.           200.           736.77666187 ... 28154.54193001
  20455.48976107 23987.81885417]
 [  221.92361467   364.07294571   662.72724867 ... 14227.90603287
  18513.30826812   676.54224265]
 ...
 [  593.92456412   200.           441.40179157 ...  2444.44964726
  14889.49886146  7166.72389757]
 [  716.10273123   524.17732477   360.06577015 ... 14366.24612891
   1011.60625902 18630.00681899]
 [  293.96091104   880.44507504   299.89503771 ... 10721.94956581
  18833.56694721  8598.21598109]]


  saving_api.save_model(


The trained C-RNN-GAN model (generator and discriminator) have been saved in the Model folder.


In [28]:
write_results(predictions)

In [30]:
# This cell is simply a copy of decoder.py

import csv
from pydub import AudioSegment
from pydub.generators import Square

def build_audio(interval_duration):
    filename = "output.csv"
    with open(filename, 'r') as f:
        reader = csv.reader(f)
        num_rows = len(list(reader))
    # Create an empty audio segment
    output_audio = AudioSegment.silent(duration=1000 * interval_duration * num_rows)

    with open(filename, 'r') as csvfile:
        csvreader = csv.reader(csvfile)
        next(csvreader)  # Skip the header row

        i = 1
        for row in csvreader:
            frequencies = [float(x) for x in row[1:9]]
            volumes = [float(x) for x in row[9:]]


            for freq, amp in zip(frequencies, volumes):
                sine_wave = Square(freq)
                sine_wave = sine_wave.to_audio_segment(duration=1000 * interval_duration)
                sine_wave = sine_wave - (60 - amp / 15000 * 25)  # Adjust volume
                output_audio = output_audio.overlay(sine_wave, position=i * 1000 * interval_duration)
            i +=1

    # Increase the overall volume
    volume_scaling = 0  # Adjust as needed
    output_audio = output_audio + volume_scaling

    # Export the resulting audio to a WAV file
    output_file_path = 'test_construct.wav'
    output_audio.export(output_file_path, format='wav')


# Set the desired interval duration in seconds
interval_duration = 0.2

build_audio(interval_duration)
