In [52]:
import numpy as np
import torch
import keras
from keras import Sequential
from keras.models import Model
from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, LSTM, Input, Activation, Dropout, BatchNormalization, Concatenate, RepeatVector
from keras.backend import repeat, concatenate
from keras.optimizers import Adam


## 1. Build Generator

### Architecture
 - Input: Music spectrogram
     - Pad to fixed size: (150, 50000, 1)
 - Encoding part
     - Conv Layers: Conv * 4
     - Dense Layers: Dense * 2
 - Generating part
     - LSTM Layers: LSTM layers * 2
 

In [31]:
max_notes = 200  # ???

batch_size = 32
input_rows = 150
input_cols = 50000
input_shape = (input_rows, input_cols, 1)
noise_size = 4
noise_shape = (max_notes, noise_size)
note_size = 4
note_shape = (max_notes, note_size)
context_size = 64
lstm_input_shape = (max_notes, noise_size + context_size)
output_size = 4

dropout_rate = 0.2

In [21]:
def build_generator(max_notes, input_shape, noise_size, note_size, context_size, output_size, dropout_rate):
    lstm_input_shape = (max_notes, noise_size + context_size)
    
    encoderG = Sequential()
    
    # Conv Layers
    encoderG.add(Conv2D(input_shape=input_shape, filters=32, kernel_size=(3, 3), padding="same"))
    encoderG.add(Activation('relu'))
    encoderG.add(BatchNormalization())
    encoderG.add(MaxPooling2D(pool_size=(3, 300), padding="valid"))
    encoderG.add(Conv2D(filters=64, kernel_size=(3, 3), padding="same"))
    encoderG.add(Activation('relu'))
    encoderG.add(BatchNormalization())
    encoderG.add(MaxPooling2D(pool_size=(3, 3), padding="valid"))
    encoderG.add(Conv2D(filters=128, kernel_size=(3, 3), padding="same"))
    encoderG.add(Activation('relu'))
    encoderG.add(BatchNormalization())
    encoderG.add(MaxPooling2D(pool_size=(3, 3), padding="valid"))
    encoderG.add(Conv2D(filters=128, kernel_size=(3, 3), padding="same"))
    encoderG.add(Activation('relu'))
    encoderG.add(BatchNormalization())
    encoderG.add(MaxPooling2D(pool_size=(3, 3), padding="valid"))

    encoderG.add(Flatten())
    encoderG.add(Dense(context_size))
    encoderG.add(Activation('relu'))

    #encoderG.summary()
    
    chartmaker = Sequential()

    chartmaker.add(LSTM(input_shape=lstm_input_shape, units=context_size + noise_size, return_sequences=True, return_state=False, stateful=False))
    chartmaker.add(Dropout(dropout_rate))
    chartmaker.add(LSTM(context_size + noise_size, return_sequences=True, return_state=False, stateful=False))
    chartmaker.add(Dropout(dropout_rate))
    chartmaker.add(Dense(output_size))

    #chartmaker.summary()
    
    # Generate contextual encoding for the music
    music_input = Input(shape=input_shape)
    music_context = encoderG(music_input)

    noise_input = Input(shape=(max_notes, noise_size))

    # Repeat and concatenation. Input to the generator
    music_context_repeat = RepeatVector(max_notes)(music_context)
    chartmaker_input = Concatenate()([music_context_repeat, noise_input])

    output = chartmaker(chartmaker_input)

    # Final generator model
    generator = Model(inputs=[music_input, noise_input], outputs=output)
    generator.summary()
    
    return generator

In [22]:
generator = build_generator(max_notes, input_shape, noise_size, note_size, context_size, output_size, dropout_rate)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 150, 50000, 1 0                                            
__________________________________________________________________________________________________
sequential_9 (Sequential)       (None, 64)           290880      input_7[0][0]                    
__________________________________________________________________________________________________
repeat_vector_4 (RepeatVector)  (None, 200, 64)      0           sequential_9[1][0]               
__________________________________________________________________________________________________
input_8 (InputLayer)            (None, 200, 4)       0                                            
__________________________________________________________________________________________________
concatenat

## 2. Build Discriminator

### Architecture

In [23]:
def build_discriminator(max_notes, input_shape, noise_size, note_size, context_size, output_size, dropout_rate):
    
    encoderD = Sequential()
    
    # Conv Layers
    encoderD.add(Conv2D(input_shape=input_shape, filters=32, kernel_size=(3, 3), padding="same"))
    encoderD.add(Activation('relu'))
    encoderD.add(BatchNormalization())
    encoderD.add(MaxPooling2D(pool_size=(3, 300), padding="valid"))
    encoderD.add(Conv2D(filters=64, kernel_size=(3, 3), padding="same"))
    encoderD.add(Activation('relu'))
    encoderD.add(BatchNormalization())
    encoderD.add(MaxPooling2D(pool_size=(3, 3), padding="valid"))
    encoderD.add(Conv2D(filters=128, kernel_size=(3, 3), padding="same"))
    encoderD.add(Activation('relu'))
    encoderD.add(BatchNormalization())
    encoderD.add(MaxPooling2D(pool_size=(3, 3), padding="valid"))
    encoderD.add(Conv2D(filters=128, kernel_size=(3, 3), padding="same"))
    encoderD.add(Activation('relu'))
    encoderD.add(BatchNormalization())
    encoderD.add(MaxPooling2D(pool_size=(3, 3), padding="valid"))

    encoderD.add(Flatten())
    encoderD.add(Dense(context_size))
    encoderD.add(Activation('relu'))
    
    chartjudger = Sequential()

    chartjudger.add(LSTM(input_shape=lstm_input_shape, units=context_size + note_size, return_sequences=True, return_state=False, stateful=False))
    chartjudger.add(Dropout(dropout_rate))
    chartjudger.add(LSTM(context_size + note_size, return_sequences=True, return_state=False, stateful=False))
    chartjudger.add(Dropout(dropout_rate))
    chartjudger.add(Dense(1))
    chartjudger.add(Activation('sigmoid'))
    
    music_input = Input(shape=input_shape)
    music_context = encoderD(music_input)

    chart_input = Input(shape=(max_notes, note_size))

    # Repeat and concatenation. Input to the generator
    music_context_repeat = RepeatVector(max_notes)(music_context)
    chartjudger_input = Concatenate()([music_context_repeat, chart_input])

    output = chartjudger(chartjudger_input)

    # Final discriminator model
    discriminator = Model(inputs=[music_input, chart_input], outputs=output)
    discriminator.summary()
    
    return discriminator

In [24]:
discriminator = build_discriminator(max_notes, input_shape, noise_size, note_size, context_size, output_size, dropout_rate)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            (None, 150, 50000, 1 0                                            
__________________________________________________________________________________________________
sequential_11 (Sequential)      (None, 64)           290880      input_9[0][0]                    
__________________________________________________________________________________________________
repeat_vector_5 (RepeatVector)  (None, 200, 64)      0           sequential_11[1][0]              
__________________________________________________________________________________________________
input_10 (InputLayer)           (None, 200, 4)       0                                            
__________________________________________________________________________________________________
concatenat

## 3. Combined Model


In [26]:
optimizer = Adam(lr=0.002)

In [28]:
# Build and compile the discrimninator
discriminator = build_discriminator(max_notes, input_shape, noise_size, note_size, context_size, output_size, dropout_rate)
discriminator.compile(loss=['binary_crossentropy'], optimizer=optimizer, metrics=['accuracy'])


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           (None, 150, 50000, 1 0                                            
__________________________________________________________________________________________________
sequential_13 (Sequential)      (None, 64)           290880      input_11[0][0]                   
__________________________________________________________________________________________________
repeat_vector_6 (RepeatVector)  (None, 200, 64)      0           sequential_13[1][0]              
__________________________________________________________________________________________________
input_12 (InputLayer)           (None, 200, 4)       0                                            
__________________________________________________________________________________________________
concatenat

In [48]:
# Build the generator
generator = build_generator(max_notes, input_shape, noise_size, note_size, context_size, output_size, dropout_rate)



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_25 (InputLayer)           (None, 150, 50000, 1 0                                            
__________________________________________________________________________________________________
sequential_17 (Sequential)      (None, 64)           290880      input_25[0][0]                   
__________________________________________________________________________________________________
repeat_vector_8 (RepeatVector)  (None, 200, 64)      0           sequential_17[1][0]              
__________________________________________________________________________________________________
input_26 (InputLayer)           (None, 200, 4)       0                                            
__________________________________________________________________________________________________
concatenat

In [36]:
# The generator takes music STFT and random noise as input, and generates the corresponding chart
music_input = Input(shape=input_shape)
noise_input = Input(shape=noise_shape)
chart = generator([music_input, noise_input])

In [38]:
# For the combined model we will only train the generator
discriminator.trainable = False


In [39]:
# The discriminator takes music STFT and the generated chart and determines validity
valid = discriminator([music_input, chart])


In [40]:
# The combined model
combined = Model([music_input, noise_input], valid)
combined.compile(loss=['binary_crossentropy'], optimizer=optimizer)

## 4. Training

In [46]:
num_epochs = 500

# Number of iterations of training on Discriminator vs. number of iterations of training on Generator
num_trainD = 7
num_trainG = 3

In [54]:
# Generate data in each iteration to train the discriminator and generator
def generate_training_data(discriminator, generator):
    # Generate a batch of new music data and corresponding charts. 
    #   music data shape (batch_size, input_rows, input_cols), chart shape (batch_size, max_notes, note_size)
    music, chart_correct = None, None
    # Generate a batch of mis-matching charts
    chart_mismatch = None
    # Generate a batch of random noise from latent space. Noise shape (batch_size, max_notes, noise_size)
    noise = None
    # Forward the music data and noise through Generator and generate fake charts
    chart_fake = generator.predict([music, noise])
    
    return music, noise, chart_correct, chart_mismatch, chart_fake

In [55]:
for epoch in range(num_epochs):
    # labels for valid music chart. shape (batch_size, max_notes, 1)
    valid_label = np.ones((batch_size, max_notes, 1))
    # labels for fake music chart. shape (batch_size, max_notes, 1)
    fake_label = np.zeros((batch_size, max_notes, 1))
    
    # Train Discriminator
    for i_trainD in range(num_trainD):
        # generate batch training data
        music, noise, chart_correct, chart_mismatch, chart_fake = generate_training_data(discriminator, generator)
        
        # Loss value on matching charts
        d_loss_real = discriminator.train_on_batch([music, chart_correct], valid_label)
        # Loss value on mis-matching charts
        d_loss_mismatch = discriminator.train_on_batch([music, chart_mismatch], fake_label)
        # Loss value on fake charts
        d_loss_fake = discriminator.train_on_batch([music, chart_fake], fake_label)
        
        # Final loss
        d_loss = np.add(d_loss_real, np.add(d_loss_mismatch, d_loss_fake) / 2)
        
    # Train Generator
    for i_trainG in range(num_trainG):
        g_loss = combined.train_on_batch([music, noise], valid_label)
        
    # Plot progress
    print ("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % (epoch + 1, d_loss[0], 100*d_loss[1], g_loss))

AttributeError: 'NoneType' object has no attribute 'shape'