# Music Forever!

### TODO:
   - extract well-formed data for training
   - data padding
   - add a terminal chart note indicating the end of the chart
   - add additional information: music length, bpm ( 60 / bpm ), etc.
   - key location as one-hot vector

In [1]:
import json
import os
import numpy as np
import tensorflow as tf
import keras
from keras import Sequential
from keras.models import Model
from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, LSTM, Input, Activation, Dropout, BatchNormalization, Concatenate, RepeatVector, TimeDistributed, Add, Lambda
from keras.backend import repeat, concatenate
from keras.optimizers import Adam
from keras import backend as K


Using TensorFlow backend.


In [2]:
# Test device
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

In [3]:
max_notes = 510  # ???

batch_size = 6
total_size = 170
input_rows = 130
input_cols = 20000
input_shape = (input_rows, input_cols, 1)
noise_size = 12
noise_shape = (max_notes, noise_size)
note_size = 12
note_shape = (max_notes, note_size)
context_size = 32
metadata_size = 2           # music length + bpm
metadata_shape = (max_notes, metadata_size)
lstm_input_shape = (max_notes, noise_size + context_size + metadata_size)
lstm_units = 32
output_size = 12

dropout_rate = 0.1

## 1. Build Generator

### Architecture
 - Input: Music spectrogram + Noise vector + metadata vector
     - Pad to fixed size: (150, 50000, 1)
 - Encoding part
     - Conv Layers: Conv * 4
     - Dense Layers: Dense * 2
 - Generating part
     - LSTM Layers: LSTM layers * 2
 

In [4]:
def build_encoder(input_shape, context_size):
    encoder = Sequential()
    
    # Music encoding Conv layers
    encoder.add(Conv2D(input_shape=input_shape, data_format="channels_last", filters=32, kernel_size=(3, 3), padding="same"))
    encoder.add(Activation('selu'))
    encoder.add(BatchNormalization())
    encoder.add(MaxPooling2D(pool_size=(3, 300), padding="valid"))
    encoder.add(Conv2D(filters=32, data_format="channels_last", kernel_size=(3, 3), padding="same"))
    encoder.add(Activation('selu'))
    encoder.add(BatchNormalization())
    encoder.add(MaxPooling2D(pool_size=(3, 3), padding="valid"))
    encoder.add(Conv2D(filters=32, data_format="channels_last", kernel_size=(3, 3), padding="same"))
    encoder.add(Activation('selu'))
    encoder.add(BatchNormalization())
    encoder.add(MaxPooling2D(pool_size=(3, 3), padding="valid"))
#     encoderG.add(Conv2D(filters=128, data_format="channels_last", kernel_size=(3, 3), padding="same"))
#     encoderG.add(Activation('selu'))
#     encoderG.add(BatchNormalization())
#     encoderG.add(MaxPooling2D(pool_size=(3, 3), padding="valid"))

    encoder.add(Flatten())
    encoder.add(Dense(context_size))
    encoder.add(Activation('selu'))

    encoder.summary()
    
    return encoder

In [5]:
def build_generator(encoderG, max_notes, noise_size, note_size, metadata_size, lstm_units, output_size, dropout_rate):
    lstm_input_shape = (max_notes, noise_size + context_size + metadata_size)
    
    # Chart generating LSTM layers
    chartmaker = Sequential()

    chartmaker.add(LSTM(input_shape=lstm_input_shape, units=lstm_units, return_sequences=True, return_state=False, stateful=False))
    chartmaker.add(Dropout(dropout_rate))
    chartmaker.add(LSTM(units=lstm_units, return_sequences=True, return_state=False, stateful=False))
    chartmaker.add(Dropout(dropout_rate))
    
    # Generate contextual encoding for the music
    music_input = Input(shape=input_shape)
    music_context = encoderG(music_input)
    
    # Noise input
    noise_input = Input(shape=(max_notes, noise_size))
    
    # Metadata input
    metadata_input = Input(shape=(max_notes, metadata_size))

    # Repeat and concatenation. Input to the generator
    music_context_repeat = RepeatVector(max_notes)(music_context)          # Repeat max_notes times
    chartmaker_input = Concatenate()([music_context_repeat, noise_input, metadata_input])  # context + noise + metadata

    lstm_output = chartmaker(chartmaker_input)
    
    # Determine the elements of output chart note
    # Predict start_time
    start_time_pred = TimeDistributed(Dense(1), input_shape=(max_notes, lstm_units))(lstm_output)
    # Predict the difference between start_time and end_time and add the difference to start_time prediction to get the end_time
    #   Use ReLU activition to ensure positivity
    difference_syn = TimeDistributed(Dense(1), input_shape=(max_notes, lstm_units))(lstm_output)
    difference_val = Activation('relu')(difference_syn)
    end_time_pred = Add()([start_time_pred, difference_val])  
    # Predict Is_Long, i.e., key type (long key or short key)
    #   Use Sigmoid activition to realize probability assumption and ensure value within range
    keytype_syn = TimeDistributed(Dense(1), input_shape=(max_notes, lstm_units))(lstm_output)
    keytype_val = Activation('sigmoid')(keytype_syn)
    # Predict which_key, i.e., key location
    #   Use Softmax activitoin to realize multinoulli probability assumption and ensure discrete value
    keyloc_syn = TimeDistributed(Dense(9), input_shape=(max_notes, lstm_units))(lstm_output)
    """keyloc_prob = Activation('softmax')(keyloc_syn)
    
    def softmax2discrete_onehot(x):
        ids = K.argmax(x)
        one_hot = K.one_hot(ids, 9)
        return one_hot
    
    keyloc_val = Lambda(softmax2discrete_onehot)(keyloc_prob)"""
    keyloc_val = Activation('softmax')(keyloc_syn)
    
    # Concatenate the above predictions to form final chart note vector. shape: (batch_size, max_notes, 12)
    notes_pred = Concatenate()([start_time_pred, end_time_pred, keytype_val, keyloc_val])

    # Final generator model
    generator = Model(inputs=[music_input, noise_input, metadata_input], outputs=notes_pred)
    generator.summary()
    
    return generator

In [6]:
encoder = build_encoder(input_shape, context_size)

generator = build_generator(encoder, max_notes, noise_size, note_size, metadata_size, lstm_units, output_size, dropout_rate)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 130, 20000, 32)    320       
_________________________________________________________________
activation_1 (Activation)    (None, 130, 20000, 32)    0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 130, 20000, 32)    128       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 43, 66, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 43, 66, 32)        9248      
_________________________________________________________________
activation_2 (Activation)    (None, 43, 66, 32)        0         
_________________________________________________________________
batch_normalization_2 (Batch (None, 43, 66, 32)        128       
__________

## 2. Build Discriminator

### Architecture

In [7]:
def build_discriminator(encoderD, max_notes, noise_size, note_size, metadata_size, lstm_units, output_size, dropout_rate):
    lstm_input_shape = (max_notes, noise_size + context_size + metadata_size)
    
    # Chart judging LSTM layers
    chartjudger = Sequential()

    chartjudger.add(LSTM(input_shape=lstm_input_shape, units=lstm_units, return_sequences=True, return_state=False, stateful=False))
    chartjudger.add(Dropout(dropout_rate))
    chartjudger.add(LSTM(units=lstm_units, return_sequences=True, return_state=False, stateful=False))
    chartjudger.add(Dropout(dropout_rate))
    
    
    chartjudger.add(TimeDistributed(Dense(1), input_shape=(max_notes, lstm_units)))
    chartjudger.add(Activation('sigmoid'))
    
    chartjudger.summary()
    
    # Generate contextual encoding for the music
    music_input = Input(shape=input_shape)
    music_context = encoderD(music_input)

    # Music chart input
    chart_input = Input(shape=(max_notes, note_size))
    
    # Music metadata input
    metadata_input = Input(shape=(max_notes, metadata_size))

    # Repeat and concatenation. Input to the generator
    music_context_repeat = RepeatVector(max_notes)(music_context)           # Repeat max_notes times
    chartjudger_input = Concatenate()([music_context_repeat, chart_input, metadata_input])

    output = chartjudger(chartjudger_input)

    # Final discriminator model
    discriminator = Model(inputs=[music_input, chart_input, metadata_input], outputs=output)
    discriminator.summary()
    
    return discriminator

In [8]:
discriminator = build_discriminator(encoder, max_notes, noise_size, note_size, metadata_size, lstm_units, output_size, dropout_rate)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 510, 32)           10112     
_________________________________________________________________
dropout_3 (Dropout)          (None, 510, 32)           0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 510, 32)           8320      
_________________________________________________________________
dropout_4 (Dropout)          (None, 510, 32)           0         
_________________________________________________________________
time_distributed_5 (TimeDist (None, 510, 1)            33        
_________________________________________________________________
activation_8 (Activation)    (None, 510, 1)            0         
Total params: 18,465
Trainable params: 18,465
Non-trainable params: 0
_________________________________________________________________
______

## 3. Combined Model


In [9]:
optimizer = Adam(lr=0.0003)

In [10]:
# Build and compile the discrimninator
discriminator.compile(loss=['binary_crossentropy'], optimizer=optimizer, metrics=['accuracy'])


In [11]:
# The generator takes music STFT, random noise, and music metadata as input, and generates the corresponding chart
music_input = Input(shape=input_shape)
noise_input = Input(shape=noise_shape)
metadata_input = Input(shape=metadata_shape)
chart = generator([music_input, noise_input, metadata_input])

In [12]:
# For the combined model we will only train the generator
discriminator.trainable = False


In [13]:
# The discriminator takes music STFT, the generated chart, and the metadata and determines validity
validity = discriminator([music_input, chart, metadata_input])


In [14]:
# The combined model
combined = Model([music_input, noise_input, metadata_input], validity)
combined.compile(loss=['binary_crossentropy'], optimizer=optimizer)
combined.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 130, 20000, 1 0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            (None, 510, 12)      0                                            
__________________________________________________________________________________________________
input_9 (InputLayer)            (None, 510, 2)       0                                            
__________________________________________________________________________________________________
model_1 (Model)                 (None, 510, 12)      66732       input_7[0][0]                    
                                                                 input_8[0][0]                    
          

## 4. Training

In [15]:
num_epochs = 500

# Number of iterations of training on Discriminator vs. number of iterations of training on Generator
num_trainD = 7
num_trainG = 3

music_dir = "data/matrix/"
chart_dir = "data/charts.json"
bpm_dir = "data/bpm_analysis.json"

In [16]:
def generate_minibatch(batch_size, total_size, note_size, music_dir, chart_dir):
    with open(chart_dir) as charts_data:
        charts = json.load(charts_data)
        keys = list(charts.keys())
    with open(bpm_dir) as bpm_data:
        bpms = json.load(bpm_data)
    
    music_batch = np.zeros((batch_size, input_rows, input_cols))
    chart_batch = np.zeros((batch_size, max_notes, note_size))
    mischart_batch = np.zeros((batch_size, max_notes, note_size))
    metadata_batch = np.zeros((batch_size, max_notes, metadata_size))
    
    indices = np.random.randint(low=0, high=total_size, size=batch_size)
    
    nb_classes = 9
    
    for (i, idx) in enumerate(indices):
        filename = keys[idx]
        #print(filename)
        
        # Extract music data
        with open(os.path.join(music_dir, filename + ".json")) as music_data:  
            music_raw = np.asarray(json.load(music_data))
            music_batch[i, :music_raw.shape[0], :music_raw.shape[1]] = music_raw
            
        # Extract correct chart data
        chart_raw = np.asarray(charts[filename])
        chart_batch[i, :chart_raw.shape[0], :3] = chart_raw[:, :3]
        # Key location integer to one-hot vector
        targets = [x if x != 9 else 8 for x in chart_raw[:, 3].reshape(-1).astype(int).tolist()]
        chart_batch[i, :chart_raw.shape[0], 3:] = np.eye(nb_classes)[targets]
        
        # Extract metadata data
        metadata_batch[i, :chart_raw.shape[0], :1] = chart_raw[:, 4:]
        metadata_batch[i, :chart_raw.shape[0], 1:] = 60 / bpms[filename.replace('_origin', '')]  # length for every beat
        
        # Extract mismatch chart data
        while True:
            misidx = np.random.randint(low=0, high=total_size)
            if misidx != idx:
                break
        misfilename = keys[misidx]
        mischart_raw = np.asarray(charts[misfilename])
        mischart_batch[i, :mischart_raw.shape[0], :3] = mischart_raw[:, :3]
        # Key location integer to one-hot vector
        targets = [x if x != 9 else 8 for x in mischart_raw[:, 3].reshape(-1).astype(int).tolist()]
        mischart_batch[i, :mischart_raw.shape[0], 3:] = np.eye(nb_classes)[targets]
        
    # Reshape music_batch into correct shape: (batch, input_rows, input_cols, 1)
    music_batch = music_batch[:, :, :, np.newaxis]
        
    return music_batch, chart_batch, mischart_batch, metadata_batch, np.asarray(keys)[indices]
    

In [17]:
# Generate data in each iteration to train the discriminator and generator
def generate_training_data(discriminator, generator):
    # Generate a batch of new music data and corresponding correct chart and mismatch chart. 
    #   music data shape (batch_size, input_rows, input_cols), 
    #   chart shape (batch_size, max_notes, note_size)
    #   metadata shape (batch_size, max_notes, metadata_size)
    music, chart_correct, chart_mismatch, metadata, _ = generate_minibatch(batch_size, total_size, note_size, music_dir, chart_dir)

    # Generate a batch of random noise from latent space. Noise shape (batch_size, max_notes, noise_size)
    #   Randomization according to actual range of the chart note vector entries
    noise = np.zeros((batch_size, max_notes, noise_size))
    noise[:, :, 0:2] = np.random.rand(batch_size, max_notes, 2)
    noise[:, :, 2] = np.random.randint(low=0, high=2, size=(batch_size, max_notes))
    noise[:, :, 3] = np.random.randint(low=0, high=10, size=(batch_size, max_notes))
    # Forward the music data and noise through Generator and generate fake charts
    chart_fake = generator.predict([music, noise, metadata])
    
    return music, noise, chart_correct, chart_mismatch, chart_fake, metadata

In [20]:
for epoch in range(num_epochs):
    # labels for valid music chart. shape (batch_size, max_notes, 1)
    valid_label = np.ones((batch_size, max_notes, 1))
    # labels for fake music chart. shape (batch_size, max_notes, 1)
    fake_label = np.zeros((batch_size, max_notes, 1))
    
    # Train Discriminator
    print("Training Discriminator:")
    for i in range(num_trainD):
        # generate batch training data
        music, noise, chart_correct, chart_mismatch, chart_fake, metadata = generate_training_data(discriminator, generator)

        # Loss value on matching charts
        d_loss_real = discriminator.train_on_batch([music, chart_correct, metadata], valid_label)
        # Loss value on mis-matching charts
        d_loss_mismatch = discriminator.train_on_batch([music, chart_mismatch, metadata], fake_label)
        # Loss value on fake charts
        d_loss_fake = discriminator.train_on_batch([music, chart_fake, metadata], fake_label)
        
        d_loss_real = discriminator.train_on_batch([music, chart_correct, metadata], valid_label)

        # Final loss
        d_loss = np.add(d_loss_real[0], np.add(d_loss_mismatch[0], d_loss_fake[0]) / 2)
        #print ("\tEpoch: %d D Iteration: %d [D loss: %f, acc.: %.2f%%]" % (epoch + 1, i_trainD + 1, d_loss[0], 100*d_loss[1]))
        print ("\tEpoch: %d [D loss: %f, real acc.: %.2f%%, mismatch acc.:%.2f%%, fake acc.:%.2f%%]" % (epoch + 1, d_loss, 100*d_loss_real[1], 100*d_loss_mismatch[1], 100*d_loss_fake[1]))

        
    # Train Generator
    print("Training Generator:")
    for i_trainG in range(num_trainG):
        music, noise, chart_correct, chart_mismatch, chart_fake, metadata = generate_training_data(discriminator, generator)
        g_loss = combined.train_on_batch([music, noise, metadata], valid_label)
        print("\tEpoch: %d G Iteration: %d [G loss: %f]" % (epoch + 1, i_trainG + 1, g_loss))
        
    # Plot progress
    print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]\n" % (epoch + 1, d_loss, 100*d_loss[1], g_loss))

Training Discriminator:
	Epoch: 1 [D loss: 1.400621, real acc.: 44.48%, mismatch acc.:54.05%, fake acc.:54.15%]
	Epoch: 1 [D loss: 1.398310, real acc.: 37.65%, mismatch acc.:62.09%, fake acc.:63.04%]
	Epoch: 1 [D loss: 1.396675, real acc.: 48.01%, mismatch acc.:50.26%, fake acc.:52.16%]


KeyboardInterrupt: 

## Prediction and Evaluation

In [64]:
def generate_chart(generator, music, metadata):
    noise = np.zeros((batch_size, max_notes, noise_size))
    noise[:, :, 0:2] = np.random.rand(batch_size, max_notes, 2)
    noise[:, :, 2] = np.random.randint(low=0, high=2, size=(batch_size, max_notes))
    noise[:, :, 3] = np.random.randint(low=0, high=10, size=(batch_size, max_notes))
    chart = generator.predict([music, noise, metadata])
    
    return chart

In [65]:
# Test Output from training set

test_music, test_chart, _, test_metadata, filename = generate_minibatch(1, total_size, note_size, music_dir, chart_dir)
test_generated_chart = generate_chart(generator, test_music, test_metadata)
print(filename)
print(test_generated_chart.shape)

['live397_origin']
(1, 510, 12)


In [66]:
test_generated_chart[0][10]

array([0.32011402, 0.95839703, 0.5640973 , 0.11448622, 0.11867859,
       0.06191122, 0.16747865, 0.1006614 , 0.1363398 , 0.10366485,
       0.1130338 , 0.0837455 ], dtype=float32)

In [67]:
test_chart[0][10]

array([0.11862836, 0.11862836, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 1.        ])

In [68]:
# Output to JSON file 
out_data = np.zeros((max_notes, note_size + metadata_size))
out_data[:, 0:4] = test_generated_chart.reshape((max_notes, note_size))
out_data[:, -1:] = test_metadata
out_dict = dict()
out_dict[filename[0]] = out_data.tolist()

with open("test_output/test_output.json", "w") as outfile:
    json.dump(out_dict, outfile)

ValueError: could not broadcast input array from shape (510,12) into shape (510,4)

In [69]:
def generate_judgement(discriminator, music, chart, metadata):
    judgement = discriminator.predict([music, chart, metadata])
    return judgement

In [70]:
test_judgement = generate_judgement(discriminator, test_music, test_generated_chart, test_metadata)
test_judgement

array([[[0.5493005 ],
        [0.58614075],
        [0.61067706],
        [0.62682027],
        [0.63755965],
        [0.6447999 ],
        [0.64974195],
        [0.6531483 ],
        [0.6555134 ],
        [0.65716404],
        [0.6583172 ],
        [0.65912163],
        [0.65968096],
        [0.66006786],
        [0.6603324 ],
        [0.6605083 ],
        [0.6606206 ],
        [0.6606887 ],
        [0.6607257 ],
        [0.66074115],
        [0.66074276],
        [0.6607341 ],
        [0.660719  ],
        [0.6607002 ],
        [0.6606793 ],
        [0.66065705],
        [0.66063464],
        [0.6606146 ],
        [0.66059554],
        [0.6605774 ],
        [0.6605615 ],
        [0.66054624],
        [0.6605328 ],
        [0.66052157],
        [0.6605118 ],
        [0.6605028 ],
        [0.66049427],
        [0.6604862 ],
        [0.66048026],
        [0.6604751 ],
        [0.66046953],
        [0.66046554],
        [0.6604624 ],
        [0.6604596 ],
        [0.6604572 ],
        [0

In [71]:
D_test_result = discriminator.test_on_batch([test_music, test_generated_chart, test_metadata], fake_label[0:1])
D_test_result

[1.1844616, 0.0]

In [72]:
test_correctchart_judgement = generate_judgement(discriminator, test_music, test_chart, test_metadata)
test_correctchart_judgement

array([[[0.5492484 ],
        [0.5860656 ],
        [0.610527  ],
        [0.6267769 ],
        [0.6374122 ],
        [0.6446731 ],
        [0.6495014 ],
        [0.6529652 ],
        [0.6554645 ],
        [0.6570971 ],
        [0.6581166 ],
        [0.6588166 ],
        [0.6594572 ],
        [0.659959  ],
        [0.6601979 ],
        [0.6605523 ],
        [0.6606587 ],
        [0.66074467],
        [0.6608571 ],
        [0.6608123 ],
        [0.66071755],
        [0.6605279 ],
        [0.66048396],
        [0.6606504 ],
        [0.6607105 ],
        [0.66061264],
        [0.660654  ],
        [0.66068894],
        [0.660703  ],
        [0.6606857 ],
        [0.66072285],
        [0.6607239 ],
        [0.6605789 ],
        [0.6603691 ],
        [0.6603291 ],
        [0.6603052 ],
        [0.6601801 ],
        [0.66007996],
        [0.6601496 ],
        [0.6602901 ],
        [0.66040534],
        [0.6605018 ],
        [0.6606757 ],
        [0.6607183 ],
        [0.6606931 ],
        [0

## Supervised learning for Generator

In [22]:

batch_size = 4
num_trainG = 10
num_epochs = 100


In [32]:
# Custom loss function
alpha = 100 # weight

def custom_loss(y_true, y_pred):
    # Take first two elements and calculate mean squared error
    y_pred_1 = K.slice(y_pred, start=[0, 0, 0], size=[batch_size, max_notes, 2])
    y_true_1 = K.slice(y_true, start=[0, 0, 0], size=[batch_size, max_notes, 2])
    mse = K.mean(K.square(y_pred_1 - y_true_1), axis=-1)
    
    # Take remaining elements and calculate categorical cross entropy
    #y_pred_2 = K.slice(y_pred, start=[0, 0, 2], size=[batch_size, max_notes, output_size])
    #y_true_2 = K.slice(y_true, start=[0, 0, 2], size=[batch_size, max_notes, output_size])
    #cce = K.categorical_crossentropy(y_pred_2, y_true_2, axis=-1)
    
    #loss = mse * 100 + cce
    return mse
    #return cce
    #return loss

In [33]:
optimizer = Adam(lr=0.0003)

generator.compile(loss=custom_loss, optimizer=optimizer)


In [34]:
for epoch in range(num_epochs):
    
    print("Supervised training Generator:")
    for i_trainG in range(num_trainG):
        music, chart_correct, chart_mismatch, metadata, _ = generate_minibatch(batch_size, total_size, note_size, music_dir, chart_dir)
        #music, chart_correct, chart_mismatch, metadata, _ = generate_minibatch(batch_size, 10, note_size, music_dir, chart_dir)

        noise = np.zeros((batch_size, max_notes, noise_size))
        noise[:, :, 0:2] = np.random.rand(batch_size, max_notes, 2)
        noise[:, :, 2] = np.random.randint(low=0, high=2, size=(batch_size, max_notes))
        noise[:, :, 3] = np.random.randint(low=0, high=10, size=(batch_size, max_notes))
        
        g_loss = generator.train_on_batch([music, noise, metadata], chart_correct)
        print("\tEpoch: %d G Iteration: %d [G loss: %f]" % (epoch + 1, i_trainG + 1, g_loss))
    

Supervised training Generator:
	Epoch: 1 G Iteration: 1 [G loss: 0.049120]
	Epoch: 1 G Iteration: 2 [G loss: 0.056556]
	Epoch: 1 G Iteration: 3 [G loss: 0.047802]
	Epoch: 1 G Iteration: 4 [G loss: 0.059213]
	Epoch: 1 G Iteration: 5 [G loss: 0.046745]
	Epoch: 1 G Iteration: 6 [G loss: 0.054946]


KeyboardInterrupt: 