In [1]:
import tensorflow as tf
from tensorflow import keras
from keras import layers, models
from pathlib import Path
import os
import numpy as np

In [2]:
NP_FILEPATH = Path('midi-np/')

In [3]:
width = 0
for file in os.listdir(NP_FILEPATH):
    width += np.load(NP_FILEPATH / file).shape[0]
combined = np.zeros((width, 88))
i = 0
for file in os.listdir(NP_FILEPATH):
    arr = np.load(NP_FILEPATH / file)
    combined[i:i+len(arr), :] = arr
    i += len(arr)

In [4]:
bars = np.array([combined[j:j+48] for j in range(0, len(combined)-48+1, 48)])

In [5]:
batch_size = 32
timesteps = 48

In [6]:
class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""

    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

In [7]:
latent_dim = 256

input_layer = layers.Input(shape=(timesteps, 88))
encoder_layer_1 = layers.LSTM(512, return_sequences=True)(input_layer)
encoder_layer_2 = layers.LSTM(256, return_sequences=True)(encoder_layer_1)
encoder_layer_3 = layers.LSTM(256, return_sequences=True)(encoder_layer_2)
encoder_flattened = layers.Flatten()(encoder_layer_3)
z_mean = layers.Dense(latent_dim, name="z_mean")(encoder_flattened)
z_log_var = layers.Dense(latent_dim, name="z_log_var")(encoder_flattened)
z = Sampling()([z_mean, z_log_var])
encoder = keras.Model(input_layer, [z_mean, z_log_var, z], name="encoder")
encoder.summary()

2022-04-15 20:56:01.337815: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "encoder"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 48, 88)]     0           []                               
                                                                                                  
 lstm (LSTM)                    (None, 48, 512)      1230848     ['input_1[0][0]']                
                                                                                                  
 lstm_1 (LSTM)                  (None, 48, 256)      787456      ['lstm[0][0]']                   
                                                                                                  
 lstm_2 (LSTM)                  (None, 48, 256)      525312      ['lstm_1[0][0]']                 
                                                                                            

In [8]:
latent_inputs = keras.Input(shape=(latent_dim,))
x = layers.Dense(timesteps * 88, activation="relu")(latent_inputs)
x = layers.Reshape((timesteps, 88))(x)
decoder_layer_1 = layers.LSTM(256, return_sequences=True)(x)
decoder_layer_2 = layers.LSTM(256, return_sequences=True)(decoder_layer_1)
decoder_layer_3 = layers.LSTM(512, return_sequences=True)(decoder_layer_2)
output_layer = layers.Dense(88, activation="sigmoid")(decoder_layer_3)
decoder = keras.Model(latent_inputs, output_layer, name="decoder")
decoder.summary()

Model: "decoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 256)]             0         
                                                                 
 dense (Dense)               (None, 4224)              1085568   
                                                                 
 reshape (Reshape)           (None, 48, 88)            0         
                                                                 
 lstm_3 (LSTM)               (None, 48, 256)           353280    
                                                                 
 lstm_4 (LSTM)               (None, 48, 256)           525312    
                                                                 
 lstm_5 (LSTM)               (None, 48, 512)           1574912   
                                                                 
 dense_1 (Dense)             (None, 48, 88)            4514

In [9]:
class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(
            name="reconstruction_loss"
        )
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

    def train_step(self, data):
        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(data)
            reconstruction = self.decoder(z)
#             print(reconstruction.shape)
#             print(data.shape)
            reconstruction_loss = tf.reduce_mean(
                tf.reduce_sum(
                    keras.losses.binary_crossentropy(data, reconstruction), axis=1
                )
            )
            kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
            kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
            total_loss = reconstruction_loss + kl_loss
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)
        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }

In [10]:
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath="models/vae/")

In [15]:
vae = VAE(encoder, decoder)
vae.compile(optimizer=keras.optimizers.Adam())

In [16]:
vae.fit(bars, epochs=5, batch_size=16)

Epoch 1/5
Epoch 2/5
Epoch 3/5
 181/2377 [=>............................] - ETA: 38:38 - loss: 4.9405 - reconstruction_loss: 4.8874 - kl_loss: 7.9415e-08

KeyboardInterrupt: 

In [109]:
# latent_vec = np.random.normal(size=32)
# latent_vec = np.expand_dims(latent_vec, 0)
# x_decoded = vae.decoder.predict(latent_vec)
# x_decoded

array([[[1.6792715e-03, 1.4395416e-03, 1.3616383e-03, ...,
         1.6097128e-03, 1.9726753e-03, 1.5690327e-03],
        [2.8264523e-04, 3.0541420e-04, 1.1182722e-04, ...,
         2.1216273e-04, 3.3915043e-04, 5.9333444e-04],
        [2.0489097e-04, 2.3820996e-04, 7.8449804e-05, ...,
         1.5148520e-04, 2.6133657e-04, 4.7060847e-04],
        ...,
        [1.9216537e-04, 2.2709370e-04, 7.4026786e-05, ...,
         1.4230609e-04, 2.5025010e-04, 4.4876337e-04],
        [1.9216537e-04, 2.2712350e-04, 7.4026852e-05, ...,
         1.4230609e-04, 2.5025010e-04, 4.4876337e-04],
        [1.9216537e-04, 2.2712350e-04, 7.4026852e-05, ...,
         1.4224648e-04, 2.5025010e-04, 4.4873357e-04]]], dtype=float32)