In [None]:
import sys
import librosa
import matplotlib.pyplot as plt
import librosa.display
import IPython.display as ipd
import os
import tensorflow as tf
import numpy as np

In [None]:
from google.colab import drive
drive.mount("/content/drive")

# Specify a filename on your drive, or alternatively upload something to the colab folder
# Something about 20 seconds long is a good start.
filename = "/content/drive/My Drive/test.wav"

# You only need to do this if you want to plot the figure
#x, sr = librosa.load(filename,sr=None)
x, sr = librosa.load(filename) # will default to 22050

print (sr)

plt.figure(figsize=(14, 5))
#librosa.display.waveplot(x, sr=sr)


# This lets you play the audio file. You can also load arrays and play those
ipd.Audio(filename) # load a local WAV file


Mounted at /content/drive
22050


<Figure size 1400x500 with 0 Axes>

In [None]:
data_tf = tf.convert_to_tensor(x, np.float32)
data_tf.shape

TensorShape([595470])

In [None]:
n = tf.signal.stft(data_tf,2048,512)

In [None]:
magnitude_spectrograms = tf.abs(n)

In [None]:
#we need to get all the fft frames and organise them into sequence batches
start = 0
sequence_length = 40
end = magnitude_spectrograms.shape[0] - sequence_length - 1
step = 1
x_frames = []
y_frames = []
for i in range(start, end, step):
    done = int(float(i) / float(end) * 100.0)
    sys.stdout.write('{}% data generation complete.   \r'.format(done))
    sys.stdout.flush()
    x = magnitude_spectrograms[i:i+sequence_length]
    y = magnitude_spectrograms[i+sequence_length]
    x_frames.append(x)
    y_frames.append(y)
x_frames = np.array(x_frames)
y_frames = np.array(y_frames)



In [None]:
learning_rate        = 0.001
amount_epochs        = 300
batch_size           = 64
loss_type            = "mse"
weight_decay         = 0.0001



# Recurrent Neural Network
rnn_type             = "lstm"
number_rnn_layers    = 2
rnn_number_units     = 128
model = tf.keras.Sequential()

model.add(tf.keras.layers.BatchNormalization(input_shape=[x_frames.shape[1], x_frames.shape[2]]))

for layer in range(number_rnn_layers):
    return_sequence = False if layer == (number_rnn_layers - 1) else True
    model.add(tf.keras.layers.LSTM(rnn_number_units, return_sequences= return_sequence))

model.add(tf.keras.layers.Dense(y_frames.shape[1]))

model.add(tf.keras.layers.Activation('linear'))
opt = tf.keras.optimizers.Adam(learning_rate)
model.compile(optimizer=opt, loss=loss_type)

# this model trains much much faster than the prior models

In [None]:
model.fit(x_frames, y_frames, batch_size=batch_size, epochs=amount_epochs)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

In [None]:
# Save your model if you want

model.save("/content/drive/My Drive/models/model_{}_{}_{}_{}_{}_{}.h5".format(sr,2048,512,rnn_type,number_rnn_layers,rnn_number_units))



In [None]:
# Also you could just load a model you've already trained

model = tf.keras.models.load_model("/content/drive/My Drive/models/model_{}_{}_{}_{}_{}_{}.h5".format(22050,2048,512,rnn_type,number_rnn_layers,rnn_number_units))


In [None]:
# Generate an audio file

import soundfile as sf
sequence_length_max = 1000
impulse_scale       = 1.0
random_chance       = 0.01
random_strength     = 0.2
window_size = 1024

dimension1 = x_frames.shape[1]
dimension2 = x_frames.shape[2]
shape = (1, dimension1, dimension2)

audio = []
random_index = np.random.randint(0, (len(x_frames) - 1))
impulse = np.array(x_frames[random_index]) * impulse_scale
predicted_magnitudes = impulse

for j in range(sequence_length_max):
    prediction = model.predict(impulse.reshape(shape))
    predicted_magnitudes = np.vstack((predicted_magnitudes, prediction))
    impulse = predicted_magnitudes[-sequence_length:]
    if (np.random.random_sample() < random_chance) :
        np.random.seed()
        random_index = np.random.randint(0, (len(x_frames) - 1))
        impulse = np.array(x_frames[random_index]) * impulse_scale

predicted_magnitudes = np.array(predicted_magnitudes).reshape(-1, window_size+1)

audio = librosa.griffinlim(predicted_magnitudes.T)

sf.write('test.wav', audio, 22050, 'PCM_24')

audio = np.array(audio)
ipd.Audio(audio, rate=sr)