In [4]:
!pip install pedalboard
!pip install -q keras-tcn --no-dependencies



In [5]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tcn import TCN, tcn_full_summary
from pedalboard.io import AudioFile
import math

In [159]:
sr = 44100

N = 400
M = 1400

In [152]:
def processSequenceToBatch(sequence, n, m):

    lookBackBuffer = np.zeros((2, m))

    seqLen = sequence.shape[1]
    nSubSequences = math.floor(seqLen/n)

    finalBatch = []

    for i in range(nSubSequences-1):
        subSeq = np.zeros((2, m+n))
        subSeq[:,:m] = lookBackBuffer
        subSeq[:,m:] = sequence[:,i*n:(i+1)*n]
        finalBatch.append(subSeq)
        lookBackBuffer = subSeq[:,n:]

    return np.array(finalBatch).transpose((0, 2, 1))

In [170]:
class nnModel:

    # choose dilation_rate d and kernel_size k in order to have d*(k-1)=M (pastSamples)

    def __init__(self, inputSamples, pastSamples):
        self.N = inputSamples
        self.M = pastSamples
        self.K = 101 # self.M / sum(dilation rates) + 1 = (1400/14)+1 = 101 in order to obtain output dimension = N (input dimension)

        input = tf.keras.Input(shape=(N+M, 2))

        x = tf.keras.layers.Conv1D(filters=32, kernel_size=self.K, dilation_rate=2, activation='tanh', kernel_initializer='random_normal')(input)
        x = tf.keras.layers.Conv1D(filters=32, kernel_size=self.K, dilation_rate=4, activation='relu', kernel_initializer='random_normal')(x)
        output = tf.keras.layers.Conv1D(filters=2, kernel_size=self.K, dilation_rate=8, activation='tanh', kernel_initializer='random_normal')(x)

        model = tf.keras.models.Model(inputs=input, outputs=output)
        model.compile(optimizer="adam", loss="mse", metrics="mse")

        self.model = model

    def getInfo(self):
        self.model.summary()

    def trainNetwork(self, x, y, batchSize, epochs, verbose=1):
        return self.model.fit(x=x, y=y, batch_size=batchSize, epochs=epochs, verbose=verbose)

    def predict(self, batchAudioStereo):
        res = self.model.predict(batchAudioStereo)
        outputL = []
        outputR = []
        res = res.transpose((0, 2, 1))
        for i in range(res.shape[0]):
            for sample in res[i, 0]:
                outputL.append(sample)
            for sample in res[i, 1]:
                outputR.append(sample)
        return np.array([np.array(outputL), np.array(outputR)])

In [135]:
'''
with AudioFile("clean.wav").resampled_to(sr) as i: cleanAudio_mono = i.read(i.frames)
with AudioFile("effect.wav").resampled_to(sr) as i: fxAudio = i.read(i.frames)

cleanAudio = np.array([cleanAudio_mono[0], cleanAudio_mono[0]])
print(cleanAudio.shape)
print(fxAudio.shape)

X_train = processSequenceToBatch(cleanAudio, N, M)
Y_train = processSequenceToBatch(fxAudio, N, M)
'''

In [171]:
neuralNetwork = nnModel(inputSamples=N, pastSamples=M)
neuralNetwork.getInfo()

Model: "model_45"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_64 (InputLayer)       [(None, 1800, 2)]         0         
                                                                 
 conv1d_112 (Conv1D)         (None, 1600, 32)          6496      
                                                                 
 conv1d_113 (Conv1D)         (None, 1200, 32)          103456    
                                                                 
 conv1d_114 (Conv1D)         (None, 400, 2)            6466      
                                                                 
Total params: 116418 (454.76 KB)
Trainable params: 116418 (454.76 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [172]:
with AudioFile("loop.wav").resampled_to(sr) as i: loop_mono = i.read(i.frames)
loop = np.array([loop_mono[0], loop_mono[0]])
loop = processSequenceToBatch(loop, N, M)

output = neuralNetwork.predict(loop)

with AudioFile('processed-output.wav', 'w', sr, output.shape[0]) as f: f.write(output)

