## Noise cancellation using variational auto encoder

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os

import librosa # audio processing
import math


from tqdm import tqdm

In [2]:
SAMPLE_RATE = 22050
FRAME_SIZE = 512
HOP_LENGTH = 256

In [3]:
#finding the duration of each signal to see how much we can trim all audio files. 
durations = []
for dirname, _, filenames in os.walk('/kaggle/input/noizeus_corpora-master/NOIZEUS/clean_noizeus/wav'):
    for filename in sorted(filenames):
        file_path = str(os.path.join(dirname, filename))
        durations.append(librosa.get_duration(filename= file_path))
print(sorted(durations))

	This alias will be removed in version 1.0.
  durations.append(librosa.get_duration(filename= file_path))


[2.116, 2.203375, 2.245875, 2.283625, 2.327625, 2.403875, 2.435875, 2.49175, 2.514375, 2.538375, 2.5395, 2.551125, 2.594, 2.63575, 2.639875, 2.651125, 2.66725, 2.671375, 2.69775, 2.725125, 2.74775, 2.816125, 2.8455, 2.915375, 2.9295, 2.930375, 2.93325, 3.009625, 3.471, 3.508]


#### Duration of each signal will be 2.2.. if it is less than 2 then it will be padded and greater than 2.2 are trimmed.

In [4]:
DURATION = 2.20 # IN SECONDS 
SAMPLES_DURATION = int(DURATION * SAMPLE_RATE)

def pad_signal(signal): 
    return np.pad(signal, (0, SAMPLES_DURATION - len(signal)), mode = "constant") #add 0 to the right side. 
def log_spectogram(signal):
    stft = librosa.stft(signal,
                       n_fft = FRAME_SIZE,
                       hop_length = HOP_LENGTH) [:-1]
    spectogram = np.abs(stft)
    log_spectogram = librosa.amplitude_to_db(spectogram)
    return log_spectogram

def min_max_normalizer(array):
    max_element = 1
    min_element = 0
    array_min = array.min()
    array_max = array.max()
    norm_array = (array - array_min) / (array_max - array_min)
    norm_array = norm_array * (max_element -  min_element) + min_element
    return norm_array, array_min, array_max

def min_max_denormalizer(norm_array , original_min, original_max):
    max_element = 1
    min_element = 0
    array = (norm_array - max_element) / (max_element - min_element)
    array = array * (original_max - original_min) + original_min
    return array


In [5]:
clean_audios = []
noise_audios = []
X = []
y = []
audio_min_max = {}
for dirname, _, filenames in os.walk('/kaggle/input/noizeus_corpora-master/NOIZEUS/clean_noizeus/wav'):
    for filename in sorted(filenames):
        file_path = str(os.path.join(dirname, filename))
        signal, sr = librosa.load(file_path, sr = SAMPLE_RATE)
        signal = signal[:SAMPLES_DURATION]
        if(len(signal) < SAMPLES_DURATION):
            signal = pad_signal(signal) #pad signal 
        feature = log_spectogram(signal) #num_freq_bins, #num_time_frames
        normalized_features, array_min, array_max = min_max_normalizer(feature)
        clean_audios.append(normalized_features)
        audio_min_max[filename] = {
            'min': array_min, 
            'max': array_max
        }
for noise_dir in tqdm(os.listdir('/kaggle/input/noizeus_corpora-master/NOIZEUS/')):
    if(noise_dir == 'clean_noizeus'): 
        continue
    data = []
    noise_audios = []
    for dirname, _, filenames in os.walk(f'/kaggle/input/noizeus_corpora-master/NOIZEUS/{noise_dir}/wav'):
        for filename in sorted(filenames):
            file_path = str(os.path.join(dirname, filename))
            signal, sr = librosa.load(file_path, sr = SAMPLE_RATE)
            signal = signal[:SAMPLES_DURATION]
            if(len(signal) < SAMPLES_DURATION):
                signal = pad_signal(signal) #pad signal 
            feature = log_spectogram(signal) #num_freq_bins, #num_time_frames
            normalized_features, array_min, array_max = min_max_normalizer(feature)
            
            noise_audios.append(normalized_features)
            audio_min_max[filename] = {
                'min': array_min, 
                'max': array_max
            }
    X = X + noise_audios
    y = y + clean_audios

X = np.array(X)
X = X[..., np.newaxis]
y = np.array(y)
y = y[..., np.newaxis]

print(X.shape)
print(y.shape)        

100%|██████████| 33/33 [00:08<00:00,  3.99it/s]

(960, 256, 190, 1)
(960, 256, 190, 1)





In [6]:
import tensorflow as tf

# Parameters for spectrogram conversion
N_FFT = 1024
HOP_LENGTH = 512

# # Show print statements when there is an issue with the code
tf.config.run_functions_eagerly(True)


def compute_spectrogram(signal):
    # Compute the Short-Time Fourier Transform (STFT)
    stft = tf.signal.stft(signal, frame_length=N_FFT, frame_step=HOP_LENGTH)
    # Compute the magnitude of the STFT
    spectrogram = tf.abs(stft)
    return spectrogram

def hybrid_loss(y_true, y_pred):
    # Compute spectrograms
    y_true_spectrogram = compute_spectrogram(y_true)
    y_pred_spectrogram = compute_spectrogram(y_pred)

    # Calculate MSE loss in the spectrogram domain
    spectrogram_loss = tf.reduce_mean(tf.square(tf.abs(y_pred_spectrogram) - tf.abs(y_true_spectrogram)), axis=[1, 2])


    # Optionally, calculate MSE loss in the time domain (on the raw signals)
    time_domain_loss = tf.reduce_mean(tf.square(y_pred - y_true), axis=-1)

    # Combine spectrogram and time domain losses
    total_loss = spectrogram_loss + time_domain_loss
    return total_loss

2024-07-19 18:39:49.507418: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-19 18:39:49.507523: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-19 18:39:49.632711: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [32]:
from tensorflow.keras.layers import Layer
def combined_loss(y_target, y_predicted):
    reconstruction_l = reconstruction_loss(y_target, y_predicted)
    kl = KLLossLayer()([mu, log_variance])  # Use the custom Keras layer
    combined_loss = 1000000 * reconstruction_l + kl
    return combined_loss

def reconstruction_loss(y_target, y_predicted):
    tf.print(y_target.shape, " here ", y_predicted.shape)
    error = y_target - y_predicted
    reconstruction_loss = K.mean(K.square(error), axis=[1, 2, 3])
    return reconstruction_loss

class KLLossLayer(Layer):
    def call(self, inputs):
        mu, log_variance = inputs
        kl = -0.5 * K.sum(1 + log_variance - K.square(mu) - K.exp(log_variance), axis=1)
        return kl

In [33]:

def sample_point_from_normal_distribution(args):
    mu, log_variance = args
    epsilon = K.random_normal(shape=K.shape(mu), mean=0.,
                              stddev=1.)
    sampled_point = mu + K.exp(log_variance / 2) * epsilon
    return sampled_point

In [34]:
#Used from https://www.kaggle.com/code/residentmario/autoencoders
from keras.layers import Input, Dense,  Conv2D, Conv2DTranspose, GaussianNoise, ReLU, BatchNormalization, Flatten, Lambda, Reshape, Activation, Cropping2D
from tensorflow.keras.layers import MaxPooling2D, UpSampling2D
from keras.models import Model
from keras import regularizers
from keras.losses import MeanSquaredError
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import Adam

encoder_input = Input(shape=X.shape[1:])
encoder = Conv2D(
            filters=512,
            kernel_size=3,
            strides=2,
            padding="same"
        ) (encoder_input)
encoder = ReLU()(encoder)
encoder = BatchNormalization()(encoder)

encoder = Conv2D(
            filters=256,
            kernel_size=3,
            strides=2,
            padding="same"
        ) (encoder)
encoder = ReLU()(encoder)
encoder = BatchNormalization()(encoder)

encoder = Conv2D(
            filters=128,
            kernel_size=3,
            strides=2,
            padding="same"
        ) (encoder)
encoder = ReLU()(encoder)
encoder = BatchNormalization()(encoder)

encoder = Conv2D(
            filters=64,
            kernel_size=3,
            strides=(2),
            padding="same"
        ) (encoder)
encoder = ReLU()(encoder)
encoder = BatchNormalization()(encoder)

encoder = Conv2D(
            filters=32,
            kernel_size=3,
            strides=(2,1),
            padding="same"
        ) (encoder)
encoder = ReLU()(encoder)
encoder = BatchNormalization()(encoder)

_shape_before_bottleneck = K.int_shape(encoder)[1:]
print("Shape before bottleneck is {}".format(_shape_before_bottleneck))
encoder = Flatten()(encoder)
mu = Dense(128)(encoder)
log_variance = Dense(128)(encoder)



bottleneck = Lambda(sample_point_from_normal_distribution,
           name="encoder_output")([mu, log_variance])


encoder = Model(encoder_input, bottleneck)

decoder_input = Input(shape=(128,), name="decoder_input")

num_neurons = np.prod(_shape_before_bottleneck)
dense_layer = Dense(num_neurons, name="decoder_dense")(decoder_input)
reshape_layer = Reshape(_shape_before_bottleneck)(dense_layer)

decoder = Conv2DTranspose(
            filters=32,
            kernel_size=3,
            strides=(2,1),
            padding="same",
        ) (reshape_layer)
decoder = ReLU()(decoder)
decoder = BatchNormalization()(decoder)

decoder = Conv2DTranspose(
            filters=64,
            kernel_size=3,
            strides=2,
            padding="same",
        ) (decoder)
decoder = ReLU()(decoder)
decoder = BatchNormalization()(decoder)

decoder = Conv2DTranspose(
            filters=128,
            kernel_size=3,
            strides=2,
            padding="same",
        ) (decoder)
decoder = ReLU()(decoder)
decoder = BatchNormalization()(decoder)

decoder = Conv2DTranspose(
            filters=256,
            kernel_size=3,
            strides=2,
            padding="same",
        ) (decoder)
decoder = ReLU()(decoder)
decoder = BatchNormalization()(decoder)



decoder = Conv2DTranspose(
            filters=1,
            kernel_size=3,
            strides=2,
            padding="same"
        ) (decoder)
decoder = Activation("sigmoid")(decoder)

decoder_output = Cropping2D(cropping=((0, 0), (1, 0)), data_format=None)(decoder) # this is the added step


decoder = Model(decoder_input, decoder_output, name="decoder")


model_input = encoder_input
model_output = decoder(encoder(model_input))
autoencoder = Model(model_input, model_output, name = "autoencoder")




# encoded = Dense(3000, activation='relu')(input_segment)
# encoded = Dense(1000, activation='relu')(encoded)
# encoded = Dense(128, activation='relu')(encoded)
# encoded = Dense(32, activation='relu')(encoded) 
# decoded = Dense(128, activation='relu')(encoded)
# decoded = Dense(1000, activation='relu')(decoded)
# decoded = Dense(3000, activation='relu')(decoded)
# decoded = Dense(5000, activation='sigmoid')(decoded)

# # this model maps an input to its reconstruction
# autoencoder = Model(input_segment, decoded)

optimizer = Adam(learning_rate = 0.0005)
autoencoder.compile(optimizer=optimizer, loss=combined_loss)

Shape before bottleneck is (8, 12, 32)


In [35]:
autoencoder.summary()

In [36]:
encoder.summary()

In [37]:
decoder.summary()

In [38]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 69)

In [40]:
from tensorflow.keras import mixed_precision


autoencoder.fit(X_train, y_train,
                epochs=150,
                batch_size=32,
                validation_data=(X_test, y_test),
                verbose=1)

Epoch 1/150


ResourceExhaustedError: Exception encountered when calling Conv2D.call().

[1m{{function_node __wrapped__Conv2D_device_/job:localhost/replica:0/task:0/device:GPU:0}} OOM when allocating tensor with shape[32,512,128,95] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:Conv2D][0m

Arguments received by Conv2D.call():
  • inputs=tf.Tensor(shape=(32, 256, 190, 1), dtype=float32)