In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os

import librosa # audio processing
import math

from tqdm import tqdm

In [2]:
SAMPLE_RATE = 22050
# NUM_SEGMENTS = 6
SEGMENT_SIZE = 5000
N_FFT = 2048
HOP_LENGTH = 512 
N_MFCC = 60

In [3]:
# 1. get clean segment MFCCs
# 2. for every other directory get noise segment MFCCs and append them to training set associated with target clean segment MFCC extracted earlier

clean_segment_mfccs = np.array([])
noise_segment_mfccs = np.array([])
audios_segments = [] #2d array with each row mentioning what segments are in single audio file
X = np.array([])
y = np.array([])
mfcc_shape = None
for dirname, _, filenames in os.walk('/kaggle/input/noizeus/noizeus_corpora-master/NOIZEUS/clean_noizeus/wav'):
    for filename in sorted(filenames):
        file_path = str(os.path.join(dirname, filename))
        signal, sr = librosa.load(file_path, sr = SAMPLE_RATE)
        
        for index in range(0, len(signal), SEGMENT_SIZE):
            start_sample = index
            end_sample = index + SEGMENT_SIZE
            if(end_sample > len(signal) + 1 ): # Making sure the last unfilled segment is truncated
                continue
            mfcc = librosa.feature.mfcc(y=signal[start_sample:end_sample],
                                                sr=sr,
                                                n_fft=N_FFT,
                                                n_mfcc=N_MFCC,
                                                hop_length=HOP_LENGTH)

            mfcc = mfcc.T
            mfcc = np.expand_dims(mfcc, axis = (0,3))    
            if clean_segment_mfccs.size == 0:
                clean_segment_mfccs = mfcc
            else:
                clean_segment_mfccs = np.vstack((clean_segment_mfccs, mfcc))
        

X_index = 0
for noise_dir in tqdm(os.listdir('/kaggle/input/noizeus/noizeus_corpora-master/NOIZEUS/')):
    if(noise_dir == 'clean_noizeus'): 
        continue
    data = np.array([])
    noise_segment_mfccs = np.array([])
    for dirname, _, filenames in os.walk(f'/kaggle/input/noizeus/noizeus_corpora-master/NOIZEUS/{noise_dir}/wav'):
        for filename in sorted(filenames):
            audio_segments = []
            file_path = str(os.path.join(dirname, filename))
            signal, sr = librosa.load(file_path, sr = SAMPLE_RATE)

            for segment in range(0, len(signal), SEGMENT_SIZE):
                start_sample = segment
                end_sample = segment + SEGMENT_SIZE
                if(end_sample > len(signal) + 1 ): # Making sure the last unfilled segment is truncated
                    continue
                mfcc = librosa.feature.mfcc(y=signal[start_sample:end_sample],
                                                    sr=sr,
                                                    n_fft=N_FFT,
                                                    n_mfcc=N_MFCC,
                                                    hop_length=HOP_LENGTH)
                mfcc = mfcc.T
                mfcc = np.expand_dims(mfcc, axis = (0,3))
                if noise_segment_mfccs.size == 0:
                    noise_segment_mfccs = mfcc
                else:
                    noise_segment_mfccs = np.vstack((noise_segment_mfccs, mfcc))
                audio_segments.append(X_index)
                X_index +=1
            audios_segments.append(audio_segments)
    if X.size == 0:
        X = noise_segment_mfccs
    else:
        X = np.concatenate((X, noise_segment_mfccs), axis = 0)
    if y.size == 0:
        y = clean_segment_mfccs
    else:
        y = np.concatenate((y, clean_segment_mfccs), axis = 0)

print(X.shape)
print(y.shape)

100%|██████████| 33/33 [01:43<00:00,  3.14s/it]

(10784, 10, 60, 1)
(10784, 10, 60, 1)





In [4]:
import tensorflow as tf
from tensorflow.keras import backend as K
import sys

#Loss function todo: test this later 

#Adjust the data shape to reverse the product and traspose shape to get back the MFCC vector
def adjust_shape(y):
    y = y.reshape(mfcc_shape)
    return y.T

#to make numpy soorta methods work on tf symbolic tensor
tf.experimental.numpy.experimental_enable_numpy_behavior()

def hybrid_loss(y_true, y_pred):
    
    #adjust the shape and transpose 
    y_true = adjust_shape(y_true)
    y_pred = adjust_shape(y_pred)
    # Calculate MSE loss in the MFCC domain
    mse_loss = tf.reduce_mean(tf.square(y_pred - y_true), axis=-1)

    # Function to convert MFCCs to spectrogram (within TensorFlow for efficiency)
    def mfcc_to_spectrogram(mfcc):
        # Convert MFCCs to spectrogram using inverse MFCC
        spectrogram = tf.signal.mfccs_from_log_mel_spectrograms(mfcc, power=2.0, n_fft=N_FFT, hop_length=HOP_LENGTH)
        return spectrogram

    # Convert MFCCs to spectrograms
    y_true_spectrogram = tf.map_fn(mfcc_to_spectrogram, y_true, dtype=tf.float32)
    y_pred_spectrogram = tf.map_fn(mfcc_to_spectrogram, y_pred, dtype=tf.float32)

    # Calculate spectrogram loss (without unnecessary abs)
    spectrogram_loss = tf.reduce_mean(tf.square(y_pred_spectrogram - y_true_spectrogram), axis=-1)

    # Combine MSE and spectrogram losses
    total_loss = mse_loss + spectrogram_loss

    return total_loss


2024-07-14 14:33:10.779249: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-14 14:33:10.779350: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-14 14:33:10.906049: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [5]:
from tensorflow.keras.layers import Input, Conv2D, Dense, Flatten, Reshape, Conv2DTranspose
from tensorflow.keras.models import Model
from keras.losses import MeanSquaredError

input_segment = Input(shape=(10, 60, 1))

# Encoding
encoded = Conv2D(256, (3, 3), activation='relu', padding='same', strides=2)(input_segment)
encoded = Flatten()(encoded)
encoded = Dense(128, activation='relu')(encoded)
encoded = Dense(32, activation='relu')(encoded)

# Decoding
decoded = Dense(128, activation='relu')(encoded)
decoded = Dense(5 * 30 * 256, activation='relu')(decoded)  # Adjust dimensions accordingly
decoded = Reshape((5, 30, 256))(decoded)
decoded = Conv2DTranspose(256, kernel_size=3, strides=2, activation='relu', padding='same')(decoded)
decoded = Conv2D(1, (3, 3), activation='sigmoid', padding='same')(decoded)

autoencoder = Model(input_segment, decoded)


autoencoder.compile(optimizer='adadelta', loss=MeanSquaredError)

In [6]:
autoencoder.summary()

In [7]:
# #Used from https://www.kaggle.com/code/residentmario/autoencoders
# from keras.layers import Input, Dense,  Conv2D, Conv2DTranspose, GaussianNoise
# from tensorflow.keras.layers import MaxPooling2D, UpSampling2D
# from keras.models import Model
# from keras import regularizers
# from keras.losses import MeanSquaredError

# input_segment = Input(shape=(10,60,1))
# encoded = Conv2D(256, (3,3), activation = 'relu', padding = 'same' , strides =2 ) (input_segment)
# encoded = Dense(128, activation='relu')(encoded)
# encoded = Dense(32, activation='relu')(encoded) 
# decoded = Dense(128, activation='relu')(encoded)
# decoded = Conv2DTranspose(256, kernel_size=3, strides=2, activation='relu', padding='same') (decoded)
# decoded = Dense(X.shape[1:], activation='sigmoid')(decoded)

# # this model maps an input to its reconstruction
# autoencoder = Model(input_segment, decoded)

# # get the encoder and decoder as seperate models
# # encoder
# encoder = Model(input_segment, encoded)

# # decoder
# encoded_input = Input(shape=(32,))
# decoder_layer1 = autoencoder.layers[-1]
# decoder_layer2 = autoencoder.layers[-2]
# decoder_layer3 = autoencoder.layers[-3]
# decoder = Model(encoded_input, decoder_layer1(decoder_layer2(decoder_layer3(encoded_input))))


# autoencoder.compile(optimizer='adadelta', loss=MeanSquaredError)

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 69)

In [9]:
autoencoder.fit(X_train, y_train,
                epochs=30,
                batch_size=256,
                validation_data=(X_test, y_test),
                verbose=1)

Epoch 1/30
[1m 3/30[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 38ms/step - loss: 4320.0200

I0000 00:00:1720967610.424812     112 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
W0000 00:00:1720967610.444242     112 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 169ms/step - loss: 4353.2432

W0000 00:00:1720967615.938792     110 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 245ms/step - loss: 4354.1650 - val_loss: 4366.3979
Epoch 2/30
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 43ms/step - loss: 4414.9761 - val_loss: 4365.9639
Epoch 3/30
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - loss: 4378.9199 - val_loss: 4362.1016
Epoch 4/30
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - loss: 4375.5552 - val_loss: 4352.3691
Epoch 5/30
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - loss: 4380.9448 - val_loss: 4350.4971
Epoch 6/30
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - loss: 4365.8232 - val_loss: 4349.9722
Epoch 7/30
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - loss: 4371.4111 - val_loss: 4349.7427
Epoch 8/30
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - loss: 4353.9233 - val_loss: 4349.6133
Epoch 9/3

<keras.src.callbacks.history.History at 0x79bf6e6f7010>

In [None]:
def reconstruct_signal(mfccs):
    return librosa.feature.inverse.mfcc_to_audio(mfccs.T, sr=SAMPLE_RATE, n_fft=N_FFT)

In [None]:
#generate audio for whole audio that is noisy, clean and predicted.
#for 69th audio
segment_indices =audios_segments[69]

y_pred = autoencoder.predict(X[segment_indices[0]: (segment_indices[-1] + 1)])

### Disclaimer: only listen to this if you want your ears to blead. :-( 

In [14]:
import IPython.display as ipd 

original_signal = np.hstack([reconstruct_signal(np.squeeze(segment)) for segment in X[segment_indices[0]: (segment_indices[-1] + 1)]])
predicted_signal = np.hstack([reconstruct_signal(np.squeeze(segment)) for segment in y_pred])
ipd.Audio(predicted_signal, rate = SAMPLE_RATE)

In [15]:
#where as original noisy audio is this (reconstructed one)
ipd.Audio(original_signal, rate = SAMPLE_RATE)

In [None]:
#save model 
autoencoder.save("model.keras")