https://www.analyticsvidhya.com/blog/2022/03/audio-denoiser-a-speech-enhancement-deep-learning-model/

https://towardsdatascience.com/40-open-source-audio-datasets-for-ml-59dc39d48f06

# Importing Libraries 

In [1]:
import numpy as np
import librosa
import soundfile as sf
import tensorflow as tf

# Load the input audio file

In [2]:
input_file = 'mixkit-small-group-cheer-and-applause-518.wav'
audio, sr = librosa.load(input_file, sr=None)

# Split the audio into overlapping frames

In [3]:
frame_length = 1024
hop_length = 256
frames = librosa.util.frame(audio, frame_length=frame_length, hop_length=hop_length).T

# Denoiser Architecture 

In [4]:
class Denoiser(tf.keras.Model):
    def __init__(self):
        super(Denoiser, self).__init__()
        self.encoder = tf.keras.Sequential([
            tf.keras.layers.Input(shape=(None,)),
            tf.keras.layers.Reshape((-1, 1)),
            tf.keras.layers.Conv1D(32, 3, padding='same', activation='relu'),
            tf.keras.layers.Conv1D(64, 3, padding='same', activation='relu', strides=2),
            tf.keras.layers.Conv1D(128, 3, padding='same', activation='relu', strides=2),
            tf.keras.layers.Conv1D(256, 3, padding='same', activation='relu', strides=2),
        ])
        
        self.decoder = tf.keras.Sequential([
            tf.keras.layers.Conv1DTranspose(256, 3, padding='same', activation='relu', strides=2),
            tf.keras.layers.Conv1DTranspose(128, 3, padding='same', activation='relu', strides=2),
            tf.keras.layers.Conv1DTranspose(64, 3, padding='same', activation='relu', strides=2),
            tf.keras.layers.Conv1DTranspose(1, 3, padding='same', activation='sigmoid'),
            tf.keras.layers.Reshape((-1,))
        ])

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [5]:
# Load the pre-trained model
model = Denoiser()

# Apply noise cancellation to each frame

In [6]:
processed_frames = []
for frame in frames:
    # Normalize the frame
    normalized_frame = frame / np.max(np.abs(frame))
    
    # Apply noise cancellation using the model
    processed_frame = model.predict(np.expand_dims(normalized_frame, axis=0))[0]
    
    # Denormalize the processed frame
    processed_frame *= np.max(np.abs(frame))    
    processed_frames.append(processed_frame)

























# Adjust the frames to align them

In [7]:
adjusted_frames = []
for i, frame in enumerate(processed_frames):
    start = i * hop_length
    end = start + frame_length
    adjusted_frame = np.zeros(frame_length)
    adjusted_frame[:len(frame)] = frame
    adjusted_frames.append(adjusted_frame)

# Combine the adjusted frames
processed_audio = np.concatenate(adjusted_frames)

# Save the processed audio to a file

In [8]:
output_file = 'processed_audio.wav'
sf.write(output_file, processed_audio, sr)