# Neural Network for Speech Enhancement Using Short-Time Fourier Transform (STFT)

In [8]:
import scipy.io
import librosa
import numpy as np
import scipy.io.wavfile as wav
import matplotlib.pyplot as plt
from IPython.display import Audio

def STFT(x):
    hop = 512
    win = 1024
    D = librosa.stft(x, n_fft=1024, hop_length=hop, win_length=win, window='hann')
    return D


def M_matrix(S, N):
    M = np.zeros((len(S), len(S[0])))
    for i in range(len(S)):
        for j in range(len(S[0])):
            if S[i][j] > N[i][j]:
                M[i][j] = 1
            else:
                M[i][j] = 0

    return M


def W_init(rows, col):
    input_dim = rows
    output_dim = col

    std_dev = np.sqrt(2 / (input_dim + output_dim))
    W = np.random.randn(input_dim, output_dim) * std_dev

    return W


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def sigmoid_prime(x):
    y = sigmoid(x)
    return y * (1 - y)
    

def forward_pass(W, X):
    Y = np.dot(W.T, X)
    return Y


if __name__ == '__main__':
    s, sr = librosa.load('/content/NN_trs.wav', sr=None)
    n, sr = librosa.load('/content/NN_trn.wav', sr=None)
    tx, sr = librosa.load('/content/tex.wav', sr=None)
    ts, sr = librosa.load('/content/tes.wav', sr=None)

    x = s + n
    S = STFT(s)
    N = STFT(n)
    X = STFT(x)
    S_mag = np.abs(S)
    N_mag = np.abs(N)
    X_mag = np.abs(X)
    target = M_matrix(S_mag, N_mag) ##  input_data = X_mag

    input_size = 513
    hidden_size = 50
    output_size = 513

    # Initialize the weights and biases xavier initialization
    weights1 = np.random.randn(input_size, hidden_size)
    biases1 = np.random.randn(hidden_size, 1)
    weights2 = np.random.randn(hidden_size, output_size)
    biases2 = np.random.randn(output_size, 1)

    # Set the learning rate and number of iterations
    l_rate = 0.001
    tol = 1e-4
    epochs = 5000

    # Train the neural network using backpropagation
    for i in range(epochs):
        # Forward pass
        h = np.dot(weights1.T, X_mag) + biases1
        h_activated = sigmoid(h)
        out = np.dot(weights2.T, h_activated) + biases2
        out_activated = sigmoid(out)

        # Calculate the error and its derivative
        loss = target - out_activated
        error1 = loss * sigmoid_prime(out)

        if np.abs(np.mean(loss)) < tol:
            break

        # Backward pass
        h_error = np.dot(weights2, error1)
        h_error1 = h_error * sigmoid_prime(h)

        # Update the weights and biases
        weights2 += l_rate * np.dot(h_activated, error1.T)
        biases2 += l_rate * np.sum(error1, axis=1, keepdims=True)
        weights1 += l_rate * np.dot(X_mag, h_error1.T)
        biases1 += l_rate * np.sum(h_error1, axis=1, keepdims=True)

    TeX = STFT(tx)
    TeX_mag = np.abs(TeX)

    h = np.dot(weights1.T, TeX_mag) + biases1
    h_activated = sigmoid(h)
    out = np.dot(weights2.T, h_activated) + biases2
    out_activated = sigmoid(out)

    result = TeX * out_activated

    s_hat = librosa.istft(result, hop_length=512, win_length=1024, window='hann')

    ### SNR
    ts = ts[0:len(s_hat)]

    num = np.dot(ts.T, ts)
    den = np.dot((ts - s_hat).T, (ts - s_hat))

    SNR = 10 * np.log(num / den)

    print("The value of the SNR is ", SNR)

    # Audio files 
    print("\n Noisy Signal Signal \n")
    display(Audio(data=tx, rate=sr))

    print("\n Clean Signal \n")
    display(Audio(data=s_hat, rate=sr))
  

The value of the SNR is  12.937820169657716

 Noisy Signal Signal 




 Clean Signal 

