In [1]:
pip install SpeechRecognition

Collecting SpeechRecognition
  Downloading speechrecognition-3.14.2-py3-none-any.whl.metadata (30 kB)
Downloading speechrecognition-3.14.2-py3-none-any.whl (32.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.14.2
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install noisereduce

Collecting noisereduce
  Downloading noisereduce-3.0.3-py3-none-any.whl.metadata (14 kB)
Downloading noisereduce-3.0.3-py3-none-any.whl (22 kB)
Installing collected packages: noisereduce
Successfully installed noisereduce-3.0.3
Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import soundfile as sf
import noisereduce as nr
import scipy.signal as signal
from scipy.signal import butter, lfilter
import speech_recognition as spr
from scipy.io import wavfile
import IPython.display as ipd

In [20]:
voice_command, sample_rate = librosa.load('/kaggle/input/testaudios/audio6.wav')

In [22]:
carrier_freq = 30000 
t = np.arange(len(voice_command)) / sample_rate  

carrier_wave = np.cos(2 * np.pi * carrier_freq * t)

modulated_signal = voice_command * carrier_wave

modulated_signal = modulated_signal / np.max(np.abs(modulated_signal))
sf.write("/kaggle/working/modulated_audio6.wav",modulated_signal, sample_rate)

# MICROPHONE SIMULATION

### AUDIO PREPROCESSING

In [5]:
TARGET_LENGTH = 1.5

def preprocess_audio(file_path, target_length=TARGET_LENGTH):
    sr = None
    y, sr = librosa.load(file_path, sr=sr)
    #print(f'{sr}')

    y_denoised = nr.reduce_noise(y=y, sr=sr)
    y_denoised = np.clip(y_denoised * 32767, -32768, 32767).astype(np.int16)

    non_silent_intervals = librosa.effects.split(y_denoised, top_db=35)

    y_non_silent = np.concatenate([y_denoised[start:end] for start, end in non_silent_intervals])

    target_samples = int(sr * target_length)  
    if len(y_non_silent) > target_samples:
        y_final = y_non_silent[:target_samples] 
    else:
        y_final = np.pad(y_non_silent, (0, max(0, target_samples - len(y_non_silent))), mode="constant")  

    return y_final, sr, y

### SPECTROGRAM GENERATION

In [6]:
def generate_spectrogram(audio_path, sr):
    audio, sr = librosa.load(audio_path, sr=None)
    #print(f"SR: {sr}")
    if sr >= 96000:
        audio = audio[48000:96000]
        print("$$")
        window_size = 4096  
    else:
        audio = audio[8000:16000]
        window_size = 1024 


    window = np.hanning(window_size)
    stft = librosa.stft(audio, n_fft=window_size, hop_length=512, window=window)
    out = 2 * np.abs(stft) / np.sum(window)

    from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas

    fig = plt.Figure()
    canvas = FigureCanvas(fig)
    ax = fig.add_subplot(111)

    p = librosa.display.specshow(librosa.amplitude_to_db(out, ref=np.max),sr=sr, ax=ax, y_axis='log', x_axis='time')

    ax.axis('off')

    fig.savefig("/kaggle/working/spectrogram.png", bbox_inches='tight', pad_inches=0)
    #return out

### PREPROCESSING SPECTROGRAM

In [7]:
def preprocess_image(image_path, target_size=(95, 128)):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_png(img, channels=1)
    img = tf.image.resize(img, target_size)
    img = tf.image.per_image_standardization(img)
    img = tf.expand_dims(img, axis=0)
    return img

### MODEL

In [8]:
class CVAE(tf.keras.Model):
    def __init__(self, input_shape=(95, 128, 1), latent_dim=16):
        super(CVAE, self).__init__()
        self.input_shape = input_shape
        self.latent_dim = latent_dim
        
        self.encoder = tf.keras.Sequential([
            tf.keras.layers.Input(shape=input_shape),
            tf.keras.layers.Conv2D(32, 3, activation='relu', strides=2, padding='same'),  
            tf.keras.layers.Conv2D(64, 3, activation='relu', strides=2, padding='same'),  
            tf.keras.layers.Conv2D(128, 3, activation='relu', strides=2, padding='same'), 
            tf.keras.layers.Flatten(),  
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.Dense(latent_dim + latent_dim)  
        ])
        
        self.decoder = tf.keras.Sequential([
            tf.keras.layers.Input(shape=(latent_dim,)),
            tf.keras.layers.Dense(12 * 16 * 128, activation='relu'),  
            tf.keras.layers.Reshape((12, 16, 128)),
            tf.keras.layers.Conv2DTranspose(64, 3, activation='relu', strides=2, padding='same'),  
            tf.keras.layers.Conv2DTranspose(32, 3, activation='relu', strides=2, padding='same'),  
            tf.keras.layers.Conv2DTranspose(1, 3, activation='linear', strides=2, padding='same')  
        ])

    def encode(self, x):
        mean_log_var = self.encoder(x)
        mean, log_var = tf.split(mean_log_var, num_or_size_splits=2, axis=1)
        return mean, log_var
    
    def reparameterize(self, mean, log_var):
        eps = tf.random.normal(shape=mean.shape)
        return eps * tf.exp(log_var * 0.5) + mean
    
    def decode(self, z):
        recon = self.decoder(z)
        return recon[:, :95, :, :]  
    
    def call(self, inputs):
        mean, log_var = self.encode(inputs)
        z = self.reparameterize(mean, log_var)
        return self.decode(z)

    @classmethod
    def from_config(cls, config):
        input_shape = config.get('input_shape', (95, 128, 1))
        latent_dim = config.get('latent_dim', 16)
        return cls(input_shape=input_shape, latent_dim=latent_dim)

### LOADING TRAINED MODEL AND THRESHOLD

In [9]:
model = tf.keras.models.load_model("/kaggle/input/conv_vae9565/tensorflow2/default/1/convvae_model_9565.h5", custom_objects={'CVAE': CVAE})
print("Model loaded successfully")

Model loaded successfully


In [10]:
threshold = np.load("/kaggle/input/threshold-new/threshold_9565.npy")
print(f"Threshold loaded: {threshold}")

Threshold loaded: 1.006987601518631


In [11]:
model.summary()

### LOW-PASS FILTERING

In [12]:
def butter_lowpass(cutoff, fs, order=5):
    nyquist = 0.5 * fs
    normal_cutoff = cutoff / nyquist
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    return b, a

def lowpass_filter(data, cutoff, fs, order=5):
    b, a = butter_lowpass(cutoff, fs, order=order)
    y = lfilter(b, a, data)
    return y

### DEMODULATION

In [13]:
def demodulation(filtered_signal, sample_rate, audio):
    carrier_freq = 30000  
    t = np.arange(len(audio)) / sample_rate 
    
    carrier_wave = np.cos(2 * np.pi * carrier_freq * t)
    
    demodulated_signal = filtered_signal * carrier_wave
    
    demodulated_signal = demodulated_signal / np.max(np.abs(demodulated_signal))
    
    demodulated_filtered_signal = lowpass_filter(demodulated_signal, 5000, sample_rate)

    sf.write('demodulated_command_new.wav', demodulated_filtered_signal, sample_rate)

### DECODING THE EMBEDDED COMMAND

In [14]:
def decode_command():
    recognizer = spr.Recognizer()
    
    with spr.AudioFile('/kaggle/working/demodulated_command_new.wav') as source:
        audio = recognizer.record(source)
    
    try:
        command_text = recognizer.recognize_google(audio)
        print(f"Recognized Command: {command_text}")
    except spr.UnknownValueError:
        print("Could not understand the audio")
    except spr.RequestError as e:
        print(f"Error with the recognition service: {e}")

### MICROPHONE SIMULATION FUNCTION

In [29]:
def microphone_simulation(input_path):
    preprocessed, sr, audio = preprocess_audio(input_path)
    sf.write("/kaggle/working/preprocessed_audio.wav", preprocessed, sr)
    generate_spectrogram('/kaggle/working/preprocessed_audio.wav', sr)
    processed_spectrogram = preprocess_image("/kaggle/working/spectrogram.png")

    recon = model(processed_spectrogram)
    recon_error = tf.reduce_mean(tf.keras.losses.mse(processed_spectrogram, recon), axis=[1, 2]).numpy()[0]
    classification = "Inaudible (Anomaly)" if recon_error >= threshold else "Audible (Normal)"
    print(f"Reconstruction Error: {recon_error:.4f}")
    print(f"Threshold: {threshold:.4f}")
    print(f"\nCLASSIFICATION: {classification}")

    if classification == "Inaudible (Anomaly)":
        print("ALERT - Inaudible command detected!!")
        alert_sound_path = "/kaggle/input/alertaudio/siren-alert-96052.wav"
        audio_widget = ipd.Audio(alert_sound_path, autoplay=True)
        display(audio_widget)
        mic_cutoff_freq = 5000  
        filtered_signal = lowpass_filter(audio, mic_cutoff_freq, sr)
        demodulation(filtered_signal, sr, audio)
        decode_command()
    else:
        print("Everything is good!")

In [27]:
#input_path = "/kaggle/input/testaudios/audio1.wav"
input_path = "/kaggle/working/modulated_audio6.wav"

In [30]:
microphone_simulation(input_path)

Reconstruction Error: 1.6060
Threshold: 1.0070

CLASSIFICATION: Inaudible (Anomaly)
ALERT - Inaudible command detected!!


Recognized Command: OK Google restart my phone now
