In [1]:
import sys
import time
import sounddevice as sd
import librosa
import numpy as np
import tensorflow as tf
from scipy.io.wavfile import write

In [8]:
# Constants
fs = 44100  # Sampling frequency (Hz)
sc = 1  # Time for each chunk in seconds
silence_threshold = 0.002 # change this as you like  

In [9]:
# Load the TFLite model
tflite_model_path = "C:/Users/HP/Desktop/WakeWordDetection/models/WWD.tflite"
interpreter = tf.lite.Interpreter(model_path=tflite_model_path)
interpreter.allocate_tensors()

In [10]:
# Get input and output tensors
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

In [11]:
# Buffer for continuous audio data
buffer = np.zeros((0,))

def audio_callback(indata, frames, time, status):
    """Audio callback function for continuous data recording."""
    global buffer
    if status:
        print(status, file=sys.stderr)
    buffer = np.append(buffer, indata)  # Append new audio to the buffer; hadi bhal dak blan dial segment by segment every 2 seconds but continous in real time 


In [12]:
def compute_rms(audio_data):
    """Compute the RMS (Root Mean Square) energy of the audio signal."""
    return np.sqrt(np.mean(np.square(audio_data)))

In [None]:
print("Speak Now: ")

# Open the audio stream
with sd.InputStream(callback=audio_callback, channels=1, samplerate=fs, dtype='float32'):
    print("Recording... Press i deux fois to stop.")
    try:
        while True:
            if len(buffer) > fs * sc:  # Process audio when buffer reaches 1 second
                # Extract 1-second chunk from the buffer
                audio_chunk = buffer[:fs * sc]
                buffer = buffer[fs * sc:]  # Update buffer with the remaining data

                # Compute RMS to detect silence
                rms = compute_rms(audio_chunk)
                if rms < silence_threshold:
                    sys.stdout.write('-')  # Indicate silence
                    continue  # Skip further processing for silence

                # Load and process the audio
                mfcc = librosa.feature.mfcc(y=audio_chunk, sr=fs, n_mfcc=32)
                mfcc_processed = np.mean(mfcc.T, axis=0)  # Averaging over time steps
                mfcc_processed = mfcc_processed.reshape(32, 1)  # Reshape to (32, 1)

                # Pad or resize to match the model input shape (32, 32)
                mfcc_processed = np.pad(mfcc_processed, ((0, max(0, 32 - mfcc_processed.shape[0])), (0, 31)), 'constant')
                mfcc_processed = mfcc_processed[:32]  # Ensure exactly 32 rows
                mfcc_processed = np.expand_dims(mfcc_processed, axis=0)  # Add batch dimension
                mfcc_processed = np.expand_dims(mfcc_processed, axis=-1)  # Add channel dimension

                # Run inference with TFLite model
                interpreter.set_tensor(input_details[0]['index'], mfcc_processed.astype(np.float32))
                interpreter.invoke()

                # Get prediction results
                prediction = interpreter.get_tensor(output_details[0]['index'])

                # Check prediction confidence
                if prediction[0, 1] > 0.99:  # Assuming class 1 is "Wake Word Detected"
                    sys.stdout.write('1')  # Wake Word Detected
                else:
                    sys.stdout.write('.')  # Wake Word NOT Detected

    except KeyboardInterrupt:
        print("\nDetection stopped by user.")
    except Exception as e:
        print(f"An error occurred: {e}")

Speak Now: 
Recording... Press i deux fois to stop.
--........-.-.-.-............1........-..--.........-...........-.-...-...-.-..............-.-.-..........-.-.-.-.-.-..........-.....-.-.1......-.-..--.--....--..-..-.....-.-........-...-.-.......-....-.-...-.-.......-.-....