In [1]:
import sounddevice as sd
import queue
import numpy as np
import librosa
import sys
import tensorflow as tf  # Use TensorFlow for TFLite support

In [2]:
# Constants
fs = 44100  # Sampling rate (44 kHz is standard for speech processing)
sc = 0.7    # Chunk duration in seconds
silence_threshold = 0.001  # Threshold for RMS to detect silence
chunk_size = int(fs * sc)  # Number of samples per chunk

In [3]:
# Queue for audio chunks
audio_queue = queue.Queue()

In [4]:
# Load TFLite model using TensorFlow
interpreter = tf.lite.Interpreter(model_path="C:/Users/HP/Desktop/WakeWordDetection/models/WWD.tflite")
interpreter.allocate_tensors()
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

In [5]:
# Helper function: Compute RMS (Root Mean Square) to detect silence
def compute_rms(audio_chunk):
    return np.sqrt(np.mean(audio_chunk**2))

In [6]:
# Audio callback: Capture audio in real-time
def audio_callback(indata, frames, time, status):
    if status:
        print(f"Stream status: {status}")
    audio_queue.put(indata.copy())

In [7]:
# Main processing function
def process_audio():
    while True:
        if not audio_queue.empty():
            # Get audio chunk from the queue
            buffer = audio_queue.get()

            # Ensure buffer length matches chunk size
            if len(buffer) < chunk_size:
                continue  # Skip if the buffer is incomplete

            # Compute RMS to detect silence
            rms = compute_rms(buffer)
            #print(f"RMS: {rms}")  # Debugging output for RMS

            if rms < silence_threshold:
                sys.stdout.write('-')  # Indicate silence
                continue  # Skip further processing for silence

            # Extract MFCC features
            mfcc = librosa.feature.mfcc(y=buffer.flatten(), sr=fs, n_mfcc=40)
            mfcc_processed = np.mean(mfcc.T, axis=0)  # Averaging over time steps
            mfcc_processed = mfcc_processed.reshape(40, 1)  # Reshape to (32, 1)

            # Pad or resize to match the model input shape (32, 32)
            mfcc_processed = np.pad(mfcc_processed, ((0, max(0, 32 - mfcc_processed.shape[0])), (0, 31)), 'constant')
            mfcc_processed = mfcc_processed[:32]  # Ensure exactly 32 rows
            mfcc_processed = np.expand_dims(mfcc_processed, axis=0)  # Add batch dimension
            mfcc_processed = np.expand_dims(mfcc_processed, axis=-1)  # Add channel dimension

            # Print MFCC shape for debugging
            #print(f"MFCC shape: {mfcc_processed.shape}")  

            # Run inference with TFLite model
            interpreter.set_tensor(input_details[0]['index'], mfcc_processed.astype(np.float32))
            interpreter.invoke()

            # Get prediction results
            prediction = interpreter.get_tensor(output_details[0]['index'])
            #print(f"Prediction scores: {prediction}")  # Debugging output for predictions

            # Check prediction confidence
            if prediction[0, 1] > 0.8:  # Lower threshold for testing
                #print("Wake word detected!")  # Wake Word Detected
                sys.stdout.write('1')
            else:
                #print("No wake word.")  # Wake Word NOT Detected
                sys.stdout.write('.')

In [None]:
# Entry point
if __name__ == "__main__":
    print("Listening for the wake word... Press Ctrl+C to stop.")
    try:
        # Open the audio stream
        with sd.InputStream(
            callback=audio_callback,
            channels=1,
            samplerate=fs,
            dtype='float32',
            blocksize=chunk_size
        ):
            process_audio()
    except KeyboardInterrupt:
        print("\nStopping...")
    except Exception as e:
        print(f"An error occurred: {e}")

Listening for the wake word... Press Ctrl+C to stop.
-................1..--....1..-....--.-..1.........1.....1...1--...11.1-.--.-.1-.