In [None]:
import pyaudio
import numpy as np
import json
import openl3  # Import openl3 for extracting embeddings

In [None]:
# Input path of embeddings json
input_file = "embeddings/embeddings.json"

In [None]:
# Function to calculate the mean of embeddings
def calculate_mean(embeddings):
    return np.mean([np.array(embedding) for timestamp, embedding in embeddings.items()], axis=0)

# Load the stored embedding from the JSON file
with open(input_file, 'r') as json_file:
    stored_data = json.load(json_file)

In [None]:
# Initialize PyAudio
p = pyaudio.PyAudio()

# Configure and start the audio stream
stream = p.open(format=pyaudio.paInt16,
                channels=1,
                rate=16000,  # Adjust based on your audio
                input=True,
                frames_per_buffer=1024)

try:
    while True:
        # Record audio data from the microphone
        audio_data = np.frombuffer(stream.read(1024), dtype=np.int16)

        # Extract features from the audio data using openl3
        live_embeddings, _ = openl3.get_audio_embedding(audio_data, 16000)

        # Compare live_embeddings with the stored embeddings
        best_match = None
        best_similarity = -1  # Initialize with a low value

        # for entry in stored_data:
        #     stored_embeddings = np.array(entry['embedding'])

        #     # Calculate similarity or distance (e.g., cosine similarity)
        #     similarity = np.dot(live_embeddings, stored_embeddings.T) / (
        #         np.linalg.norm(live_embeddings) * np.linalg.norm(stored_embeddings)
        #     )

        #     if similarity > best_similarity:
        #         best_similarity = similarity
        #         best_match = entry['word']

        # print("Predicted word:", best_match)

        for entry in stored_data:
            stored_word = entry["word"]
            stored_embeddings = entry["embeddings"]

            # Calculate similarity or distance (e.g., cosine similarity)
            similarity = np.dot(live_embeddings, calculate_mean(stored_embeddings).T) / (
                np.linalg.norm(live_embeddings) * np.linalg.norm(calculate_mean(stored_embeddings))
            )

            if np.any(similarity > best_similarity):  # Use np.any to check if any element is greater
                best_similarity = np.max(similarity)  # Get the maximum similarity value
                best_match = stored_word

        print("Predicted word:", best_match)

except KeyboardInterrupt:
    # Stop the audio stream when Ctrl+C is pressed
    stream.stop_stream()
    stream.close()
    p.terminate()