In [1]:
import os

# Define the target directory (change yours)
target_directory = r"C:\Users\pablosal\Desktop\gbbai-azure-ai-speech-services"

# Check if the directory exists
if os.path.exists(target_directory):
    # Change the current working directory
    os.chdir(target_directory)
    print(f"Directory changed to {os.getcwd()}")
else:
    print(f"Directory {target_directory} does not exist.")

Directory changed to C:\Users\pablosal\Desktop\gbbai-azure-ai-speech-services


In [2]:
import azure.cognitiveservices.speech as speechsdk
import numpy as np

In [87]:
AUDIO_FILE_WAV = "C://Users//pablosal//Desktop//gbbai-azure-ai-speech-services//utils//audio_data//7.wav"
AUDIO_FILE_PCM_MONO = "C://Users//pablosal//Desktop//gbbai-azure-ai-speech-services//utils//audio_data//aboutSpeechSdk.wav"
AUDIO_FILE_PCM_STEREO_PROCESSED = "C://Users//pablosal//Desktop//gbbai-azure-ai-speech-services//utils//audio_data//7.pcm"
AUDIO_FILE_PCM_STEREO = "C://Users//pablosal//Desktop//gbbai-azure-ai-speech-services//utils//audio_data//d6a35a5e-be01-40cd-b9ef-d61fcda699fa.pcm"
AUDIO_FILE_PCM_STEREO_MONO = "C://Users//pablosal//Desktop//gbbai-azure-ai-speech-services//utils//audio_data//d6a35a5e-be01-40cd-b9ef-d61fcda699fa.wav"
COUNTDOWN_FILE_MONO = "C://Users//pablosal//Desktop//gbbai-azure-ai-speech-services//utils//audio_data//andthewinner.wav"

In [86]:
convert_to_pcm16(AUDIO_FILE_PCM_STEREO, AUDIO_FILE_PCM_STEREO_MONO)

In [85]:
def convert_to_pcm16(input_file_path, output_file_path):
    """
    Converts an audio file to PCM 16-bit format.

    Args:
    input_file_path (str): Path to the input WAV file.
    output_file_path (str): Path to save the output PCM 16-bit WAV file.
    """
    with wave.open(input_file_path, "rb") as wav:
        # Read audio data
        n_channels = wav.getnchannels()
        sample_width = wav.getsampwidth()
        framerate = wav.getframerate()
        n_frames = wav.getnframes()
        audio_data = wav.readframes(n_frames)

        # Convert audio data to numpy array
        if sample_width == 1:  # 8-bit PCM
            data = np.frombuffer(audio_data, dtype=np.uint8) - 128
        elif sample_width == 2:  # 16-bit PCM
            data = np.frombuffer(audio_data, dtype=np.int16)
        else:
            raise ValueError("Unsupported sample width: {}".format(sample_width))

        # Convert to mono if stereo
        if n_channels == 2:
            data = data.reshape(-1, 2).mean(axis=1).astype(np.int16)

    # Write data to a new 16-bit PCM WAV file
    with wave.open(output_file_path, "wb") as mono_wav:
        mono_wav.setnchannels(1)  # Mono
        mono_wav.setsampwidth(2)  # 16-bit samples
        mono_wav.setframerate(framerate)
        mono_wav.writeframes(data.tobytes())

In [4]:
speech_key = os.getenv("SPEECH_KEY")
service_region = os.getenv("SPEECH_REGION")

In [5]:
import wave

In [6]:
from utils.ml_logging import get_logger

logger = get_logger()

In [64]:
def check_audio_file(file_path):
    """
    Checks the format of the audio stream from the provided WAV file and logs the details.
    Returns False if any of the required conditions are not met. Otherwise, returns True.

    Required conditions for the audio format:
    - PCM format (int-16, signed)
    - One channel (mono)
    - 16 bits per sample
    - 8,000 or 16,000 samples per second (16,000 bytes or 32,000 bytes per second)
    - Two-block aligned (16 bits including padding for a sample)

    Parameters:
    file_path (str): Path to the WAV file to be checked.
    """
    with wave.open(file_path, "rb") as wav_file:
        (
            n_channels,
            sampwidth,
            framerate,
            nframes,
            comptype,
            compname,
        ) = wav_file.getparams()

        # Check PCM format (int-16)
        is_pcm_format = comptype == "NONE" and sampwidth == 2
        logger.info(f"PCM Format (int-16): {is_pcm_format}")

        # Check if it's mono
        is_mono = n_channels == 1
        logger.info(f"One Channel (Mono): {is_mono}")

        # Check sample rate
        is_valid_sample_rate = framerate in [8000, 16000]
        logger.info(f"Valid Sample Rate (8000 or 16000 Hz): {is_valid_sample_rate}")

        # Calculate bytes per second
        bytes_per_second = framerate * sampwidth * n_channels
        logger.info(f"Bytes Per Second (16000 or 32000): {bytes_per_second}")

        # Check two-block alignment
        is_two_block_aligned = wav_file.getsampwidth() * n_channels == 2
        logger.info(f"Two-block Aligned: {is_two_block_aligned}")

        # Return False if any condition is not met
        return (
            is_pcm_format and is_mono and is_valid_sample_rate and is_two_block_aligned
        )

In [66]:
check_audio_file(AUDIO_FILE_PCM_MONO)

INFO:__main__:PCM Format (int-16): True
INFO:__main__:One Channel (Mono): True
INFO:__main__:Valid Sample Rate (8000 or 16000 Hz): True
INFO:__main__:Bytes Per Second (16000 or 32000): 32000
INFO:__main__:Two-block Aligned: True


True

In [88]:
check_audio_file(AUDIO_FILE_PCM_STEREO_MONO)

Error: unknown format: 7

In [82]:
check_audio_file(COUNTDOWN_FILE_MONO)

INFO:__main__:PCM Format (int-16): False
INFO:__main__:One Channel (Mono): False
INFO:__main__:Valid Sample Rate (8000 or 16000 Hz): False
INFO:__main__:Bytes Per Second (16000 or 32000): 288000
INFO:__main__:Two-block Aligned: False


False

In [12]:
check_audio_file(AUDIO_FILE_PCM_STEREO)

2023-12-26 01:40:50,706 - micro - MainProcess - INFO     PCM Format (int-16): False (1521368823.py:check_audio_file:21)
2023-12-26 01:40:50,707 - micro - MainProcess - INFO     One Channel (Mono): False (1521368823.py:check_audio_file:25)
2023-12-26 01:40:50,707 - micro - MainProcess - INFO     Valid Sample Rate (8000 or 16000 Hz): True (1521368823.py:check_audio_file:29)
2023-12-26 01:40:50,708 - micro - MainProcess - INFO     Bytes Per Second (16000 or 32000): 16000 (1521368823.py:check_audio_file:33)
2023-12-26 01:40:50,708 - micro - MainProcess - INFO     Two-block Aligned: True (1521368823.py:check_audio_file:37)


False

In [90]:
# Set up logging
import time


def speech_recognition_with_push_stream_vol2(audio: str):
    """
    Recognizes speech from a custom audio source using a push audio stream.
    Converts stereo audio to mono in real-time before pushing it to the stream.
    """
    try:
        speech_config = speechsdk.SpeechConfig(
            subscription=speech_key, region=service_region
        )
        stream = speechsdk.audio.PushAudioInputStream()
        audio_config = speechsdk.audio.AudioConfig(stream=stream)
        speech_recognizer = speechsdk.SpeechRecognizer(
            speech_config=speech_config, audio_config=audio_config
        )

        done = False
        wav_fh = wave.open(audio, "rb")
        final_text = ""

        def update_final_text(evt):
            nonlocal final_text
            final_text += " " + evt.result.text

        def stop_cb(evt):
            logger.info(f"CLOSING on {evt}")
            nonlocal done
            done = True

        speech_recognizer.recognizing.connect(
            lambda evt: logger.info(f"RECOGNIZING: {evt}")
        )
        speech_recognizer.recognized.connect(update_final_text)
        speech_recognizer.session_started.connect(
            lambda evt: logger.info(f"SESSION STARTED: {evt}")
        )
        speech_recognizer.session_stopped.connect(
            lambda evt: logger.info(f"SESSION STOPPED {evt}")
        )
        speech_recognizer.canceled.connect(lambda evt: logger.info(f"CANCELED {evt}"))
        speech_recognizer.session_stopped.connect(stop_cb)
        speech_recognizer.canceled.connect(stop_cb)

        speech_recognizer.start_continuous_recognition()

        while not done:
            frames = wav_fh.readframes(wav_fh.getframerate() // 10)
            if not frames:
                break

            if wav_fh.getnchannels() == 2:
                try:
                    # Interpreting the stereo frame data
                    stereo_data = np.frombuffer(frames, dtype=np.int16)
                    logger.info(f"Stereo data shape: {stereo_data.shape}")
                    # The reshape(-1, 2) method is used to ensure that the stereo data
                    # is reshaped into a 2-column array (representing left and right channels).
                    # Then, the mean is calculated across the columns (axis=1) to produce the
                    # mono audio data.
                    # Reshaping and averaging to convert to mono
                    mono_data = stereo_data.reshape(-1, 2).mean(axis=1).astype(np.int16)
                    logger.info(f"Mono data shape: {mono_data.shape}")
                    # Convert mono_data back to bytes
                    mono_frames = mono_data.tobytes()
                    stream.write(mono_frames)
                except Exception as e:
                    logger.error(f"Error during stereo to mono conversion: {e}")
            else:
                mono_data = np.frombuffer(frames, dtype=np.int16)
                logger.info(f"Mono data shape: {mono_data.shape}")
                mono_frames = mono_data.tobytes()
                stream.write(mono_frames)

            time.sleep(0.1)
    except Exception as e:
        logger.error(f"An error occurred: {e}")
    finally:
        wav_fh.close()
        stream.close()
        speech_recognizer.stop_continuous_recognition()
        return final_text

In [91]:
speech_recognition_with_push_stream_vol2(AUDIO_FILE_PCM_STEREO_MONO)

ERROR:__main__:An error occurred: unknown format: 7


UnboundLocalError: local variable 'wav_fh' referenced before assignment

In [59]:
import time
import wave


def speech_recognition_with_push_stream(audio: str):
    """gives an example how to use a push audio stream to recognize speech from a custom audio
    source"""
    speech_config = speechsdk.SpeechConfig(
        subscription=speech_key, region=service_region
    )

    # setup the audio stream
    stream = speechsdk.audio.PushAudioInputStream()
    audio_config = speechsdk.audio.AudioConfig(stream=stream)

    # instantiate the speech recognizer with push stream input
    speech_recognizer = speechsdk.SpeechRecognizer(
        speech_config=speech_config, audio_config=audio_config
    )
    done = False
    # The number of bytes to push per buffer
    n_bytes = 3500
    wav_fh = wave.open(audio)
    final_text = ""

    def update_final_text(evt):
        nonlocal final_text
        final_text += " " + evt.result.text

    def stop_cb(evt: speechsdk.SessionEventArgs):
        logger.info(f"CLOSING on {evt}")
        nonlocal done
        done = True

    speech_recognizer.recognizing.connect(
        lambda evt: logger.info(f"RECOGNIZING: {evt}")
    )
    speech_recognizer.recognized.connect(update_final_text)
    speech_recognizer.session_started.connect(
        lambda evt: logger.info(f"SESSION STARTED: {evt}")
    )
    speech_recognizer.session_stopped.connect(
        lambda evt: logger.info(f"SESSION STOPPED {evt}")
    )
    speech_recognizer.canceled.connect(lambda evt: logger.info(f"CANCELED {evt}"))
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    # start continuous speech recognition
    speech_recognizer.start_continuous_recognition()

    # start pushing data until all data has been read from the file
    try:
        while not done:
            frames = wav_fh.readframes(n_bytes // 2)
            print("read {} bytes".format(len(frames)))
            if not frames:
                break

            stream.write(frames)
            time.sleep(0.1)
    finally:
        # stop recognition and clean up
        wav_fh.close()
        stream.close()
        speech_recognizer.stop_continuous_recognition()
        return final_text

In [61]:
final_text = speech_recognition_with_push_stream(audio=COUNTDOWN_FILE_MONO)

INFO:__main__:SESSION STARTED: SessionEventArgs(session_id=4abc39570d5e4783939c8a895a707651)


read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes


INFO:__main__:RECOGNIZING: SpeechRecognitionEventArgs(session_id=4abc39570d5e4783939c8a895a707651, result=SpeechRecognitionResult(result_id=88eb102381664080af7fdc54383ebeba, text="the company's", reason=ResultReason.RecognizingSpeech))


read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 bytes
read 10500 byt

INFO:__main__:RECOGNIZING: SpeechRecognitionEventArgs(session_id=4abc39570d5e4783939c8a895a707651, result=SpeechRecognitionResult(result_id=c029229d3b8c43bf9893f58fdf3f26de, text="but the", reason=ResultReason.RecognizingSpeech))
INFO:__main__:RECOGNIZING: SpeechRecognitionEventArgs(session_id=4abc39570d5e4783939c8a895a707651, result=SpeechRecognitionResult(result_id=c183022a923849b1bd85f46f3cc1c7ea, text="but the US", reason=ResultReason.RecognizingSpeech))
INFO:__main__:RECOGNIZING: SpeechRecognitionEventArgs(session_id=4abc39570d5e4783939c8a895a707651, result=SpeechRecognitionResult(result_id=5a9142fed31947b59e35195965ccdfe4, text="but the USAFTA", reason=ResultReason.RecognizingSpeech))
INFO:__main__:RECOGNIZING: SpeechRecognitionEventArgs(session_id=4abc39570d5e4783939c8a895a707651, result=SpeechRecognitionResult(result_id=d26a223bb43b468ab205bf7a37cad9eb, text="but the USAFTA has", reason=ResultReason.RecognizingSpeech))
INFO:__main__:RECOGNIZING: SpeechRecognitionEventArgs(sessi

In [None]:
import sounddevice as sd
import numpy as np
import azure.cognitiveservices.speech as speechsdk
import threading
import queue


def real_time_audio_stream(queue, sample_rate, channels):
    def callback(indata, outdata, frames, time, status):
        if status:
            print(status)
        # Convert to mono by averaging if input is stereo
        if indata.shape[1] > 1:
            indata = np.mean(indata, axis=1, keepdims=True)
        # Ensure audio data is in 16-bit format
        indata = (indata * 32767).astype(np.int16)
        queue.put(indata.copy())

    # Create a stream
    with sd.Stream(
        callback=callback, samplerate=sample_rate, channels=channels, dtype="int16"
    ):
        print("Streaming started...")
        while True:
            sd.sleep(1000)


def speech_recognition_with_push_stream(queue):
    speech_config = speechsdk.SpeechConfig(
        subscription=speech_key, region=service_region
    )
    stream = speechsdk.audio.PushAudioInputStream()
    audio_config = speechsdk.audio.AudioConfig(stream=stream)
    speech_recognizer = speechsdk.SpeechRecognizer(
        speech_config=speech_config, audio_config=audio_config
    )

    def stop_cb(evt):
        nonlocal done
        done = True

    done = False
    final_text = ""

    speech_recognizer.recognized.connect(
        lambda evt: print(f"Recognized: {evt.result.text}")
    )
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    speech_recognizer.start_continuous_recognition()
    try:
        while not done:
            data = queue.get()
            if data is None:
                break
            stream.write(data.tobytes())
    finally:
        speech_recognizer.stop_continuous_recognition()
        return final_text


# Queue for transferring audio data
audio_queue = queue.Queue()

# Start the audio capture thread
audio_thread = threading.Thread(
    target=real_time_audio_stream, args=(audio_queue, 16000, 1)
)
audio_thread.start()

# Start speech recognition
recognized_text = speech_recognition_with_push_stream(audio_queue)
print("Recognized Text:", recognized_text)

In [23]:
final_text

' '

In [37]:
import time


def speech_recognition_with_push_stream(file_buffer):
    speech_config = speechsdk.SpeechConfig(
        subscription=speech_key, region=service_region
    )

    # Setup the audio stream
    stream = speechsdk.audio.PushAudioInputStream()
    audio_config = speechsdk.audio.AudioConfig(stream=stream)

    # Instantiate the speech recognizer with push stream input
    speech_recognizer = speechsdk.SpeechRecognizer(
        speech_config=speech_config, audio_config=audio_config
    )

    recognized_text = []  # List to accumulate recognized text

    # Callback function to collect the recognized text
    def recognized(args):
        if args.result.reason == speechsdk.ResultReason.RecognizedSpeech:
            recognized_text.append(args.result.text)

    # Connect callbacks to the events fired by the speech recognizer
    speech_recognizer.recognized.connect(recognized)
    speech_recognizer.recognizing.connect(
        lambda evt: print("RECOGNIZING: {}".format(evt))
    )
    speech_recognizer.session_started.connect(
        lambda evt: print("SESSION STARTED: {}".format(evt))
    )
    speech_recognizer.session_stopped.connect(
        lambda evt: print("SESSION STOPPED {}".format(evt))
    )
    speech_recognizer.canceled.connect(lambda evt: print("CANCELED {}".format(evt)))

    # Start continuous speech recognition
    speech_recognizer.start_continuous_recognition()

    try:
        # Push data from file_buffer to the stream
        buffer_size = 3200  # Size of each chunk of data
        for i in range(0, len(file_buffer), buffer_size):
            chunk = file_buffer[i : i + buffer_size]
            if not chunk:
                break
            stream.write(chunk)
            time.sleep(0.1)
    finally:
        # Stop recognition and clean up
        stream.close()
        speech_recognizer.stop_continuous_recognition()

    # Print the final recognized text
    final_recognized_text = " ".join(recognized_text)
    print("Final Recognized Text:", final_recognized_text)

In [17]:
# set up cognitive services
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
audio_input = speechsdk.audio.AudioInputStream(file_buffer)
audio_config = speechsdk.audio.AudioConfig(stream=audio_input)
recognizer = speechsdk.SpeechRecognizer(
    speech_config=speech_config, audio_config=audio_config
)


# handle results
def recognized(args):
    print(f"(recognized)  Reason: {args.result.reason} Text: {args.result.text}")


def canceled(args):
    print("canceled")


recognizer.recognized.connect(recognized)
recognizer.canceled.connect(canceled)

# start!
recognizer.start_continuous_recognition()

RuntimeError: Exception with error code: 
[CALL STACK BEGIN]

    > GetModuleObject
    - audio_config_get_audio_processing_options
    - audio_config_create_audio_input_from_stream
    - ffi_call_win64
    - ffi_call
    - ffi_call
    - DllCanUnloadNow
    - DllCanUnloadNow
    - 00007FFC9DFC7928 (SymFromAddr() error: Attempt to access invalid address.)
    - PyObject_Call
    - PyEval_GetFuncDesc
    - PyEval_EvalFrameDefault
    - PyEval_EvalFrameDefault
    - PyFunction_Vectorcall
    - PyVectorcall_Call
    - PyObject_Call

[CALL STACK END]

Exception with an error code: 0x5 (SPXERR_INVALID_ARG)