In [6]:
import os
# Set the working directory to your project's main folder
os.chdir('/mnt/c/Users/pablosal/Desktop/lilly-workshop-gbb-text-to-speach/')
import azure.cognitiveservices.speech as speechsdk
from utils.ml_logging import get_logger
from dotenv import load_dotenv
load_dotenv()
logger = get_logger()

In [14]:
from pydub import AudioSegment
from pydub.utils import mediainfo

In [21]:
def log_audio_characteristics(file_name: str):
    # Check if file exists
    if not os.path.exists(file_name):
        logger.error(f"File not found: {file_name}")
        return

    try:
        audio = AudioSegment.from_file(file_name)
        base_name, _ = os.path.splitext(file_name)
        pcm_file_name = base_name + ".pcm"

        audio.export(pcm_file_name, format="wav")
        info = mediainfo(pcm_file_name)

        logger.info(f"Audio file characteristics for {pcm_file_name}:")
        logger.info(f"Number of channels: {info['channels']}")
        if info['bits_per_sample'].isdigit():
            sample_width = int(info['bits_per_sample']) / 8
            logger.info(f"Sample width (bytes): {sample_width}")
        else:
            logger.error("Invalid bits_per_sample value")
        logger.info(f"Sampling frequency (Hz): {info['sample_rate']}")
        if info['duration'].replace('.', '', 1).isdigit() and info['sample_rate'].isdigit():
            number_of_frames = int(float(info['duration']) * int(info['sample_rate']))
            logger.info(f"Number of frames: {number_of_frames}")
        else:
            logger.error("Invalid duration or sample rate values")
    except Exception as e:
        logger.error(f"An error occurred: {e}")

In [None]:
from typing import Optional

def log_audio_characteristics(file_name: str) -> Optional[None]:
    """
    Logs the characteristics of an audio file.

    Args:
        file_name (str): The path to the audio file.

    Returns:
        None
    """
    # Check if file exists
    if not os.path.exists(file_name):
        logger.error(f"File not found: {file_name}")
        return

    try:
        audio = AudioSegment.from_file(file_name)
        base_name, _ = os.path.splitext(file_name)
        pcm_file_name = base_name + ".pcm"

        audio.export(pcm_file_name, format="wav")
        info = mediainfo(pcm_file_name)

        logger.info(f"Audio file characteristics for {pcm_file_name}:")
        logger.info(f"Number of channels: {info['channels']}")
        if info['bits_per_sample'].isdigit():
            sample_width = int(info['bits_per_sample']) / 8
            logger.info(f"Sample width (bytes): {sample_width}")
        else:
            logger.error("Invalid bits_per_sample value")
        logger.info(f"Sampling frequency (Hz): {info['sample_rate']}")
        if info['duration'].replace('.', '', 1).isdigit() and info['sample_rate'].isdigit():
            number_of_frames = int(float(info['duration']) * int(info['sample_rate']))
            logger.info(f"Number of frames: {number_of_frames}")
        else:
            logger.error("Invalid duration or sample rate values")
    except Exception as e:
        logger.error(f"An error occurred: {e}")

In [None]:
from typing import Optional

def log_audio_characteristics(file_name: str) -> Optional[None]:
    """
    Logs the characteristics of an audio file.

    Args:
        file_name (str): The path to the audio file.

    Returns:
        None
    """
    # Check if file exists
    if not os.path.exists(file_name):
        logger.error(f"File not found: {file_name}")
        return

    try:
        audio = AudioSegment.from_file(file_name)
        base_name, _ = os.path.splitext(file_name)
        pcm_file_name = base_name + ".pcm"

        audio.export(pcm_file_name, format="wav")
        info = mediainfo(pcm_file_name)

        logger.info(f"Audio file characteristics for {pcm_file_name}:")
        logger.info(f"Number of channels: {info['channels']}")
        if info['bits_per_sample'].isdigit():
            sample_width = int(info['bits_per_sample']) / 8
            logger.info(f"Sample width (bytes): {sample_width}")
        else:
            logger.error("Invalid bits_per_sample value")
        logger.info(f"Sampling frequency (Hz): {info['sample_rate']}")
        if info['duration'].replace('.', '', 1).isdigit() and info['sample_rate'].isdigit():
            number_of_frames = int(float(info['duration']) * int(info['sample_rate']))
            logger.info(f"Number of frames: {number_of_frames}")
        else:
            logger.error("Invalid duration or sample rate values")
    except Exception as e:
        logger.error(f"An error occurred: {e}")

In [22]:
KEY = os.getenv('KEY')
REGION = os.getenv('REGION')
FILE_NAME = '/mnt/c/Users/pablosal/Desktop/lilly-workshop-gbb-text-to-speach/notebooks/dev/8000khz-mulaw-pullstream/7.wav'

In [23]:
log_audio_characteristics(FILE_NAME)

2023-11-15 08:02:35,611 - micro - MainProcess - INFO     Audio file characteristics for /mnt/c/Users/pablosal/Desktop/lilly-workshop-gbb-text-to-speach/notebooks/dev/8000khz-mulaw-pullstream/7.pcm: (2467322109.py:log_audio_characteristics:15)
2023-11-15 08:02:35,612 - micro - MainProcess - INFO     Number of channels: 2 (2467322109.py:log_audio_characteristics:16)
2023-11-15 08:02:35,613 - micro - MainProcess - INFO     Sample width (bytes): 1.0 (2467322109.py:log_audio_characteristics:19)
2023-11-15 08:02:35,614 - micro - MainProcess - INFO     Sampling frequency (Hz): 8000 (2467322109.py:log_audio_characteristics:22)
2023-11-15 08:02:35,615 - micro - MainProcess - INFO     Number of frames: 182880 (2467322109.py:log_audio_characteristics:25)


In [11]:
def from_file_async(file_name: str, key: str, region: str) -> str:
    """
    Transcribes speech from an audio file using Azure Cognitive Services Speech SDK.

    Args:
        file_name (str): The name of the audio file to transcribe.
        key (str): The subscription key for the Speech service.
        region (str): The region for the Speech service.

    Returns:
        str: The transcribed text from the audio file.
    """
    speech_config = speechsdk.SpeechConfig(subscription=key, region=region)
    audio_config = speechsdk.AudioConfig(filename=file_name)
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

    logger.info(f"Transcribing speech from file: {file_name}")
    result = speech_recognizer.recognize_once_async().get()
    if result.reason == speechsdk.ResultReason.RecognizedSpeech:
        logger.info(f"Transcription result: {result.text}")
    elif result.reason == speechsdk.ResultReason.NoMatch:
        logger.warning(f"No speech could be recognized: {result.no_match_details}")
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        logger.error(f"Speech Recognition canceled: {cancellation_details.reason}")
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            logger.error(f"Error details: {cancellation_details.error_details}")
    return result.text

In [24]:
import time 
def from_file_continous(file_name: str, key: str, region: str) -> str:
    """performs continuous speech recognition with input from an audio file"""
    # Set up logging
  
    speech_config = speechsdk.SpeechConfig(subscription=key, region=region)
    audio_config = speechsdk.audio.AudioConfig(filename=file_name)

    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

    log_audio_characteristics(file_name)

    done = False
    final_text = ""

    def update_final_text(evt):
        nonlocal final_text
        final_text += ' ' + evt.result.text

    def stop_cb(evt: speechsdk.SessionEventArgs):
        """callback that signals to stop continuous recognition upon receiving an event `evt`"""
        logger.info('CLOSING on {}'.format(evt))
        nonlocal done
        done = True

    # Connect callbacks to the events fired by the speech recognizer
    speech_recognizer.recognizing.connect(lambda evt: logger.info('RECOGNIZING: {}'.format(evt)))
    speech_recognizer.recognized.connect(update_final_text)
    speech_recognizer.session_started.connect(lambda evt: logger.info('SESSION STARTED: {}'.format(evt)))
    speech_recognizer.session_stopped.connect(lambda evt: logger.info('SESSION STOPPED {}'.format(evt)))
    speech_recognizer.canceled.connect(lambda evt: logger.info('CANCELED {}'.format(evt)))
    # Stop continuous recognition on either session stopped or canceled events
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    # Start continuous speech recognition
    speech_recognizer.start_continuous_recognition()
    while not done:
        time.sleep(.1)

    speech_recognizer.stop_continuous_recognition()

    return final_text.strip() 

In [25]:
text = from_file_continous(file_name=FILE_NAME, key=KEY, region=REGION)

ValueError: either subscription key or authorization token must be given along with a region

In [50]:
text

'What is the date? May 15th, 1980. Thursday, May 15th, 19180. What is the date? July 6th. Saturday, July 6th, 2024.'

In [12]:
from_file(file_name=FILE_NAME, key=KEY, region=REGION)

2023-11-14 16:46:38,782 - micro - MainProcess - INFO     Transcribing speech from file: /mnt/c/Users/pablosal/Desktop/lilly-workshop-gbb-text-to-speach/notebooks/dev/8000khz-mulaw-pullstream/7.wav (248621148.py:from_file:17)


2023-11-14 16:46:39,395 - micro - MainProcess - INFO     Transcription result: What is the date? (248621148.py:from_file:20)


'What is the date?'

In [13]:
speech_config = speechsdk.SpeechConfig(subscription=KEY, region=REGION)

In [14]:
speech_config.enable_audio_logging()

In [16]:
audio_config = speechsdk.AudioConfig(filename=FILE_NAME)
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

In [15]:
# speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config)

In [33]:
result = speech_recognizer.recognize_once?

[0;31mSignature:[0m [0mspeech_recognizer[0m[0;34m.[0m[0mrecognize_once[0m[0;34m([0m[0;34m)[0m [0;34m->[0m [0mazure[0m[0;34m.[0m[0mcognitiveservices[0m[0;34m.[0m[0mspeech[0m[0;34m.[0m[0mSpeechRecognitionResult[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Performs recognition in a blocking (synchronous) mode. Returns after a single utterance is
recognized. The end of a single utterance is determined by listening for silence at the end
or until a maximum of 15 seconds of audio is processed. The task returns the recognition
text as result. For long-running multi-utterance recognition, use
:py:meth:`.start_continuous_recognition_async` instead.

:returns: The result value of the synchronous recognition.
[0;31mFile:[0m      ~/miniconda3/envs/lilly-speach-to-text/lib/python3.9/site-packages/azure/cognitiveservices/speech/speech.py
[0;31mType:[0m      method

In [32]:
result.text

'What is the date?'

In [24]:
result.g

'{"Id":"0a812f792b094255b31eb6e1abf81c88","RecognitionStatus":"Success","DisplayText":"May 15th, 1980.","Offset":38600000,"Duration":16000000,"Channel":0}'

In [20]:
result.text

'What is the date?'

In [None]:
speech_config

In [None]:
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config)

In [51]:
import azure.cognitiveservices.speech as speechsdk
import time 
import os
from utils.ml_logging import get_logger
import argparse
from dotenv import load_dotenv
load_dotenv()
logger = get_logger()


KEY = os.getenv('KEY')
REGION = os.getenv('REGION')
FILE_NAME = '/mnt/c/Users/pablosal/Desktop/lilly-workshop-gbb-text-to-speach/notebooks/dev/8000khz-mulaw-pullstream/7.wav'
language_understanding_app_id = os.getenv('INTENT_KEY')



intent_config: speechsdk.SpeechConfig = speechsdk.SpeechConfig(subscription=KEY, region=REGION)
audio_config: speechsdk.audio.AudioConfig = speechsdk.audio.AudioConfig(filename=FILE_NAME)
intent_recognizer: speechsdk.intent.IntentRecognizer = speechsdk.intent.IntentRecognizer(speech_config=intent_config, audio_config=audio_config)

# set up the intents that are to be recognized. These can be a mix of simple phrases and
# intents specified through a LanguageUnderstanding Model.
model = speechsdk.intent.LanguageUnderstandingModel(app_id=language_understanding_app_id)

In [65]:
def recognize_intent_continuous(file_name: str, key: str, region: str) -> None:
    """
    Performs continuous intent recognition from input from an audio file.
    Uses the Azure Cognitive Services Speech SDK to set up an intent recognizer,
    add intents to be recognized, and start continuous recognition.
    Prints the output of the recognition to the console.

    Args:
        file_name (str): The name of the audio file to transcribe.
        key (str): The subscription key for the Speech service.
        region (str): The region for the Speech service.
    """
    # Set up the intent recognizer
    intent_config: speechsdk.SpeechConfig = speechsdk.SpeechConfig(subscription=key, region=region)
    audio_config: speechsdk.audio.AudioConfig = speechsdk.audio.AudioConfig(filename=file_name)
    intent_recognizer: speechsdk.intent.IntentRecognizer = speechsdk.intent.IntentRecognizer(speech_config=intent_config, audio_config=audio_config)

    # set up the intents that are to be recognized. These can be a mix of simple phrases and
    # intents specified through a LanguageUnderstanding Model.
    model = speechsdk.intent.LanguageUnderstandingModel(app_id=language_understanding_app_id)
    intents = [
        (model, "HomeAutomation.TurnOn"),
        (model, "HomeAutomation.TurnOff"),
        ("This is a test.", "test"),
        ("Switch the channel to 34.", "34"),
        ("what's the weather like", "weather"),
    ]
    intent_recognizer.add_intents(intents)

    # Connect callback functions to the signals the intent recognizer fires.
    done = False

    def stop_cb(evt: speechsdk.SessionEventArgs):
        """callback that signals to stop continuous recognition upon receiving an event `evt`"""
        print('CLOSING on {}'.format(evt))
        nonlocal done
        done = True

    intent_recognizer.session_started.connect(lambda evt: print("SESSION_START: {}".format(evt)))
    intent_recognizer.speech_end_detected.connect(lambda evt: print("SPEECH_END_DETECTED: {}".format(evt)))
    # event for intermediate results
    intent_recognizer.recognizing.connect(lambda evt: print("RECOGNIZING: {}".format(evt)))
    # event for final result
    intent_recognizer.recognized.connect(lambda evt: print(
        "RECOGNIZED: {}\n\tText: {} (Reason: {})\n\tIntent Id: {}\n\tIntent JSON: {}".format(
            evt, evt.result.text, evt.result.reason, evt.result.intent_id, evt.result.intent_json)))

    # cancellation event
    intent_recognizer.canceled.connect(lambda evt: print(f"CANCELED: {evt.cancellation_details} ({evt.reason})"))

    # stop continuous recognition on session stopped, end of speech or canceled events
    intent_recognizer.session_stopped.connect(stop_cb)
    intent_recognizer.speech_end_detected.connect(stop_cb)
    intent_recognizer.canceled.connect(stop_cb)

    # And finally run the intent recognizer. The output of the callbacks should be printed to the console.
    intent_recognizer.start_continuous_recognition()
    while not done:
        time.sleep(.5)

    intent_recognizer.stop_continuous_recognition()
    # </IntentContinuousRecognitionWithFile>

In [66]:
recognize_intent_continuous(file_name=FILE_NAME, key=KEY, region=REGION)

SESSION_START: SessionEventArgs(session_id=5148196610eb42618ea78e19bb722d2a)
RECOGNIZING: IntentRecognitionEventArgs(session_id=5148196610eb42618ea78e19bb722d2a, result=IntentRecognitionResult(result_id=38a249704fe3412385174092ed0eb4f1, text="what is the", intent_id=, reason=ResultReason.RecognizingSpeech))
SPEECH_END_DETECTED: RecognitionEventArgs(session_id=5148196610eb42618ea78e19bb722d2a)
CLOSING on RecognitionEventArgs(session_id=5148196610eb42618ea78e19bb722d2a)
RECOGNIZED: IntentRecognitionEventArgs(session_id=5148196610eb42618ea78e19bb722d2a, result=IntentRecognitionResult(result_id=04c7e8e4b9c747439abfaa2cc6183ecc, text="What is the date?", intent_id=, reason=ResultReason.RecognizedSpeech))
	Text: What is the date? (Reason: ResultReason.RecognizedSpeech)
	Intent Id: 
	Intent JSON: 
RECOGNIZING: IntentRecognitionEventArgs(session_id=5148196610eb42618ea78e19bb722d2a, result=IntentRecognitionResult(result_id=69f1ceb0bf0b43bebe3a27ef0903d909, text="may", intent_id=, reason=ResultR

In [54]:
intents = [
        (model, "HomeAutomation.TurnOn"),
        (model, "HomeAutomation.TurnOff"),
        ("This is a test.", "test"),
        ("Switch the channel to 34.", "34"),
        ("what's the weather like", "weather"),
    ]
intent_recognizer.add_intents(intents)

# Connect callback functions to the signals the intent recognizer fires.
done = False


In [None]:
def stop_cb(evt: speechsdk.SessionEventArgs):
    """callback that signals to stop continuous recognition upon receiving an event `evt`"""
    print('CLOSING on {}'.format(evt))
    nonlocal done
    done = True

In [None]:




intent_recognizer.session_started.connect(lambda evt: print("SESSION_START: {}".format(evt)))
intent_recognizer.speech_end_detected.connect(lambda evt: print("SPEECH_END_DETECTED: {}".format(evt)))
# event for intermediate results
intent_recognizer.recognizing.connect(lambda evt: print("RECOGNIZING: {}".format(evt)))
# event for final result
intent_recognizer.recognized.connect(lambda evt: print(
    "RECOGNIZED: {}\n\tText: {} (Reason: {})\n\tIntent Id: {}\n\tIntent JSON: {}".format(
        evt, evt.result.text, evt.result.reason, evt.result.intent_id, evt.result.intent_json)))

# cancellation event
intent_recognizer.canceled.connect(lambda evt: print(f"CANCELED: {evt.cancellation_details} ({evt.reason})"))

# stop continuous recognition on session stopped, end of speech or canceled events
intent_recognizer.session_stopped.connect(stop_cb)
intent_recognizer.speech_end_detected.connect(stop_cb)
intent_recognizer.canceled.connect(stop_cb)

# And finally run the intent recognizer. The output of the callbacks should be printed to the console.
intent_recognizer.start_continuous_recognition()
while not done:
    time.sleep(.5)

intent_recognizer.stop_continuous_recognition()