Azure AI speech to text \
https://learn.microsoft.com/en-us/azure/ai-services/speech-service/get-started-stt-diarization?tabs=windows&pivots=programming-language-python \
https://github.com/Azure-Samples/cognitive-services-speech-sdk/tree/master

In [16]:
# continuous-recognition speech to text
import os
import time
import azure.cognitiveservices.speech as speechsdk

def conversation_transcriber_recognition_canceled_cb(evt: speechsdk.SessionEventArgs):
    print('Canceled event')

def conversation_transcriber_session_stopped_cb(evt: speechsdk.SessionEventArgs):
    print('SessionStopped event')

def conversation_transcriber_transcribed_cb(evt: speechsdk.SpeechRecognitionEventArgs):
    print('TRANSCRIBED:')
    if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
        print('\tText={}'.format(evt.result.text))
        print('\tSpeaker ID={}'.format(evt.result.speaker_id))
    elif evt.result.reason == speechsdk.ResultReason.NoMatch:
        print('\tNOMATCH: Speech could not be TRANSCRIBED: {}'.format(evt.result.no_match_details))

def conversation_transcriber_session_started_cb(evt: speechsdk.SessionEventArgs):
    print('SessionStarted event')

def recognize_from_file():
    # This example requires environment variables named "AZURE_SPEECH_KEY" and "AZURE_SPEECH_REGION"
    speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('AZURE_SPEECH_KEY'), region=os.environ.get('AZURE_SPEECH_REGION'))
    speech_config.speech_recognition_language="en-US"
    
    # audio_file_path = "..\\data\\speech\\BillGates_2010.wav" 
    audio_file_path = "..\\data\\speech\\time.wav" 
    audio_config = speechsdk.audio.AudioConfig(filename=audio_file_path)
    conversation_transcriber = speechsdk.transcription.ConversationTranscriber(speech_config=speech_config, audio_config=audio_config)

    transcribing_stop = False

    def stop_cb(evt: speechsdk.SessionEventArgs):
        #"""callback that signals to stop continuous recognition upon receiving an event `evt`"""
        print('CLOSING on {}'.format(evt))
        nonlocal transcribing_stop
        transcribing_stop = True

    # Connect callbacks to the events fired by the conversation transcriber
    conversation_transcriber.transcribed.connect(conversation_transcriber_transcribed_cb)
    conversation_transcriber.session_started.connect(conversation_transcriber_session_started_cb)
    conversation_transcriber.session_stopped.connect(conversation_transcriber_session_stopped_cb)
    conversation_transcriber.canceled.connect(conversation_transcriber_recognition_canceled_cb)
    # stop transcribing on either session stopped or canceled events
    conversation_transcriber.session_stopped.connect(stop_cb)
    conversation_transcriber.canceled.connect(stop_cb)

    conversation_transcriber.start_transcribing_async()

    # Waits for completion.
    while not transcribing_stop:
        time.sleep(.5)

    conversation_transcriber.stop_transcribing_async()

# Main

try:
    recognize_from_file()
except Exception as err:
    print("Encountered exception. {}".format(err))

SessionStarted event
TRANSCRIBED:
	Text=What time is it?
	Speaker ID=Guest-1
Canceled event
CLOSING on ConversationTranscriptionCanceledEventArgs(session_id=d771b0e1732c4246bd76807f9b448a73, result=ConversationTranscriptionResult(result_id=c7509e98c44a4f1885cf4a2dcb1cbe13, speaker_id=, text=, reason=ResultReason.Canceled))
SessionStopped event
CLOSING on SessionEventArgs(session_id=d771b0e1732c4246bd76807f9b448a73)


In [15]:
# single-shot recognition speech to text
import os
import azure.cognitiveservices.speech as speechsdk

def speech_to_text(audio_file_path, subscription_key, service_region):
    speech_config = speechsdk.SpeechConfig(subscription=subscription_key, region=service_region)
    audio_input = speechsdk.AudioConfig(filename=audio_file_path)

    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_input)
    result = speech_recognizer.recognize_once_async().get()
    # result = speech_recognizer.recognize_once()

    if result.reason == speechsdk.ResultReason.RecognizedSpeech:
        print("Recognized: {}".format(result.text))
    elif result.reason == speechsdk.ResultReason.NoMatch:
        print("No speech could be recognized")
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        print("Speech Recognition canceled: {}".format(cancellation_details.reason))

if __name__ == "__main__":
    audio_file_path = "..\\data\\speech\\BillGates_2010.wav" 
    # audio_file_path = "..\\data\\speech\\time.wav" 
    subscription_key = os.environ["AZURE_SPEECH_KEY"]
    service_region = os.environ["AZURE_SPEECH_REGION"]

    speech_to_text(audio_file_path, subscription_key, service_region)


No speech could be recognized
