In [1]:
# A demo on speech to text in Azure using API, speech keys and endpoints in jupyter notebook

# a) how to convert audio to text and 

# b) converting audio to text and saving converted texts into files.

# Inspiration: https://github.com/caiomsouza/Microsoft-Cognitive-Services/tree/master/speech-to-text/nlp-demo-21-april-2021/notebook

In [8]:
# Importing libraries

import os
from pathlib import Path
import azure.cognitiveservices.speech as speechsdk

In [9]:
# Configuring speech key and service region, found in Azure credentials

speech_key = "key"
service_region = "region"

In [10]:
# a) how to convert audio to text

audio_file_path = Path("audiofile").resolve()

speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
audio_input = speechsdk.AudioConfig(filename=str(audio_file_path))
speech_recognizer = speechsdk.SpeechRecognizer(
        speech_config=speech_config, language="en-GB", audio_config=audio_input)

In [None]:
# a) how to convert audio to text

# Function to show the output and print

def recognize_audio_file():
    print('Recognizing first result...')

    result = speech_recognizer.recognize_once()
    if result.reason == speechsdk.ResultReason.RecognizedSpeech:
        print('Recognized: {}'.format(result.text))
    elif result.reason == speechsdk.ResultReason.NoMatch:
        print('No speech could be recognized: {}'.format(result.no_match_details))
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        print('Speech Recognition canceled: {}'.format(cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print('Error details: {}'.format(cancellation_details.error_details))

if __name__ == '__main__':
    recognize_audio_file()

In [None]:
# b) converting audio to text and saving converted texts into files

# importing libraries

import glob
import azure.cognitiveservices.speech as speechsdk
import time
import json
import pandas as pd

# creating a config file

config_file_name = "config_file_dev.json"

with open(config_file_name, 'r') as json_data_file:
    configuration = json.load(json_data_file)

print("################################")
# print(configuration)
print("################################")

# Speech SDK
speech_key = configuration["speech_api"]["speech_key"]
service_region = configuration["speech_api"]["service_region"]

# File location
location = configuration["location"]["full_file_path"]

print ("####################################################################################")
print("speech_key: " + speech_key)
print("service_region: " + service_region)
print("full_file_path: " + location)
print ("####################################################################################")

In [None]:
# b) converting audio to text and saving converted texts into files

# Creating an instance of a speech config with specified subscription key and service region

speech_key, service_region = speech_key, service_region
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)

print ("####################################################################################")
print ("PROGRAM STARTS")
print ("####################################################################################")

def speech_recognize_continuous_from_file(file):
    """performs continuous speech recognition with input from an audio file"""
    # <SpeechContinuousRecognitionWithFile>
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
    audio_config = speechsdk.audio.AudioConfig(filename=file)

    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

    done = False

    def stop_cb(evt):
        """callback that stops continuous recognition upon receiving an event `evt`"""
        print('CLOSING on {}'.format(evt))
        speech_recognizer.stop_continuous_recognition()
        nonlocal done
        done = True

    all_results = []
    def handle_final_result(evt):
        all_results.append(evt.result.text)

    speech_recognizer.recognized.connect(handle_final_result)
    # Connect callbacks to the events fired by the speech recognizer
    speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
    speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)))
    speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
    speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
    speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
    # stop continuous recognition on either session stopped or canceled events
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    # Start continuous speech recognition
    speech_recognizer.start_continuous_recognition()
    while not done:
        time.sleep(.5)

    print("Printing all results:")
    print(all_results)

    df = pd.DataFrame(all_results)
    df

    file_name = file + r"-speech-to-text-csv-output.csv"
    df.to_csv(file_name)


    print ("Audio File: "+file+" converted successfully")
    print ("####################################################################################")


# Define the files locations and list audio files (*.wav)
#location = 'location'
location = 'data'

fileset = [file for file in glob.glob(location + "**/*.wav", recursive=True)]

# Loop to call function to convert audio files to text
for file in fileset:
    #run_speech_to_text_small_audio_files(file)
    speech_recognize_continuous_from_file(file)
    print(file)

print ("####################################################################################")
print ("PROGRAM END")
print ("####################################################################################")
print ("Thank you for using this code")