# Read Audio Files

Read WAV or MP3 Files with Python and transcribe to text.

### Install the Required Libraries

In [3]:
%pip install azure-cognitiveservices-speech
%pip install openai

Note: you may need to restart the kernel to use updated packages.



### Python Imports

In [4]:
import azure.cognitiveservices.speech as speech_sdk
import sys

import time
import os
from dotenv import load_dotenv
from openai import AzureOpenAI

sys.path.append('..\\code')

load_dotenv(override=True)




import pandas as pd


from IPython.display import display, Markdown, HTML
from PIL import Image
from doc_utils import *
from utils.bcolors import bcolors as bc  


### Make sure we have the Azure Speech information

We will need the Speech APIKEY, REGION and LANGUAGE for this notebook.

When running the below cell, the values should reflect the Azure Speech reource you have created in 

In [None]:
speech_info = {
        'SPEECH_APIKEY': os.environ.get('SPEECH_APIKEY'),
        'SPEECH_REGION': os.environ.get('SPEECH_REGION'),
        'SPEECH_LANGUAGE': os.environ.get('SPEECH_LANGUAGE'),
}

speech_info

In [None]:
model_info = {
        'AZURE_OPENAI_MODEL_WHISPER': os.environ.get('AZURE_OPENAI_MODEL_WHISPER'),
        'AZURE_OPENAI_KEY': os.environ.get('AZURE_OPENAI_KEY'),
        'AZURE_OPENAI_MODEL_WHISPER': os.environ.get('AZURE_OPENAI_MODEL_WHISPER'),
        'AZURE_OPENAI_ENDPOINT_WHISPER': os.environ.get('AZURE_OPENAI_ENDPOINT_WHISPER'),
        'AZURE_OPENAI_VERSION_WHISPER': os.environ.get('AZURE_OPENAI_VERSION_WHISPER'),
        'AZURE_OPENAI_RESOURCE': os.environ.get('AZURE_OPENAI_RESOURCE'),
        'AZURE_OPENAI_MODEL_VISION': os.environ.get('AZURE_OPENAI_MODEL_VISION'),
        'AZURE_OPENAI_MODEL': os.environ.get('AZURE_OPENAI_MODEL'),
}

model_info

### Code Definitions

Defining the functions that will read in the audio file and return the transcription.

In [7]:
# Configure the Azure Speech Service
def config_speech_service():
    try:
        speech_config = speech_sdk.SpeechConfig(
            subscription=speech_info['SPEECH_APIKEY'], 
            region=speech_info['SPEECH_REGION'], 
            speech_recognition_language=speech_info['SPEECH_LANGUAGE'])

        # Set parameters
        speech_config.set_property(speech_sdk.PropertyId.SpeechServiceConnection_InitialSilenceTimeoutMs, "5000")
        speech_config.set_property(speech_sdk.PropertyId.Speech_SegmentationSilenceTimeoutMs, "2000")
        speech_config.set_property(speech_sdk.PropertyId.SpeechServiceConnection_EndSilenceTimeoutMs, "5000")
    
    except Exception as ex:
        print(ex)

    return speech_config

# Execute the transcription from file with Azure Speech service 
def speech_recognize_continuous_from_file(speech_config, filename):
    # Performs continuous speech recognition with input from an audio file"""
    audio_config = speech_sdk.AudioConfig(filename=filename)

    speech_recognizer = speech_sdk.SpeechRecognizer(speech_config, audio_config)

    done = False
    transcription = []

    # Callback that signals to stop continuous recognition upon receiving an event `evt`
    def stop_cb(evt: speech_sdk.SessionEventArgs):
        print('CLOSING')
        nonlocal done
        done = True
    
    # Callback that signals the recognition has been canceled
    def speech_recognizer_recognition_canceled_cb(evt: speech_sdk.SessionEventArgs):
        print('Canceled event')

    # Callback that signals the recognition session has been stopped
    def speech_recognizer_session_stopped_cb(evt: speech_sdk.SessionEventArgs):
        print('SessionStopped event')

    # Callback while transcribing
    def speech_recognizer_recognizing_cb(evt: speech_sdk.SpeechRecognitionEventArgs):
        print('Transcribing: ', evt.result.text)

    # Callback when a sentence has finished
    def speech_recognizer_transcribed_cb(evt: speech_sdk.SpeechRecognitionEventArgs):
        print('TRANSCRIBED:')
        if evt.result.reason == speech_sdk.ResultReason.RecognizedSpeech:
            print(f'\tText: {evt.result.text}')
            transcription.append(evt.result.text)
        elif evt.result.reason == speech_sdk.ResultReason.NoMatch:
            print(f'\tNOMATCH: Speech could not be TRANSCRIBED: {evt.result.no_match_details}')
            stop_cb(evt)

    # Callback that signal the session has started
    def speech_recognizer_session_started_cb(evt: speech_sdk.SessionEventArgs):
        print('SessionStarted event')

    # Connect callbacks to the events fired by the speech recognizer
    speech_recognizer.recognizing.connect(speech_recognizer_recognizing_cb)
    speech_recognizer.recognized.connect(speech_recognizer_transcribed_cb)
    speech_recognizer.session_started.connect(speech_recognizer_session_started_cb)
    speech_recognizer.session_stopped.connect(speech_recognizer_session_stopped_cb)
    speech_recognizer.canceled.connect(speech_recognizer_recognition_canceled_cb)
    # stop transcribing on either session stopped or canceled events
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    # Start continuous speech recognition
    speech_recognizer.start_continuous_recognition()
    while not done:
        time.sleep(.5)
            
    final_text = ""
    for text in transcription:
        final_text += text + " \n"
    #print(f'TRANSCRIPTION: [{final_text}]')

    speech_recognizer.stop_continuous_recognition()

    return transcription

def config_whisper():
    whisper_client = AzureOpenAI(
        api_key=model_info['AZURE_OPENAI_KEY'],  
        api_version=model_info['AZURE_OPENAI_VERSION_WHISPER'],
        base_url=f"{model_info['AZURE_OPENAI_ENDPOINT_WHISPER']}/openai/deployments/{model_info['AZURE_OPENAI_MODEL_WHISPER']}"
    )

    return whisper_client


def transcribe_with_whisper(whisper_client, filename):
    try:
        transcript = whisper_client.audio.transcriptions.create(
            file=open(filename, "rb"), 
            model=model_info['AZURE_OPENAI_MODEL_WHISPER']
            )
        return transcript
    
    except Exception as ex:
        return ex

## Read Audio File

Read the audio file and print the transcription out.

### Transcribing using Speech Services (AI Service)

In [None]:
# Usage with Azure Speech service
speech_config=config_speech_service()
#file_path = 'sample_data/sample_audio_parte_accidente.wav'
#file_path = 'sample_data/The_National_Park.wav'
#file_path = 'sample_data/CNVSample049.wav'
file_path = 'sample_data/call_recording_en.wav'
transcript = speech_recognize_continuous_from_file(speech_config, file_path)
display(transcript)


In [None]:
# Usage with Azure Speech service
speech_config=config_speech_service()
#file_path = 'sample_data/sample_audio_parte_accidente.wav'
#file_path = 'sample_data/The_National_Park.wav'
#file_path = 'sample_data/CNVSample049.wav'
file_path = 'sample_data/sample_audio_parte_accidente.wav'
transcript = speech_recognize_continuous_from_file(speech_config, file_path)
display(transcript)


### Transcribing using Whisper

In [8]:
# Usage with Whisper
whisper_client = config_whisper()
#file_path = 'sample_data/CNVSample049.wav'
file_path = 'sample_data/sample_audio_parte_accidente.wav'
transcript2 = transcribe_with_whisper(whisper_client, file_path)
display(transcript2)

Transcription(text='Hola, acabo de tener un accidente y quería notificarlo. Hola, de acuerdo. Espero que esté bien. ¿Qué ha pasado? Estaba conduciendo por la carretera de Colmenar y me he dado un golpe con otro coche. ¿Está usted bien? Sí, solo un poco nervioso. Es normal. ¿Me puede decir su nombre completo? Claro, me llamo Álvaro Gómez Rodríguez. ¿Sabe cuál ha sido la causa del accidente? Creo que he golpeado un bache. De acuerdo. ¿Dónde se ha producido el accidente? En la carretera de Colmenar, pasada la salida 17. ¿Ha habido algún otro herido? Creo que no, pero no estoy seguro. De acuerdo, lo investigaremos. ¿Me puede dar la información del otro conductor? Sí, su nombre es Juan Delgado Rivera. De acuerdo, un momento, por favor. ¿Me puede decir su DNI, por favor? Sí. Es 12345678F. De acuerdo. ¿Qué daños ha sufrido el coche? Se ha roto el faro delantero derecho y se ha pinchado una rueda. ¿Puede conducir el coche? No lo sé, no. Va a venir a recogerlo la grúa. De acuerdo, necesitaremos

In [9]:
# Usage with Whisper
whisper_client = config_whisper()
#file_path = 'sample_data/CNVSample049.wav'
file_path = 'sample_data/01_Customer_Service_Call.wav'
transcript = transcribe_with_whisper(whisper_client, file_path)
display(transcript)

Transcription(text="Good afternoon and thank you for calling Contoso Suites. This call is being recorded for quality assurance purposes. My name is Cameron Baker. How can I assist you today? Uh, yes, hi. My name is Parker McLean and I'm calling because I wanted to change some of the details on my upcoming stay. Thank you for choosing Contoso Suites, Mr. McLean. Could you please tell me what hotel your reservation is for so I can look up the details of your stay? Uh, I'm staying at the, uh, the Airport Gateway Hotel. Okay, I found your reservation. Can you please confirm your check-in and check-out dates for me? I check in on, uh, on the 9th and check out on... Hang on, let me check. Um, right, right, right, okay. Yeah, so I'm checking in on the 9th and out on the 14th. And I requested two meeting rooms for the 11th and the 12th. Thank you. That matches what I'm seeing on your reservation. What changes were you looking to make today? I'm hoping to change the date from that hotel to the,

In [10]:
#import OAI_client
#from joblib import Parallel, delayed
import time
import itertools
import pdb


def construct_prompt(prompt_parameters):

    prompt = """

    Document Body

    Review of {prompt_parameters}


    |endoftext|

    Write answer for questions only referring to the facts in document above. 
    Sentiment can either be positive, neutral or negative.

    |endoftext|

    ### Questions
    1) What is the overall sentiment?
    2) List the sentiment aspects as bullet points.
    """

    return(prompt)



def extract_sentiment_aspects_for_batch(reviews_batch, batch_name):


    prompt = """

    Document Body

    Review of {prompt_parameters}


    |endoftext|

    Write answer for questions only referring to the facts in document above. 
    Sentiment can either be positive, neutral or negative.

    |endoftext|

    ### Questions
    1) What is the overall sentiment?
    2) List the sentiment aspects as bullet points.
    3) Overall Sentiment
    """



    p = prompt.format(prompt_parameters = reviews_batch)
    print(p)
    output = ask_LLM(p, model_info=model_info)
    print(output)

    results = output
  
    return(results)


In [None]:
# Usage with Whisper
whisper_client = config_whisper()
#file_path = 'sample_data/CNVSample049.wav'
file_path = 'sample_data/01_Customer_Service_Call.wav'
transcript = transcribe_with_whisper(whisper_client, file_path)
display(transcript)


In [12]:
extract_sentiment_aspects_for_batch(transcript, "test")



    Document Body

    Review of Transcription(text="Good afternoon and thank you for calling Contoso Suites. This call is being recorded for quality assurance purposes. My name is Cameron Baker. How can I assist you today? Uh, yes, hi. My name is Parker McLean and I'm calling because I wanted to change some of the details on my upcoming stay. Thank you for choosing Contoso Suites, Mr. McLean. Could you please tell me what hotel your reservation is for so I can look up the details of your stay? Uh, I'm staying at the, uh, the Airport Gateway Hotel. Okay, I found your reservation. Can you please confirm your check-in and check-out dates for me? I check in on, uh, on the 9th and check out on... Hang on, let me check. Um, right, right, right, okay. Yeah, so I'm checking in on the 9th and out on the 14th. And I requested two meeting rooms for the 11th and the 12th. Thank you. That matches what I'm seeing on your reservation. What changes were you looking to make today? I'm hoping to chan

"### Answers\n\n1) The overall sentiment is positive.\n2) List of sentiment aspects:\n   - The customer service representative was polite and helpful.\n   - The customer's needs were addressed promptly, even though the initial request could not be fulfilled.\n   - An alternative option was provided that was close to the desired location and included an upgrade.\n   - The customer was appreciative of the upgrade and the service provided.\n3) Overall Sentiment: Positive"

In [11]:
extract_sentiment_aspects_for_batch(transcript2, "test")



    Document Body

    Review of Transcription(text='Hola, acabo de tener un accidente y quería notificarlo. Hola, de acuerdo. Espero que esté bien. ¿Qué ha pasado? Estaba conduciendo por la carretera de Colmenar y me he dado un golpe con otro coche. ¿Está usted bien? Sí, solo un poco nervioso. Es normal. ¿Me puede decir su nombre completo? Claro, me llamo Álvaro Gómez Rodríguez. ¿Sabe cuál ha sido la causa del accidente? Creo que he golpeado un bache. De acuerdo. ¿Dónde se ha producido el accidente? En la carretera de Colmenar, pasada la salida 17. ¿Ha habido algún otro herido? Creo que no, pero no estoy seguro. De acuerdo, lo investigaremos. ¿Me puede dar la información del otro conductor? Sí, su nombre es Juan Delgado Rivera. De acuerdo, un momento, por favor. ¿Me puede decir su DNI, por favor? Sí. Es 12345678F. De acuerdo. ¿Qué daños ha sufrido el coche? Se ha roto el faro delantero derecho y se ha pinchado una rueda. ¿Puede conducir el coche? No lo sé, no. Va a venir a recogerlo

'### Answers\n\n1) The overall sentiment is neutral.\n2) Sentiment aspects:\n   - The caller is a bit nervous but otherwise unharmed, which could be seen as slightly negative but mostly neutral as the situation is under control.\n   - The conversation is factual and procedural, focusing on gathering information about the accident and the next steps, which is neutral.\n   - The assistance provided by the operator is helpful and efficient, which leans towards a positive sentiment.\n3) Overall Sentiment: Neutral. The conversation mainly revolves around the exchange of necessary information after an accident, without any strong positive or negative emotions displayed.'