In [None]:
%pip install azure-cognitiveservices-speech==1.37.0

In [None]:
# key_vault_name value is set at the time of deployment

key_vault_name = 'kv_to-be-replaced'

In [None]:
from trident_token_library_wrapper import PyTridentTokenLibrary as tl

def get_secrets_from_kv(kv_name, secret_name):

    access_token = mssparkutils.credentials.getToken("keyvault")
    kv_endpoint = f'https://{kv_name}.vault.azure.net/'
    return(tl.get_secret_with_token(kv_endpoint,secret_name,access_token))

openai_api_type = "azure"
openai_api_version  = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-VERSION")
openai_api_base = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-ENDPOINT")
openai_api_key = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-KEY")

In [None]:
#Set AI services variables
ai_services_endpoint = get_secrets_from_kv(key_vault_name,"COG-SERVICES-ENDPOINT") 
ai_services_key = get_secrets_from_kv(key_vault_name,"COG-SERVICES-KEY") 
ai_services_region = get_secrets_from_kv(key_vault_name,"COG-SERVICES-REGION")
# wav_file_path = '/lakehouse/default/Files/data/audio_input/Travel_20240417132839.wav'
language1 = 'en-US'

In [None]:
# # This cell creates new folders within the specified base path in the lakehouse. 
# The purpose is to create corresponding folders so files can be moved as they are processed.
import os 

# Define the base path
base_path = '/lakehouse/default/Files/data'

# List of folders to be created
folders = ['audio_failed', 'audio_processed']

# Create each folder
for folder in folders:
    folder_path = os.path.join(base_path, folder)
    try:
        os.makedirs(folder_path, exist_ok=True)
        print(f'Folder created at: {folder_path}')
    except Exception as e:
        print(f'Failed to create the folder {folder_path}. Error: {e}')

In [None]:
#Drop the metadata table if it already exists
spark.sql('drop table if exists ckm_conv_metadata')

In [None]:
from pyspark.sql import functions as F

spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")

# Read all the CSV files in the directory
df = spark.read.format("csv").option("header","true").load("Files/data/audio_input/*.csv")

# Convert StartTime and EndTime to timestamp format
df = df.withColumn("StartTime", F.to_timestamp("StartTime", "MM/dd/yyyy h:mm:ss a"))
df = df.withColumn("EndTime", F.to_timestamp("EndTime", "MM/dd/yyyy h:mm:ss a"))

# Calculate the duration in milliseconds and add it as a new column
df = df.withColumn("Duration", (F.col("EndTime").cast("long") - F.col("StartTime").cast("long")) / 60)


# Write the DataFrame to a Delta table
df.write.format('delta').mode('overwrite').saveAsTable('ckm_conv_metadata')

# # Display the first 2 rows
# display(df.head(2))


In [None]:
# This code block is designed to transcribe speech from an audio file using Azure's Cognitive Services Speech SDK.
# https://learn.microsoft.com/en-us/azure/ai-services/speech-service/get-started-stt-diarization?tabs=windows&pivots=programming-language-python

# It supports diarization, which distinguishes between different speakers in the audio.
# The transcribed results are returned as a list of all recognized utterances with associated metadata.

import os
import time
import azure.cognitiveservices.speech as speechsdk
import json

# Function to transcribe speech from an audio file
def transcribe_from_file(ai_services_key, ai_services_region, wav_file_path, conversation_id):
    # List to store the results of the transcription
    all_results = list()

    # Configure the speech service
    speech_config = speechsdk.SpeechConfig(subscription=ai_services_key, region=ai_services_region)
    speech_config.speech_recognition_language = "en-US"

    # Set up the audio configuration using the provided file path
    audio_config = speechsdk.audio.AudioConfig(filename=wav_file_path)
    # Create a conversation transcriber object
    conversation_transcriber = speechsdk.transcription.ConversationTranscriber(speech_config=speech_config, audio_config=audio_config)

    # Flag to indicate when to stop transcribing
    transcribing_stop = False

    # Callback for when the transcription session starts
    def conversation_transcriber_session_started_cb(evt: speechsdk.SessionEventArgs):
        print('SessionStarted event')

    # Callback to signal to stop continuous recognition
    def stop_cb(evt: speechsdk.SessionEventArgs):
        nonlocal transcribing_stop
        transcribing_stop = True
        # Log the session ID
        print(f"Stopping transcription for session id: {evt.session_id}")

        # Check if the event has a result attribute
        if hasattr(evt, 'result'):
            # If the result reason is cancellation, provide the cancellation details
            if evt.result.reason == speechsdk.ResultReason.Canceled:
                cancellation_details = speechsdk.CancellationDetails(evt.result)
                print(f"Transcription was stopped due to cancellation: {cancellation_details.reason}")
                if cancellation_details.reason == speechsdk.CancellationReason.Error:
                    print(f"Error details: {cancellation_details.error_details}")
            # If the result reason is EndOfStream, indicate the audio stream has ended
            elif evt.result.reason == speechsdk.ResultReason.EndOfStream:
                print("Transcription stopped because the end of the audio stream was reached.")
            # If the result reason is NoMatch, indicate no speech could be recognized
            elif evt.result.reason == speechsdk.ResultReason.NoMatch:
                print("Transcription stopped because no speech could be recognized.")
            # For any other reason, log the result reason
            else:
                print(f"Transcription stopped for an unknown reason: {evt.result.reason}")
        else:
            # If there is no result attribute, log that the reason is unknown
            print("Transcription stopped, but no additional information is available.")

    # Callback for when the transcription is canceled
    def conversation_transcriber_recognition_canceled_cb(evt: speechsdk.SessionEventArgs):
        print("Canceled event")
        # Access the cancellation details from the event
        cancellation_details = speechsdk.CancellationDetails(evt.result)
        # Print the reason for the cancellation
        print(f"Canceled event: {cancellation_details.reason}")

        # If there was an error, print the error details
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print(f"Error details: {cancellation_details.error_details}")


    # Callback for when the transcription session stops
    def conversation_transcriber_session_stopped_cb(evt: speechsdk.SessionEventArgs):
        # Print the session stopped event with the session id for reference
        print(f"SessionStopped event for session id: {evt.session_id}")

        # If the event has a result attribute, we can check if there are any additional details
        if hasattr(evt, 'result') and evt.result:
            # Check if the result has a reason attribute and print it
            if hasattr(evt.result, 'reason'):
                print(f"Reason for stop: {evt.result.reason}")

            # If the result is a cancellation, print the cancellation details
            if evt.result.reason == speechsdk.ResultReason.Canceled:
                cancellation_details = speechsdk.CancellationDetails(evt.result)
                print(f"Cancellation reason: {cancellation_details.reason}")
                if cancellation_details.reason == speechsdk.CancellationReason.Error:
                    print(f"Error details: {cancellation_details.error_details}")


    # Handler for the final result of the transcription
    def handle_final_result(evt):
        nonlocal all_results
        # Log the event type
        print(f"Event type: {type(evt)}")
        # Log the event's result reason
        print(f"Result reason: {evt.result.reason}")

        # Check if the event's result is speech recognition with a recognized phrase
        if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
            # Parse the JSON result from the transcription
            r = json.loads(evt.result.json)
            # Log the entire JSON result
            print(f"JSON result: {r}")
            # Append the relevant data to the results list
            all_results.append([conversation_id,
                                r["Id"],
                                r["DisplayText"],
                                r["Offset"],
                                r["Duration"],
                                r["Channel"],
                                r["Type"],
                                r["SpeakerId"]
                                ])
        # If the result reason is not recognized speech, log that no recognized speech was found
        else:
            print("No recognized speech was found.")


    # Connect the callbacks to the events fired by the conversation transcriber
    conversation_transcriber.transcribed.connect(handle_final_result)
    conversation_transcriber.session_started.connect(conversation_transcriber_session_started_cb)
    conversation_transcriber.session_stopped.connect(conversation_transcriber_session_stopped_cb)
    conversation_transcriber.canceled.connect(conversation_transcriber_recognition_canceled_cb)
    conversation_transcriber.session_stopped.connect(stop_cb)
    conversation_transcriber.canceled.connect(stop_cb)

    # Start the asynchronous transcription
    conversation_transcriber.start_transcribing_async()

    # Wait for the transcription to complete
    while not transcribing_stop:
        time.sleep(.5)

    # Stop the asynchronous transcription
    conversation_transcriber.stop_transcribing_async()
    # Return the list of transcribed results
    return(all_results)


In [None]:
# # spark.sql('drop table if exists ckm_conv_messages')

# from pyspark.sql import SparkSession

# # Create a Spark session
# spark = SparkSession.builder.getOrCreate()

# # Get the schema of the existing table
# schema = spark.table("ckm_conv_messages").schema

# # Create an empty DataFrame with the same schema
# empty_df = spark.createDataFrame([], schema)

# # Overwrite the existing table with the empty DataFrame
# empty_df.write.mode('overwrite').saveAsTable("ckm_conv_messages")


In [None]:
spark.sql('drop table if exists ckm_conv_messages')

In [None]:
"""
This script transcribes audio files using the AI services key and region. It iterates over each row in a dataframe, 
constructs the full path of the audio file, and attempts to transcribe the audio. If the transcription is successful 
and not empty, it creates a new dataframe with the transcriptions and writes it to a delta table 'ckm_conv_messages'. 
The 'ckm_conv_messages' table stores the conversation messages with columns such as conversation_id, Id, DisplayText, 
Offset, Duration, Channel, Type, and SpeakerId. The processed audio files are then moved to a 'audio_processed' folder. 
If an error occurs during the process, it prints the error message and the file that could not be loaded.
"""

from pyspark.sql import functions as f

for row in df.rdd.collect():
    # Strip leading and trailing whitespace from the file name
    file_name = row.FileName.strip()
    wav_file_path = '/lakehouse/default/Files/data/audio_input/' + file_name # full path is required for speechSDK
    # print(wav_file_path)
    try:
        # print(f"transcribing file: {wav_file_path}")
        r = transcribe_from_file(ai_services_key,ai_services_region,wav_file_path,row.ConversationId)
        # print(f"r= {r}")
        if len(r) != 0:
            df_columns = ["conversation_id","Id","DisplayText","Offset","Duration","Channel","Type","SpeakerId"]
            df_conv = spark.createDataFrame(data=r, schema = df_columns)
            df_conv = df_conv.coalesce(1).withColumn("row_id", f.monotonically_increasing_id())

            df_conv.write.format('delta').mode('append').saveAsTable('ckm_conv_messages')
            # Move the processed file to the 'audio_processed' folder
            mssparkutils.fs.mv(('Files/data/audio_input/' + file_name), ('Files/data/audio_processed/' + file_name), False,True)
            # break
    except Exception as e:
        print("could not load:", wav_file_path)
        print("An error occurred:", e)  # Print the exception
        # Move the processed file to the 'audio_failed' folder
        mssparkutils.fs.mv(('Files/data/audio_input/' + file_name), ('Files/data/audio_failed/' + file_name), False,True)


In [None]:
import os
import shutil

# Directory paths
input_dir = '/lakehouse/default/Files/data/audio_input/'
processed_dir = '/lakehouse/default/Files/data/audio_processed/'

# Get a list of all .csv files in the input directory
csv_files = [f for f in os.listdir(input_dir) if f.endswith('.csv')]

# Move each .csv file to the processed directory
for file_name in csv_files:
    shutil.move(os.path.join(input_dir, file_name), os.path.join(processed_dir, file_name))
