In [None]:
%pip install azure.cognitiveservices.speech
%pip install azure-ai-inference
%pip install azure-search-documents

In [None]:
import openai
import json
import time
import pandas as pd
from datetime import datetime, timedelta
import re
import uuid
import azure.cognitiveservices.speech as speechsdk 

In [None]:
key_vault_name = 'kv_to-be-replaced'
index_name = "call_trascripts_index"

In [None]:
from trident_token_library_wrapper import PyTridentTokenLibrary as tl
def get_secrets_from_kv(kv_name, secret_name):
    access_token = mssparkutils.credentials.getToken("keyvault")
    kv_endpoint = f'https://{kv_name}.vault.azure.net/'
    return(tl.get_secret_with_token(kv_endpoint,secret_name,access_token))

In [None]:
ai_services_key = get_secrets_from_kv(key_vault_name,'AZURE-OPENAI-KEY')
ai_services_region = get_secrets_from_kv(key_vault_name, 'AZURE-LOCATION')


In [None]:
# Function to transcribe speech from an audio file
def transcribe_from_file(ai_services_key, ai_services_region, wav_file_path, conversation_id):
    # List to store the results of the transcription
    all_results = list()
    
    # Configure the speech service
    speech_config = speechsdk.SpeechConfig(subscription=ai_services_key, region=ai_services_region)
    speech_config.speech_recognition_language = "en-US"
    
    # Set up the audio configuration using the provided file path
    audio_config = speechsdk.audio.AudioConfig(filename=wav_file_path)
    
    # Create a conversation transcriber object
    conversation_transcriber = speechsdk.transcription.ConversationTranscriber(speech_config=speech_config, audio_config=audio_config)

    # Flag to indicate when to stop transcribing
    transcribing_stop = False

    # Callback for when the transcription session starts
    def conversation_transcriber_session_started_cb(evt: speechsdk.SessionEventArgs):
        # print('SessionStarted event')
        pass

    # Callback to signal to stop continuous recognition
    def stop_cb(evt: speechsdk.SessionEventArgs):
        nonlocal transcribing_stop
        transcribing_stop = True
        # Log the session ID
        # print(f"Stopping transcription for session id: {evt.session_id}")

        # Check if the event has a result attribute
        if hasattr(evt, 'result'):
            # If the result reason is cancellation, provide the cancellation details
            if evt.result.reason == speechsdk.ResultReason.Canceled:
                cancellation_details = speechsdk.CancellationDetails(evt.result)
                # print(f"Transcription was stopped due to cancellation: {cancellation_details.reason}")
                if cancellation_details.reason == speechsdk.CancellationReason.Error:
                    print(f"Error details: {cancellation_details.error_details}")
            # If the result reason is EndOfStream, indicate the audio stream has ended
            elif evt.result.reason == speechsdk.ResultReason.EndOfStream:
                # print("Transcription stopped because the end of the audio stream was reached.")
                pass
            # If the result reason is NoMatch, indicate no speech could be recognized
            elif evt.result.reason == speechsdk.ResultReason.NoMatch:
                print("Transcription stopped because no speech could be recognized.")
            # For any other reason, log the result reason
            else:
                print(f"Transcription stopped for an unknown reason: {evt.result.reason}")
        else:
            # If there is no result attribute, log that the reason is unknown
            # print("Transcription stopped, but no additional information is available.")
            pass

    # Callback for when the transcription is canceled
    def conversation_transcriber_recognition_canceled_cb(evt: speechsdk.SessionEventArgs):
        # print("Canceled event")
        # Access the cancellation details from the event
        cancellation_details = speechsdk.CancellationDetails(evt.result)
        # Print the reason for the cancellation
        # print(f"Canceled event: {cancellation_details.reason}")

        # If there was an error, print the error details
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print(f"Error details: {cancellation_details.error_details}")


    # Callback for when the transcription session stops
    def conversation_transcriber_session_stopped_cb(evt: speechsdk.SessionEventArgs):
        # Print the session stopped event with the session id for reference
        # print(f"SessionStopped event for session id: {evt.session_id}")
        pass

        # If the event has a result attribute, we can check if there are any additional details
        if hasattr(evt, 'result') and evt.result:
            # Check if the result has a reason attribute and print it
            if hasattr(evt.result, 'reason'):
                print(f"Reason for stop: {evt.result.reason}")

            # If the result is a cancellation, print the cancellation details
            if evt.result.reason == speechsdk.ResultReason.Canceled:
                cancellation_details = speechsdk.CancellationDetails(evt.result)
                print(f"Cancellation reason: {cancellation_details.reason}")
                if cancellation_details.reason == speechsdk.CancellationReason.Error:
                    print(f"Error details: {cancellation_details.error_details}")


    # Handler for the final result of the transcription
    def handle_final_result(evt):
        nonlocal all_results
      
        # Check if the event's result is speech recognition with a recognized phrase
        if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
            # Parse the JSON result from the transcription
            r = json.loads(evt.result.json)
            all_results.append([conversation_id,
                                r["Id"],
                                r["DisplayText"],
                                r["Offset"],
                                r["Duration"],
                                r["Channel"],
                                r["Type"],
                                r["SpeakerId"]
                                ])
        # If the result reason is not recognized speech, log that no recognized speech was found
        else:
            print("No recognized speech was found.")


    # Connect the callbacks to the events fired by the conversation transcriber
    conversation_transcriber.transcribed.connect(handle_final_result)
    conversation_transcriber.session_started.connect(conversation_transcriber_session_started_cb)
    conversation_transcriber.session_stopped.connect(conversation_transcriber_session_stopped_cb)
    conversation_transcriber.canceled.connect(conversation_transcriber_recognition_canceled_cb)
    conversation_transcriber.session_stopped.connect(stop_cb)
    conversation_transcriber.canceled.connect(stop_cb)

    # Start the asynchronous transcription
    conversation_transcriber.start_transcribing_async()

    # Wait for the transcription to complete
    while not transcribing_stop:
        time.sleep(.5)

    # Stop the asynchronous transcription
    conversation_transcriber.stop_transcribing_async()
    # Return the list of transcribed results
    return(all_results)

In [None]:
foldername = 'data_stt'
if not mssparkutils.fs.exists(f'Files/{foldername}/'):
    mssparkutils.fs.mkdirs(f'Files/{foldername}/')

In [None]:
import os
wav_files = []
count = 0

folder_path ='/lakehouse/default/Files/cu_audio_files'
wav_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.wav')]

for wav_file in wav_files:
    file_name = wav_file.split('/')[-1].replace('.wav', '')#.split('_')[1]
    
    r = transcribe_from_file(ai_services_key,ai_services_region,wav_file,file_name)
    
    json_obj = {}
    content = ""
    start_time = wav_file.replace(".wav", "")[-19:]
    timestamp_format = "%Y-%m-%d %H_%M_%S"
    start_timestamp = datetime.strptime(start_time, timestamp_format)
    print(start_timestamp)
    conversation_id = file_name.split('convo_', 1)[1].split('_')[0]
    duration = 0
    endTime = ""
    if len(r) != 0:
        for i in r:
            duration += i[4]
            content += i[2] + " "
                        
            EndTime = str(datetime.strptime(str(start_timestamp), "%Y-%m-%d %H:%M:%S") + timedelta(minutes=int(duration)/100000000))
            try:
                EndTime = str(datetime.strptime(str(EndTime), "%Y-%m-%d %H:%M:%S.%f")).split('.')[0]
            except:
                EndTime = str(datetime.strptime(str(EndTime), "%Y-%m-%d %H:%M:%S"))
        # print('end time:', EndTime)
        # print('duration:', duration) 

        conversationRow = {
            "ConversationId": conversation_id,
            "StartTime": start_time,
            "EndTime": EndTime,
            "Duration": duration/100000000,
            "Content": content,
        }
        filename = 'convo_' + str(conversation_id) + '_'+ str(start_time) + '.json'
        # print(filename)
        # print(wav_file)
        download_path = f'/lakehouse/default/Files/{foldername}/'

        with open(f"{download_path}/{filename}", 'w', encoding='utf-8') as f:
            json.dump(conversationRow, f, ensure_ascii=False, indent=4) 
    # break


In [None]:
search_endpoint = get_secrets_from_kv(key_vault_name,"AZURE-SEARCH-ENDPOINT")
search_key = get_secrets_from_kv(key_vault_name,"AZURE-SEARCH-KEY")

In [None]:
# Create the search index

from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchFieldDataType,
    SearchableField,
    SearchField,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    SemanticConfiguration,
    SemanticPrioritizedFields,
    SemanticField,
    SemanticSearch,
    SearchIndex
)

In [None]:
from azure.core.credentials import AzureKeyCredential 
search_credential = AzureKeyCredential(search_key)
# Create a search index 
index_client = SearchIndexClient(endpoint=search_endpoint, credential=search_credential)

fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SearchableField(name="chunk_id", type=SearchFieldDataType.String),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchableField(name="sourceurl", type=SearchFieldDataType.String),
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
]

# Configure the vector search configuration 
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw"
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
        )
    ]
)

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        keywords_fields=[SemanticField(field_name="chunk_id")],
        content_fields=[SemanticField(field_name="content")]
    )
)

# Create the semantic settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, semantic_search=semantic_search)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')

# Function: Get Embeddings 
def get_embeddings(text: str,openai_api_base,openai_api_version,openai_api_key):
    model_id = "text-embedding-ada-002"
    client = AzureOpenAI(
        api_version=openai_api_version,
        azure_endpoint=openai_api_base,
        api_key = openai_api_key
    )
    
    embedding = client.embeddings.create(input=text, model=model_id).data[0].embedding

    return embedding

# Function: Clean Spaces with Regex - 
def clean_spaces_with_regex(text):
    # Use a regular expression to replace multiple spaces with a single space
    cleaned_text = re.sub(r'\s+', ' ', text)
    # Use a regular expression to replace consecutive dots with a single dot
    cleaned_text = re.sub(r'\.{2,}', '.', cleaned_text)
    return cleaned_text

def chunk_data(text):
    tokens_per_chunk = 1024 #500
    text = clean_spaces_with_regex(text)
    SENTENCE_ENDINGS = [".", "!", "?"]
    WORDS_BREAKS = ['\n', '\t', '}', '{', ']', '[', ')', '(', ' ', ':', ';', ',']

    sentences = text.split('. ') # Split text into sentences
    chunks = []
    current_chunk = ''
    current_chunk_token_count = 0
    
    # Iterate through each sentence
    for sentence in sentences:
        # Split sentence into tokens
        tokens = sentence.split()
        
        # Check if adding the current sentence exceeds tokens_per_chunk
        if current_chunk_token_count + len(tokens) <= tokens_per_chunk:
            # Add the sentence to the current chunk
            if current_chunk:
                current_chunk += '. ' + sentence
            else:
                current_chunk += sentence
            current_chunk_token_count += len(tokens)
        else:
            # Add current chunk to chunks list and start a new chunk
            chunks.append(current_chunk)
            current_chunk = sentence
            current_chunk_token_count = len(tokens)
    
    # Add the last chunk
    if current_chunk:
        chunks.append(current_chunk)
    
    return chunks

In [None]:
# GPT-4o-mini
# api_key = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-KEY")
# api_type = "azure"
# api_version = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-PREVIEW-API-VERSION")
# endpoint = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-ENDPOINT")

# endpoint_b = spark.sparkContext.broadcast(endpoint)
# api_key_b = spark.sparkContext.broadcast(api_key)
# api_version_b = spark.sparkContext.broadcast(api_version)
# api_type_b = spark.sparkContext.broadcast(api_type)

# Phi-3
from azure.ai.inference import ChatCompletionsClient
from azure.ai.inference.models import SystemMessage, UserMessage
from azure.core.credentials import AzureKeyCredential

endpoint= get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-INFERENCE-ENDPOINT")
api_key = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-INFERENCE-KEY")
endpoint_b = spark.sparkContext.broadcast(endpoint)
api_key_b = spark.sparkContext.broadcast(api_key)

# client = ChatCompletionsClient(endpoint=endpoint, credential=AzureKeyCredential(api_key))

In [None]:
# get the details of the content from call transcripts
def get_details(input_text):
    # Construct the prompt  
    prompt = f'''You are a JSON formatter for extracting information out of a single chat conversation -
            {input_text}
            Summarize the conversation, key: summary . 
            Is the customer satisfied with the agent interaction (Yes or No), key: satisfied . 
            Identify the sentiment of the conversation (Positive, Neutral, Negative), key: sentiment . 
            Identify the single primary topic of the conversation in 6 words or less,key: topic . 
            Identify the top 10 key phrases as comma seperated string excluding people names , key: keyPhrases .
            Identify the single primary complaint of the conversation in 3 words or less, key: complaint .
            Answer in JSON machine-readable format, using the keys from above. 
            Pretty print the JSON and make sure that it is properly closed at the end and do not generate any other content.'''

    

    # working for gpt-4o
    # openai.api_base = endpoint_b.value
    # openai.api_key =  api_key_b.value
    # openai.api_type = api_type_b.value
    # openai.api_version = api_version_b.value
    
    # system_prompt = 'You are a helpful assistant.'
    # response = openai.ChatCompletion.create(
    # engine=  "gpt-4o-mini", #"gpt-4", # The deployment name you chose when you deployed the GPT-35-Turbo or GPT-4 model.
    # messages=[
    #     {"role": "system", "content": system_prompt},
    #     {"role": "user", "content": prompt}
    # ],
    # temperature = 0,
    # max_tokens = 2000
    # )
    # res = response['choices'][0]['message']['content']
    # return(json.loads(res.replace("```json",'').replace("```",'')))

    ENDPOINT = endpoint_b.value
    API_KEY = api_key_b.value
    client = ChatCompletionsClient(endpoint=ENDPOINT, credential=AzureKeyCredential(API_KEY))
    # Phi-3 model    
    response = client.complete(
        messages=[
            # SystemMessage(content=prompt),
            UserMessage(content=prompt),
        ],
        max_tokens = 500,
        temperature = 0,
        top_p = 1
    )

    res = response.choices[0].message.content
    return(json.loads(res.replace("```json",'').replace("```",'')))

In [None]:
from pyspark.sql import Row

from pyspark.sql.types import *
from pyspark.sql.functions import *

table_name = 'processed_data'
df = spark.read.option("multiline", "true").json(f"Files/{foldername}/")
text = df.select('content')


schema = StructType([
             StructField("summary", StringType(), True),
             StructField("satisfied", StringType(), True),
             StructField("sentiment", StringType(), True),
             StructField("topic", StringType(), True),
             StructField("keyPhrases", StringType(), True), 
             StructField("complaint", StringType(), True)
         ])

get_detail_udf = udf(lambda content: get_details(content),returnType=schema)



df_processed = df.select(["ConversationId","EndTime","StartTime","Content"]) \
                .withColumn("Details", get_detail_udf(col("Content"))) \
                .select(["ConversationId","EndTime","StartTime","Content", \
                          col("Details.summary").alias("summary"), \
                          col("Details.satisfied").alias("satisfied"), \
                          col("Details.sentiment").alias("sentiment"), \
                          col("Details.topic").alias("topic"), \
                          col("Details.keyPhrases").alias("keyPhrases"), \
                          col('Details.complaint').alias("complaint"), \
                          ]) 
                          
df_processed.write.format('delta').mode('append').option("overwriteSchema", "true").saveAsTable(table_name)
# display(df_processed)

In [None]:
table_name = 'processed_data'
sql_stmt = f'select ConversationId, keyPhrases, sentiment, StartTime from {table_name}'
df = spark.sql(sql_stmt)
df_keyPhrases = df.withColumn('keyPhrases', split(df['keyPhrases'], ','))
df_keyPhrases = df_keyPhrases.withColumn('keyPhrase', explode(df_keyPhrases['keyPhrases']))
df_keyPhrases = df_keyPhrases.select('ConversationId', 'keyPhrase', 'sentiment')
df_keyPhrases.write.format('delta').mode('append').option("overwriteSchema", "true").saveAsTable('processed_data_key_phrases')
# display(df_keyPhrases)

In [None]:
sql_stmt ="select ConversationId, StartTime, EndTime, Content, summary, satisfied, keyPhrases, complaint, mined_topic as topic from processed_data"
df = spark.sql(sql_stmt)
df.write.format('delta').mode('append').option("overwriteSchema", "true").saveAsTable('km_processed_data')
# display(df)

In [None]:
import base64
from azure.search.documents import SearchClient
chunk_num = 0
docs = []
counter = 0


path_name = (f'file:/lakehouse/default/Files/{foldername}')
paths = mssparkutils.fs.ls(path_name)

search_client = SearchClient(search_endpoint, index_name, search_credential)

for path in paths: 
    data = spark.read.option("multiline", "true").json(path.path)
    text = data.select('Content').collect()[0][0]
    filename = path.name.split('/')[-1]
    document_id = filename.replace('.json','').replace('convo_','')
    
    chunks = chunk_data(text)
    # print(chunks)
    # break
    chunk_num = 0
    for chunk in chunks:
        chunk_num += 1
        d = {
                "chunk_id" : document_id + '_' + str(chunk_num).zfill(2),
                "content": chunk,       
            }
        counter += 1
        try:
            v_contentVector = get_embeddings(str(d["content"]),openai_api_base,openai_api_version,openai_api_key)
        except:
            time.sleep(30)
            # print(d["content"])
            try: 
                v_contentVector = get_embeddings(str(d["content"]),openai_api_base,openai_api_version,openai_api_key)
            except: 
                v_contentVector = []

        docs.append(
            {
                    "id": base64.urlsafe_b64encode(bytes(d["chunk_id"], encoding='utf-8')).decode('utf-8'),
                    "chunk_id": d["chunk_id"],
                    "content": d["content"],
                    "sourceurl": path.name.split('/')[-1],
                    "contentVector": v_contentVector
            }
        )

        if counter % 10 == 0:
            result = search_client.upload_documents(documents=docs)
            docs = []
            print(f' {str(counter)} uploaded')

    time.sleep(4)
    # upload the last batch
    if docs != []:
        search_client.upload_documents(documents=docs)