In [None]:
# %pip install azure.cognitiveservices.speech
# %pip install azure-ai-inference
# %pip install azure-search-documents
# %pip install pymssql

In [None]:
# Import required modules
import requests
import time
import os
import pprint
import json
from pathlib import Path
from typing import List
from os import listdir
from os.path import isfile, join

In [None]:
import os 

# Define the base path
base_path = '/lakehouse/default/Files/data'

# List of folders to be created
folders = ['cu_output']

# Create each folder
for folder in folders:
    folder_path = os.path.join(base_path, folder)
    try:
        os.makedirs(folder_path, exist_ok=True)
        print(f'Folder created at: {folder_path}')
    except Exception as e:
        print(f'Failed to create the folder {folder_path}. Error: {e}')

In [None]:
# get Azure AI credentials from keyvalut
from trident_token_library_wrapper import PyTridentTokenLibrary as tl

key_vault_name = 'kv_to-be-replaced'

def get_secrets_from_kv(kv_name, secret_name):
    access_token = mssparkutils.credentials.getToken("keyvault")
    kv_endpoint = f'https://{kv_name}.vault.azure.net/'
    return(tl.get_secret_with_token(kv_endpoint,secret_name,access_token))

endpoint = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-CU-ENDPOINT")
api_key = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-CU-KEY")
api_version = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-CU-VERSION")

endpoint_b = spark.sparkContext.broadcast(endpoint)
api_key_b = spark.sparkContext.broadcast(api_key)
api_version_b = spark.sparkContext.broadcast(api_version)

In [None]:
# helper method to poll for inferencing results
def poll_for_results(operation_location: str, success_state: str, failed_state: str, timeout: int = 300, interval: int = 2):
    """
    Polls the operation location URL until the operation reaches a success or failure state.

    Args:
        operation_location (str): The URL to poll for the operation result.
        success_state (str): The status indicating the operation succeeded.
        failed_state (str): The status indicating the operation failed.
        timeout (int, optional): Maximum time to wait in seconds. Default is 60 seconds.
        interval (int, optional): Time between polling attempts in seconds. Default is 2 seconds.

    Returns:
        dict or None: The final JSON response if successful, None otherwise.
    """

    API_KEY = api_key_b.value
    
    headers = {
        'Ocp-Apim-Subscription-Key': API_KEY,
        'cogsvc-videoanalysis-face-identification-enable': "true"
    }

    # print(f'GET {operation_location}')

    elapsed_time = 0
    while elapsed_time <= timeout:
        try:
            response = requests.get(operation_location, headers=headers)
            response.raise_for_status()
            result = response.json()
            # print(response)
            # print(result)

            status = result.get('status')
            if status == success_state:
                return result
            elif status == failed_state:
                print(f"Operation failed with status: {status}")
                return None

            time.sleep(interval)
            elapsed_time += interval

        except requests.exceptions.RequestException as e:
            print(f"An error occurred: {e}")
            return None

    print("Operation timed out.")
    return None

In [None]:
from pyspark.sql import Row

from pyspark.sql.types import *
from pyspark.sql.functions import *
from datetime import datetime, timedelta

def process_audio_file(file_path):
    analyzer_id = 'ckm-analyzer'

    # # Set content understanding service settings
    AISERVICE_ENDPOINT = endpoint_b.value
    API_KEY = api_key_b.value
    API_VERSION = api_version_b.value

    headers = {
            'Ocp-Apim-Subscription-Key': API_KEY,
            'Content-Type': 'application/octet-stream',
            'cogsvc-videoanalysis-face-identification-enable': "true"
        }

    ## Set Content Understanding inference paths
    PATH_ANALYZER_INFERENCE = "/contentunderstanding/analyzers/{analyzerId}:analyze"

    with open(file_path, 'rb') as f:
        data = f.read()
    
    url = f"{AISERVICE_ENDPOINT}{PATH_ANALYZER_INFERENCE.format(analyzerId=analyzer_id)}{API_VERSION}"
    conversation_id = file_path.split('convo_', 1)[1].split('_')[0]

    try:
        response = requests.post(url, headers=headers, data=data)
        
        # Get the operation location from the response headers
        operation_location = response.headers.get('Operation-Location')
        if not operation_location:
            print("Error: 'Operation-Location' not found in the response headers.")
            # return None

        # Poll for results
        result = poll_for_results(operation_location, 'Succeeded', 'Failed')
        # print(result)

        # file_name = file_path
        start_time = file_path.replace(".wav", "")[-19:]
        timestamp_format = "%Y-%m-%d %H_%M_%S"  # Adjust format if necessary
        start_timestamp = datetime.strptime(start_time, timestamp_format)
        start_date = start_timestamp.strftime("%Y-%m-%d")
        conversation_id = file_path.split('convo_', 1)[1].split('_')[0]
        duration = int(result['result']['contents'][0]['fields']['Duration']['valueString'])
        end_timestamp = str(start_timestamp + timedelta(seconds=duration))
        end_timestamp = end_timestamp.split(".")[0]

        conversationRow = {
            "ConversationId": conversation_id,
            "ConversationDate": start_date,
            "StartTime": str(start_timestamp),
            "EndTime": str(end_timestamp),
            "Duration": duration,
            "Content": result['result']['contents'][0]['fields']['content']['valueString'],
            "summary": result['result']['contents'][0]['fields']['summary']['valueString'],
            "satisfied": result['result']['contents'][0]['fields']['satisfied']['valueString'],
            "sentiment": result['result']['contents'][0]['fields']['sentiment']['valueString'],
            "topic": result['result']['contents'][0]['fields']['topic']['valueString'],
            "keyPhrases": result['result']['contents'][0]['fields']['keyPhrases']['valueString'],
            "complaint": result['result']['contents'][0]['fields']['complaint']['valueString']
        }
    except:
        conversationRow = {
            "ConversationId": conversation_id,
            "ConversationDate": start_date,
            "StartTime": str(start_timestamp),
            "EndTime": str(start_timestamp),
            "Duration": 0,
            "Content": '',
            "summary": '',
            "satisfied": '',
            "sentiment": '',
            "topic": '',
            "keyPhrases": '',
            "complaint": ''
        }

    return conversationRow
# test_file = '/lakehouse/default/Files/cu_audio_files_all/convo_05be369b-0a5d-4b6a-b7af-3aef1ba4e6e6_2024-12-08 22_00_00.wav'
# process_audio_file(test_file)

In [None]:
import pandas as pd

folder_path = "/lakehouse/default/Files/cu_audio_files_all"
wav_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.wav')]
# wav_files  = wav_files[:3]

df_files = spark.createDataFrame([(value,) for value in wav_files], ["file_path"])

schema = StructType([
             StructField("ConversationId", StringType(), True),
             StructField("ConversationDate", StringType(), True),
             StructField("StartTime", StringType(), True),
             StructField("EndTime", StringType(), True),
             StructField("Duration", StringType(), True),
             StructField("Content", StringType(), True),
             StructField("summary", StringType(), True),
             StructField("satisfied", StringType(), True),
             StructField("sentiment", StringType(), True),
             StructField("topic", StringType(), True),
             StructField("keyPhrases", StringType(), True),
             StructField("complaint", StringType(), True)
         ])

process_audio_udf = udf(lambda file_path: process_audio_file(file_path),returnType=schema)

df_processed = df_files.select(["file_path"]) \
                .withColumn("Details", process_audio_udf(col("file_path"))) \
                .select([ col("Details.ConversationId").alias("ConversationId"), \
                          col("Details.ConversationDate").alias("ConversationDate"), \
                          col("Details.StartTime").alias("StartTime"), \
                          col("Details.EndTime").alias("EndTime"), \
                          col("Details.Duration").alias("Duration"), \
                          col("Details.Content").alias("Content"), \
                          col("Details.summary").alias("summary"), \
                          col("Details.satisfied").alias("satisfied"), \
                          col("Details.sentiment").alias("sentiment"), \
                          col("Details.topic").alias("topic"), \
                          col("Details.keyPhrases").alias("keyPhrases"), \
                          col("Details.complaint").alias("complaint")
                          ])

# to adjust the dates to current date
df_processed_pd = df_processed.toPandas()
df_processed_pd['StartTime'] = pd.to_datetime(df_processed_pd['StartTime'], errors='coerce')
df_processed_pd['EndTime'] = pd.to_datetime(df_processed_pd['EndTime'], errors='coerce')
df_processed_pd['ConversationDate'] = pd.to_datetime(df_processed_pd['ConversationDate'], errors='coerce')

max_start_time = df_processed_pd['StartTime'].max()
days_difference = (datetime.today() - max_start_time).days - 1

df_processed_pd['StartTime'] += pd.DateOffset(days=days_difference)
df_processed_pd['EndTime'] += pd.DateOffset(days=days_difference)
df_processed_pd['ConversationDate'] = (df_processed_pd['ConversationDate'] + pd.DateOffset(days=days_difference)).dt.date
df_processed = spark.createDataFrame(df_processed_pd)

# df_processed.write.format('delta').mode('append').saveAsTable('km_processed_data_cu1')                      
df_processed.write.format('delta').mode('overwrite').option("overwriteSchema", "true").saveAsTable('km_processed_data_cu1')                       

In [None]:
import tiktoken
import openai
import json
from pyspark.sql.types import *

openai.api_type = "azure"
openai.api_version = "2023-07-01-preview" 
openai.api_base = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-ENDPOINT")
openai.api_key = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-KEY")

# sql_stmt = 'SELECT distinct topic FROM processed_data_4o_mini'
df_data = spark.sql('''select distinct topic from km_processed_data_cu1 where content != "" ''')
df = df_data.toPandas()

topics_str = ', '.join(df['topic'].tolist())
# print(topics_str)

# 2. Choose the right number of topics based on data. Try to keep it as low number of topics as possible.

def call_gpt4(topics_str1):
    topic_prompt = f"""
        You are a data analysis assistant specialized in natural language processing and topic modeling. 
        Your task is to analyze the given text corpus and identify distinct topics present within the data.
        {topics_str1}
        1. Identify the key topics in the text using topic modeling techniques. 
        2. Choose the right number of topics based on data. Try to keep it up to 8 topics.
        3. Assign a clear and concise label to each topic based on its content.
        4. Provide a brief description of each topic along with its label.
        5. Add parental controls, billing issues like topics to the list of topics if the data includes calls related to them.
        
        If the input data is insufficient for reliable topic modeling, indicate that more data is needed rather than making assumptions. 
        Ensure that the topics and labels are accurate, relevant, and easy to understand.

        Return the topics and their labels in JSON format.Always add 'topics' node and 'label', 'description' attriubtes in json.
        Do not return anything else.
        """
    system_prompt = 'You are a helpful assistant.'
    response = openai.ChatCompletion.create(
        engine="gpt-4o-mini", # The deployment name you chose when you deployed the GPT-35-Turbo or GPT-4 model.
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": topic_prompt}
        ],
        temperature = 0,
        max_tokens = 2000,
        seed=42
    )
    res = response['choices'][0]['message']['content']
    return(json.loads(res.replace("```json",'').replace("```",'')))

# Function to count the number of tokens in a string using tiktoken
def count_tokens(text, encoding='gpt-4'):
    tokenizer = tiktoken.encoding_for_model(encoding)
    tokens = tokenizer.encode(text)
    return len(tokens)

# Function to split a comma-separated string into chunks that fit within max_tokens
def split_data_into_chunks(text, max_tokens=2000, encoding='gpt-4'):
    # print("\n Split data input:", text)
    tokenizer = tiktoken.encoding_for_model(encoding)
    # Split the string by commas
    items = text.split(',')
    current_chunk = []
    all_chunks = []
    current_token_count = 0

    for item in items:
        item = item.strip()  # Clean up any extra whitespace
        # Count the tokens for the current item
        item_token_count = len(tokenizer.encode(item))
        
        # Check if adding the item exceeds the max token limit
        if current_token_count + item_token_count > max_tokens:
            # Save the current chunk and start a new one
            all_chunks.append(', '.join(current_chunk))
            current_chunk = [item]
            current_token_count = item_token_count
        else:
            # Add item to the current chunk
            current_chunk.append(item)
            current_token_count += item_token_count

    # Append the last chunk if it has any content
    if current_chunk:
        all_chunks.append(', '.join(current_chunk))
    return all_chunks


# Define the max tokens per chunk (4096 for GPT-4)
max_tokens = 3096

# Split the string into chunks
chunks = split_data_into_chunks(topics_str, max_tokens)

def reduce_data_until_fits(topics_str, max_tokens):
    if len(topics_str) <= max_tokens:
        return call_gpt4(topics_str)
    chunks = split_data_into_chunks(topics_str)
    reduced_data = []

    for idx, chunk in enumerate(chunks):
        print(f"Processing chunk {idx + 1}/{len(chunks)}...")
        try:
            result = call_gpt4(chunk)
            topics_object = res #json.loads(res)
            for object1 in topics_object['topics']:
                reduced_data.extend([object1['label']])
        except Exception as e:
            print(f"Error processing chunk {idx + 1}: {str(e)}")
    combined_data = ", ".join(reduced_data)
    return reduce_data_until_fits(combined_data, max_tokens)

# res = reduce_data_until_fits(topics_str, max_tokens)
topics_object = call_gpt4(topics_str)
# res = json.loads(res.replace("```json",'').replace("```",''))

schema = StructType([
    StructField('label', StringType(), True), 
    StructField('description', StringType(), True)
])
df = spark.createDataFrame(topics_object['topics'], schema)
df.write.format('delta').mode('overwrite').option("overwriteSchema", "true").saveAsTable('mined_topcis_cu')
# display(df)

In [None]:
import time
def get_mined_topic_mapping(input_text, list_of_topics):
    # Construct the prompt  
    prompt = f'''You are a data analysis assistant to help find the closest topic for a given text {input_text} 
                from a list of topics - {list_of_topics}.
                ALLWAYS only return a topic from list - {list_of_topics}. Do not add any other text.'''
    system_prompt = 'You are a helpful assistant.'
    try:
        response = openai.ChatCompletion.create(
            engine="gpt-4o-mini", # The deployment name you chose when you deployed the GPT-35-Turbo or GPT-4 model.
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt}
            ],
            temperature = 0,
            max_tokens = 2000
        )
    except:
        time.sleep(50)
        response = openai.ChatCompletion.create(
            engine="gpt-4o-mini", # The deployment name you chose when you deployed the GPT-35-Turbo or GPT-4 model.
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt}
            ],
            temperature = 0,
            max_tokens = 2000
        )
    return response['choices'][0]['message']['content']
    # return(json.loads(res.replace("```json",'').replace("```",'')))

df_topics = spark.sql('select * from mined_topcis_cu').toPandas()
mined_topics_list = df_topics['label'].tolist()
mined_topics =  ", ".join(mined_topics_list)

sql_stmt = "select * from km_processed_data_cu1 where content != ''"
df_processed_data = spark.sql(sql_stmt).toPandas()
df_processed_data['mined_topic'] = df_processed_data['topic'].apply(lambda x: get_mined_topic_mapping(x,str(mined_topics_list)))

# sql_stmt = "select * from km_processed_data_cu1 where content != '' "
# df_processed_data = spark.sql(sql_stmt).toPandas()
# counter = 0
# # call get_mined_topic_mapping function for each row in the dataframe and update the mined_topic column in the database table
# for index, row in df_processed_data.iterrows():
#     mined_topic_str = get_mined_topic_mapping(row['topic'], str(mined_topics_list))
#     # update the dataframe
#     df_processed_data.at[index, 'mined_topic'] = mined_topic_str

df = spark.createDataFrame(df_processed_data)
df.write.format('delta').mode('overwrite').option("overwriteSchema", "true").saveAsTable('km_processed_data_cu2')

In [None]:
# move to after data padding
from pyspark.sql.types import *
from pyspark.sql.functions import *
sql_stmt = '''select ConversationId,keyphrases,sentiment, mined_topic as topic, StartTime from km_processed_data_cu2'''
df = spark.sql(sql_stmt)

df_keyPhrases = df.withColumn('keyPhrases', split(df['keyPhrases'], ','))
df_keyPhrases = df_keyPhrases.withColumn('keyPhrase', explode(df_keyPhrases['keyPhrases']))
df_keyPhrases = df_keyPhrases.select('ConversationId', 'keyPhrase', 'sentiment','topic', 'StartTime')
df_keyPhrases.write.format('delta').mode('overwrite').option("overwriteSchema", "true").saveAsTable('km_processed_data_keyphrases_cu2')

In [None]:
import pymssql
server = get_secrets_from_kv(key_vault_name,"SQLDB-SERVER")
database = get_secrets_from_kv(key_vault_name,"SQLDB-DATABASE")
username = get_secrets_from_kv(key_vault_name,"SQLDB-USERNAME")
password = get_secrets_from_kv(key_vault_name,"SQLDB-PASSWORD")

conn = pymssql.connect(server, username, password, database)
cursor = conn.cursor()
print("Connected to the database")

# sql_stmt = 'SELECT distinct topic FROM processed_data'
# cursor.execute(sql_stmt)

# rows = cursor.fetchall()
# column_names = [i[0] for i in cursor.description]
# df = pd.DataFrame(rows, columns=column_names)
# print(df)

In [None]:
# cursor.execute('DROP TABLE IF EXISTS processed_data')
# conn.commit()

# create_processed_data_sql = """CREATE TABLE processed_data (
#                 ConversationId varchar(255) NOT NULL PRIMARY KEY,
#                 StartTime varchar(255),
#                 EndTime varchar(255),
#                 Content varchar(max),
#                 summary varchar(3000),
#                 satisfied varchar(255),
#                 sentiment varchar(255),
#                 topic varchar(255),
#                 key_phrases nvarchar(max),
#                 complaint varchar(255), 
#                 mined_topic varchar(255)
#             );"""
# cursor.execute(create_processed_data_sql)
# conn.commit()


sql_stmt = '''select ConversationId, StartTime, EndTime, Content, summary, satisfied, sentiment,
 keyphrases as key_phrases, complaint, topic, mined_topic from km_processed_data_cu2'''
df_cu2_processed = spark.sql(sql_stmt).toPandas()
# cursor.execute(sql_stmt)

# rows = cursor.fetchall()
# column_names = [i[0] for i in cursor.description]
# df = pd.DataFrame(rows, columns=column_names)
# df.rename(columns={'mined_topic': 'topic'}, inplace=True)
# print(df.columns)
for idx, row in df_cu2_processed.iterrows():
    # row['ConversationId'] = str(uuid.uuid4())
    cursor.execute(f"INSERT INTO processed_data (ConversationId, StartTime, EndTime, Content, summary, satisfied, sentiment, topic, key_phrases, complaint, mined_topic) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", (row['ConversationId'], row['StartTime'], row['EndTime'], row['Content'], row['summary'], row['satisfied'], row['sentiment'], row['topic'], row['key_phrases'], row['complaint'], row['mined_topic']))
conn.commit()

In [None]:
conversationIds_tuple = tuple(df_cu2_processed['ConversationId'].tolist())

conversationId_vals = ', '.join(f"'{v}'" for v in conversationIds_tuple)
# print(conversationId_vals)

In [None]:
import uuid
import random
sql_stmt = f"SELECT * FROM processed_data WHERE ConversationId IN ({conversationId_vals})"
cursor.execute(sql_stmt)

rows = cursor.fetchall()
column_names = [i[0] for i in cursor.description]
df = pd.DataFrame(rows, columns=column_names)
columns_lst = df.columns
df_append = pd.DataFrame(df, columns=columns_lst)
days_list = [7, 14, 21, 28, 35, 42]
rows = [5, 7, 8]

# Define the sentiment values and their probabilities
sentiment_values = ['Negative', 'Positive', 'Neutral']
probabilities = [0.7, 0.2, 0.1]  # 70% for value1 and 30% for value2

text = 'billing'

for idx, row in df.iterrows():
    for i in range(random.choice(rows)):
    
        days = random.choice(days_list)
       
        if ('billing' in row['mined_topic'].lstrip().lower()) or ('issue' in row['mined_topic'].lstrip().lower()):
            
            row['sentiment'] = random.choices(sentiment_values, probabilities)[0] #'Negative'
            row['satisfied'] = 'No'
            row['EndTime'] = pd.to_datetime(row['EndTime']) - pd.to_timedelta(f"{days} days")
            row['StartTime'] = pd.to_datetime(row['StartTime']) - pd.to_timedelta(f"{days} days")
            row['EndTime'] = row['EndTime'].strftime('%Y-%m-%d %H:%M:%S')
            row['StartTime'] = row['StartTime'].strftime('%Y-%m-%d %H:%M:%S')
            row['ConversationId'] = str(uuid.uuid4())
                
        else:
            row['ConversationId'] = str(uuid.uuid4())
            row['EndTime'] = pd.to_datetime(row['EndTime']) - pd.to_timedelta(f"{days} days")
            row['StartTime'] = pd.to_datetime(row['StartTime']) - pd.to_timedelta(f"{days} days")
            row['EndTime'] = row['EndTime'].strftime('%Y-%m-%d %H:%M:%S')
            row['StartTime'] = row['StartTime'].strftime('%Y-%m-%d %H:%M:%S')
            
    # write the new row to the processed_data table
        cursor.execute(f"INSERT INTO processed_data (ConversationId, EndTime, StartTime, Content, summary, satisfied, sentiment, topic, key_phrases, complaint, mined_topic) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", (row['ConversationId'], row['EndTime'], row['StartTime'], row['Content'], row['summary'], row['satisfied'], row['sentiment'], row['topic'], row['key_phrases'], row['complaint'], row['mined_topic']))
        # add to search index 
        
conn.commit()

In [None]:
# cursor.execute('DROP TABLE IF EXISTS km_processed_data')
# conn.commit()

# create_processed_data_sql = """CREATE TABLE km_processed_data (
#                 ConversationId varchar(255) NOT NULL PRIMARY KEY,
#                 StartTime varchar(255),
#                 EndTime varchar(255),
#                 Content varchar(max),
#                 summary varchar(max),
#                 satisfied varchar(255),
#                 sentiment varchar(255),
#                 keyphrases nvarchar(max),
#                 complaint varchar(255), 
#                 topic varchar(255)
#             );"""
# cursor.execute(create_processed_data_sql)
# conn.commit()
# sql_stmt = 'SELECT * FROM processed_data'
sql_stmt = f'''select ConversationId, StartTime, EndTime, Content, summary, satisfied, sentiment, 
key_phrases as keyphrases, complaint, mined_topic as topic from processed_data WHERE ConversationId IN ({conversationId_vals})'''

cursor.execute(sql_stmt)

rows = cursor.fetchall()
column_names = [i[0] for i in cursor.description]
df = pd.DataFrame(rows, columns=column_names)
# df.rename(columns={'mined_topic': 'topic'}, inplace=True)
# print(df.columns)
for idx, row in df.iterrows():
    # row['ConversationId'] = str(uuid.uuid4())
    cursor.execute(f"INSERT INTO km_processed_data (ConversationId, StartTime, EndTime, Content, summary, satisfied, sentiment, keyphrases, complaint, topic) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", (row['ConversationId'], row['StartTime'], row['EndTime'], row['Content'], row['summary'], row['satisfied'], row['sentiment'], row['keyphrases'], row['complaint'], row['topic']))
conn.commit()


In [None]:
sql_stmt = f'SELECT ConversationId,key_Phrases,sentiment, mined_topic as topic, StartTime FROM processed_data WHERE ConversationId IN ({conversationId_vals})'
cursor.execute(sql_stmt)
rows = cursor.fetchall()
column_names = [i[0] for i in cursor.description]
# df = pd.DataFrame(rows, columns=column_names)
df = spark.createDataFrame(pd.DataFrame(rows, columns=column_names))
df_keyPhrases = df.withColumn('keyPhrases', split(df['key_Phrases'], ','))
df_keyPhrases = df_keyPhrases.withColumn('keyPhrase', explode(df_keyPhrases['keyPhrases']))
df_keyPhrases = df_keyPhrases.select('ConversationId', 'keyPhrase', 'sentiment','topic', 'StartTime')
df_keyPhrases.write.format('delta').mode('overwrite').option("overwriteSchema", "true").saveAsTable('km_processed_data_keyphrases_cu2')

In [None]:
# cursor.execute('DROP TABLE IF EXISTS processed_data_key_phrases')
# conn.commit()

# create_processed_data_sql = """CREATE TABLE processed_data_key_phrases (
#                 ConversationId varchar(255),
#                 key_phrase varchar(500), 
#                 sentiment varchar(255),
#                 topic varchar(255)
#             );"""
# cursor.execute(create_processed_data_sql)
# conn.commit()

sql_stmt = f'''select ConversationId, lower(trim(keyPhrase)) as keyPhrase,
                 sentiment, topic, StartTime from km_processed_data_keyphrases_cu2 WHERE ConversationId IN ({conversationId_vals})'''
df = spark.sql(sql_stmt).toPandas()
rows = list(df.itertuples(index=False, name=None))
# print(rows)

# Generate the SQL query for insertion
insert_query = f"INSERT INTO processed_data_key_phrases (ConversationId, key_phrase, sentiment,topic, StartTime) VALUES (%s, %s, %s, %s, %s)"

# # Perform the bulk insert
# cursor.executemany(insert_query, rows)

chunk_size = 1000
for i in range(0, len(rows), chunk_size):
    cursor.executemany(insert_query, rows[i:i + chunk_size])

conn.commit()
cursor.close()
conn.close()

In [None]:
# cursor.execute('DROP TABLE IF EXISTS processed_data_key_phrases')
# conn.commit()

# create_processed_data_sql = """CREATE TABLE processed_data_key_phrases (
#                 ConversationId varchar(255),
#                 key_phrase varchar(500), 
#                 sentiment varchar(255),
#                 topic varchar(255)
#             );"""
# cursor.execute(create_processed_data_sql)
# conn.commit()

# # sql_stmt = 'SELECT ConversationId,key_Phrases,sentiment, mined_topic as topic FROM processed_data'
# # cursor.execute(sql_stmt)
# # rows = cursor.fetchall()
# # column_names = [i[0] for i in cursor.description]
# # df = pd.DataFrame(rows, columns=column_names)
# # # print(df.columns)
# # for idx, row in df.iterrows():
# #     key_phrases = row['key_Phrases'].split(',')
# #     for key_phrase in key_phrases:
# #         key_phrase = key_phrase.strip()
# #         cursor.execute(f"INSERT INTO processed_data_key_phrases (ConversationId, key_phrase, sentiment,topic) VALUES (%s,%s,%s,%s)", (row['ConversationId'], key_phrase, row['sentiment'],row['topic']))

# sql_stmt = '''select ConversationId, lower(trim(keyPhrase)) as keyPhrase, sentiment, topic from km_processed_data_keyphrases_cu2'''
# df = spark.sql(sql_stmt).toPandas()

# # # sql_stmt = 'SELECT * FROM processed_data'
# # # cursor.execute(sql_stmt)
# # # rows = cursor.fetchall()
# # # column_names = [i[0] for i in cursor.description]
# # # df = pd.DataFrame(rows, columns=column_names)
# # # df.rename(columns={'mined_topic': 'topic'}, inplace=True)
# # # print(df.columns)
# for idx, row in df.iterrows():
#     # row['ConversationId'] = str(uuid.uuid4())
#     cursor.execute(f"INSERT INTO processed_data_key_phrases (ConversationId, key_phrase, sentiment, topic) VALUES (%s,%s,%s,%s)", (row['ConversationId'], row['keyPhrase'].strip(),row['sentiment'],row['topic']))
# conn.commit()
# cursor.close()
# conn.close()

In [None]:
# %%sql
# select * from km_processed_data_keyphrases_cu2 order by topic

In [None]:
# %%sql
# select lower(trim(keyPhrase)), sentiment,count(*) from km_processed_data_keyphrases_cu2
# -- where trim(keyPhrase) not in ('account number','Contoso Incorporated','phone number')
# group by lower(trim(keyPhrase)), sentiment
# order by count(*) desc

In [None]:
# %%sql
# select mined_topic, sentiment,count(*) from km_processed_data_cu2
# group by mined_topic, sentiment
# order by mined_topic

In [None]:
# Directory paths
input_dir = '/lakehouse/default/Files/cu_audio_files_all/'
processed_dir = '/lakehouse/default/Files/cu_audio_files_processed/'
# failed_folder = '/lakehouse/default/Files/data/conversation_failed/'

In [None]:
# # This cell creates new folders within the specified base path in the lakehouse. 
# The purpose is to create corresponding folders so files can be moved as they are processed.

import os 

# Define the base path
base_path = '/lakehouse/default/Files/'

# List of folders to be created
folders = ['cu_audio_files_processed']

# Create each folder
for folder in folders:
    folder_path = os.path.join(base_path, folder)
    try:
        os.makedirs(folder_path, exist_ok=True)
        print(f'Folder created at: {folder_path}')
    except Exception as e:
        print(f'Failed to create the folder {folder_path}. Error: {e}')

In [None]:
# Move input files to processed directory

import os
import shutil

# Get a list of all .json files in the input directory
wav_files = [f for f in os.listdir(input_dir) if f.endswith('.wav')]

print(wav_files)

# Move each .json file to the processed directory
for file_name in wav_files:
    shutil.move(os.path.join(input_dir, file_name), os.path.join(processed_dir, file_name))