In [None]:
%pip install azure-ai-inference

In [None]:
import openai
import json
import ast
import base64
import pandas as pd
import tiktoken

In [None]:
key_vault_name = 'kv_to-be-replaced'
table_name = 'processed_data'

In [None]:
from trident_token_library_wrapper import PyTridentTokenLibrary as tl
def get_secrets_from_kv(kv_name, secret_name):
    access_token = mssparkutils.credentials.getToken("keyvault")
    kv_endpoint = f'https://{kv_name}.vault.azure.net/'
    return(tl.get_secret_with_token(kv_endpoint,secret_name,access_token))

In [None]:
# GPT-4o-mini
# api_key = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-KEY")
# api_type = "azure"
# api_version = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-PREVIEW-API-VERSION")
# endpoint = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-ENDPOINT")

# Phi-3
from azure.ai.inference import ChatCompletionsClient
from azure.ai.inference.models import SystemMessage, UserMessage
from azure.core.credentials import AzureKeyCredential

endpoint= get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-INFERENCE-ENDPOINT")
api_key = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-INFERENCE-KEY")
client = ChatCompletionsClient(endpoint=endpoint, credential=AzureKeyCredential(api_key))

In [None]:
sql_stmt = f'SELECT distinct topic FROM {table_name}'
df = spark.sql(sql_stmt).toPandas()

topics_str = ', '.join(df['topic'].tolist())


def call_gpt4(topics_str1):
    topic_prompt = f"""
        You are a data analysis assistant specialized in natural language processing and topic modeling. 
        Your task is to analyze the given text corpus and identify distinct topics present within the data.
        {topics_str1}
        1. Identify the key topics in the text using topic modeling techniques. 
        2. Choose the right number of topics based on data. Try to keep it up to 8 topics.
        3. Assign a clear and concise label to each topic based on its content.
        4. Provide a brief description of each topic along with its label.
        5. Add parental controls, billing issues like topics to the list of topics if the data includes calls related to them.
        
        If the input data is insufficient for reliable topic modeling, indicate that more data is needed rather than making assumptions. 
        Ensure that the topics and labels are accurate, relevant, and easy to understand.

        Return the topics and their labels in JSON format.Always add 'topics' node and 'label', 'description' attriubtes in json.
        Do not return anything else.
        """
    # GPT-4o-mini
    # system_prompt = 'You are a helpful assistant.'
    # response = openai.ChatCompletion.create(
    #     engine="gpt-4o-mini", # The deployment name you chose when you deployed the GPT-35-Turbo or GPT-4 model.
    #     messages=[
    #         {"role": "system", "content": system_prompt},
    #         {"role": "user", "content": topic_prompt}
    #     ],
    #     temperature = 0,
    #     max_tokens = 2000
    # )
    # res = response['choices'][0]['message']['content']
    # return(json.loads(res.replace("```json",'').replace("```",'')))

    # Phi-3 model    
    response = client.complete(
        messages=[
            # SystemMessage(content=prompt),
            UserMessage(content=topic_prompt),
        ],
        max_tokens = 500,
        temperature = 0,
        top_p = 1
    )

    res = response.choices[0].message.content
    return(json.loads(res.replace("```json",'').replace("```",'')))

# Function to count the number of tokens in a string using tiktoken
def count_tokens(text, encoding='gpt-4'):
    tokenizer = tiktoken.encoding_for_model(encoding)
    tokens = tokenizer.encode(text)
    return len(tokens)

# Function to split a comma-separated string into chunks that fit within max_tokens
def split_data_into_chunks(text, max_tokens=2000, encoding='gpt-4'):
    print("\n Split data input:", text)
    tokenizer = tiktoken.encoding_for_model(encoding)
    # Split the string by commas
    items = text.split(',')
    current_chunk = []
    all_chunks = []
    current_token_count = 0

    for item in items:
        item = item.strip()  # Clean up any extra whitespace
        # Count the tokens for the current item
        item_token_count = len(tokenizer.encode(item))
        
        # Check if adding the item exceeds the max token limit
        if current_token_count + item_token_count > max_tokens:
            # Save the current chunk and start a new one
            all_chunks.append(', '.join(current_chunk))
            current_chunk = [item]
            current_token_count = item_token_count
        else:
            # Add item to the current chunk
            current_chunk.append(item)
            current_token_count += item_token_count

    # Append the last chunk if it has any content
    if current_chunk:
        all_chunks.append(', '.join(current_chunk))
    return all_chunks


# Define the max tokens per chunk (4096 for GPT-4)
max_tokens = 3096

# Split the string into chunks
chunks = split_data_into_chunks(topics_str, max_tokens)

def reduce_data_until_fits(topics_str, max_tokens):
    if len(topics_str) <= max_tokens:
        return call_gpt4(topics_str)
    chunks = split_data_into_chunks(topics_str)
    reduced_data = []

    for idx, chunk in enumerate(chunks):
        print(f"Processing chunk {idx + 1}/{len(chunks)}...")
        try:
            result = call_gpt4(chunk)
            topics_object = res #json.loads(res)
            for object1 in topics_object['topics']:
                reduced_data.extend([object1['label']])
        except Exception as e:
            print(f"Error processing chunk {idx + 1}: {str(e)}")
    combined_data = ", ".join(reduced_data)
    return reduce_data_until_fits(combined_data, max_tokens)

res = call_gpt4(topics_str)
topics_object = res 


In [None]:
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import *

schema = StructType([
    StructField('label', StringType(), True), 
    StructField('description', StringType(), True)
])
df = spark.createDataFrame(topics_object['topics'], schema)
df.write.format('delta').mode('overwrite').option("overwriteSchema", "true").saveAsTable('mined_topcis')
# display(df)

In [None]:
def get_mined_topic_mapping(input_text, list_of_topics):
    # Construct the prompt  
    prompt = f'''You are a data analysis assistant to help find topic from a given text {input_text} 
             and a list of predefined topics {list_of_topics}.  
             Always find the topic from {list_of_topics}. Do not add new topics.
            Only return topic and nothing else.'''
    system_prompt = 'You are a helpful assistant.'
    # response = openai.ChatCompletion.create(
    #     engine="gpt-4o-mini", # The deployment name you chose when you deployed the GPT-35-Turbo or GPT-4 model.
    #     messages=[
    #         {"role": "system", "content": system_prompt},
    #         {"role": "user", "content": prompt}
    #     ],
    #     temperature = 0,
    #     max_tokens = 2000
    # )
    # return response['choices'][0]['message']['content']
    # return(json.loads(res.replace("```json",'').replace("```",'')))

    # Phi-3 model    
    response = client.complete(
        messages=[
            # SystemMessage(content=prompt),
            UserMessage(content=prompt),
        ],
        max_tokens = 500,
        temperature = 0,
        top_p = 1
    )

    return response.choices[0].message.content
    # return(json.loads(res.replace("```json",'').replace("```",'')))

df_topics = spark.sql('select * from mined_topcis').toPandas()
mined_topics_list = df_topics['label'].tolist()
mined_topics =  ", ".join(mined_topics_list)

In [None]:
sql_stmt = 'select * from processed_data'
df_processed_data = spark.sql(sql_stmt).toPandas()
counter = 0
# call get_mined_topic_mapping function for each row in the dataframe and update the mined_topic column in the database table
for index, row in df_processed_data.iterrows():
    mined_topic_str = get_mined_topic_mapping(row['topic'], mined_topics)
    # update the dataframe
    df_processed_data.at[index, 'mined_topic'] = mined_topic_str
    # print(mined_topic_str)
    # break
df = spark.createDataFrame(df_processed_data)
df.write.format('delta').mode('overwrite').option("overwriteSchema", "true").saveAsTable('processed_data')