In [None]:
import pandas as pd
import json

# Define the path to your Excel file
file_path = "path_to_your_excel_file.xlsx"  # Replace with the actual path to your file

# Load the Excel file
data_dict = pd.read_excel(file_path, sheet_name=0)  # Adjust the sheet name if needed

# Define the required columns for the flattened JSON
columns_to_extract = {
    "Variable Name": "Variable Name",               # Column E
    "Extracted CRF Name": "Extracted CRF Name",     # Column B
    "Matched HEAL Core CRF": "Matched HEAL Core CRF",  # Column C
    "Match Confidence": "Match Confidence",         # Column D
    "Short Description": "Short Description",       # Column G
    "PV Description": "PV Description",            # Column J
    "Data Type": "Data Type"                        # Column K
}

# Extract and rename the required columns
flattened_df = data_dict[list(columns_to_extract.values())].rename(columns=columns_to_extract)

# Set the main entry key as "Variable Name"
flattened_dict = flattened_df.set_index("Variable Name").to_dict(orient="index")

# Save the flattened JSON file
output_json_path = "Flattened_DataDictionary_By_VariableName.json"  # Adjust the output path if needed
with open(output_json_path, "w") as file:
    json.dump(flattened_dict, file, indent=4)

print(f"Flattened JSON file saved to: {output_json_path}")


In [None]:
# talk to assistant via webapp 

In [29]:
import os
import asyncio
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI
from openai import AssistantEventHandler

In [30]:
# Load environment variables from .env file
load_dotenv()

# Set the OpenAI API key from the environment variable
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("OPENAI_API_KEY environment variable is not set!")
client = OpenAI(api_key=api_key)

In [31]:
async def get_or_create_assistant(client, name, instructions):
    # Fetch the list of existing assistants (no await needed)
    assistants = client.beta.assistants.list()
    
    # Search for an assistant by name
    for assistant in assistants:
        if assistant.name == name:  # Use dot notation instead of subscript
            print(f"[INFO] Using existing assistant: {assistant.id}")
            return assistant
    
    # Create a new assistant if not found
    print("[INFO] Creating a new assistant...")
    assistant = client.beta.assistants.create(
        name=name,
        instructions=instructions,
        tools=[{"type": "file_search"}]
    )
    print(f"[INFO] New assistant created with ID: {assistant['id']}")
    return assistant


In [33]:
# Main function to test thread creation and response
async def main():
    client = OpenAI(api_key="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")  # Replace with your API key

    # Create or get the assistant
    assistant = await get_or_create_assistant(
        client,
        name="Test-Assistant",
        instructions="You are a helpful assistant. Answer user questions clearly and concisely."
    )
    print(f"[INFO] Assistant ID: {assistant.id}")
    
    # Create a new thread
    try:
        thread = client.beta.threads.create(
            messages=[
                {
                    "role": "assistant",
                    "content": "You are a helpful assistant. Answer user questions clearly and accurately."
                },
                {
                    "role": "user",
                    "content": "What is the capital of France?"
                }
            ]
        )
        print(f"Thread created with ID: {thread.id}")
    except Exception as e:
        print(f"[ERROR] Failed to create thread: {e}")
        return

    # Fetch all thread messages
    try:
        messages = client.beta.threads.messages.list(thread_id=thread.id)
        for message in messages:
            print(f"[DEBUG] Thread message: {message.role} - {message.content}")
    except Exception as e:
        print(f"[ERROR] Failed to fetch thread messages: {e}")
        return

    # Send another message and fetch the assistant's response
    try:
        response = client.beta.threads.messages.create(
            thread_id=thread.id,
            role="user",
            content="Can you confirm the capital of France?"
        )
        print(f"[DEBUG] Assistant's response: {response.content}")
    except Exception as e:
        print(f"[ERROR] Failed to send message or fetch response: {e}")

# Run the main function
asyncio.run(main())

[INFO] Using existing assistant: asst_LZR0DZ1lZNsHK7d15kdwndwd
[INFO] Assistant ID: asst_LZR0DZ1lZNsHK7d15kdwndwd
Thread created with ID: thread_IFmb4qtgfCd0kjNMMlEk1d1G
[DEBUG] Thread message: user - [TextContentBlock(text=Text(annotations=[], value='What is the capital of France?'), type='text')]
[DEBUG] Thread message: assistant - [TextContentBlock(text=Text(annotations=[], value='You are a helpful assistant. Answer user questions clearly and accurately.'), type='text')]
[DEBUG] Assistant's response: [TextContentBlock(text=Text(annotations=[], value='Can you confirm the capital of France?'), type='text')]


In [20]:
response = client.beta.threads.messages.create(
    thread_id=thread.id,
    role="user",
    content="What is the capital of France?"
)
print(f"[DEBUG] Assistant's response: {response.content}")

# Fetch and print all messages in the thread for troubleshooting
thread_messages = client.beta.threads.messages.list(thread_id=thread.id)
for message in thread_messages:
    print(f"[DEBUG] Thread message: {message.role} - {message.content}")


[DEBUG] Assistant's response: [TextContentBlock(text=Text(annotations=[], value='What is the capital of France?'), type='text')]
[DEBUG] Thread message: user - [TextContentBlock(text=Text(annotations=[], value='What is the capital of France?'), type='text')]
[DEBUG] Thread message: user - [TextContentBlock(text=Text(annotations=[], value='What is the capital of France?'), type='text')]


In [5]:
async def get_or_create_vector_store(client, name, file_path):
    # Fetch the list of existing vector stores (no await needed for list)
    vector_stores = client.beta.vector_stores.list()
    
    # Iterate through existing vector stores
    for store in vector_stores:
        if store.name == name:  # Compare store.name with the provided name
            print(f"[INFO] Using existing vector store: {store.id}")
            return store.id  # Return the ID of the matched vector store
    
    # Create a new vector store if not found
    print("[INFO] Creating a new vector store...")
    vector_store = await client.beta.vector_stores.create(name=name)
    
    # Upload files to the new vector store
    file_streams = [open(file_path, "rb")]
    file_batch = await client.beta.vector_stores.file_batches.upload_and_poll(
        vector_store_id=vector_store.id, files=file_streams
    )
    print(f"[INFO] Vector store created with ID: {vector_store.id}")
    print(f"[INFO] File batch status: {file_batch.status}")
    return vector_store.id


In [6]:
async def update_assistant_with_knowledge_base(client, assistant_id, vector_store_id):
    print(f"[DEBUG] Linking vector store {vector_store_id} to assistant {assistant_id}...")  # Add debug log
    client.beta.assistants.update(
        assistant_id=assistant_id,
        tool_resources={
            "file_search": {"vector_store_ids": [vector_store_id]}
        }
    )
    print(f"[INFO] Assistant {assistant_id} linked to vector store {vector_store_id}")


In [7]:
# Function to process variables
async def process_variables(df, client, assistant_id, thread_id):
    for _, row in df.iterrows():
        variable_name = row["Variable / Field Name"]
        module = row["Form Name"]
        title = row["Field Label"]
        description = row.get("Description", "")
        var_type = row.get("Field Type", "")
        encodings = row.get("Choices, Calculations, OR Slider Labels", "")

        prompt = f"""
        You are a HEAL CDE identification assistant. Use the HEAL CDE knowledge base to analyze the following variable and determine if it matches any HEAL CDE. Provide a detailed explanation based on the HEAL CDE knowledge base.
        - **Variable Name**: {variable_name}
        - **Module**: {module}
        - **Title**: {title}
        - **Description**: {description}
        - **Type**: {var_type}
        - **Encodings**: {encodings}
        
        Respond based on the HEAL CDE knowledge base.
        """
        try:
            # Send message to the assistant
            client.beta.threads.messages.create(
                thread_id=thread_id,
                role="user",
                content=prompt
            )
            print(f"[DEBUG] Successfully sent message for variable: {variable_name}")

            # Fetch the latest messages in the thread
            thread_messages = client.beta.threads.messages.list(thread_id=thread_id)
            latest_message = None
            for message in thread_messages:
                latest_message = message  # Iterate to get the most recent message

            if latest_message:
                print(f"[DEBUG] Assistant's response for {variable_name}: {latest_message.content}")
            else:
                print(f"[ERROR] No response received for variable '{variable_name}'")
        
        except Exception as e:
            print(f"[ERROR] Failed to process variable '{variable_name}': {e}")

In [8]:
# Event handler class
class EventHandler(AssistantEventHandler):
    def on_text_created(self, text) -> None:
        print(f"\nassistant > {text}")

In [11]:
# Main function
async def main():
    client = OpenAI(api_key=api_key)

    # Create or get assistant
    assistant = await get_or_create_assistant(
        client,
        "Test-Assistant",
        "You are a helpful assistant. Answer user questions clearly and concisely.",
        "gpt-4o"
    )

    # Create a test thread
    thread = client.beta.threads.create(
        messages=[
            {
                "role": "user",
                "content": "What is the capital of France?"
            }
        ]
    )

    # Get the thread ID
    thread_id = thread.id
    print(f"Thread created with ID: {thread_id}")

    # Fetch the assistant's response
    try:
        response = client.beta.threads.messages.create(
            thread_id=thread_id,
            role="user",
            content="What is the capital of France?"
        )
        print(f"[DEBUG] Assistant's response: {response.content}")
    except Exception as e:
        print(f"[ERROR] Failed to get response: {e}")

# Run the script
await main()


[INFO] Using existing assistant: asst_LZR0DZ1lZNsHK7d15kdwndwd
Thread created with ID: thread_5IYmz4Eu72JZKf6XBQ598V9D
[DEBUG] Assistant's response: [TextContentBlock(text=Text(annotations=[], value='What is the capital of France?'), type='text')]


In [10]:
import nest_asyncio
import pandas as pd
from openai import OpenAI
import os
from dotenv import load_dotenv

nest_asyncio.apply()

# Load environment variables
load_dotenv()

# Main function
async def main():
    client = OpenAI(api_key=api_key)
    
    # Define paths and names
    vector_store_name = "All HEAL CDEs KB"
    assistant_name = "CDE-detective"
    knowledge_base_file = r"C:\Users\lmaefos\Code Stuffs\CDE_detective\CDE_ID_detective_revamp\KnowledgeBase\All_HEALPAINCDEsDD_flattened.json"
    data_dictionary_path = r"C:\Users\lmaefos\Code Stuffs\CDE_detective\CDE_ID_detective_revamp\in\SAMPLE_HDP00980_iHOPEAim1_DataDictionary_2024-12-13_enhanced.xlsx"

    # Check file paths
    if not os.path.exists(knowledge_base_file):
        print(f"[ERROR] Knowledge base file not found: {knowledge_base_file}")
        return
    if not os.path.exists(data_dictionary_path):
        print(f"[ERROR] Data dictionary file not found: {data_dictionary_path}")
        return

    # Load data dictionary
    try:
        df = pd.read_excel(data_dictionary_path)
    except Exception as e:
        print(f"[ERROR] Failed to load data dictionary: {e}")
        return

    # Create or get assistant and vector store
    assistant = await get_or_create_assistant(
        client, 
        assistant_name, 
        "You are a HEAL CDE identification assistant. Use the provided knowledge base to analyze variables and determine if they match any HEAL CDE.",
        "gpt-4o"
    )
    vector_store_id = await get_or_create_vector_store(client, vector_store_name, knowledge_base_file)
    
    # Link assistant to vector store
    try:
        await update_assistant_with_knowledge_base(client, assistant.id, vector_store_id)
    except Exception as e:
        print(f"[ERROR] Failed to link vector store: {e}")
        return
    
    # Create a new thread
    thread = client.beta.threads.create(
        messages=[
            {
                "role": "user",
                "content": "Analyze variables from the data dictionary to determine HEAL CDE matches."
            }
        ],
        tool_resources={"file_search": {"vector_store_ids": [vector_store_id]}}
    )

    # Get the thread ID
    thread_id = thread.id
    print(f"Thread created with ID: {thread_id}")

    # Test query to check file search functionality
    try:
        response = client.beta.threads.messages.create(
            thread_id=thread_id,
            role="user",
            content="Find any information about the variable 'age' in the HEAL CDE knowledge base."
        )
        print(f"[DEBUG] File search test query response: {response.content}")
    except Exception as e:
        print(f"[ERROR] Failed to perform file search test query: {e}")
        return

    # Process variables
    print("[INFO] Processing variables...")
    try:
        await process_variables(df, client, assistant.id, thread_id)
    except Exception as e:
        print(f"[ERROR] Failed to process variables: {e}")
        return
    print("[INFO] Finished processing variables.")

# Run the script
await main()

[INFO] Using existing assistant: asst_1323thTbBBESBJYw3JlOTiiP
[INFO] Using existing vector store: vs_22m2znc8T2bqfwkup3qKKCWG
[DEBUG] Linking vector store vs_22m2znc8T2bqfwkup3qKKCWG to assistant asst_1323thTbBBESBJYw3JlOTiiP...
[INFO] Assistant asst_1323thTbBBESBJYw3JlOTiiP linked to vector store vs_22m2znc8T2bqfwkup3qKKCWG
Thread created with ID: thread_iI3hPJe40GCRyKqcuUxDgrHr
[DEBUG] File search test query response: [TextContentBlock(text=Text(annotations=[], value="Find any information about the variable 'age' in the HEAL CDE knowledge base."), type='text')]
[INFO] Processing variables...
[DEBUG] Successfully sent message for variable: exclusion_total
[DEBUG] Assistant's response for exclusion_total: [TextContentBlock(text=Text(annotations=[], value='Analyze variables from the data dictionary to determine HEAL CDE matches.'), type='text')]
[DEBUG] Successfully sent message for variable: nrs
[DEBUG] Assistant's response for nrs: [TextContentBlock(text=Text(annotations=[], value='A