In [233]:
import os
import asyncio
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI
from openai import AssistantEventHandler


In [234]:
# Load environment variables from .env file
load_dotenv()

# Set the OpenAI API key from the environment variable
openai.api_key = os.getenv("OPENAI_API_KEY")
# Initialize the OpenAI client with your API key
client = OpenAI(api_key=openai.api_key)

api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("OPENAI_API_KEY environment variable is not set!")
openai.api_key = api_key
client = OpenAI(api_key=openai.api_key)

In [216]:
async def get_or_create_assistant(client, name, instructions, model):
    # Fetch the list of existing assistants (no await needed)
    assistants = client.beta.assistants.list()
    
    # Search for an assistant by name
    for assistant in assistants:
        if assistant.name == name:  # Use dot notation instead of subscript
            print(f"[INFO] Using existing assistant: {assistant.id}")
            return assistant
    
    # Create a new assistant if not found
    print("[INFO] Creating a new assistant...")
    assistant = client.beta.assistants.create(
        name=name,
        instructions=instructions,
        model=model,
        tools=[{"type": "file_search"}]
    )
    print(f"[INFO] New assistant created with ID: {assistant['id']}")
    return assistant


In [217]:
async def get_or_create_vector_store(client, name, file_path):
    # Fetch the list of existing vector stores (no await needed for list)
    vector_stores = client.beta.vector_stores.list()
    
    # Iterate through existing vector stores
    for store in vector_stores:
        if store.name == name:  # Compare store.name with the provided name
            print(f"[INFO] Using existing vector store: {store.id}")
            return store.id  # Return the ID of the matched vector store
    
    # Create a new vector store if not found
    print("[INFO] Creating a new vector store...")
    vector_store = await client.beta.vector_stores.create(name=name)
    
    # Upload files to the new vector store
    file_streams = [open(file_path, "rb")]
    file_batch = await client.beta.vector_stores.file_batches.upload_and_poll(
        vector_store_id=vector_store.id, files=file_streams
    )
    print(f"[INFO] Vector store created with ID: {vector_store.id}")
    print(f"[INFO] File batch status: {file_batch.status}")
    return vector_store.id


In [218]:
async def update_assistant_with_knowledge_base(client, assistant_id, vector_store_id):
    print(f"[DEBUG] Linking vector store {vector_store_id} to assistant {assistant_id}...")  # Add debug log
    client.beta.assistants.update(
        assistant_id=assistant_id,
        tool_resources={
            "file_search": {"vector_store_ids": [vector_store_id]}
        }
    )
    print(f"[INFO] Assistant {assistant_id} linked to vector store {vector_store_id}")


In [219]:
# Function to process variables
async def process_variables(df, client, assistant_id, thread_id):
    for _, row in df.iterrows():
        variable_name = row["Variable / Field Name"]
        module = row["Form Name"]
        title = row["Field Label"]
        description = row.get("Description", "")
        var_type = row.get("Field Type", "")
        encodings = row.get("Choices, Calculations, OR Slider Labels", "")

        prompt = f"""
        You are a HEAL CDE identification assistant. Use the HEAL CDE knowledge base to analyze the following variable and determine if it matches any HEAL CDE. Provide a detailed explanation based on the HEAL CDE knowledge base.
        - **Variable Name**: {variable_name}
        - **Module**: {module}
        - **Title**: {title}
        - **Description**: {description}
        - **Type**: {var_type}
        - **Encodings**: {encodings}
        
        Respond based on the HEAL CDE knowledge base.
        """
        try:
            # Send message to the assistant
            client.beta.threads.messages.create(
                thread_id=thread_id,
                role="user",
                content=prompt
            )
            print(f"[DEBUG] Successfully sent message for variable: {variable_name}")

            # Fetch the latest messages in the thread
            thread_messages = client.beta.threads.messages.list(thread_id=thread_id)
            latest_message = None
            for message in thread_messages:
                latest_message = message  # Iterate to get the most recent message

            if latest_message:
                print(f"[DEBUG] Assistant's response for {variable_name}: {latest_message.content}")
            else:
                print(f"[ERROR] No response received for variable '{variable_name}'")
        
        except Exception as e:
            print(f"[ERROR] Failed to process variable '{variable_name}': {e}")

In [220]:
# Event handler class
class EventHandler(AssistantEventHandler):
    def on_text_created(self, text) -> None:
        print(f"\nassistant > {text}")

In [223]:
import nest_asyncio
import pandas as pd
from openai import OpenAI
import os
from dotenv import load_dotenv

nest_asyncio.apply()

# Load environment variables
load_dotenv()

# Main function
async def main():
    client = OpenAI(api_key=openai.api_key)
    
    # Define paths and names
    vector_store_name = "All HEAL CDEs KB"
    assistant_name = "CDE-detective"
    knowledge_base_file = r"C:\Users\lmaefos\Code Stuffs\CDE_detective\CDE_ID_detective_revamp\KnowledgeBase\All_HEALPAINCDEsDD_flattened.json"
    data_dictionary_path = r"C:\Users\lmaefos\Code Stuffs\CDE_detective\CDE_ID_detective_revamp\in\SAMPLE_HDP00980_iHOPEAim1_DataDictionary_2024-12-13_enhanced.xlsx"

    # Check file paths
    if not os.path.exists(knowledge_base_file):
        print(f"[ERROR] Knowledge base file not found: {knowledge_base_file}")
        return
    if not os.path.exists(data_dictionary_path):
        print(f"[ERROR] Data dictionary file not found: {data_dictionary_path}")
        return

    # Load data dictionary
    try:
        df = pd.read_excel(data_dictionary_path)
    except Exception as e:
        print(f"[ERROR] Failed to load data dictionary: {e}")
        return

    # Create or get assistant and vector store
    assistant = await get_or_create_assistant(
        client, 
        assistant_name, 
        "You are a HEAL CDE identification assistant. Use the provided knowledge base to analyze variables and determine if they match any HEAL CDE.",
        "gpt-4o"
    )
    vector_store_id = await get_or_create_vector_store(client, vector_store_name, knowledge_base_file)
    
    # Link assistant to vector store
    try:
        await update_assistant_with_knowledge_base(client, assistant.id, vector_store_id)
    except Exception as e:
        print(f"[ERROR] Failed to link vector store: {e}")
        return
    
    # Create a new thread
    thread = client.beta.threads.create(
        messages=[
            {
                "role": "user",
                "content": "Analyze variables from the data dictionary to determine HEAL CDE matches."
            }
        ],
        tool_resources={"file_search": {"vector_store_ids": [vector_store_id]}}
    )

    # Get the thread ID
    thread_id = thread.id
    print(f"Thread created with ID: {thread_id}")

    # Test query to check file search functionality
    try:
        response = client.beta.threads.messages.create(
            thread_id=thread_id,
            role="user",
            content="Find any information about the variable 'age' in the HEAL CDE knowledge base."
        )
        print(f"[DEBUG] File search test query response: {response.content}")
    except Exception as e:
        print(f"[ERROR] Failed to perform file search test query: {e}")
        return

    # Process variables
    print("[INFO] Processing variables...")
    try:
        await process_variables(df, client, assistant.id, thread_id)
    except Exception as e:
        print(f"[ERROR] Failed to process variables: {e}")
        return
    print("[INFO] Finished processing variables.")

# Run the script
await main()

[INFO] Using existing assistant: asst_1323thTbBBESBJYw3JlOTiiP
[INFO] Using existing vector store: vs_22m2znc8T2bqfwkup3qKKCWG
[DEBUG] Linking vector store vs_22m2znc8T2bqfwkup3qKKCWG to assistant asst_1323thTbBBESBJYw3JlOTiiP...
[INFO] Assistant asst_1323thTbBBESBJYw3JlOTiiP linked to vector store vs_22m2znc8T2bqfwkup3qKKCWG
Thread created with ID: thread_hxjJEtosfBVPkPWrR4SY71E1
[DEBUG] File search test query response: [TextContentBlock(text=Text(annotations=[], value="Find any information about the variable 'age' in the HEAL CDE knowledge base."), type='text')]
[INFO] Processing variables...
[DEBUG] Successfully sent message for variable: exclusion_total
[DEBUG] Assistant's response for exclusion_total: [TextContentBlock(text=Text(annotations=[], value='Analyze variables from the data dictionary to determine HEAL CDE matches.'), type='text')]
[DEBUG] Successfully sent message for variable: nrs
[DEBUG] Assistant's response for nrs: [TextContentBlock(text=Text(annotations=[], value='A

In [232]:
async def simple_test(client):
    # Create a thread with detailed system instructions
    thread = client.beta.threads.create(
        messages=[
            {
                "role": "assistant",
                "content": (
                    "You are an AI assistant. Your goal is to answer user questions clearly, concisely, "
                    "and accurately. If the user asks 'What is the capital of France?', your response should be 'The capital of France is Paris.'"
                )
            }
        ]
    )
    
    thread_id = thread.id
    print(f"Thread created with ID: {thread_id}")

    # Send a simple test query
    try:
        response = client.beta.threads.messages.create(
            thread_id=thread_id,
            role="user",
            content="What is the capital of France?"
        )
        print(f"[DEBUG] Assistant's response: {response.content}")
    except Exception as e:
        print(f"[ERROR] Failed to get response: {e}")


async def inspect_thread(client, thread_id):
    thread_messages = client.beta.threads.messages.list(thread_id=thread_id)
    for message in thread_messages:
        print(f"[DEBUG] Message Role: {message.role}, Content: {message.content}")

await inspect_thread(client, thread_id)



NameError: name 'thread_id' is not defined

In [231]:
async def main():
    client = OpenAI(api_key=openai.api_key)

    # Run the simple test to validate responses
    await simple_test(client)

# Run the script
await main()


Thread created with ID: thread_a6DrAlbvO9osstf93gE4AL1i
[DEBUG] Assistant's response: [TextContentBlock(text=Text(annotations=[], value='What is the capital of France?'), type='text')]


In [20]:
import openai
import json
import os
import asyncio
import configparser
from openai import OpenAI
from dotenv import load_dotenv

# openai: The OpenAI SDK to interact with their APIs.
# json: For working with JSON data.
# os: Access environment variables and file paths.
# asyncio: Enables asynchronous operations for better performance.
# configparser: Reads configuration files like config.ini.
# OpenAI: A specific OpenAI class for client interactions.


In [21]:
# Load environment variables from .env file
load_dotenv()

# Set the OpenAI API key from the environment variable
openai.api_key = os.getenv("OPENAI_API_KEY")
# Initialize the OpenAI client with your API key
client = OpenAI(api_key=openai.api_key)

api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("OPENAI_API_KEY environment variable is not set!")
openai.api_key = api_key
client = OpenAI(api_key=openai.api_key)



In [526]:
# Creates an assistant named "CDE-detective" with specific instructions for identifying HEAL CDE variables.
assistant = client.beta.assistants.create(
name="CDE-detective",
instructions="You are a helpful HEAl CDE Identification assistant to see if a variable is a HEAL CDE based on the files provided to you. The goal is to determine whether each name, which represents a variable, matches a HEAL CDE variable from a pre-defined list of HEAL CDE variables stored in your knowledge base.",
model="gpt-4o",
tools=[{"type": "file_search"}]
)

In [None]:
# Create a vector store
vector_store = client.beta.vector_stores.create(name="All HEAL CDEs KB")

# Ready the files for upload to OpenAI
file_paths = [r"C:\Users\lmaefos\Code Stuffs\CDE_detective\CDE_ID_detective_revamp\KnowledgeBase\All_HEALPAINCDEsDD_flattened.json"]
file_streams = [open(path, "rb") for path in file_paths]

# Use the upload and poll SDK helper to upload the files, add them to the vector store,
# and poll the status of the file batch for completion.
file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
vector_store_id=vector_store.id, files=file_streams
)

# You can print the status and the file counts of the batch to see the result of this operation.
print(file_batch.status)
print(file_batch.file_counts)

In [None]:
# Updating the Assistant to Use the Vector Store
assistant = client.beta.assistants.update(
    assistant_id=assistant.id,
    tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}},
)
print(f"[DEBUG] Assistant updated with tool resources: {assistant.tool_resources}")


In [None]:
# Upload the user provided file to OpenAI
message_file = client.files.create(
file=open(r"C:\Users\lmaefos\Code Stuffs\CDE_detective\CDE_ID_detective_revamp\in\SAMPLE_DataDictionary_ForTesting.json", "rb"), purpose="assistants"
)

# Create a thread and attach the file to the message
thread = client.beta.threads.create(
messages=[
  {
    "role": "user",
    "content": "Analyze the contents of each name nested under its respective module in the provided JSON file. The goal is to determine whether each name, which represents a variable, matches a HEAL CDE variable from a pre-defined list of HEAL CDE variables stored in your knowledge base.",
    # Attach the new file to the message.
    "attachments": [
      { "file_id": message_file.id, "tools": [{"type": "file_search"}] }
    ],
  }
]
)

# The thread now has a vector store with that file in its tool resources.
print(thread.tool_resources.file_search)

In [None]:
import pandas as pd

# Load the data dictionary (CSV or Excel)
file_path = r"C:\Users\lmaefos\Code Stuffs\CDE_detective\CDE_ID_detective_revamp\in\SAMPLE_HDP00980_iHOPEAim1_DataDictionary_2024-12-13_enhanced.xlsx"  # Change to .xlsx if necessary
df = pd.read_excel(file_path)  # Or use pd.read_excel(file_path)

# Define column mappings (customize for your data dictionary)
column_mappings = {
    "variable": "Variable / Field Name",         # Replace "name" with your actual column name for variables
    "module": "Form Name",
    "heal_crf_match": "Matched HEAL Core CRF",
    "title": "Field Label",
    "description": "",
    "type": "Field Type",
    "encodings": "Choices, Calculations, OR Slider Labels",
}

# Validate that all columns exist in the DataFrame
optional_columns = ["description", "heal_crf_match", "type", "encodings"]

for key, col in column_mappings.items():
    if col not in df.columns and key not in optional_columns:
        raise ValueError(f"Column '{col}' for '{key}' not found in the file.")
    elif col not in df.columns:
        print(f"Warning: Optional column '{col}' for '{key}' is missing. Defaulting to blank values.")
        df[col] = ""  # Create an empty column with blank values for optional fields


In [530]:
async def process_variables(df, client, assistant_id, thread_id):
    for _, row in df.iterrows():
        variable_name = row[column_mappings["variable"]]
        module = row[column_mappings["module"]]
        title = row[column_mappings["title"]]
        description = row[column_mappings["description"]]
        var_type = row[column_mappings["type"]]
        encodings = row[column_mappings["encodings"]]

        # Construct the prompt
        prompt = f"""
        Analyze the following variable:
        - **Variable Name**: {variable_name}
        - **Module**: {module}
        - **Title**: {title}
        - **Description**: {description}
        - **Type**: {var_type}
        - **Encodings**: {encodings}
        
        Do not summarize or review the entire reference document. Instead, determine if this specific variable matches any HEAL CDE and explain your reasoning.
        """
        
        # Send message to the assistant
        try:
            await client.beta.threads.messages.create(
                thread_id=thread_id,
                messages=[{"role": "user", "content": prompt}],
            )
            print(f"[DEBUG] Successfully sent message for variable: {variable_name}")
        except Exception as e:
            print(f"[ERROR] Failed to process variable '{variable_name}': {e}")


In [531]:
async def main():
    # Initialize the OpenAI client
    client = OpenAI(api_key=openai_api_key)

    # Create the assistant
    assistant = client.beta.assistants.create(
        name="CDE-detective",
        instructions="You are a helpful HEAL CDE Identification assistant to see if a variable is a HEAL CDE based on the All_HEALPAINCDEsDD_flattened.json provided to you.",
        model="gpt-4o",
        tools=[{"type": "file_search"}]
    )

    # Create a new thread
    thread = await client.beta.threads.create(
        messages=[
            {
                "role": "user",
                "content": "This thread will analyze variables from the data dictionary to determine HEAL CDE matches."
            }
        ],
        assistant_id=assistant.id,
    )

    # Get the thread ID
    thread_id = thread.id
    print(f"Thread created with ID: {thread_id}")

    # Pass the DataFrame to the processing function
    print("[DEBUG] Calling process_variables...")
    await process_variables(df, client, assistant.id, thread_id=thread_id)
    print("[DEBUG] Finished processing variables.")

In [None]:
# Handling Assistant Events
from typing_extensions import override
from openai import AssistantEventHandler, OpenAI

client = OpenAI(api_key='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')

class EventHandler(AssistantEventHandler):
  @override
  def on_text_created(self, text) -> None:
      print(f"\nassistant > ", end="", flush=True)

  @override
  def on_tool_call_created(self, tool_call):
      print(f"\nassistant > {tool_call.type}\n", flush=True)

  @override
  def on_message_done(self, message) -> None:
    try:
        message_content = message.content[0].text
        annotations = message_content.annotations
        citations = []
        for index, annotation in enumerate(annotations):
            message_content.value = message_content.value.replace(
                annotation.text, f"[{index}]"
            )
            if file_citation := getattr(annotation, "file_citation", None):
                cited_file = client.files.retrieve(file_citation.file_id)
                citations.append(f"[{index}] {cited_file.filename}")

        print(message_content.value)
        print("\n".join(citations))
    except AttributeError as e:
        print(f"Error processing message: {e}")

# Then, we use the stream SDK helper
# with the EventHandler class to create the Run
# and stream the response.

with client.beta.threads.runs.stream(
  thread_id=thread.id,
  assistant_id=assistant.id,
  instructions="Analyze the variable and its description to determine whether it matches a HEAL CDE variable from the HEAL CDE variables stored in your knowledge base.",
  event_handler=EventHandler(),
) as stream:
  stream.until_done()

In [None]:
# Define an asynchronous function to process a variable using OpenAI's chat completion
async def extract_variable_metadata(variable_name, module_name, file_id):
    instructions = """
    Objective: Process the raw responses from the output of the HEAL CDE Detective to produce a structured JSON format that aligns with the HEAL Core Common Data Elements (CDE) categorization.

    Response Format:
    For each variable entry, provide the following metadata in a flat JSON format:
    {
      "variable_name": "<original variable name from the input file>",
      "module_name": "<original module name from the input file>",
      "Matched_HEAL_Core_CRF": "<HEAL_CRF_match name from input file>", 
      "standards_mapping_type": "<your chosen identification of whether or not this is a CDE match>",
      "heal_cde_name": "<best match for HEAL CDE from the reference All_HEALPAINCDEsDD_flattened.json file>",
      "variable_cde_name": "<the variable name as listed in the reference All_HEALPAINCDEsDD_flattened.json file for the respected HEAL CDE match>"
    }
    """

    prompt = f"""
    Variable Name: {variable_name}
    Module Name: {module_name}

    Using the provided file, generate the metadata for this variable based on the instructions above.
    """

    # Make the asynchronous API call
    response = await openai.ChatCompletion.acreate(
        model="gpt-4",
        messages=[
            {"role": "system", "content": instructions},
            {"role": "user", "content": prompt},
        ],
        tools=[
            {
                "type": "file",
                "file_id": file_id  # Use the actual file ID
            }
        ],
        max_tokens=500,
        temperature=0.5,
    )

    # Extract the response content
    content = response["choices"][0]["message"]["content"].strip()
    return content

# Main async function to run the process
async def main():
    variable_name = "tapstobaccoproductscl"
    module_name = "tobacco_alcohol_prescription_medication_substance"
    file_id = "file-Ev35xCQexzkLoLMN4RNAwR"  # Replace with the actual file ID

    # Call the extract function
    metadata = await extract_variable_metadata(variable_name, module_name, file_id)
    print("Extracted Metadata:", metadata)

# Run the main function
asyncio.run(main())

# ARCHIVED BELOW 

In [None]:
import openai
import asyncio
import os
import pandas as pd  # For data handling, like reading from Excel
from openai import AsyncOpenAI  # Asynchronous client from the new OpenAI SDK
import json
import configparser  # For reading configuration files
import re

import nest_asyncio
nest_asyncio.apply()

In [None]:
# Load configuration file
config = configparser.ConfigParser()
config.read('config.ini')

# Debugging: Print sections and keys
print(f"Config sections: {config.sections()}")  # Should include 'OpenAI'
if 'OpenAI' in config:
    print(f"Keys in OpenAI section: {config['OpenAI']}")  # Should include 'api_key'

# Retrieve the API key
openai_api_key = config.get('OpenAI', 'api_key', fallback=None)
print(f"Retrieved API key: {openai_api_key}")  # Debugging: Check the key value
# Raise an error if the API key is missing
if not openai_api_key:
    raise ValueError("API key not found. Please set it in the 'config.ini' file under the [OpenAI] section.")

# Use the API key with OpenAI
client = AsyncOpenAI(api_key=openai_api_key)
print("Client initialized successfully.")

In [356]:
# Retrieve file paths and openai configs
json_input_file = config['Files']['json_input_file']
json_output_file = config['Files']['json_output_file']
akshay_assistant_id = config['OpenAI']['akshay_assistant_id']
variable_matching = config['Instructions']['variable_matching']
akshay_vs_id = config['OpenAI']['akshay_vs_id']
akshay_kb_file_id = config['OpenAI']['akshay_kb_file_id']
akshay_kbdo_file_id = config['OpenAI']['akshay_kbdo_file_id']


In [357]:
# Define Event Handler for debugging tool usage
class EventHandler(AssistantEventHandler):
    def on_tool_call_created(self, tool_call):
        print(f"\nassistant > Tool called: {tool_call.type}")

    def on_text_created(self, text):
        print(f"\nassistant > {text}")

    def on_message_done(self, message):
        print(f"Assistant completed the message: {message.content[0].text}")


In [None]:
# Define the assistant creation synchronously
def create_assistant():
    instructions = config.get("Instructions", "variable_matching", fallback=None)
    if not instructions:
        raise ValueError("Instructions not found in config.ini.")

    # Run the coroutine to create the assistant
    return asyncio.run(client.beta.assistants.create(
        model="gpt-4o-mini-2024-07-18",
        instructions=instructions,
        name="CDE ID Python",
        tools=[{"type": "file_search"}],
        tool_resources={
            "file_search": {
                "vector_store_ids": [akshay_vs_id]
            }
        }
    ))

# Call the create_assistant function
assistant = create_assistant()
print(f"Assistant created with tools: {assistant.tools}")


In [359]:
async def fetch_vector_stores():
    vector_stores = []
    async for store in client.beta.vector_stores.list():
        vector_stores.append(store)
    return vector_stores

async def setup_vector_store():
    try:
        # Fetch the target vector store ID from the config
        target_id = config['OpenAI']['akshay_vs_id']
        print(f"Target vector store ID: {target_id}")  # Debugging

        # Fetch the vector stores
        vector_stores = await client.beta.vector_stores.list()

        # Access the `data` attribute to get the list of VectorStore objects
        vector_store_list = vector_stores.data  # This contains the actual VectorStore objects

        # Iterate over the list of VectorStore objects
        for store in vector_store_list:
            print(f"Checking vector store: {store.id}")  # Debugging
            if store.id == target_id:
                print(f"✅ Using existing vector store: {store.id}")
                return store.id

        # If we reach here, the target vector store wasn't found
        raise ValueError("Vector store not found!")

    except Exception as e:
        print(f"Error accessing vector store: {e}")
        return None







In [360]:
# Define the function to send variable names to the assistant
async def analyze_variables(client, assistant, modules, instructions):
    """
    Asynchronously analyzes variable names by sending them to the assistant for matching with HEAL CDE variables.
    """
    results = {}

    for module_name, variable_names in modules.items():
        module_results = []
        for variable_name in variable_names:
            print(f"Processing variable '{variable_name}' in module '{module_name}'")

            prompt = f"""
            Module Name: {module_name}
            Variable Name: {variable_name}

            {instructions}
            """

            try:
                # Step 1: Create a thread
                thread = await client.beta.threads.create(
                    messages=[{"role": "user", "content": prompt}]
                )
                print(f"Thread created: {thread.id}")

                # Step 2: Poll the run until completion
                await client.beta.threads.runs.create_and_poll(
                    thread_id=thread.id,
                    assistant_id=assistant.id
                )
                print(f"Run completed for thread: {thread.id}")

                # Step 3: Retrieve the updated thread to access messages
                updated_thread = await client.beta.threads.retrieve(thread.id)
                if updated_thread.messages:
                    # Get the last assistant message
                    content = updated_thread.messages[-1].content.strip()
                else:
                    content = "No response received"

                module_results.append({
                    "variable_name": variable_name,
                    "response": content
                })

            except Exception as e:
                print(f"Error processing variable '{variable_name}' in module '{module_name}': {e}")
                module_results.append({
                    "variable_name": variable_name,
                    "response": f"Error: {e}"
                })

        results[module_name] = module_results

    return results

In [None]:
async def main():
    try:
        # Load the JSON input file
        json_input_file = config['Files']['json_input_file']
        with open(json_input_file, 'r') as file:
            data = json.load(file)
        print("JSON input file loaded successfully.")

        # Organize variables by module
        modules = {module_name: [variable['name'] for variable in variables] for module_name, variables in data.items()}

        # Define instructions
        instructions = config.get("Instructions", "variable_matching", fallback=None)
        if not instructions:
            raise ValueError("Instructions not found in config.ini.")

        # Setup vector store (awaiting setup)
        vector_store_id = await setup_vector_store()  # This is fine
        if not vector_store_id:
            print("Failed to set up vector store. Exiting...")
            return

        # Analyze variables asynchronously
        results = await analyze_variables(client, assistant, modules, instructions)

        # Save the results to a JSON file
        json_output_file = config['Files']['json_output_file']
        with open(json_output_file, 'w') as file:
            json.dump(results, file, indent=4)
        print(f"Results saved to {json_output_file}")

    except Exception as e:
        print(f"Error in main process: {e}")

# Run the async main function
import asyncio
asyncio.run(main())


In [None]:
# Read and parse the JSON file
with open(json_input_file, 'r') as file:
    data = json.load(file)

# Organize the variable names and "Matched HEAL Core CRF" by module
modules = {}
for module_name, variables in data.items():
    modules[module_name] = [
        {
            'name': variable['name'],
            'matched_crf': variable.get('Matched HEAL Core CRF', 'Not Available')  # Default to 'Not Available' if key doesn't exist
        }
        for variable in variables
    ]

# Display the loaded modules with variable names and matched CRF for debugging
for module, variable_details in modules.items():
    print(f"Module: {module}")
    for detail in variable_details:
        print(f"  Variable: {detail['name']}, Matched HEAL Core CRF: {detail['matched_crf']}")

In [None]:
# Step 2: Load the JSON output from a file
with open(json_output_file, 'r') as file:  # Replace with your actual file name
    assistant_responses = json.load(file)

# Debugging: Print the structure of the loaded data to understand the format
print(assistant_responses)


In [None]:
# Step 3: Extract relevant data from the JSON
data = []

for module_name, variables in assistant_responses.items():
    for variable_entry in variables:
        variable_name = variable_entry.get('variable_name', 'Unknown')
        response = variable_entry.get('response', '')

        # Use regex to extract fields from the response
        match = re.search(r'"standards_mapping_type":\s*"([^"]+)"', response)
        standards_mapping_type = match.group(1) if match else 'No Match'

        match = re.search(r'"heal_cde_name":\s*"([^"]+)"', response)
        heal_cde_name = match.group(1) if match else 'N/A'

        match = re.search(r'"variable_cde_name":\s*"([^"]+)"', response)
        variable_cde_name = match.group(1) if match else 'N/A'

        # Append data to the list
        data.append({
            'Module Name': module_name,
            'Variable Name': variable_name,
            'Standards Mapping Type': standards_mapping_type,
            'HEAL CDE Name': heal_cde_name,
            'Variable CDE Name': variable_cde_name
        })

# Debugging: Print the first few rows of extracted data
print(data[:5])


In [None]:
# Step 4: Convert extracted data into a Pandas DataFrame
df = pd.DataFrame(data)

# Debugging: Display the DataFrame
print(df.head())



In [None]:
# Step 5: Export the DataFrame to an Excel file
output_file = 'SAMPLE_DataDictionary_ForTesting_varmatch.xlsx'  # Name your file as you like
df.to_excel(output_file, index=False)

print(f"Data successfully exported to {output_file}")
