In [8]:
import openai
import asyncio
import os
from openai import AsyncOpenAI  # Asynchronous client from the new OpenAI SDK
import configparser  # For reading configuration files
import json
import re

import nest_asyncio
nest_asyncio.apply()

In [None]:
# Load configuration file
config = configparser.ConfigParser()
config.read('config.ini')

# Debugging: Print sections and keys
print(f"Config sections: {config.sections()}")  # Should include 'OpenAI'
if 'OpenAI' in config:
    print(f"Keys in OpenAI section: {config['OpenAI']}")  # Should include 'api_key'

# Retrieve the API key
openai_api_key = config.get('OpenAI', 'api_key', fallback=None)
print(f"Retrieved API key: {openai_api_key}")  # Debugging: Check the key value

# Raise an error if the API key is missing
if not openai_api_key:
    raise ValueError("API key not found. Please set it in the 'config.ini' file under the [OpenAI] section.")

# Use the API key with OpenAI
client = AsyncOpenAI(api_key=openai_api_key)
print("Client initialized successfully.")

In [10]:
# Retrieve file paths and openai configs
json_input_file = config['Files']['json_input_file']
json_output_file = config['Files']['json_output_file']
akshay_assistant_id = config['OpenAI']['akshay_assistant_id']
variable_matching = config['Instructions']['variable_matching']
akshay_vs_id = config['OpenAI']['akshay_vs_id']
akshay_kb_file_id = config['OpenAI']['akshay_kb_file_id']
akshay_kbdo_file_id = config['OpenAI']['akshay_kbdo_file_id']

In [None]:
# Read and parse the JSON file
with open(json_input_file, 'r') as file:
    data = json.load(file)

# Organize the variable names by module
modules = {}
for module_name, variables in data.items():
    modules[module_name] = [variable['name'] for variable in variables]

# Display the loaded modules and variable names for debugging
for module, variable_names in modules.items():
    print(f"Module: {module}, Variables: {variable_names}")

In [None]:
#Define the assistant
instructions = config.get("Instructions", "variable_matching", fallback=None)
if not instructions:
    raise ValueError("Instructions not found in config.ini.")

assistant = client.beta.assistants.create(
    model="gpt-4o-mini-2024-07-18",
    instructions=instructions,
    name="CDE ID Python",
    tools=[{"type": "file_search"}]
)


In [15]:
#Store reference files. These will be permanently accessible to the assistant.
vector_store = client.beta.vector_stores.create(name="CDE Files")
file_paths = [
    r"C:\Users\lmaefos\Code Stuffs\CDE_detective\CDE_ID_detective_revamp\KnowledgeBase\All_HEALPAINCDEsDD_JSON.json"
]
file_streams = [open(path, "rb") for path in file_paths]
file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
  vector_store_id=akshay_vs_id, files=file_streams
)

In [16]:
# Define the async function to send variable names to the assistant
async def analyze_variables(client, assistant, modules):
    """
    Analyzes variable names by sending them to the assistant for matching with HEAL CDE variables.

    Args:
        client: The OpenAI client instance.
        assistant: The assistant instance to process variables.
        modules: A dictionary where keys are module names and values are lists of variable names.

    Returns:
        A dictionary with results for each module and variable.
    """
    results = {}

    for module_name, variable_names in modules.items():
        module_results = []
        for variable_name in variable_names:
            # Construct a prompt for each variable
            prompt = f"""
            Module Name: {module_name}
            Variable Name: {variable_name}

            {instructions}
            """
            # Call the assistant
            response = await client.chat.completions.create(
                model="gpt-4o-mini-2024-07-18",
                messages=[{"role": "user", "content": prompt}],
                temperature=0.5
            )

            # Parse the response content
            content = response.choices[0].message.content.strip()
            module_results.append({
                "variable_name": variable_name,
                "response": content
            })

        # Store results for the module
        results[module_name] = module_results

    return results

In [None]:
async def main():
    # Load the JSON input file
    json_input_file = config['Files']['json_input_file']
    with open(json_input_file, 'r') as file:
        data = json.load(file)

    # Organize variables by module
    modules = {module_name: [variable['name'] for variable in variables] for module_name, variables in data.items()}

    # Analyze variables with the assistant
    results = await analyze_variables(client, assistant, modules)

    # Save the results to a JSON file
    json_output_file = config['Files']['json_output_file']
    with open(json_output_file, 'w') as file:
        json.dump(results, file, indent=4)

    print(f"Results saved to {json_output_file}")

# Run the async function
await main()


In [None]:
# akshay original
assistant = openai.Client().beta.assistants.create(
    model="gpt-4o-mini-2024-07-18",
    instructions="""Objective: Process the raw responses from the output of the HEAL CDE Detective to produce a structured JSON format that aligns with the HEAL Core Common Data Elements (CDE) categorization.

                Response Format:
                For each entry in the module, provide the following in a nested JSON format:
                {
                  "module_name": "<module name>",
                  "entries": [
                    {
                      "name": "<entry name>",
                      "standards_mapping_type": "<CDE mapping type>",
                      "cde_name": "<CDE name>",
                      "variable_cde_name": "<variable name>"
                    },
                    ...
                  ]
                }

                Guidelines:
                1. Parse Raw Responses: Extract and organize information from the raw response provided by the first assistant.
                2. Determine Standards Mapping Type:
                - HEAL CDE Match: When the entry matches a CDE directly.
                - Potential HEAL CDE Match: When the entry partially matches a CDE or aligns with its context.
                - No CDE match: When the entry does not correspond to any CDE.
                3. Specify Standards Mapping Label: The CDE name derived from the All_HEALPAINCDEsDD_JSON.json file or indicate 'No CDE name'.
                4. Specify Variable Name: Provide the matched variable name derived from the ALL_HEALPAINCDEsDD_JSON.json file or indicate 'No variable match'.

                Example Output:
                {
                  "module_name": "baseline_assessment",
                  "entries": [
                    {
                      "name": "birth",
                      "standards_mapping_type": "HEAL CDE Match",
                      "cde_name": "Birth date",
                      "variable_cde_name":"BRTHDTC"
                    },
                    {
                      "name": "age_at_enrollment",
                      "standards_mapping_type": "HEAL CDE Match",
                      "cde_name": "Age",
                      "variable_cde_name":"Age"
                    }
                  ]
                }""",
    name="CDE ID Python",
    tools=[{"type": "file_search"}]
)

In [None]:
#Store reference files. These will be permanently accessible to the assistant.
vector_store = client.beta.vector_stores.create(name="CDE Files")
file_paths = [r"C:\Users\lmaefos\Code Stuffs\CDE_detective\CDE_ID_detective_revamp\KnowledgeBase\All_HEALPAINCDEsDD_JSON.json", r"C:\Users\lmaefos\Code Stuffs\CDE_detective\CDE_ID_detective_revamp\desiredoutcome.json"]
file_streams = [open(path, "rb") for path in file_paths]
file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
  vector_store_id=vector_store.id, files=file_streams
)

In [None]:
#Update the assistant to intake reference files.
assistant = client.beta.assistants.update(
  assistant_id=assistant.id,
  tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}},
)

In [None]:
#Files that are going to be used in the message. See "attachments" in the block below.
sample_output_file = client.files.create(
    file=open(r"C:\Users\lmaefos\Code Stuffs\CDE_detective\CDE_ID_detective_revamp\out\HDP00125_DataDictionary_2023-08-22_2024-12-09_varmatch.json", "rb"),
    purpose='assistants'
)

In [None]:
#Create message thread
thread = client.beta.threads.create(
  messages=[
    {
      "role": "user",
      "content": """Adjust the sample output provided according to the instructions to produce a file that looks like the desired outcome file.
      The sample output is a json file that contains modules with information that you need to parse through.
      I want you to determine whether each entry in the module matches with a CDE. The CDE list is found in the All_HEALPAINCDEsDD_JSON.
      You should output text that is in the same format as the desired outcome, which is another json file. Here are the instructions:
      Objective: Process the raw responses from the output of the HEAL CDE Detective to produce a structured JSON format that aligns with the HEAL Core Common Data Elements (CDE) categorization.

        Response Format:
        For each entry in the module, provide the following in a nested JSON format:
        {
          "module_name": "<module name>",
          "entries": [
            {
              "name": "<entry name>",
              "standards_mapping_type": "<CDE mapping type>",
              "cde_name": "<CDE name>",
              "variable_cde_name":"<variable name>"
            },
            ...
          ]
        }

        Guidelines:
                1. Parse Raw Responses: Extract and organize information from the raw response provided by the first assistant.
                2. Determine Standards Mapping Type:
                - HEAL CDE Match: When the entry matches a CDE directly.
                - Potential HEAL CDE Match: When the entry partially matches a CDE or aligns with its context.
                - No CDE match: When the entry does not correspond to any CDE.
                3. Specify Standards Mapping Label: The CDE name derived from the All_HEALPAINCDEsDD_JSON.json file or indicate 'No CDE name'.
                4. Specify Variable Name: Provide the matched variable name derived from the ALL_HEALPAINCDEsDD_JSON.json file or indicate 'No variable match'.

                Example Output:
                {
                  "module_name": "baseline_assessment",
                  "entries": [
                    {
                      "name": "birth",
                      "standards_mapping_type": "HEAL CDE Match",
                      "cde_name": "Birth date",
                      "variable_cde_name":"BRTHDTC"
                    },
                    {
                      "name": "age_at_enrollment",
                      "standards_mapping_type": "HEAL CDE Match",
                      "cde_name": "Age",
                      "variable_cde_name":"Age"
                    }
                  ]
                }""",
      # Attach the new file to the message.
      "attachments": [
        { "file_id": sample_output_file.id, "tools": [{"type": "file_search"}] }
      ],
    }
  ]
)

In [None]:
#Execute message thread. DO NOT CHANGE (for now)
from typing_extensions import override
from openai import AssistantEventHandler, OpenAI
 
client = OpenAI()
 
class EventHandler(AssistantEventHandler):
    @override
    def on_text_created(self, text) -> None:
        print(f"\nassistant > ", end="", flush=True)

    @override
    def on_tool_call_created(self, tool_call):
        print(f"\nassistant > {tool_call.type}\n", flush=True)

    @override
    def on_message_done(self, message) -> None:
        # print a citation to the file searched
        message_content = message.content[0].text
        annotations = message_content.annotations
        citations = []
        for index, annotation in enumerate(annotations):
            message_content.value = message_content.value.replace(
                annotation.text, f"[{index}]"
            )
            if file_citation := getattr(annotation, "file_citation", None):
                cited_file = client.files.retrieve(file_citation.file_id)
                citations.append(f"[{index}] {cited_file.filename}")

        print(message_content.value)
        print("\n".join(citations))


# Then, we use the stream SDK helper
# with the EventHandler class to create the Run
# and stream the response.

with client.beta.threads.runs.stream(
    thread_id=thread.id,
    assistant_id=assistant.id,
    instructions="Please parse through the whole sample output file.",
    event_handler=EventHandler(),
) as stream:
    stream.until_done()