In [8]:
import openai
import json
import os
import asyncio
from dotenv import load_dotenv
from openai import AsyncOpenAI

load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")

if not api_key:
    raise ValueError("API key not found. Please set the OPENAI_API_KEY environment variable.")

client = openai.Client(api_key=api_key)

#Define the assistant
assistant = openai.Client().beta.assistants.create(
    model="gpt-4o-mini-2024-07-18",
    instructions="""Objective: Process the given modules and their variables, and produce JSON output as per the specified format. Ensure that all modules and their entries are included in the output. Analyze the contents of each name nested under its respective module in the provided JSON file. The goal is to determine whether each name, which represents a variable, matches a HEAL CDE variable from a pre-defined list of HEAL CDE variables stored in the HEAL CDE JSON file.

                Response Format:
                For each variable entry in the module, provide the following in a nested JSON format:
                {
                  "module_name": "<module name>",
                  "entries": [
                    {
                      "name": "<entry name>",
                      "standards_mapping_type": "<CDE mapping type>",
                      "heal_cde_name": "<CDE name>",
                      "variable_cde_name": "<Variable name>"
                    },
                    ...
                  ]
                }

                Guidelines:
                1. Match Variable Names to HEAL CDE Variables: compare each variable name in the provided JSON file against the HEAL CDE variable list from the HEAL CDE JSON file. Determine if the variable matches a HEAL CDE variable.
                2. Determine Standards Mapping Type:
                - HEAL CDE Match: When the entry matches a CDE directly.
                - Potential HEAL CDE Match: When the entry partially matches a CDE or aligns with its context.
                - No CDE match: When the entry does not correspond to any CDE.
                3. Specify the HEAL CDE name: Provide the CDE name derived from the All_HEALPAINCDEsDD_JSON.json file or indicate 'No CRF match'.
                4. Specify Variable CDE name: Provide the variable name derived from the All_HEALPAINCDEsDD_JSON.json file or indicate 'No variable match'.

                Example Output:
                {
                  "module_name": "baseline_assessment",
                  "entries": [
                    {
                      "name": "birth",
                      "standards_mapping_type": "HEAL CDE Match",
                      "heal_cde_name": "Birth date",
                      "variable_cde_name": "BRTHDTC"
                    },
                    {
                      "name": "age_at_enrollment",
                      "standards_mapping_type": "HEAL CDE Match",
                      "heal_cde_name": "Age",
                      "variable_cde_name": "Age"
                    }
                  ]
                }""",
    name="CDE ID Python",
    tools=[{"type": "file_search"}]             
)

In [9]:
#Store reference files. These will be permanently accessible to the assistant.
vector_store = client.beta.vector_stores.create(name="CDE Files")
file_paths = [
    r"C:\Users\lmaefos\Code Stuffs\CDE_detective\CDE_ID_detective_revamp\KnowledgeBase\All_HEALPAINCDEsDD_JSON.json"
]
file_streams = [open(path, "rb") for path in file_paths]
file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
  vector_store_id=vector_store.id, files=file_streams
)

In [10]:
#Update the assistant to intake reference files.
assistant = client.beta.assistants.update(
  assistant_id=assistant.id,
  tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}},
)

In [4]:
# Load the input JSON file
with open(r"C:\Users\lmaefos\Code Stuffs\CDE_detective\CDE_ID_detective_revamp\in\HDP00125_DataDictionary_2023-08-22_2024-12-09_enhanced_2modsonly.json", "r") as file:
    input_data = json.load(file)

# Print all modules and their variables
print("Modules and Variables in Input JSON:")

for module_name, variable_list in input_data.items():  # Top-level keys are module names
    print(f"Module: {module_name}")

    # Check if the module contains a list of variables
    if isinstance(variable_list, list) and all(isinstance(v, dict) for v in variable_list):
        # Extract variable names
        variable_names = [variable.get("name", "Unnamed Variable") for variable in variable_list]
        print(f"  Variables: {', '.join(variable_names)}")
    else:
        print(f"  Warning: Variables for module {module_name} are not in the expected list format.")


Modules and Variables in Input JSON:
Module: tobacco_alcohol_prescription_medication_substance
  Variables: tapstobaccoproductscl, tapsalcoholusemalescl, tapsalcoholusefemalescl, tapsdrugusescl, tapsprescriptionmedusescl
Module: sleep_duration
  Variables: sleepnighthourdur, sleepnightmindur


In [11]:
#Files that are going to be used in the message. See "attachments" in the block below.
sample_output_file = client.files.create(
    file=open(r"C:\Users\lmaefos\Code Stuffs\CDE_detective\CDE_ID_detective_revamp\in\HDP00125_DataDictionary_2023-08-22_2024-12-09_enhanced_2modsonly.json", "rb"),
    purpose='assistants'
)

In [12]:
#Create message thread
thread = client.beta.threads.create(
  messages=[
    {
      "role": "user",
      "content": """Adjust the sample output provided according to the instructions to produce a file that looks like the response format in JSON structure.
      The provided json file contains modules and variables that you need to parse through.
      Please process THE ENTIRE FILE and produce the final JSON output as per the specified format. Ensure that all modules and their entries are included in the output.
      I want you to determine whether each variable entry in the module matches with a CDE. The CDE list is found in the All_HEALPAINCDEsDD_JSON.
      You should output text that is in the same format as the example response format, provided below. Here are the instructions:
      Objective: Process the raw responses from the output of the HEAL CDE Detective to produce a structured JSON format that aligns with the HEAL Core Common Data Elements (CDE) categorization.

        Response Format:
                For each variable entry in the module, provide the following in a nested JSON format:
                {
                  "module_name": "<module name>",
                  "entries": [
                    {
                      "name": "<entry name>",
                      "standards_mapping_type": "<CDE mapping type>",
                      "heal_cde_name": "<CDE name>",
                      "variable_cde_name": "<Variable name>"
                    },
                    ...
                  ]
                }

                Guidelines:
                1. Match Variable Names to HEAL CDE Variables: compare each variable name in the provided JSON file against the HEAL CDE variable list from the HEAL CDE JSON file. Determine if the variable matches a HEAL CDE variable.
                2. Determine Standards Mapping Type:
                - HEAL CDE Match: When the entry matches a CDE directly.
                - Potential HEAL CDE Match: When the entry partially matches a CDE or aligns with its context.
                - No CDE match: When the entry does not correspond to any CDE.
                3. Specify the HEAL CDE name: Provide the CDE name derived from the All_HEALPAINCDEsDD_JSON.json file or indicate 'No CRF match'.
                4. Specify Variable CDE name: Provide the variable name derived from the All_HEALPAINCDEsDD_JSON.json file or indicate 'No variable match'.

                Example Output:
                {
                  "module_name": "baseline_assessment",
                  "entries": [
                    {
                      "name": "birth",
                      "standards_mapping_type": "HEAL CDE Match",
                      "heal_cde_name": "Birth date",
                      "variable_cde_name": "BRTHDTC"
                    },
                    {
                      "name": "age_at_enrollment",
                      "standards_mapping_type": "HEAL CDE Match",
                      "heal_cde_name": "Age",
                      "variable_cde_name": "Age"
                    }
                  ]
                }""",
      # Attach the new file to the message.
      "attachments": [
        { "file_id": sample_output_file.id, "tools": [{"type": "file_search"}] }
      ],
    }
  ]
)

In [13]:
#Execute message thread. DO NOT CHANGE (for now)
from typing_extensions import override
from openai import AssistantEventHandler, OpenAI
 
client = OpenAI()
 
class EventHandler(AssistantEventHandler):
    @override
    def on_text_created(self, text) -> None:
        print(f"\nassistant > ", end="", flush=True)

    @override
    def on_tool_call_created(self, tool_call):
        print(f"\nassistant > {tool_call.type}\n", flush=True)

    @override
    def on_message_done(self, message) -> None:
        # print a citation to the file searched
        message_content = message.content[0].text
        annotations = message_content.annotations
        citations = []
        for index, annotation in enumerate(annotations):
            message_content.value = message_content.value.replace(
                annotation.text, f"[{index}]"
            )
            if file_citation := getattr(annotation, "file_citation", None):
                cited_file = client.files.retrieve(file_citation.file_id)
                citations.append(f"[{index}] {cited_file.filename}")

        print(message_content.value)
        print("\n".join(citations))


# Then, we use the stream SDK helper
# with the EventHandler class to create the Run
# and stream the response.

with client.beta.threads.runs.stream(
    thread_id=thread.id,
    assistant_id=assistant.id,
    instructions="Please parse through the whole sample output file.",
    event_handler=EventHandler(),
) as stream:
    stream.until_done()


assistant > file_search


assistant > To transform the provided data into the required JSON format according to the instructions, we will proceed with the following steps:

1. Extract the necessary details from both the data dictionary JSON file and the CDE JSON file.
2. Compare each variable name from the data dictionary with the variable names from the CDE list to determine the standards mapping type (HEAL CDE Match, Potential HEAL CDE Match, or No CDE match).
3. For matched entries, retrieve the corresponding CDE name and variable name.
4. Format the output into the specified JSON structure.

Below is the structured JSON output based on the given instructions:

```json
{
  "modules": [
    {
      "module_name": "tobacco_alcohol_prescription_medication_substance",
      "entries": [
        {
          "name": "tapstobaccoproductscl",
          "standards_mapping_type": "HEAL CDE Match",
          "heal_cde_name": "Tobacco Alcohol Prescription medications and other Substance (TAPS)