In [1]:
import openai
import json
import os
import asyncio
from dotenv import load_dotenv
from openai import AsyncOpenAI

load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")

if not api_key:
    raise ValueError("API key not found. Please set the OPENAI_API_KEY environment variable.")

client = openai.Client(api_key=api_key)

#Define the assistant
assistant = openai.Client().beta.assistants.create(
    model="gpt-4o-mini-2024-07-18",
    instructions="""Objective: Process the raw responses from the output of the HEAL CDE Detective to produce a structured JSON format that aligns with the HEAL Core Common Data Elements (CDE) categorization.

                Response Format:
                For each entry in the module, provide the following in a nested JSON format:
                {
                  "module_name": "<module name>",
                  "entries": [
                    {
                      "name": "<entry name>",
                      "standards_mapping_type": "<CDE mapping type>",
                      "standards_mapping_label": "<CRF name>",
                      "confidence_level": "<confidence level>"
                    },
                    ...
                  ]
                }

                Guidelines:
                1. Parse Raw Responses: Extract and organize information from the raw response provided by the first assistant.
                2. Determine Standards Mapping Type:
                - HEAL CDE Match: When the entry matches a CDE directly.
                - Potential HEAL CDE Match: When the entry partially matches a CDE or aligns with its context.
                - No CDE match: When the entry does not correspond to any CDE.
                3. Specify Standards Mapping Label: The CRF name derived from the All_HEALPAINCDEsDD_JSON.json file or indicate 'No CRF match'.
                4. Specify Confidence Level:
                - High confidence level: When the match is clear and direct.
                - Medium confidence level: When the match is reasonable but may require further verification.
                - Low confidence level: When the match is uncertain or ambiguous.

                Example Output:
                {
                  "module_name": "baseline_assessment",
                  "entries": [
                    {
                      "name": "birth",
                      "standards_mapping_type": "HEAL CDE Match",
                      "standards_mapping_label": "Demographics",
                      "confidence_level": "High"
                    },
                    {
                      "name": "age_at_enrollment",
                      "standards_mapping_type": "HEAL CDE Match",
                      "standards_mapping_label": "Demographics",
                      "confidence_level": "High"
                    }
                  ]
                }""",
    name="CDE ID Python",
    tools=[{"type": "file_search"}]             
)

In [7]:
#Store reference files. These will be permanently accessible to the assistant.
vector_store = client.beta.vector_stores.create(name="CDE Files")
file_paths = [r"C:\Users\asman\Desktop\HEAL\All_HEALPAINCDEsDD_JSON.json", r"C:\Users\asman\Desktop\HEAL\desiredoutcome.json"]
file_streams = [open(path, "rb") for path in file_paths]
file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
  vector_store_id=vector_store.id, files=file_streams
)

In [3]:
#Update the assistant to intake reference files.
assistant = client.beta.assistants.update(
  assistant_id=assistant.id,
  tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}},
)

In [4]:
#Files that are going to be used in the message. See "attachments" in the block below.
sample_output_file = client.files.create(
    file=open(r'C:\Users\asman\Desktop\HEAL\SAMPLE_DataDictionary_output.json', "rb"),
    purpose='assistants'
)

In [5]:
#Create message thread
thread = client.beta.threads.create(
  messages=[
    {
      "role": "user",
      "content": """Adjust the sample output provided according to the instructions to produce a file that looks like the desired outcome file.
      The sample output is a json file that contains modules with information that you need to parse through.
      I want you to determine whether each entry in the module matches with a CDE. The CDE list is found in the All_HEALPAINCDEsDD_JSON.
      You should output text that is in the same format as the desired outcome, which is another json file. Here are the instructions:
      Objective: Process the raw responses from the output of the HEAL CDE Detective to produce a structured JSON format that aligns with the HEAL Core Common Data Elements (CDE) categorization.

        Response Format:
        For each entry in the module, provide the following in a nested JSON format:
        {
          "module_name": "<module name>",
          "entries": [
            {
              "name": "<entry name>",
              "standards_mapping_type": "<CDE mapping type>",
              "standards_mapping_label": "<CRF name>",
              "confidence_level": "<confidence level>"
            },
            ...
          ]
        }

        Guidelines:
        1. Parse Raw Responses: Extract and organize information from the raw response provided by the first assistant.
        2. Determine Standards Mapping Type:
        - HEAL CDE Match: When the entry matches a CDE directly.
        - Potential HEAL CDE Match: When the entry partially matches a CDE or aligns with its context.
        - No CDE match: When the entry does not correspond to any CDE.
        3. Specify Standards Mapping Label: The CRF name derived from the All_HEALPAINCDEsDD_JSON.json file or indicate 'No CRF match'.
        4. Specify Confidence Level:
        - High confidence level: When the match is clear and direct.
        - Medium confidence level: When the match is reasonable but may require further verification.
        - Low confidence level: When the match is uncertain or ambiguous.

        Example Output:
        {
          "module_name": "baseline_assessment",
          "entries": [
            {
              "name": "birth",
              "standards_mapping_type": "HEAL CDE Match",
              "standards_mapping_label": "Demographics",
              "confidence_level": "High"
            },
            {
              "name": "age_at_enrollment",
              "standards_mapping_type": "HEAL CDE Match",
              "standards_mapping_label": "Demographics",
              "confidence_level": "High"
            }
          ]
        }
      
      """,
      # Attach the new file to the message.
      "attachments": [
        { "file_id": sample_output_file.id, "tools": [{"type": "file_search"}] }
      ],
    }
  ]
)

In [6]:
#Execute message thread. DO NOT CHANGE (for now)
from typing_extensions import override
from openai import AssistantEventHandler, OpenAI
 
client = OpenAI()
 
class EventHandler(AssistantEventHandler):
    @override
    def on_text_created(self, text) -> None:
        print(f"\nassistant > ", end="", flush=True)

    @override
    def on_tool_call_created(self, tool_call):
        print(f"\nassistant > {tool_call.type}\n", flush=True)

    @override
    def on_message_done(self, message) -> None:
        # print a citation to the file searched
        message_content = message.content[0].text
        annotations = message_content.annotations
        citations = []
        for index, annotation in enumerate(annotations):
            message_content.value = message_content.value.replace(
                annotation.text, f"[{index}]"
            )
            if file_citation := getattr(annotation, "file_citation", None):
                cited_file = client.files.retrieve(file_citation.file_id)
                citations.append(f"[{index}] {cited_file.filename}")

        print(message_content.value)
        print("\n".join(citations))


# Then, we use the stream SDK helper
# with the EventHandler class to create the Run
# and stream the response.

with client.beta.threads.runs.stream(
    thread_id=thread.id,
    assistant_id=assistant.id,
    instructions="Please parse through the whole sample output file.",
    event_handler=EventHandler(),
) as stream:
    stream.until_done()


assistant > file_search


assistant > Based on the provided sample output and the CDE list from the All_HEALPAINCDEsDD_JSON, I have structured the information into the desired JSON format as instructed. Below is the adjusted output:

```json
[
    {
        "module_name": "baseline_assessment",
        "entries": [
            {
                "name": "Date of Birth",
                "standards_mapping_type": "HEAL CDE Match",
                "standards_mapping_label": "Birth date",
                "confidence_level": "High"
            },
            {
                "name": "Age at Enrollment",
                "standards_mapping_type": "HEAL CDE Match",
                "standards_mapping_label": "Age",
                "confidence_level": "Medium"
            },
            {
                "name": "Gender",
                "standards_mapping_type": "HEAL CDE Match",
                "standards_mapping_label": "Gender identification type",
                "confidence_level": "High"