In [21]:
import openai
import json
import os
import asyncio

from dotenv import load_dotenv
from openai import AsyncOpenAI

In [22]:
load_dotenv()

api_key = os.getenv('OPENAI_API_KEY')

if not api_key:
    raise ValueError('API key not found. Please set the OPENAI_API_KEY environment variable.')

client = openai.Client(api_key=api_key)

In [23]:
local_dir = r"C:/Users/lmaefos/Code Stuffs/CDE_detective/"

CDE_list_file = local_dir + 'CDE_ID_detective_revamp/KnowledgeBase/All_HEALPAINCDEsDD_JSON.json'
desired_outcome_file = local_dir + 'CDE_ID_detective_revamp/desiredoutcome.json'

In [24]:
# file(s) to be used in message
 
sample_input_file = client.files.create(
    file = open(local_dir + 'SAMPLE_DataDictionary_ForTesting.json', 'rb'),
    purpose = 'assistants'
)

In [25]:
# store reference files to be permanently accessible to assistant
vector_store = client.beta.vector_stores.create(name='CDE Files')

file_paths = [CDE_list_file, desired_outcome_file]
file_streams = [open(path, 'rb') for path in file_paths]

file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
    files = file_streams,
    vector_store_id = vector_store.id
)

In [26]:
# OpenAI parameters

# instructions guide the personality of assistant and define its goals
instructions = 'You are expert at producing a structured JSON format '
instructions += 'that aligns with the HEAL Core Common Data Elements (CDE) categorization. '
instructions += 'You will get details how to respond in messages that are passed in.'

# tools
#  - code_interpreter allows assistant to write/run Python code
#  - file_search extracts appropriate data from input files and augments model responses

# tool_resources give tools access to files uploaded using file upload endpoint

# messages defines inputs for assistant to process
message_content = 'Here is an overview. '
message_content += 'Adjust the sample output provided according to the specific instructions below, '
message_content += 'to produce a file that looks like the desired output. '
message_content += 'The desired output is a .json file '
message_content += 'that contains modules with information that you need to parse through. '
message_content += 'Its format is found in the file called ' + desired_outcome_file + ' that is passed in. '
message_content += 'You are to determine whether or not each entry in the module matches with a CDE. '
message_content += 'The CDE list is found in the file called ' + CDE_list_file + ' that is also passed in.'
message_content += '\n'
message_content += 'Here are specific instructions. '
message_content += 'Process each of the raw responses provided to produce a structured JSON format '
message_content += 'that aligns with the HEAL Core CDE categorization.'
message_content += '\n'
message_content += 'Here are some guidelines. '
message_content += '1. Parsing raw responses: '
message_content += 'Extract and organize information from the raw response provided. '
message_content += '2. Determine standards mapping type: '
message_content += '(i) HEAL CDE match--when the entry matches a CDE directly. '
message_content += '(ii) Potential HEAL CDE match--when the entry partially matches a CDE or aligns with its context. '
message_content += '(iii) No CDE match--when the entry does not correspond to any CDE. '
message_content += '3. Specify standards mapping label: '
message_content += 'The CRF name derived from the CDE list or indicate "No CRF match". '
message_content += '4. Specify Confidence Level: '
message_content += '(i) High confidence level--when the match is clear and direct. '
message_content += '(ii) Medium confidence level--when the match is reasonable but may require further verification. '
message_content += '(iii) Low confidence level--when the match is uncertain or ambiguous.'
message_content += '\n'
message_content += 'Finally, instead of preparing a .json file for download, please print out the contents.'

In [27]:
# create assistant and thread

assistant = client.beta.assistants.create(
    instructions = instructions,
    model="gpt-4o-mini-2024-07-18",
    name = 'CDE ID Python',
    tool_resources = {'file_search': {'vector_store_ids': [vector_store.id]}},
    tools = [{'type': 'code_interpreter'}, {'type': 'file_search'}]
)

thread = client.beta.threads.create(
    messages = [
        {
            'attachments': [
                {'file_id': sample_input_file.id, 'tools': [{'type': 'file_search'}]}
            ],
            'content': message_content,
            'role': 'user'
        }
    ]
)

In [28]:
#Execute message thread. DO NOT CHANGE (for now)
from typing_extensions import override
from openai import AssistantEventHandler, OpenAI
 
client = OpenAI()
 
class EventHandler(AssistantEventHandler):
    @override
    def on_text_created(self, text) -> None:
        print(f"\nassistant > ", end="", flush=True)

    @override
    def on_tool_call_created(self, tool_call):
        print(f"\nassistant > {tool_call.type}\n", flush=True)

    @override
    def on_message_done(self, message) -> None:
        # print a citation to the file searched
        message_content = message.content[0].text
        annotations = message_content.annotations
        citations = []
        for index, annotation in enumerate(annotations):
            message_content.value = message_content.value.replace(
                annotation.text, f"[{index}]"
            )
            if file_citation := getattr(annotation, "file_citation", None):
                cited_file = client.files.retrieve(file_citation.file_id)
                citations.append(f"[{index}] {cited_file.filename}")

        print(message_content.value)
        print("\n".join(citations))

In [29]:
# Then, we use the stream SDK helper
# with the EventHandler class to create the Run
# and stream the response.

with client.beta.threads.runs.stream(
    thread_id=thread.id,
    assistant_id=assistant.id,
    instructions="Please parse through the whole sample output file.",
    event_handler=EventHandler(),
) as stream:
    stream.until_done()


assistant > file_search


assistant > To adjust the sample output according to the specific requirements, I will need to process the following steps:

1. Parse the raw responses to extract relevant information.
2. Determine the standards mapping type based on matches with the CDE list.
3. Specify the mapping label and confidence levels.

### Sample Parsed Output

```json
[
    {
        "module name": "baseline_assessment",
        "name": "birth",
        "standards_mapping_type": "HEAL CDE Match",
        "heal_cde_name": "Birth date",
        "variable_cde_name": "BRTHDTC",
        "confidence_level": "High"
    },
    {
        "module name": "opioid_risk_tool",
        "name": "depression",
        "standards_mapping_type": "Potential HEAL CDE Match",
        "heal_cde_name": "Depression",
        "variable_cde_name": "depression",
        "confidence_level": "Medium"
    },
    {
        "module name": "opioid_risk_tool",
        "name": "illegal_drugs",
        "standards_mappi

In [None]:
# --> older code

In [None]:
#Define the assistant
assistant = openai.Client().beta.assistants.create(
    model="gpt-4o-mini-2024-07-18",
    instructions="""Objective: Process the raw responses from the output of the HEAL CDE Detective to produce a structured JSON format that aligns with the HEAL Core Common Data Elements (CDE) categorization.

                Response Format:
                For each entry in the module, provide the following in a nested JSON format:
                {
                  "module_name": "<module name>",
                  "entries": [
                    {
                      "name": "<entry name>",
                      "standards_mapping_type": "<CDE mapping type>",
                      "standards_mapping_label": "<CRF name>",
                      "confidence_level": "<confidence level>"
                    },
                    ...
                  ]
                }

                Guidelines:
                1. Parse Raw Responses: Extract and organize information from the raw response provided by the first assistant.
                2. Determine Standards Mapping Type:
                - HEAL CDE Match: When the entry matches a CDE directly.
                - Potential HEAL CDE Match: When the entry partially matches a CDE or aligns with its context.
                - No CDE match: When the entry does not correspond to any CDE.
                3. Specify Standards Mapping Label: The CRF name derived from the All_HEALPAINCDEsDD_JSON.json file or indicate 'No CRF match'.
                4. Specify Confidence Level:
                - High confidence level: When the match is clear and direct.
                - Medium confidence level: When the match is reasonable but may require further verification.
                - Low confidence level: When the match is uncertain or ambiguous.

                Example Output:
                {
                  "module_name": "baseline_assessment",
                  "entries": [
                    {
                      "name": "birth",
                      "standards_mapping_type": "HEAL CDE Match",
                      "standards_mapping_label": "Demographics",
                      "confidence_level": "High"
                    },
                    {
                      "name": "age_at_enrollment",
                      "standards_mapping_type": "HEAL CDE Match",
                      "standards_mapping_label": "Demographics",
                      "confidence_level": "High"
                    }
                  ]
                }""",
    name="CDE ID Python",
    tools=[{"type": "file_search"}]             
)

In [None]:
#Update the assistant to intake reference files.
assistant = client.beta.assistants.update(
  assistant_id=assistant.id,
  tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}},
)

In [None]:
#Create message thread
thread = client.beta.threads.create(
  messages=[
    {
      "role": "user",
      "content": """Adjust the sample output provided according to the instructions to produce a file that looks like the desired outcome file.
      The sample output is a json file that contains modules with information that you need to parse through.
      I want you to determine whether each entry in the module matches with a CDE. The CDE list is found in the All_HEALPAINCDEsDD_JSON.
      You should output text that is in the same format as the desired outcome, which is another json file. Here are the instructions:
      Objective: Process the raw responses from the output of the HEAL CDE Detective to produce a structured JSON format that aligns with the HEAL Core Common Data Elements (CDE) categorization.

        Response Format:
        For each entry in the module, provide the following in a nested JSON format:
        {
          "module_name": "<module name>",
          "entries": [
            {
              "name": "<entry name>",
              "standards_mapping_type": "<CDE mapping type>",
              "standards_mapping_label": "<CRF name>",
              "confidence_level": "<confidence level>"
            },
            ...
          ]
        }

        Guidelines:
        1. Parse Raw Responses: Extract and organize information from the raw response provided by the first assistant.
        2. Determine Standards Mapping Type:
        - HEAL CDE Match: When the entry matches a CDE directly.
        - Potential HEAL CDE Match: When the entry partially matches a CDE or aligns with its context.
        - No CDE match: When the entry does not correspond to any CDE.
        3. Specify Standards Mapping Label: The CRF name derived from the All_HEALPAINCDEsDD_JSON.json file or indicate 'No CRF match'.
        4. Specify Confidence Level:
        - High confidence level: When the match is clear and direct.
        - Medium confidence level: When the match is reasonable but may require further verification.
        - Low confidence level: When the match is uncertain or ambiguous.

        Example Output:
        {
          "module_name": "baseline_assessment",
          "entries": [
            {
              "name": "birth",
              "standards_mapping_type": "HEAL CDE Match",
              "standards_mapping_label": "Demographics",
              "confidence_level": "High"
            },
            {
              "name": "age_at_enrollment",
              "standards_mapping_type": "HEAL CDE Match",
              "standards_mapping_label": "Demographics",
              "confidence_level": "High"
            }
          ]
        }
      
      """,
      # Attach the new file to the message.
      "attachments": [
        { "file_id": sample_input_file.id, "tools": [{"type": "file_search"}] }
      ],
    }
  ]
)