In [1]:
import pandas as pd
import openai
import json
import os
from dotenv import load_dotenv
from openai import AsyncOpenAI

# Load environment variables
load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")

if not api_key:
    raise ValueError("API key not found. Please set the OPENAI_API_KEY environment variable.")

# Initialize OpenAI client
client = openai.Client(api_key=api_key)

In [2]:
# Load HEAL CDE JSON reference
heal_cde_file_path = r"C:\Users\lmaefos\Code Stuffs\CDE_detective\CDE_ID_detective_revamp\KnowledgeBase\All_HEALPAINCDEsDD_JSON.json"
with open(heal_cde_file_path, "r") as f:
    heal_cde_data = json.load(f)

# Inspect the structure of the data
print(heal_cde_data.keys())  # Show top-level keys
print(heal_cde_data)  # Optionally inspect part of the data for verification

# Access the list of CDEs
try:
    cde_list = heal_cde_data["Variable Name"]  # Update this key based on actual data
    heal_cde_names = [cde["CDE Name"] for cde in cde_list]
except KeyError:
    print("Could not find the 'cdes' key in the JSON file.")
    heal_cde_names = []

# Verify the results
print(f"First 10 CDE names: {heal_cde_names[:10]}")


# Flatten HEAL CDE names for matching
heal_cde_names = [cde["name"] for cde in heal_cde_data]

dict_keys(['BPIWrstPain7dRtngScale', 'BPILstPain7dRtngScale', 'BPIAvgPain7dRtngScale', 'BPICurntPainRtngScale', 'BPIPainSeverityScore', 'BPIWorstPainRatingScl', 'BPILeastPainRatingScl', 'BPIAvgPainRatingScl', 'BPICurrentPainRatingScl', 'BRTHDTC', 'Age', 'Sex', 'GENIDENT', 'GENIDENTOTH ', 'ETHNIC', 'AI_AN', 'Asian', 'Bl_AA', 'NH_PI', 'White', 'Unkn', 'Not_Rep', 'EDULEVEL', 'EMPSTAT', 'MARISTAT', 'INCMLVL', 'DISABINSIND', 'PAINDUR', 'SurveyLanguage', 'demRUCA1', 'demRUCA2', 'AgeU', 'Ageparent', 'AgeUparent', 'ProxyRelationship', 'ProxyRelationshipOTH', 'ETHNICparent', 'AI_AN_p', 'Asian_p', 'Bl_AA_p', 'NH_PI_p', 'White_p', 'Unkn_p', 'Not_Rep_p', 'EDULEVELspouse', 'EMPSTATspouse', 'SDOH_Childcare', 'SDOH_Clothing', 'SDOH_Food', 'SDOH_Housing', 'SDOH_Internet', 'SDOH_Phone', 'SDOH_Transportation', 'SDOH_Utilities', 'SDOH_MedHlthcare', 'SDOH_Other', 'SDOH_OtherSpecify', 'SDOH_NotAnswer', 'GAD2FeelNervScale', 'GAD2NotStopWryScale', 'GAD2TotalScore', 'NIDAL2AnyAlchlScl', 'NIDAL2ExtntAlchlScl',

TypeError: string indices must be integers, not 'str'

In [None]:
# Load the input Excel/CSV file
input_file_path = r"C:\Users\lmaefos\Code Stuffs\CDE_detective\SAMPLE_DataDictionary.csv"  # Update with your input file path
variable_name_column = "name"  # Update this to match the column name for variable names in your input file
module_column = "module"  # Update this to match the column for module names if needed

In [None]:
# Load the input Excel/CSV file
df = pd.read_excel(input_file_path)  # Use pd.read_csv() for CSV files

# Convert DataFrame to list of dictionaries for processing
rows = df.to_dict(orient="records")

# Process each row
results = []
for row in rows:
    # Extract columns
    module_name = row.get(module_column, "Unknown Module")
    name = row.get(variable_name_column, "Unknown Name")

    # Matching logic
    if name in heal_cde_names:
        standards_mapping_type = "HEAL CDE Match"
        heal_cde_name = name  # Replace with the matched CDE name if needed
        variable_cde_name = "BRTHDTC"  # Replace with the actual matched variable name
    else:
        standards_mapping_type = "No CDE Match"
        heal_cde_name = "No match"
        variable_cde_name = "No match"

    # Append the processed row to the results
    results.append({
        "module_name": module_name,
        "name": name,
        "standards_mapping_type": standards_mapping_type,
        "heal_cde_name": heal_cde_name,
        "variable_cde_name": variable_cde_name
    })

In [None]:
# Output results to a JSON file
output_file_path = "output_file.json"  # Customize the output file name and location
with open(output_file_path, "w") as f:
    json.dump(results, f, indent=4)

print(f"Processed data saved to {output_file_path}")

# Optional: Save results to CSV or Excel
# pd.DataFrame(results).to_csv("output_file.csv", index=False)
# pd.DataFrame(results).to_excel("output_file.xlsx", index=False)

In [None]:
# Define the assistant with instructions
assistant = openai.Client().beta.assistants.create(
    model="gpt-4o-mini-2024-07-18",
    instructions="""Your task is to analyze variables from the input file and map them against the HEAL CDE reference. Provide results in a flat JSON format:
    [
        {
            "module_name": "<module name>",
            "name": "<entry name>",
            "standards_mapping_type": "<CDE mapping type>",
            "heal_cde_name": "<CDE name>",
            "variable_cde_name": "<Variable name>"
        },
        ...
    ]
    """,
    name="CDE ID Python",
    tools-[{"type": "file_search"}]
)

In [None]:
# Event handling logic remains unchanged
from typing_extensions import override
from openai import AssistantEventHandler, OpenAI

class EventHandler(AssistantEventHandler):
    @override
    def on_text_created(self, text) -> None:
        print(f"\nassistant > ", end="", flush=True)

    @override
    def on_tool_call_created(self, tool_call):
        print(f"\nassistant > {tool_call.type}\n", flush=True)

    @override
    def on_message_done(self, message) -> None:
        message_content = message.content[0].text
        annotations = message_content.annotations
        citations = []
        for index, annotation in enumerate(annotations):
            message_content.value = message_content.value.replace(
                annotation.text, f"[{index}]"
            )
            if file_citation := getattr(annotation, "file_citation", None):
                cited_file = client.files.retrieve(file_citation.file_id)
                citations.append(f"[{index}] {cited_file.filename}")

        print(message_content.value)
        print("\n".join(citations))

In [None]:
# Stream the response
with client.beta.threads.runs.stream(
    assistant_id=assistant.id,
    instructions="Process the input file and match variables.",
    event_handler=EventHandler(),
) as stream:
    stream.until_done()

In [7]:
# come back to this from HERE
#Define the assistant
assistant = openai.Client().beta.assistants.create(
    model="gpt-4o-mini-2024-07-18",
    instructions="""Process the given modules and their variables, and produce JSON output as per the specified format. Ensure that all modules and their entries are included in the output. Analyze the contents of each name nested under its respective module in the attached JSON file. The goal is to determine whether each name, which represents a variable, matches a HEAL CDE variable from a pre-defined list of HEAL CDE variables stored in the HEAL CDE JSON file.

                Response Format:
                For each entry in the module, provide the following in a flat JSON format:
                [
                    {
                      "module_name": "<module name>",
                      "name": "<entry name>",
                      "standards_mapping_type": "<CDE mapping type>",
                      "heal_cde_name": "<CDE name>",
                      "variable_cde_name": "<Variable name>"
                    },
                    ...
                ]

                Guidelines:
                1. Match Variable Names to HEAL CDE Variables: compare each variable name in the provided JSON file against the HEAL CDE variable list from the HEAL CDE JSON file. Determine if the variable matches a HEAL CDE variable.
                2. Determine Standards Mapping Type:
                - HEAL CDE Match: When the entry matches a CDE directly.
                - Potential HEAL CDE Match: When the entry partially matches a CDE or aligns with its context.
                - No CDE match: When the entry does not correspond to any CDE.
                3. Specify the HEAL CDE name: Provide the CDE name derived from the All_HEALPAINCDEsDD_JSON.json file or indicate 'No CDE match'.
                4. Specify Variable CDE name: Provide the variable name derived from the All_HEALPAINCDEsDD_JSON.json file or indicate 'No variable match'.

                Example Output:
                {
                    "module name": "baseline_assessment",
                    "name": "birth",
                    "standards_mapping_type": "HEAL CDE Match",
                    "heal_cde_name": "Birth date",
                    "variable_cde_name": "BRTHDTC"
                },
                {
                    "module name": "opioid_risk_tool",
                    "name": "depression",
                    "standards_mapping_type": "Potential HEAL CDE Match",
                    "heal_cde_name": "Depression",
                    "variable_cde_name": "depression"
                },
                {
                    "module name": "opioid_risk_tool",
                    "name": "illegal_drugs",
                    "standards_mapping_type": "No CDE match",
                    "heal_cde_name": "No CDE match",
                    "variable_cde_name": "No variable match"
                }
                """,
    name="CDE ID Python",
    tools=[{"type": "file_search"}]             
)

In [8]:
#Store reference files. These will be permanently accessible to the assistant.
vector_store = client.beta.vector_stores.create(name="CDE Files")
heal_cde_file = [r"C:\Users\lmaefos\Code Stuffs\CDE_detective\CDE_ID_detective_revamp\KnowledgeBase\All_HEALPAINCDEsDD_JSON.json"]
with open(heal_cde_file, "r") as f:
    heal_cde_data = json.load(f)

file_streams = [open(path, "rb") for path in heal_cde_file]
file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
  vector_store_id=vector_store.id, files=file_streams
)

In [9]:
#Update the assistant to intake reference files.
assistant = client.beta.assistants.update(
  assistant_id=assistant.id,
  tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}},
)

In [10]:
#Files that are going to be used in the message. See "attachments" in the block below.
sample_output_file = client.files.create(
    file=open(r"C:\Users\lmaefos\Code Stuffs\CDE_detective\SAMPLE_DataDictionary_ForTesting.json", "rb"),
    purpose='assistants'
)

In [11]:
#Create message thread
thread = client.beta.threads.create(
  messages=[
    {
      "role": "user",
      "content": """Adjust the sample output provided according to the instructions to produce a file that looks like the desired outcome file.
      The sample output is a json structure that contains modules with information that you need to parse through.
      I want you to determine whether each entry in the module matches with a CDE. The CDE list is found in the All_HEALPAINCDEsDD_JSON.
      You should output text that is in the same format as the desired outcome, which is another json file. Here are the instructions:
      Objective: Process the raw responses from the output of the HEAL CDE Detective to produce a structured JSON format that aligns with the HEAL Core Common Data Elements (CDE) categorization.

        Response Format:
        For each entry in the module, provide the following in a flat JSON format:
        [
            {
              "module_name": "<module name>",
              "name": "<entry name>",
              "standards_mapping_type": "<CDE mapping type>",
              "heal_cde_name": "<CDE name>",
              "variable_cde_name": "<variable name>"
            },
            ...
        ]

        Guidelines:
        1. Parse Raw Responses: Extract and organize information from the raw response provided by the first assistant.
        2. Determine Standards Mapping Type:
        - HEAL CDE Match: When the entry matches a CDE directly.
        - Potential HEAL CDE Match: When the entry partially matches a CDE or aligns with its context.
        - No CDE match: When the entry does not correspond to any CDE.
        3. Specify the HEAL CDE name: Provide the CDE name derived from the All_HEALPAINCDEsDD_JSON.json file or indicate 'No CDE match'.
        4. Specify Variable CDE name: Provide the variable name derived from the All_HEALPAINCDEsDD_JSON.json file or indicate 'No variable match'.

        Example Output:
        [
            {
              "module_name": "baseline_assessment",
              "name": "birth",
              "standards_mapping_type": "HEAL CDE Match",
              "heal_cde_name": "Birth date",
              "variable_cde_name": "BRTHDTC"
            },
            ...
      """,
      # Attach the new file to the message.
      "attachments": [
        { "file_id": sample_output_file.id, "tools": [{"type": "file_search"}] }
      ],
    }
  ]
)

In [12]:
#Execute message thread. DO NOT CHANGE (for now)
from typing_extensions import override
from openai import AssistantEventHandler, OpenAI
 
client = OpenAI()
 
class EventHandler(AssistantEventHandler):
    @override
    def on_text_created(self, text) -> None:
        print(f"\nassistant > ", end="", flush=True)

    @override
    def on_tool_call_created(self, tool_call):
        print(f"\nassistant > {tool_call.type}\n", flush=True)

    @override
    def on_message_done(self, message) -> None:
        # print a citation to the file searched
        message_content = message.content[0].text
        annotations = message_content.annotations
        citations = []
        for index, annotation in enumerate(annotations):
            message_content.value = message_content.value.replace(
                annotation.text, f"[{index}]"
            )
            if file_citation := getattr(annotation, "file_citation", None):
                cited_file = client.files.retrieve(file_citation.file_id)
                citations.append(f"[{index}] {cited_file.filename}")

        print(message_content.value)
        print("\n".join(citations))


# Then, we use the stream SDK helper
# with the EventHandler class to create the Run
# and stream the response.

with client.beta.threads.runs.stream(
    thread_id=thread.id,
    assistant_id=assistant.id,
    instructions="Please parse through the whole sample output file.",
    event_handler=EventHandler(),
) as stream:
    stream.until_done()


assistant > file_search


assistant > To process the provided data, I will create a structured JSON format based on the criteria you've mentioned for matching entries against the HEAL Core Common Data Elements (CDEs). Let's analyze the sample data to create the desired output JSON.

### Desired JSON Output
Here’s a sample structure for the output based on the provided instructions and the available CDE data.

```json
[
    {
        "module_name": "opioid_risk_tool",
        "name": "depression",
        "standards_mapping_type": "HEAL CDE Match", 
        "heal_cde_name": "Depression",
        "variable_cde_name": "depression"
    },
    {
        "module_name": "opioid_risk_tool",
        "name": "illegal drugs",
        "standards_mapping_type": "No CDE match", 
        "heal_cde_name": "No CDE match",
        "variable_cde_name": "No variable match"
    }
    // and so on for the other entries...
]
```

### Steps to Process the Output
1. **Extract Data:** Parse the provided sample