In [107]:
import openai
import asyncio
import os
import pandas as pd  # For data handling, like reading from Excel
from openai import AsyncOpenAI  # Asynchronous client from the new OpenAI SDK
import json
import configparser  # For reading configuration files
import re

import nest_asyncio
nest_asyncio.apply()


In [108]:
# Load configuration file
config = configparser.ConfigParser()
config.read('config.ini')

# Retrieve file paths and column names from config
input_file = config['Files']['input_file']
input_worksheet = config['Files']['input_worksheet']
crf_column = config['Columns']['crf_column']
variable_column = config['Columns']['variable_column']
description_column = config['Columns']['description_column']


In [109]:
# Set OpenAI API key
openai_api_key = config['OpenAI']['api_key']
client = AsyncOpenAI(api_key=openai_api_key)# Retrieve OpenAI settings from the config file

assistant_id = config['OpenAI']['assistant_id']
matching_instruction = config['Instructions']['matching_instruction']
crf_id = config['Instructions']['crf_id']
file_id = config['OpenAI']['file_id']



In [110]:
# Load the data dictionary from Excel file
data_dict_df = pd.read_excel(input_file, sheet_name=input_worksheet)

# Select only the relevant columns
data_dict_df = data_dict_df[[crf_column, variable_column, description_column]]

# Display the first few rows of the loaded data
print("Loaded Data Dictionary:")
print(data_dict_df.head())

# Group descriptions by each unique CRF
# Concatenate all descriptions for each CRF into a single string
grouped_crf_df = data_dict_df.groupby(crf_column)[description_column].apply(lambda x: ' '.join(x.dropna())).reset_index()

# Rename columns for clarity
grouped_crf_df.columns = ['CRF Name', 'Descriptions']

# Display the grouped DataFrame
print("\nGrouped CRF Names with Descriptions:")
print(grouped_crf_df)

Loaded Data Dictionary:
                                       Form Name Variable / Field Name  \
0  adolescent_sleep_wake_scale_short_form_aswssf               awsw_i1   
1  adolescent_sleep_wake_scale_short_form_aswssf               awsw_i2   
2  adolescent_sleep_wake_scale_short_form_aswssf                awsw_1   
3  adolescent_sleep_wake_scale_short_form_aswssf               awsw_i3   
4  adolescent_sleep_wake_scale_short_form_aswssf                awsw_2   

                                         Field Label  
0  Using the choices below, select how often the ...  
1                      When its time to go to bed...  
2  1. ...I want to stay up and do other things (f...  
3                                      In general...  
4          2. ...I am ready to go to bed at bedtime.  

Grouped CRF Names with Descriptions:
                                             CRF Name  \
0       adolescent_sleep_wake_scale_short_form_aswssf   
1    adverse_childhood_experiences_questionnaire_

In [111]:
# Step 1 of 2 in API call

def parse_extracted_crf_name(response_content):
    """
    Parse the response to extract the general CRF name, regardless of HEAL Core CRFs.
    This is used for Step 1.
    """
    # Debug: Print the full response content
    print("\n--- Full Response Content ---\n", response_content, "\n--- End of Response ---\n")

    # Modify the regex to capture the full CRF name, even if it contains a dash
    crf_name_pattern = r"(?i)CRF name\s*:\s*(.+?)(?=\n|- Rationale|-|$)"
    crf_match = re.search(crf_name_pattern, response_content)
    matched_crf = crf_match.group(1).strip() if crf_match else "Unknown CRF"

    # Extract rationale if available
    rationale_pattern = r"(?i)Rationale\s*:\s*(.+)"
    rationale_match = re.search(rationale_pattern, response_content)
    rationale = rationale_match.group(1).strip() if rationale_match else "No rationale provided"

    return {
        "Extracted CRF Name": matched_crf,
        "Rationale": rationale,
        "Response": response_content  # Full response for reference
    }

async def extract_crf_name(client, crf_name, descriptions):
    prompt = f"{crf_id}\n\nCRF Name: {crf_name}\nDescriptions: {descriptions}"
    response = await client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7,
    )
    content = response.choices[0].message.content.strip()
    return parse_extracted_crf_name(content)

In [112]:
# Step 2 of 2 in API call

def parse_heal_crf_match(response_content):
    """
    Parse the response to extract a matched HEAL Core CRF name and confidence level.
    This is used for Step 2.
    """
    # Extract HEAL Core CRF match
    crf_match = re.search(r"(?i)HEAL Core CRF Match:\s*(.+?)(?=\s*-|$)", response_content)
    matched_crf = crf_match.group(1).strip() if crf_match else "No CRF match"

    # Extract confidence level
    confidence_match = re.search(r"(?i)Confidence\s*[Ll]evel:\s*(High Confidence|Medium Confidence|Low Confidence|No Match)", response_content)
    confidence_level = confidence_match.group(1).strip() if confidence_match else "No Confidence Score"

    return {
        "Matched CRF": matched_crf,
        "Confidence": confidence_level,
        "Response": response_content  # Full response for reference
    }

# Async function to call OpenAI API with the new version syntax
async def get_crf_match_from_openai(client, extracted_crf_name, descriptions):
    prompt = f"{matching_instruction}\n\nCRF Name: {extracted_crf_name}\nDescriptions: {descriptions}"
    response = await client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.7,
    )
    content = response.choices[0].message.content.strip()
    return parse_heal_crf_match(content)

In [113]:
# Asynchronous function to process all CRFs concurrently
async def analyze_crfs(client, crf_df):
    """
    Processes each CRF and gathers results asynchronously in two steps.
    Step 1: Extract the general CRF name.
    Step 2: Match the extracted name to a HEAL Core CRF.
    """
    tasks = []

    # Step 1: Extract CRF names
    for _, row in crf_df.iterrows():
        crf_name = row['CRF Name']
        descriptions = row['Descriptions']
        # First step: Extract CRF name (regardless of HEAL Core)
        task = extract_crf_name(client, crf_name, descriptions)
        tasks.append(task)

    # Await extraction results
    extracted_names = await asyncio.gather(*tasks)

    # Step 2: Match extracted CRF names with HEAL Core CRFs
    match_tasks = []
    for idx, extracted in enumerate(extracted_names):
        extracted_crf_name = extracted["Extracted CRF Name"]
        descriptions = crf_df.iloc[idx]["Descriptions"]
        # Second step: Match the extracted CRF name with HEAL Core CRFs
        task = get_crf_match_from_openai(client, extracted_crf_name, descriptions)
        match_tasks.append(task)

    # Await matching results
    matches = await asyncio.gather(*match_tasks)

    # Format the results
    results = []
    for idx, (extracted, match) in enumerate(zip(extracted_names, matches)):
        crf_name = crf_df.iloc[idx]["CRF Name"]
        results.append({
            "Original CRF Name": crf_name,
            "Extracted CRF Name": extracted["Extracted CRF Name"],
            "Rationale": extracted["Rationale"],  # Replace confidence with rationale
            "Full Response (Extracted CRF)": extracted["Response"],  # Store full response of extraction
            "Matched HEAL Core CRF": match.get("Matched CRF", "No Match"),
            "Match Confidence": match.get("Confidence", "No Confidence"),
            "Match Full Response": match.get("Response")  # Store full response of matching
        })

    # Convert results to DataFrame
    return pd.DataFrame(results)

In [114]:
# Main function to execute the analysis
async def main():
    # Load the data dictionary from Excel file
    data_dict_df = pd.read_excel(input_file, sheet_name=input_worksheet)
    
    # Select only the relevant columns
    data_dict_df = data_dict_df[[crf_column, variable_column, description_column]]
    
    # Display the first few rows of the loaded data for confirmation
    print("Loaded Data Dictionary:")
    print(data_dict_df.head())

    # Group descriptions by each unique CRF and concatenate descriptions
    grouped_crf_df = data_dict_df.groupby(crf_column)[description_column].apply(lambda x: ' '.join(x.dropna())).reset_index()
    grouped_crf_df.columns = ['CRF Name', 'Descriptions']
    
    # Display grouped data to verify it’s ready for analysis
    print("\nGrouped CRF Names with Descriptions:")
    print(grouped_crf_df.head())
    
    # Run the two-step analyze_crfs process and get the results
    results_df = await analyze_crfs(client, grouped_crf_df)

    # Save the results to an Excel file
    output_file = config['Files']['output_file']
    results_df.to_excel(output_file, index=False)
    print(f"Results saved to {output_file}")

# Run the main function in a Jupyter-compatible way
await main()  # Place this at the end to kick off execution


Loaded Data Dictionary:
                                       Form Name Variable / Field Name  \
0  adolescent_sleep_wake_scale_short_form_aswssf               awsw_i1   
1  adolescent_sleep_wake_scale_short_form_aswssf               awsw_i2   
2  adolescent_sleep_wake_scale_short_form_aswssf                awsw_1   
3  adolescent_sleep_wake_scale_short_form_aswssf               awsw_i3   
4  adolescent_sleep_wake_scale_short_form_aswssf                awsw_2   

                                         Field Label  
0  Using the choices below, select how often the ...  
1                      When its time to go to bed...  
2  1. ...I want to stay up and do other things (f...  
3                                      In general...  
4          2. ...I am ready to go to bed at bedtime.  

Grouped CRF Names with Descriptions:
                                           CRF Name  \
0     adolescent_sleep_wake_scale_short_form_aswssf   
1  adverse_childhood_experiences_questionnaire_aceq  