In [41]:
import openai
import asyncio
import os
import pandas as pd  # For data handling, like reading from Excel
from openai import AsyncOpenAI  # Asynchronous client from the new OpenAI SDK
import json
import configparser  # For reading configuration files
import re

import nest_asyncio
nest_asyncio.apply()


In [42]:
# Load configuration file
config = configparser.ConfigParser()
config.read('config.ini')

# Retrieve file paths and column names from config
input_file = config['Files']['input_file']
input_worksheet = config['Files']['input_worksheet']
crf_column = config['Columns']['crf_column']
variable_column = config['Columns']['variable_column']
description_column = config['Columns']['description_column']


In [43]:
# Set OpenAI API key
openai_api_key = config['OpenAI']['api_key']
client = AsyncOpenAI(api_key=openai_api_key)# Retrieve OpenAI settings from the config file

assistant_id = config['OpenAI']['assistant_id']
crf_id = config['Instructions']['crf_id']
matching_instruction = config['Instructions']['matching_instruction']
file_id = config['OpenAI']['file_id']

In [44]:
# Load the data dictionary from Excel file
data_dict_df = pd.read_excel(input_file, sheet_name=input_worksheet)

# Select only the relevant columns
data_dict_df = data_dict_df[[crf_column, variable_column, description_column]]

# Display the first few rows of the loaded data
print("Loaded Data Dictionary:")
print(data_dict_df.head())

# Group descriptions by each unique CRF
# Concatenate all descriptions for each CRF into a single string
grouped_descriptions_df = (
    data_dict_df.groupby(crf_column)[description_column]
    .apply(lambda x: ' '.join(x.dropna()))
    .reset_index()
)

# Group variable names by each unique CRF
# Concatenate all variable names for each CRF into a comma-separated string
grouped_variables_df = (
    data_dict_df.groupby(crf_column)[variable_column]
    .apply(lambda x: ', '.join(x.dropna()))
    .reset_index()
)

# Merge grouped descriptions and variable names on CRF Name
grouped_crf_df = grouped_descriptions_df.merge(
    grouped_variables_df, on=crf_column, how='left'
)

# Rename columns for clarity
grouped_crf_df.columns = ['CRF Name', 'Descriptions', 'Variable Names']

# Display the grouped DataFrame with descriptions and variables
print("\nGrouped CRF Names with Descriptions and Variable Names:")
print(grouped_crf_df.head())

Loaded Data Dictionary:
  Refined CRF Name              name  \
0    Screening Log         record_id   
1    Screening Log         subjectid   
2    Screening Log        oboe_group   
3    Screening Log  study_identifier   
4    Screening Log       sc_birthdat   

                                         description  
0                                          Record ID  
1                                  OBOE Study Number  
2          What group was the subject enrolled into?  
3  Study Identifier (hidden, raw value from oboe ...  
4                                      Date of Birth  

Grouped CRF Names with Descriptions and Variable Names:
                        CRF Name  \
0          Extensive Travel Form   
1       Maternal Medical History   
2  Preauthorization Request Form   
3                  Screening Log   
4      Travel Authorization Form   

                                        Descriptions  \
0  SECTION C. FORM COMPLETION: 1. Initials of sta...   
1  SECTION D. MATER

In [45]:
# Step 1 of 2 in API call

def parse_extracted_crf_name(response_content):
    """
    Parse the response to extract the full CRF name, including descriptors like "Short Form" or abbreviations in parentheses.
    """
    # Debug: Print the full response content
    print("\n--- Full Response Content ---\n", response_content, "\n--- End of Response ---\n")

    # Modify the regex to capture the entire CRF name, including phrases like "Short Form"
    crf_name_pattern = r"(?i)CRF name\s*:\s*([\w\s-]+(?:\(.+?\))?)"
    crf_match = re.search(crf_name_pattern, response_content)
    matched_crf = re.sub(r'-\s*Rationale$', '', crf_match.group(1).strip()) if crf_match else "Unknown CRF"

    # Extract rationale if available
    rationale_pattern = r"(?i)Rationale\s*:\s*(.+)"
    rationale_match = re.search(rationale_pattern, response_content)
    rationale = rationale_match.group(1).strip() if rationale_match else "No rationale provided"

    return {
        "Extracted CRF Name": matched_crf,
        "Rationale": rationale,
        "Response": response_content  # Full response for reference
    }

async def extract_crf_name(client, crf_name, descriptions):
    prompt = f"{crf_id}\n\nCRF Name: {crf_name}\nDescriptions: {descriptions}"
    response = await client.chat.completions.create(
        model="gpt-4.1-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.5,
    )
    content = response.choices[0].message.content.strip()
    return parse_extracted_crf_name(content)

In [46]:
# Step 2 of 2 in API call

def parse_heal_crf_match(response_content):
    """
    Parse the response to extract a matched HEAL Core CRF name and confidence level.
    This is used for Step 2.
    """
    # Extract HEAL Core CRF match
    crf_match = re.search(r"(?i)HEAL Core CRF Match:\s*(.+?)(?=\s*-|$)", response_content, re.DOTALL)
    matched_crf = crf_match.group(1).strip() if crf_match else "No CRF match"

    # Extract confidence level
    confidence_match = re.search(r"(?i)Confidence\s*[Ll]evel:\s*(High Confidence|Medium Confidence|Low Confidence|No Match)", response_content)
    confidence_level = confidence_match.group(1).strip() if confidence_match else "No Confidence Score"

    return {
        "Matched CRF": matched_crf,
        "Confidence": confidence_level,
        "Response": response_content  # Full response for reference
    }

# Async function to call OpenAI API with the new version syntax
async def get_crf_match_from_openai(client, extracted_crf_name, descriptions):
    prompt = f"{matching_instruction}\n\nCRF Name: {extracted_crf_name}\nDescriptions: {descriptions}"
    response = await client.chat.completions.create(
        model="gpt-4.1-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.5,
    )
    content = response.choices[0].message.content.strip()
    return parse_heal_crf_match(content)

In [None]:
# Asynchronous function to process all CRFs concurrently
async def analyze_crfs(client, crf_df, extracted_names):
    # Match extracted CRF names with HEAL Core CRFs
    step2_tasks = []
    for idx, extracted in enumerate(extracted_names):
        extracted_crf_name = extracted["Extracted CRF Name"]
        descriptions = crf_df.iloc[idx]["Descriptions"]
        task = get_crf_match_from_openai(client, extracted_crf_name, descriptions)
        step2_tasks.append(task)

    matches = await asyncio.gather(*step2_tasks)

    # Format and return results as before
    results = []
    for idx, (extracted, match) in enumerate(zip(extracted_names, matches)):
        original_crf_name = crf_df.iloc[idx]["CRF Name"]
        results.append({
            "Original CRF Name": original_crf_name,
            "Refined CRF Name": crf_df.iloc[idx]["Refined CRF Name"],
            "Extracted CRF Name": extracted["Extracted CRF Name"],
            "Rationale": extracted["Rationale"],
            "Full Response (Extracted CRF)": extracted["Response"],
            "Matched HEAL Core CRF": match.get("Matched CRF", "No Match"),
            "Match Confidence": match.get("Confidence", "No Confidence"),
            "Match Full Response": match.get("Response")
        })

    return pd.DataFrame(results)

In [51]:
# Main function to execute the analysis
async def main():
    # Step 1: Fancy Pivot Table Output

    # Load the data dictionary from Excel file
    data_dict_df = pd.read_excel(input_file, sheet_name=input_worksheet)
    
    # Select only the relevant columns
    data_dict_df = data_dict_df[[crf_column, variable_column, description_column]]
    
    # Display the first few rows of the loaded data for confirmation
    print("Loaded Data Dictionary:")
    print(data_dict_df.head())

    # Group descriptions by each unique CRF and concatenate descriptions
    grouped_descriptions_df = (
        data_dict_df.groupby(crf_column)[description_column]
        .apply(lambda x: ' '.join(x.dropna()))
        .reset_index()
    )

    # Group variable names by each unique CRF and concatenate variable names
    grouped_variables_df = (
        data_dict_df.groupby(crf_column)[variable_column]
        .apply(lambda x: ', '.join(x.dropna()))
        .reset_index()
    )

    # Merge descriptions and variable names into one DataFrame
    grouped_crf_df = grouped_descriptions_df.merge(
        grouped_variables_df, on=crf_column, how='left'
    )

    # Rename columns for clarity
    grouped_crf_df.columns = ['CRF Name', 'Descriptions', 'Variable Names']
    
    # Display grouped data to verify it’s ready for analysis
    print("\nGrouped CRF Names with Descriptions and Variable Names:")
    print(grouped_crf_df.head())
    
    # Run the two-step analyze_crfs process and get the results
    results_df = await analyze_crfs(client, grouped_crf_df)

    # Debug: Print columns in results_df before merge
    print("\nColumns in results_df:")
    print(results_df.columns.tolist())

    # Step 2: Create the Enhanced Output File

    # Load the input file again to ensure all original columns are included
    full_input_df = pd.read_excel(input_file, sheet_name=input_worksheet)

    # Merge the Input File with the Results File on the Original CRF Name (Form Name)
    enhanced_output_df = full_input_df.copy()  # Start with the original input file
    enhanced_output_df = enhanced_output_df.merge(
        results_df[['Original CRF Name', 'Extracted CRF Name', 'Matched HEAL Core CRF', 'Match Confidence']],
        how='left',  # Preserve all rows from the input file
        left_on=crf_column,  # Match using the original CRF Name column from the input file
        right_on='Original CRF Name'  # Match to the Results File column
    )

    # Debug: Print columns in enhanced_output_df after merge
    print("\nColumns in enhanced_output_df after merge:")
    print(enhanced_output_df.columns.tolist())

    # Optional: Reorganize the new columns to appear next to the Form Name column
    form_name_col_index = list(enhanced_output_df.columns).index(crf_column)  # Find the index of the Form Name column
    for new_col in ['Extracted CRF Name', 'Matched HEAL Core CRF', 'Match Confidence']:
        if new_col in enhanced_output_df.columns:
            # Move each new column to the right of the Form Name column
            col_data = enhanced_output_df.pop(new_col)
            enhanced_output_df.insert(form_name_col_index + 1, new_col, col_data)
            form_name_col_index += 1  # Adjust index for the next new column
        else:
            print(f"Warning: Column '{new_col}' not found in enhanced_output_df. Skipping.")


    # Save everything in a **single** Excel file with two sheets
    output_file = config['Files']['output_file']
    
    with pd.ExcelWriter(output_file, engine="xlsxwriter") as writer:
        results_df.to_excel(writer, sheet_name="Metadata", index=False)  # First sheet
        enhanced_output_df.to_excel(writer, sheet_name="EnhancedDD", index=False)  # Second sheet

    print(f"Results saved to {output_file} with two sheets: 'Metadata' and 'EnhancedDD'.")

# Run the main function in a Jupyter-compatible way
await main()  # Place this at the end to kick off execution

Loaded Data Dictionary:
  Refined CRF Name              name  \
0    Screening Log         record_id   
1    Screening Log         subjectid   
2    Screening Log        oboe_group   
3    Screening Log  study_identifier   
4    Screening Log       sc_birthdat   

                                         description  
0                                          Record ID  
1                                  OBOE Study Number  
2          What group was the subject enrolled into?  
3  Study Identifier (hidden, raw value from oboe ...  
4                                      Date of Birth  

Grouped CRF Names with Descriptions and Variable Names:
                        CRF Name  \
0          Extensive Travel Form   
1       Maternal Medical History   
2  Preauthorization Request Form   
3                  Screening Log   
4      Travel Authorization Form   

                                        Descriptions  \
0  SECTION C. FORM COMPLETION: 1. Initials of sta...   
1  SECTION D. MATER

TypeError: analyze_crfs() missing 1 required positional argument: 'extracted_names'