In [None]:
from openai import AsyncOpenAI  # Asynchronous client from the new OpenAI SDK
import asyncio  # For asynchronous task management
import pandas as pd  # For data handling, like reading from Excel
import configparser  # For reading configuration files
import os  # To handle environment variables

import re
import nest_asyncio



In [None]:
# Load configuration file
config = configparser.ConfigParser()
config.read('config.ini')

# Retrieve file paths and column names from config
input_file = config['Files']['input_file']
input_worksheet = config['Files']['input_worksheet']
crf_column = config['Columns']['crf_column']
variable_column = config['Columns']['variable_column']
description_column = config['Columns']['description_column']


In [None]:
# Retrieve OpenAI settings from the config file
openai_api_key = config['OpenAI']['api_key']
assistant_id = config['OpenAI']['assistant_id']
matching_instruction = config['Instructions']['matching_instruction']
file_id = config['OpenAI']['file_id']

# Set OpenAI API key
client = AsyncOpenAI(api_key=openai_api_key)
completion = await client.chat.completions.create(model="gpt-4o", messages=[{"role": "user", "content": "Hello world"}])

In [None]:
# Load the data dictionary from Excel file
data_dict_df = pd.read_excel(input_file, sheet_name=input_worksheet)

# Select only the relevant columns
data_dict_df = data_dict_df[[crf_column, variable_column, description_column]]

# Display the first few rows of the loaded data
print("Loaded Data Dictionary:")
print(data_dict_df.head())

# Group descriptions by each unique CRF
# Concatenate all descriptions for each CRF into a single string
grouped_crf_df = data_dict_df.groupby(crf_column)[description_column].apply(lambda x: ' '.join(x.dropna())).reset_index()

# Rename columns for clarity
grouped_crf_df.columns = ['CRF Name', 'Descriptions']

# Display the grouped DataFrame
print("\nGrouped CRF Names with Descriptions:")
print(grouped_crf_df)

In [None]:
# Function to parse OpenAI's response with flexible patterns
def parse_crf_response(response_content):
    # Debug: Print the full response content
    print("\n--- Full Response Content ---\n", response_content, "\n--- End of Response ---\n")
    
    # Attempt to extract CRF Match
    crf_match = re.search(r"-\s*\**HEAL Core CRF Match\**\s*:\s*\**(.+?)\**\s*(?=\n|$)", response_content, re.IGNORECASE)
    matched_crf = crf_match.group(1).strip() if crf_match else "No CRF match"
    print("Matched CRF:", matched_crf)  # Debug: Print matched CRF

    # Attempt to extract Confidence Level
    confidence_match = re.search(r"-\s*\**Confidence\s*[Ll]evel\**\s*:\s*(High Confidence|Medium Confidence|Low Confidence|No Match)", response_content, re.IGNORECASE)
    confidence_level = confidence_match.group(1).strip() if confidence_match else "No Confidence Score"
    print("Confidence Level:", confidence_level)  # Debug: Print confidence level

    # Return results
    return {"Matched CRF": matched_crf, "Confidence": confidence_level, "Response": response_content}

# Async function to call OpenAI API with the new version syntax
async def get_crf_match_from_openai(client, crf_name, descriptions):
    """
    Asynchronously calls the OpenAI API to match the CRF name with HEAL Core CRFs.
    """
    # Construct the prompt for completion-based models
    prompt = f"{matching_instruction}\n\nCRF Name: {crf_name}\nDescriptions: {descriptions}"
    
    # Send the request to OpenAI API
    response = await client.chat.completions.create(
        model="gpt-4o",  # Use a prompt-based model compatible with completions
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.7,
    )
    
    # Extract response content
    content = response.choices[0].message.content.strip()

    # Print the raw API response for debugging
    print("\n--- Raw API Response ---\n", content, "\n--- End of Response ---\n")
    
    # Parse and return the structured response
    return parse_crf_response(content)


In [None]:
# Asynchronous function to process all CRFs concurrently
async def analyze_crfs(client, crf_df):
    """
    Processes each CRF and gathers results asynchronously.
    """
    tasks = []
    
    # Loop through each CRF and prepare tasks for concurrent execution
    for _, row in crf_df.iterrows():
        crf_name = row['CRF Name']
        descriptions = row['Descriptions']
        
        # Append an async task to the tasks list
        tasks.append(get_crf_match_from_openai(client, crf_name, descriptions))
    
    # Run all tasks concurrently and wait for them to complete
    responses = await asyncio.gather(*tasks)
    
    # Format the results
    results = []
    for idx, response in enumerate(responses):
        crf_name = crf_df.iloc[idx]['CRF Name']
        results.append({
            "CRF Name": crf_name,
            "Matched CRF": response.get("Matched CRF", "No Match"),
            "Confidence": response.get("Confidence", "No Confidence"),
            "Full Response": response.get("Response")
        })
    
    # Convert results to a DataFrame and return
    return pd.DataFrame(results)

# Run the analysis and save to Excel
output_file = config['Files']['output_file']

# Run analyze_crfs with await and save results
results_df = await analyze_crfs(client, grouped_crf_df)
results_df.to_excel(output_file, index=False)
print(f"Results saved to {output_file}")
