In [239]:
import openai
import asyncio
import os
import pandas as pd  # For data handling, like reading from Excel
from openai import AsyncOpenAI  # Asynchronous client from the new OpenAI SDK
from rapidfuzz import fuzz
import json
import configparser  # For reading configuration files
import re

import nest_asyncio
nest_asyncio.apply()

In [240]:
# Load configuration file
config = configparser.ConfigParser()
config.read('config_prestep.ini')

# Retrieve file paths and column names from config
input_file = config['Files']['input_file']
input_worksheet = config['Files']['input_worksheet']
crf_column = config['Columns']['crf_column']
variable_column = config['Columns']['variable_column']
description_column = config['Columns']['description_column']


In [241]:
# Set OpenAI API key
openai_api_key = config['OpenAI']['api_key']
client = AsyncOpenAI(api_key=openai_api_key)# Retrieve OpenAI settings from the config file

assistant_id = config['OpenAI']['assistant_id']
crf_id = config['Instructions']['crf_id_prestep']
healmatch_id = config['OpenAI']['healmatch_id']
matching_instruction = config['Instructions']['matching_instruction']
harmonizer_id = config['OpenAI']['harmonizer_id']
form_harmonizer_prompt = config['Instructions']['form_harmonizer']

In [242]:
# Load the data dictionary from Excel file
data_dict_df = pd.read_excel(input_file, sheet_name=input_worksheet)

# Select only the relevant columns
data_dict_df = data_dict_df[[crf_column, variable_column, description_column]]

# Display the first few rows of the loaded data
print("Loaded Data Dictionary:")
print(data_dict_df.head())

# Convert the DataFrame into record dicts
records = data_dict_df.to_dict(orient='records')

# Take the first record and build the payload
first_row = records[0]
payload = {
    "crf_name": first_row[crf_column],
    "variable_name": first_row[variable_column],
    "description": first_row[description_column]
}

Loaded Data Dictionary:
         section              name  \
0  screening_log         record_id   
1  screening_log         subjectid   
2  screening_log        oboe_group   
3  screening_log  study_identifier   
4  screening_log       sc_birthdat   

                                         description  
0                                          Record ID  
1                                  OBOE Study Number  
2          What group was the subject enrolled into?  
3  Study Identifier (hidden, raw value from oboe ...  
4                                      Date of Birth  


In [243]:
def parse_llm_json(full, crf_name):
    try:
        data = json.loads(full)
        refined = data.get("crf_name", crf_name).strip()
        rationale = data.get("rationale", "").strip()
        return refined, rationale, full
    except json.JSONDecodeError:
        print("\n[ERROR] JSON decode error. Full response was:\n", full)
        return crf_name, "", full

def normalize_name(name):
    # Lowercase, remove punctuation, and drop generic words like "form"
    name = name.lower()
    name = re.sub(r'form|log|assessment|information|status', '', name)  # Remove extra generic words
    name = re.sub(r'[^a-z0-9\s]', '', name)  # Remove punctuation
    name = re.sub(r'\s+', ' ', name)         # Collapse whitespace
    return name.strip()

def auto_cluster_names(names, threshold=70):
    """
    Clusters similar names using fuzzy matching and assigns the most frequent as canonical.
    Returns a dict: {original_name: canonical_name}
    """
    clusters = []
    mapping = {}
    name_counts = pd.Series(names).value_counts().to_dict()
    normalized_names = {name: normalize_name(name) for name in names}
    for name in names:
        found_cluster = False
        for cluster in clusters:
            if any(fuzz.ratio(normalized_names[name], normalized_names[c]) >= threshold for c in cluster):
                cluster.append(name)
                found_cluster = True
                break
        if not found_cluster:
            clusters.append([name])
    for cluster in clusters:
        canonical = pd.Series(cluster).map(lambda x: name_counts.get(x, 0)).idxmax()
        for name in cluster:
            mapping[name] = canonical
    return mapping

In [244]:
# Helper set 1 API call
async def refine_crf_name_with_variables(client, variable_names, crf_name, descriptions):
    """
    Calls OpenAI to refine/formulate a unique, concise CRF name 
    based on the original CRF name, variable names, and descriptions.
    """
    prompt = (
        f"{crf_id}\n\n"
        "Please respond in JSON with keys "
        "`crf_name` and `rationale` only. Do not wrap in markdown.\n\n"
        f"Variable names: {variable_names}\n"
        f"Original form name: {crf_name}\n"
        f"Descriptions: {descriptions}"
    )

    response = await client.chat.completions.create(
        model="gpt-4.1-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.5,
    )
    full = response.choices[0].message.content.strip()
    print("\n--- Full Prestep Response ---\n", full, "\n--- End ---\n")

    # parse the JSON
    return parse_llm_json(full, crf_name)

async def refine_with_retry(client, var, crf, desc, tries=5):
    backoff = 1
    for attempt in range(1, tries + 1):
        try:
            return await refine_crf_name_with_variables(client, var, crf, desc)
        except Exception as e:
            msg = str(e).lower()
            is_rate = ("rate limit" in msg) or ("429" in msg) \
                      or (hasattr(e, "code") and e.code == "rate_limit_exceeded")

            if is_rate:
                print(f"[rate limit] prestep attempt {attempt}, sleeping {backoff}s")
            else:
                print(f"[warning] prestep failed attempt {attempt}: {e}")

            if attempt == tries:
                return crf, "", ""

            await asyncio.sleep(backoff)
            backoff *= 2

#loop call row-by-row, run prestep
async def run_prestep(client, df, chunk_size=50):
    all_r, all_rat, all_full = [], [], []
    for start in range(0, len(df), chunk_size):
        chunk = df.iloc[start:start+chunk_size]
        tasks = [
            refine_with_retry(
                client,
                row[variable_column],
                row[crf_column],
                row[description_column]
            )
            for _, row in chunk.iterrows()
        ]
        results = await asyncio.gather(*tasks)
        names, rats, fulls = zip(*results)
        all_r.extend(names)
        all_rat.extend(rats)
        all_full.extend(fulls)
        # slight pause between chunks to smooth out rate
        await asyncio.sleep(1)
    df["Refined CRF Name"], df["Rationale"], df["Full Response"] = all_r, all_rat, all_full
    return df

async def main():
    # … your config loading and DataFrame setup …

    # Step 1: refine each CRF name (this loops internally)
    refined_df = await run_prestep(client, data_dict_df)

    # Now refined_df has a new "Refined CRF Name" column.
    print("After prestep, here’s a sample:")
    print(refined_df[[crf_column, "Refined CRF Name"]].head())


In [245]:
# 1) Define the function schema
harmonize_function = {
    "name": "harmonize_crf_names",
    "description": "Map each original CRF name to a single harmonized label",
    "parameters": {
        "type": "object",
        "properties": {
            "mapping": {
                "type": "object",
                "description": "Keys are the original form names; values are the harmonized labels",
                "additionalProperties": { "type": "string" }
            }
        },
        "required": ["mapping"]
    }
}

# 2) Harmonizer helper with debug prints and fallbacks
def batcher(seq, size=20):
    """Yield successive size-sized chunks from seq."""
    for pos in range(0, len(seq), size):
        yield seq[pos:pos + size]

async def harmonize_crf_names_step(client, refined_df, batch_size=20):
    # Build and dedupe the payload
    seen = set()
    unique_entries = []
    for orig, rat in zip(refined_df["Refined CRF Name"], refined_df["Rationale"]):
        key = (orig, rat)
        if key not in seen:
            seen.add(key)
            unique_entries.append({"original": orig, "rationale": rat})

    if not unique_entries:
        refined_df["Canonical CRF Name"] = refined_df["Refined CRF Name"]
        print("[Harmonizer] No entries to harmonize, using identity mapping.")
        return refined_df

    combined_mapping = {}

    # Batch and harmonize!
    for batch_num, batch in enumerate(batcher(unique_entries, size=batch_size), start=1):
        print(f"\n[Harmonizer] Sending batch {batch_num} of {len(batch)}:")
        for e in batch:
            print("   ", e)

        response = await client.chat.completions.create(
            model="gpt-4.1-mini",
            messages=[
                {"role": "system", "content": config["Instructions"]["form_harmonizer"]},
                {"role": "user", "content": json.dumps(batch)}
            ],
            functions=[harmonize_function],
            function_call={"name": "harmonize_crf_names"},
            temperature=0
        )

        choice = response.choices[0].message
        print("\n[Harmonizer] Raw model message:")
        print(choice)

        # Verify the function name
        if choice.function_call:
            print(f"[Harmonizer] Function called: {choice.function_call.name}")
        else:
            print("[Harmonizer] No function_call detected")

        # Parse the returned arguments
        mapping = {}
        if choice.function_call:
            raw_args = choice.function_call.arguments
            print("[Harmonizer] Raw function_call.arguments:", raw_args)
            try:
                args = json.loads(raw_args)
                if "mapping" in args and isinstance(args["mapping"], dict):
                    mapping = args["mapping"]
                else:
                    mapping = args
            except json.JSONDecodeError as e:
                print("[Harmonizer] JSON decode error:", e)

        # Fallback to identity if mapping is empty for this batch
        if not mapping:
            print("[Harmonizer] Empty mapping for this batch; defaulting to identity.")
            mapping = {e['original']: e['original'] for e in batch}

        print("\n[Harmonizer] Parsed mapping (original → harmonized):")
        for orig, canon in mapping.items():
            print(f"    '{orig}' -> '{canon}'")

        # Combine batch mapping into all mappings
        combined_mapping.update(mapping)

    # Apply the combined mapping
    refined_df["Canonical CRF Name"] = refined_df["Refined CRF Name"].map(lambda x: combined_mapping.get(x, x))

    for col in ["Refined CRF Name", "Canonical CRF Name"]:
        refined_df[col] = refined_df[col].apply(lambda x: ", ".join(x) if isinstance(x, list) else x)

    print("\n[Harmonizer] Final Canonical CRF Name results:")
    print(
        refined_df[["Refined CRF Name", "Canonical CRF Name"]]
        .drop_duplicates()
        .reset_index(drop=True)
    )

    # Auto-cluster for final canonical name
    print("\n[Auto-Cluster] Clustering Canonical CRF Names for final deduplication...")

    unique_canonicals = refined_df["Canonical CRF Name"].unique()
    auto_map = auto_cluster_names(unique_canonicals, threshold=70)

    refined_df["Final Canonical CRF Name"] = refined_df["Canonical CRF Name"].map(auto_map)

    print("\n[Auto-Cluster] Clustered Canonical CRF Names:")
    print(refined_df[["Canonical CRF Name", "Final Canonical CRF Name"]].drop_duplicates().reset_index(drop=True))

    return refined_df

In [246]:
# Helper set 3 API call
async def match_heal_core_crf(client, full_prestep_response):
    # build a user message that includes the JSON‐only instruction
    user_content = (
        f"Prestep output:\n{full_prestep_response}\n\n"
        "Please respond in strict JSON with keys "
        "\"heal_core_crf\", \"confidence\", and \"rationale\". "
        "Do not wrap in markdown or add any extra fields."
    )

    resp = await client.chat.completions.create(
        model="gpt-4.1-mini",
        messages=[
            {"role": "system", "content": matching_instruction},
            {"role":   "user", "content": user_content}
        ],
        temperature=0.3
    )
    full = resp.choices[0].message.content.strip()
    print("\n--- HEAL-Match Response ---\n", full, "\n--- End ---\n")

    # now json.loads should actually work
    data = json.loads(full)
    match     = data.get("heal_core_crf",    "No CRF match").strip()
    conf      = data.get("confidence",       "Low Confidence").strip()
    rationale = data.get("rationale",        "").strip()
    return match, conf, rationale

async def match_with_retry(client, full_response, tries=5):
    backoff = 1
    for attempt in range(1, tries + 1):
        try:
            return await match_heal_core_crf(client, full_response)
        except Exception as e:
            msg = str(e).lower()
            is_rate = (
                "rate limit" in msg or
                "429" in msg or
                (hasattr(e, "code") and e.code == "rate_limit_exceeded")
            )

            if is_rate:
                print(f"[rate limit] match attempt {attempt}, sleeping {backoff}s")
            else:
                print(f"[warning] match failed attempt {attempt}: {e}")

            if attempt == tries:
                return "No CRF match", "Low Confidence", ""

            await asyncio.sleep(backoff)
            backoff *= 2

# Loop over prestep outputs
async def run_heal_match(client, df, chunk_size=50):
    all_match, all_conf, all_mrat = [], [], []
    for start in range(0, len(df), chunk_size):
        chunk = df.iloc[start:start+chunk_size]
        tasks = [
            match_with_retry(client, row["Full Response"])
            for _, row in chunk.iterrows()
        ]
        results = await asyncio.gather(*tasks)
        matches, confs, mrats = zip(*results)
        all_match.extend(matches)
        all_conf.extend(confs)
        all_mrat.extend(mrats)
        await asyncio.sleep(1)
    df["HEAL Core CRF Match"], df["Confidence Level"], df["Match Rationale"] = all_match, all_conf, all_mrat
    return df

In [247]:
# Orchestrator
async def main():
    # Load the full input file (all original columns)
    full_input_df = pd.read_excel(input_file, sheet_name=input_worksheet)

    # Extract just the columns we need for prestep
    data_dict_df = full_input_df[[crf_column, variable_column, description_column]].copy()

    # Prestep: get Refined CRF Name, Rationale, Full Response
    refined_df = await run_prestep(client, data_dict_df, chunk_size=50)

    # Harmonize the refined names via your new assistant
    refined_df = await harmonize_crf_names_step(client, refined_df, batch_size=20)

    # Merge prestep outputs back into the full DataFrame, using Canonical
    enhanced_df = full_input_df.join(
        refined_df[["Canonical CRF Name", "Rationale", "Full Response"]]
    )

    # HEAL-Core matching: adds three new columns
    final_df = await run_heal_match(client, enhanced_df, chunk_size=50)

    # --- Harmonize Confidence Level for No CRF match ---
    # Replace any 'Confidence Level' with 'No CRF match' where 'HEAL Core CRF Match' is 'No CRF match'
    mask = final_df["HEAL Core CRF Match"] == "No CRF match"
    final_df.loc[mask, "Confidence Level"] = "No CRF match"

    # Build a *unique* metadata table:
    metadata_df = (
        refined_df
        [[crf_column, "Canonical CRF Name", "Rationale", "Full Response"]]
        .drop_duplicates(subset=[crf_column, "Canonical CRF Name"])
        .reset_index(drop=True)
    )

    # Save to Excel:
    output_file = config["Files"]["output_file"]
    with pd.ExcelWriter(output_file, engine="xlsxwriter") as writer:
        # Metadata sheet: one row per (section, refined CRF) combo
        metadata_df.to_excel(writer, sheet_name="Metadata", index=False)
        # EnhancedDD sheet: the full original + all new columns
        final_df.to_excel(writer, sheet_name="EnhancedDD", index=False)

    print(f"Results saved to {output_file} with sheets 'Metadata' and 'EnhancedDD'")

if __name__ == "__main__":
    import asyncio
    final_df = asyncio.run(main())



--- Full Prestep Response ---
 {"crf_name":"Eligibility","rationale":"Variable relates to eligibility criteria, matching the Eligibility CRF section A."} 
--- End ---


--- Full Prestep Response ---
 {"crf_name":"Screening Log","rationale":"Variable pertains to subject identification during initial enrollment, matching the Screening Log form used for recording screening details."} 
--- End ---


--- Full Prestep Response ---
 {"crf_name":"Screening Log","rationale":"The variable 'languages' relates to initial participant information typically captured during screening, matching the 'Screening Log' form."} 
--- End ---


--- Full Prestep Response ---
 {"crf_name":"Eligibility","rationale":"Variable 'cietestcd5' is from the eligibility section, indicating it belongs to the Eligibility CRF."} 
--- End ---


--- Full Prestep Response ---
 {"crf_name":"Eligibility Form","rationale":"The variable pertains to mother's race information collected during eligibility assessment, matching the Eli