In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm
import json
from openai import OpenAI
import csv
import re
from random import randint
import warnings
import openai
import yaml
warnings.filterwarnings('ignore')

In [2]:
def load_config(yaml_path="P4-config.yaml"):
    with open(yaml_path, "r", encoding="utf-8") as f:
        return yaml.safe_load(f)
    
config = load_config()


In [5]:
load_dotenv()

client_gpt = OpenAI(
    # This is the default and can be omitted
    api_key=os.getenv("OPENAI_API_KEY"),
)

In [6]:
# MODEL CHOICE
model = "llama" # biomistralor llama

In [13]:
# load chatgpt prompts
if model == "biomistral":
    input_file_path = "input_file_paths_biomistral"
    prompts_dir = "prompts_dir_biomistral"
    output_dir = "output_file_paths_biomistral"

elif model == "llama":
    input_file_path = "input_file_paths_llama"
    prompts_dir = "prompts_dir_llama"
    output_dir = "output_file_paths_llama"

chatgpt_prompts_dir = config[prompts_dir]["chatgpt_prompt_dir"]
basic_prompt_path = os.path.join(chatgpt_prompts_dir, "basic_realworld_test")
onco_prompt_path = os.path.join(chatgpt_prompts_dir, "onco_realworld_test")
reviewer_prompt_path = os.path.join(chatgpt_prompts_dir, "reviewer_realworld_test")


In [9]:

basic_filepath = basic_prompt_path +  '/pdsqi_input_to_llm_as_a_judge_zero_shot.csv'
onco_filepath = onco_prompt_path +  '/pdsqi_input_to_llm_as_a_judge_zero_shot.csv'
reviewer_filepath = reviewer_prompt_path +  '/pdsqi_input_to_llm_as_a_judge_zero_shot.csv'

df_basic_prompts = pd.read_csv(basic_filepath)
df_onco_prompts = pd.read_csv(onco_filepath)
df_reviewer_prompts = pd.read_csv(reviewer_filepath)

    


In [10]:
df_basic_prompts.head()

Unnamed: 0,record_id,prompt
0,0,Here is your new role and persona:\n Yo...
1,1,Here is your new role and persona:\n Yo...
2,2,Here is your new role and persona:\n Yo...
3,3,Here is your new role and persona:\n Yo...
4,4,Here is your new role and persona:\n Yo...


In [14]:

df_basic = pd.read_csv(config[input_file_path]["basic_realworld_test"])
df_onco = pd.read_csv(config[input_file_path]["onco_realworld_test"])
df_reviewer = pd.read_csv(config[input_file_path]["reviewer_realworld_test"])

# add index columns
def add_index_column(df):
    df.reset_index(inplace=True)
    df.rename(columns={"index": "record_id"}, inplace=True)
    return df

df_basic = add_index_column(df_basic)
df_onco = add_index_column(df_onco)
df_reviewer = add_index_column(df_reviewer)

In [15]:
df_basic.head()

Unnamed: 0,record_id,Drug,Receptor,PDB_ID,generated_report,prompt_tier
0,0,Metformin,Acetyl-CoA carboxylase 2,"3FF6,3TDC,2X24,3JRX,3JRW,2HJW,4HQ6,5KKN,3GLK,3...",")<Output>\n{{\n ""Mechanism"": ""<Metformin targ...",basic
1,1,Pioglitazone,Peroxisome proliferator-activated receptor gamma,"3E00,3DZY,3DZU,7QB1,6L89,6K0T,6AD9,5HZC,5F9B,5...",")<Output format>\n{{\n ""Mechanism"": ""<Pioglit...",basic
2,2,Alogliptin,Dipeptidyl peptidase 4,"2QTB,2QT9,2BGR,2JID,3F8S,2QJR,3W2T,3VJM,3VJL,3...",")<Output format>\n{{\n ""Mechanism"": ""<Aloglip...",basic
3,3,Linagliptin,Dipeptidyl peptidase 4,"2QTB,2QT9,2BGR,2JID,3F8S,2QJR,3W2T,3VJM,3VJL,3...",")<Output format>\n{{\n ""Mechanism"": ""<Linagli...",basic
4,4,Sitagliptin,Dipeptidyl peptidase 4,"2QTB,2QT9,2BGR,2JID,3F8S,2QJR,3W2T,3VJM,3VJL,3...",")<Output format>\n{{\n ""Mechanism"": ""<Sitagli...",basic


In [16]:
def create_scorefile(df, name):
    scoreFile_name = f"scoreFile_{name}"
    scoreFile_name = pd.DataFrame({
        "record_id": df["record_id"],
        "Drug": df["Drug"],
        "Summary": df["generated_report"],
        "accurate": None,
        "organized": None,
        "comprehensible": None,
        "succinct": None,
    })
    scoreFile_name.head()

    return scoreFile_name

basic_scorefile = create_scorefile(df_basic, "basic")
onco_scorefile = create_scorefile(df_onco, "onco")
reviewer_scorefile = create_scorefile(df_reviewer, "reviewer")

basic_scorefile.head()
    

Unnamed: 0,record_id,Drug,Summary,accurate,organized,comprehensible,succinct
0,0,Metformin,")<Output>\n{{\n ""Mechanism"": ""<Metformin targ...",,,,
1,1,Pioglitazone,")<Output format>\n{{\n ""Mechanism"": ""<Pioglit...",,,,
2,2,Alogliptin,")<Output format>\n{{\n ""Mechanism"": ""<Aloglip...",,,,
3,3,Linagliptin,")<Output format>\n{{\n ""Mechanism"": ""<Linagli...",,,,
4,4,Sitagliptin,")<Output format>\n{{\n ""Mechanism"": ""<Sitagli...",,,,


In [17]:
def call_o4mini_chat(prompt: str) -> str:
    """
    Calls o4-mini via chat.completions and returns the text content.
    Your prompt already includes all instructions and ends with OUTPUT:.
    """
    if prompt is None or (isinstance(prompt, float) and pd.isna(prompt)):
        raise ValueError("Prompt is empty or NaN")

    resp = client_gpt.chat.completions.create(
        model="o4-mini",
        messages=[
            {"role": "user", "content": prompt}
        ],
        max_completion_tokens=6000,
    )

    # Standard ChatCompletion access pattern
    return resp.choices[0].message.content

In [22]:
def extract_rating(df, scoreFile):
    raw_outputs = []
    errors = []

    # Make updates easier/faster by indexing scoreFile on record_id
    if "record_id" not in scoreFile.columns:
        raise ValueError("scoreFile is missing 'record_id' column")
    
    scoreFile_indexed = scoreFile.set_index("record_id")

    for i, row in tqdm(df.iterrows(), total=len(df)):
        record_id = row["record_id"]
        prompt = row["prompt"]

        # handle empty / NaN prompts
        if prompt is None or (isinstance(prompt, float) and pd.isna(prompt)):
            errors.append({
                "index": i,
                "record_id": record_id,
                "error": "Empty or NaN prompt"
            })
            continue  # IMPORTANT: skip calling the model for this row

        try:
            raw = call_o4mini_chat(prompt)
            raw_outputs.append({"record_id": record_id, "raw_output": raw})

            # Try to parse JSON
            try:
                scores = json.loads(raw)
            except Exception as je:
                errors.append({
                    "index": i,
                    "record_id": record_id,
                    "error": f"JSON parse failed: {je}",
                    "raw_output": raw,
                })
                continue

            # Update scoreFile_indexed with parsed scores
            for col in ["accurate", "organized", "comprehensible", "succinct"]:
                if col in scores:
                    scoreFile_indexed.loc[record_id, col] = scores[col]

        except Exception as e:
            errors.append({
                "index": i,
                "record_id": record_id,
                "error": f"API call failed: {e}",
            })

    # --- turn everything into DataFrames and RETURN them ---
    updated_scores = scoreFile_indexed.reset_index()
    raw_df = pd.DataFrame(raw_outputs)
    errors_df = pd.DataFrame(errors)

    return updated_scores, raw_df, errors_df


In [23]:
basic_scores, basic_raw, basic_errors = extract_rating(
    df_basic_prompts,
    basic_scorefile
)

onco_scores, onco_raw, onco_errors = extract_rating(
    df_onco_prompts,
    onco_scorefile

)

reviewer_scores, reviewer_raw, reviewer_errors = extract_rating(
    df_reviewer_prompts,
    reviewer_scorefile
)


100%|██████████| 29/29 [02:24<00:00,  4.98s/it]
100%|██████████| 29/29 [02:20<00:00,  4.85s/it]
100%|██████████| 29/29 [02:40<00:00,  5.54s/it]


In [24]:
output_path = config[output_dir]["chatgpt_ratings_output_dir"]
os.makedirs(output_path, exist_ok=True)

# create output directories for each evaluation set
basic_output_path = os.path.join(output_path, "basic_realworld_test")
os.makedirs(basic_output_path, exist_ok=True)
onco_output_path = os.path.join(output_path, "onco_realworld_test")
os.makedirs(onco_output_path, exist_ok=True)
reviewer_output_path = os.path.join(output_path, "reviewer_realworld_test")
os.makedirs(reviewer_output_path, exist_ok=True)

# Save basic results
basic_scores.to_csv(os.path.join(basic_output_path, "chatgpt_ratings_basic_realworld_test.csv"), index=False)
basic_raw.to_csv(os.path.join(basic_output_path, "chatgpt_raw_outputs_basic_realworld_test.csv"), index=False)
basic_errors.to_csv(os.path.join(basic_output_path, "chatgpt_errors_basic_realworld_test.csv"), index=False)

# Save onco results
onco_scores.to_csv(os.path.join(onco_output_path, "chatgpt_ratings_onco_realworld_test.csv"), index=False)
onco_raw.to_csv(os.path.join(onco_output_path, "chatgpt_raw_outputs_onco_realworld_test.csv"), index=False)
onco_errors.to_csv(os.path.join(onco_output_path, "chatgpt_errors_onco_realworld_test.csv"), index=False)

# Save reviewer results
reviewer_scores.to_csv(os.path.join(reviewer_output_path, "chatgpt_ratings_reviewer_realworld_test.csv"), index=False)
reviewer_raw.to_csv(os.path.join(reviewer_output_path, "chatgpt_raw_outputs_reviewer_realworld_test.csv"), index=False)
reviewer_errors.to_csv(os.path.join(reviewer_output_path, "chatgpt_errors_reviewer_realworld_test.csv"), index=False)