In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm
import json
from openai import OpenAI
import csv
import re
from random import randint
import warnings
import openai
import yaml
warnings.filterwarnings('ignore')

In [21]:
def load_config(yaml_path="P4-config.yaml"):
    with open(yaml_path, "r", encoding="utf-8") as f:
        return yaml.safe_load(f)
    
config = load_config()

In [2]:
load_dotenv()


client_deepseek = OpenAI(
    api_key=os.getenv("DEEPSEEK_API_KEY"),
    base_url="https://api.deepseek.com"
)



In [5]:
deepseek_prompt_dir = config["prompts_dir"]["deepseek_prompt_dir"]
basic_prompt_path = os.path.join(deepseek_prompt_dir, "basic_realworld_test")
onco_prompt_path = os.path.join(deepseek_prompt_dir, "onco_realworld_test")
reviewer_prompt_path = os.path.join(deepseek_prompt_dir, "reviewer_realworld_test")

basic_filepath = basic_prompt_path +  '/pdsqi_input_to_llm_as_a_judge_zero_shot.csv'
onco_filepath = onco_prompt_path +  '/pdsqi_input_to_llm_as_a_judge_zero_shot.csv'
reviewer_filepath = reviewer_prompt_path +  '/pdsqi_input_to_llm_as_a_judge_zero_shot.csv'

df_basic_prompts = pd.read_csv(basic_filepath)
df_onco_prompts = pd.read_csv(onco_filepath)
df_reviewer_prompts = pd.read_csv(reviewer_filepath)

In [6]:
df_basic_prompts.head()

Unnamed: 0,record_id,prompt
0,0,Here is your new role and persona:\n Yo...
1,1,Here is your new role and persona:\n Yo...
2,2,Here is your new role and persona:\n Yo...
3,3,Here is your new role and persona:\n Yo...
4,4,Here is your new role and persona:\n Yo...


In [7]:
df_basic = pd.read_csv(config["input_file_paths"]["basic_realworld_test"])
df_onco = pd.read_csv(config["input_file_paths"]["onco_realworld_test"])
df_reviewer = pd.read_csv(config["input_file_paths"]["reviewer_realworld_test"])

# add index columns
def add_index_column(df):
    df.reset_index(inplace=True)
    df.rename(columns={"index": "record_id"}, inplace=True)
    return df

df_basic = add_index_column(df_basic)
df_onco = add_index_column(df_onco)
df_reviewer = add_index_column(df_reviewer)

In [8]:
df_basic.head()

Unnamed: 0,record_id,Drug,Receptor,PDB_ID,generated_report,prompt_tier
0,0,Metformin,Acetyl-CoA carboxylase 2,"3FF6,3TDC,2X24,3JRX,3JRW,2HJW,4HQ6,5KKN,3GLK,3...","{\n ""Mechanism"": ""Metformin is an oral anti-h...",basic
1,1,Pioglitazone,Peroxisome proliferator-activated receptor gamma,"3E00,3DZY,3DZU,7QB1,6L89,6K0T,6AD9,5HZC,5F9B,5...","<Output>\n{\n ""Mechanism"": ""Pioglitazone is a...",basic
2,2,Alogliptin,Dipeptidyl peptidase 4,"2QTB,2QT9,2BGR,2JID,3F8S,2QJR,3W2T,3VJM,3VJL,3...","<Output>\n{\n ""Mechanism"": ""Alogliptin is a D...",basic
3,3,Linagliptin,Dipeptidyl peptidase 4,"2QTB,2QT9,2BGR,2JID,3F8S,2QJR,3W2T,3VJM,3VJL,3...","<Output>\n{\n ""Mechanism"": ""Linagliptin is a ...",basic
4,4,Sitagliptin,Dipeptidyl peptidase 4,"2QTB,2QT9,2BGR,2JID,3F8S,2QJR,3W2T,3VJM,3VJL,3...","{\n ""Mechanism"": ""Sitagliptin is a DPP-4 inhi...",basic


In [9]:
def create_scorefile(df, name):
    scoreFile_name = f"scoreFile_{name}"
    scoreFile_name = pd.DataFrame({
        "record_id": df["record_id"],
        "Drug": df["Drug"],
        "Summary": df["generated_report"],
        "accurate": None,
        "organized": None,
        "comprehensible": None,
        "succinct": None,
    })
    scoreFile_name.head()

    return scoreFile_name

basic_scorefile = create_scorefile(df_basic, "basic")
onco_scorefile = create_scorefile(df_onco, "onco")
reviewer_scorefile = create_scorefile(df_reviewer, "reviewer")

basic_scorefile.head()
    

Unnamed: 0,record_id,Drug,Summary,accurate,organized,comprehensible,succinct
0,0,Metformin,"{\n ""Mechanism"": ""Metformin is an oral anti-h...",,,,
1,1,Pioglitazone,"<Output>\n{\n ""Mechanism"": ""Pioglitazone is a...",,,,
2,2,Alogliptin,"<Output>\n{\n ""Mechanism"": ""Alogliptin is a D...",,,,
3,3,Linagliptin,"<Output>\n{\n ""Mechanism"": ""Linagliptin is a ...",,,,
4,4,Sitagliptin,"{\n ""Mechanism"": ""Sitagliptin is a DPP-4 inhi...",,,,


In [11]:
def call_deepseek_r1(prompt: str) -> str:
    """
    Call DeepSeek-R1 (deepseek-reasoner) and return the raw text output.
    Raises an error if anything goes wrong or if the model returns empty text.
    """
    if prompt is None or str(prompt).strip() == "":
        raise ValueError("Prompt is empty or blank")

    try:
        resp = client_deepseek.chat.completions.create(
            model="deepseek-reasoner",   # R1 reasoning model
            messages=[{"role": "user", "content": prompt}],
            max_tokens=6000,
            temperature=1.0,
        )
    except Exception as e:
        # Do NOT silently swallow this – let it be visible
        raise RuntimeError(f"DeepSeek API call failed: {e}")

    # Inspect choices
    if not resp.choices or resp.choices[0].message is None:
        raise RuntimeError(f"DeepSeek returned no choices: {resp}")

    content = resp.choices[0].message.content
    if content is None:
        raise RuntimeError(f"DeepSeek returned None content: {resp}")

    content = str(content)
    if not content.strip():
        raise RuntimeError(f"DeepSeek returned EMPTY content: {repr(content)}\nFull response: {resp}")

    return content

def parse_deepseek_output(text):
    text = text.strip()
    # extract JSON between <think> and </think>
    match = re.search(r'<think>(.*?)</think>', text, re.DOTALL)
    if not match:
        return text  # return original text if no match
    
    json_part = match.group(1).strip()
    return json.loads(json_part)

In [None]:
def extract_rating(df, scoreFile):
    raw_outputs = []
    errors = []

    # Make updates easier/faster by indexing scoreFile on record_id
    if "record_id" not in scoreFile.columns:
        raise ValueError("scoreFile is missing 'record_id' column")
    
    scoreFile_indexed = scoreFile.set_index("record_id")

    for i, row in tqdm(df.iterrows(), total=len(df)):
        record_id = row["record_id"]
        prompt = row["prompt"]

        # handle empty / NaN prompts
        if prompt is None or (isinstance(prompt, float) and pd.isna(prompt)):
            errors.append({
                "index": i,
                "record_id": record_id,
                "error": "Empty or NaN prompt"
            })
            continue  # IMPORTANT: skip calling the model for this row

        try:
            raw = call_deepseek_r1(prompt)
            parse = parse_deepseek_output(raw)
            raw_outputs.append({"record_id": record_id, "raw_output": parse})

            # Try to parse JSON
            try:
                scores = json.loads(raw)
            except Exception as je:
                errors.append({
                    "index": i,
                    "record_id": record_id,
                    "error": f"JSON parse failed: {je}",
                    "raw_output": raw,
                })
                continue

            # Update scoreFile_indexed with parsed scores
            for col in ["accurate", "organized", "comprehensible", "succinct"]:
                if col in scores:
                    scoreFile_indexed.loc[record_id, col] = scores[col]

        except Exception as e:
            errors.append({
                "index": i,
                "record_id": record_id,
                "error": f"API call failed: {e}",
            })

    # --- turn everything into DataFrames and RETURN them ---
    updated_scores = scoreFile_indexed.reset_index()
    raw_df = pd.DataFrame(raw_outputs)
    errors_df = pd.DataFrame(errors)

    return updated_scores, raw_df, errors_df


In [25]:
basic_scores, basic_raw, basic_errors = extract_rating(
    df_basic_prompts,
    basic_scorefile
)

onco_scores, onco_raw, onco_errors = extract_rating(
    df_onco_prompts,
    onco_scorefile

)

reviewer_scores, reviewer_raw, reviewer_errors = extract_rating(
    df_reviewer_prompts,
    reviewer_scorefile
)


100%|██████████| 29/29 [38:18<00:00, 79.26s/it]
100%|██████████| 29/29 [37:28<00:00, 77.54s/it]
100%|██████████| 29/29 [35:06<00:00, 72.64s/it]


In [None]:
output_path = config["output_file_paths"]["deepseek_ratings_output_dir"]
os.makedirs(output_path, exist_ok=True)

# create output directories for each evaluation set
basic_output_path = os.path.join(output_path, "basic_realworld_test")
os.makedirs(basic_output_path, exist_ok=True)
onco_output_path = os.path.join(output_path, "onco_realworld_test")
os.makedirs(onco_output_path, exist_ok=True)
reviewer_output_path = os.path.join(output_path, "reviewer_realworld_test")
os.makedirs(reviewer_output_path, exist_ok=True)

# Save basic results
basic_scores.to_csv(os.path.join(basic_output_path, "deepseek_ratings_basic_realworld_test.csv"), index=False)
basic_raw.to_csv(os.path.join(basic_output_path, "deepseek_raw_outputs_basic_realworld_test.csv"), index=False)
basic_errors.to_csv(os.path.join(basic_output_path, "deepseek_errors_basic_realworld_test.csv"), index=False)

# Save onco results
onco_scores.to_csv(os.path.join(onco_output_path, "deepseek_ratings_onco_realworld_test.csv"), index=False)
onco_raw.to_csv(os.path.join(onco_output_path, "deepseek_raw_outputs_onco_realworld_test.csv"), index=False)
onco_errors.to_csv(os.path.join(onco_output_path, "deepseek_errors_onco_realworld_test.csv"), index=False)

# Save reviewer results
reviewer_scores.to_csv(os.path.join(reviewer_output_path, "deepseek_ratings_reviewer_realworld_test.csv"), index=False)
reviewer_raw.to_csv(os.path.join(reviewer_output_path, "deepseek_raw_outputs_reviewer_realworld_test.csv"), index=False)
reviewer_errors.to_csv(os.path.join(reviewer_output_path, "deepseek_errors_reviewer_realworld_test.csv"), index=False)

In [32]:
onco_raw = pd.read_csv(os.path.join(onco_output_path, "deepseek_raw_outputs_onco_realworld_test.csv"))
basic_raw = pd.read_csv(os.path.join(basic_output_path, "deepseek_raw_outputs_basic_realworld_test.csv"))
reviewer_raw = pd.read_csv(os.path.join(reviewer_output_path, "deepseek_raw_outputs_reviewer_realworld_test.csv"))

def clean_rawdf(raw_df):
    # parse raw output column using parse_deepseek_output
    raw_df['parsed_output'] = raw_df['raw_output'].apply(parse_deepseek_output)
    # update rubric columns based on parsed_output
    for rubric in ["accurate", "organized", "comprehensible", "succinct"]:
        raw_df[rubric] = raw_df['parsed_output'].apply(lambda x: x.get(rubric) if isinstance(x, dict) else None)
    # drop parsed_output column
    raw_df.drop(columns=['parsed_output'], inplace=True)
    # drop raw_output column

    # in raw_df replace all "" with ' 
    raw_df.replace({r'^"$': "'"}, regex=True, inplace=True)

    # remove all \n
    raw_df.replace({r'\n': ' '}, regex=True, inplace=True)
    return raw_df

basic_raw = clean_rawdf(basic_raw)
onco_raw = clean_rawdf(onco_raw)
reviewer_raw = clean_rawdf(reviewer_raw)

basic_raw.head()

Unnamed: 0,record_id,raw_output,accurate,organized,comprehensible,succinct
0,0,"{""accurate"":5,""organized"":5,""comprehensible"":5...",,,,
1,1,"{""accurate"": 5, ""organized"": 5, ""comprehensibl...",,,,
2,2,"<think>{""accurate"": 5, ""organized"": 5, ""compre...",5.0,5.0,5.0,5.0
3,3,"<think>{""accurate"": 2, ""organized"": 5, ""compre...",2.0,5.0,5.0,5.0
4,4,"<think>{""accurate"": 5, ""organized"": 5, ""compre...",5.0,5.0,5.0,5.0


In [37]:
def expand_rawdf(raw_df):
    raw_df_expanded = raw_df['raw_output'].str.split(',', expand=True)
    raw_df_expanded.head()

    # remove all non-numeric characters from each cell in raw_df_expanded and rename columns to accurate, organized, comprehensible, succinct
    for i, col in enumerate(raw_df_expanded.columns):
        raw_df_expanded[col] = raw_df_expanded[col].str.replace(r'[^0-9.]', '', regex=True)
        if i == 0:
            raw_df_expanded.rename(columns={col: 'accurate'}, inplace=True)
        elif i == 1:
            raw_df_expanded.rename(columns={col: 'organized'}, inplace=True)
        elif i == 2:
            raw_df_expanded.rename(columns={col: 'comprehensible'}, inplace=True)
        elif i == 3:
            raw_df_expanded.rename(columns={col: 'succinct'}, inplace=True)

    # add record_id column back to raw_df_expanded
    raw_df_expanded['record_id'] = raw_df['record_id']
    return raw_df_expanded

basic_raw_expanded = expand_rawdf(basic_raw)
onco_raw_expanded = expand_rawdf(onco_raw)
reviewer_raw_expanded = expand_rawdf(reviewer_raw)
basic_raw_expanded.head()

Unnamed: 0,accurate,organized,comprehensible,succinct,record_id
0,5,5,5,3,0
1,5,5,5,5,1
2,5,5,5,5,2
3,2,5,5,5,3
4,5,5,5,5,4


In [38]:
def replace_scorefile(raw_df, scoreFile):
    for i, row in scoreFile.iterrows():
        record_id = row['record_id']
        matching_row = raw_df[raw_df['record_id'] == record_id]
        if not matching_row.empty:
            scoreFile.at[i, 'accurate'] = matching_row['accurate'].values[0]
            scoreFile.at[i, 'organized'] = matching_row['organized'].values[0]
            scoreFile.at[i, 'comprehensible'] = matching_row['comprehensible'].values[0]
            scoreFile.at[i, 'succinct'] = matching_row['succinct'].values[0]
    return scoreFile

basic_scorefile = replace_scorefile(basic_raw_expanded, basic_scorefile)
onco_scorefile = replace_scorefile(onco_raw_expanded, onco_scorefile)
reviewer_scorefile = replace_scorefile(reviewer_raw_expanded, reviewer_scorefile)

In [39]:
basic_scorefile.head()

Unnamed: 0,record_id,Drug,Summary,accurate,organized,comprehensible,succinct
0,0,Metformin,"{\n ""Mechanism"": ""Metformin is an oral anti-h...",5,5,5,3
1,1,Pioglitazone,"<Output>\n{\n ""Mechanism"": ""Pioglitazone is a...",5,5,5,5
2,2,Alogliptin,"<Output>\n{\n ""Mechanism"": ""Alogliptin is a D...",5,5,5,5
3,3,Linagliptin,"<Output>\n{\n ""Mechanism"": ""Linagliptin is a ...",2,5,5,5
4,4,Sitagliptin,"{\n ""Mechanism"": ""Sitagliptin is a DPP-4 inhi...",5,5,5,5


In [41]:
# save cleaned scorefiles
basic_scorefile.to_csv(os.path.join(basic_output_path, "deepseek_cleaned_ratings_basic_realworld_test.csv"), index=False)
onco_scorefile.to_csv(os.path.join(onco_output_path, "deepseek_cleaned_ratings_onco_realworld_test.csv"), index=False)
reviewer_scorefile.to_csv(os.path.join(reviewer_output_path, "deepseek_cleaned_ratings_reviewer_realworld_test.csv"), index=False)

# save raw outputs
basic_raw_expanded.to_csv(os.path.join(basic_output_path, "deepseek_raw_outputs_cleaned_basic_realworld_test.csv"), index=False)
onco_raw_expanded.to_csv(os.path.join(onco_output_path, "deepseek_raw_outputs_cleaned_onco_realworld_test.csv"), index=False)
reviewer_raw_expanded.to_csv(os.path.join(reviewer_output_path, "deepseek_raw_outputs_cleaned_reviewer_realworld_test.csv"), index=False)