In [1]:
import os
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm
import json
from openai import OpenAI
import csv
import re
from random import randint
import warnings
import openai
import yaml
warnings.filterwarnings('ignore')

In [2]:
def load_config(yaml_path="P4-config.yaml"):
    with open(yaml_path, "r", encoding="utf-8") as f:
        return yaml.safe_load(f)
    
config = load_config()

In [3]:
load_dotenv()


client_deepseek = OpenAI(
    api_key=os.getenv("DEEPSEEK_API_KEY"),
    base_url="https://api.deepseek.com"
)



In [4]:
# MODEL CHOICE
model = "llama" # or llama

In [10]:
if model == "biomistral":
    input_file_path = "input_file_paths_biomistral"
    prompts_dir = "prompts_dir_biomistral"
    output_dir = "output_file_paths_biomistral"

elif model == "llama":
    input_file_path = "input_file_paths_llama"
    prompts_dir = "prompts_dir_llama"
    output_dir = "output_file_paths_llama"

    
deepseek_prompt_dir = config[prompts_dir]["deepseek_prompt_dir"]
basic_prompt_path = os.path.join(deepseek_prompt_dir, "basic_realworld_test")
onco_prompt_path = os.path.join(deepseek_prompt_dir, "onco_realworld_test")
reviewer_prompt_path = os.path.join(deepseek_prompt_dir, "reviewer_realworld_test")

basic_filepath = basic_prompt_path +  '/pdsqi_input_to_llm_as_a_judge_zero_shot.csv'
onco_filepath = onco_prompt_path +  '/pdsqi_input_to_llm_as_a_judge_zero_shot.csv'
reviewer_filepath = reviewer_prompt_path +  '/pdsqi_input_to_llm_as_a_judge_zero_shot.csv'

df_basic_prompts = pd.read_csv(basic_filepath)
df_onco_prompts = pd.read_csv(onco_filepath)
df_reviewer_prompts = pd.read_csv(reviewer_filepath)

In [11]:
df_basic_prompts.head()

Unnamed: 0,record_id,prompt
0,0,Here is your new role and persona:\n Yo...
1,1,Here is your new role and persona:\n Yo...
2,2,Here is your new role and persona:\n Yo...
3,3,Here is your new role and persona:\n Yo...
4,4,Here is your new role and persona:\n Yo...


In [12]:
df_basic = pd.read_csv(config[input_file_path]["basic_realworld_test"])
df_onco = pd.read_csv(config[input_file_path]["onco_realworld_test"])
df_reviewer = pd.read_csv(config[input_file_path]["reviewer_realworld_test"])

# add index columns
def add_index_column(df):
    df.reset_index(inplace=True)
    df.rename(columns={"index": "record_id"}, inplace=True)
    return df

df_basic = add_index_column(df_basic)
df_onco = add_index_column(df_onco)
df_reviewer = add_index_column(df_reviewer)

In [13]:
df_basic.head()

Unnamed: 0,record_id,Drug,Receptor,PDB_ID,generated_report,prompt_tier
0,0,Metformin,Acetyl-CoA carboxylase 2,"3FF6,3TDC,2X24,3JRX,3JRW,2HJW,4HQ6,5KKN,3GLK,3...",")<Output>\n{{\n ""Mechanism"": ""<Metformin targ...",basic
1,1,Pioglitazone,Peroxisome proliferator-activated receptor gamma,"3E00,3DZY,3DZU,7QB1,6L89,6K0T,6AD9,5HZC,5F9B,5...",")<Output format>\n{{\n ""Mechanism"": ""<Pioglit...",basic
2,2,Alogliptin,Dipeptidyl peptidase 4,"2QTB,2QT9,2BGR,2JID,3F8S,2QJR,3W2T,3VJM,3VJL,3...",")<Output format>\n{{\n ""Mechanism"": ""<Aloglip...",basic
3,3,Linagliptin,Dipeptidyl peptidase 4,"2QTB,2QT9,2BGR,2JID,3F8S,2QJR,3W2T,3VJM,3VJL,3...",")<Output format>\n{{\n ""Mechanism"": ""<Linagli...",basic
4,4,Sitagliptin,Dipeptidyl peptidase 4,"2QTB,2QT9,2BGR,2JID,3F8S,2QJR,3W2T,3VJM,3VJL,3...",")<Output format>\n{{\n ""Mechanism"": ""<Sitagli...",basic


In [14]:
def create_scorefile(df, name):
    scoreFile_name = f"scoreFile_{name}"
    scoreFile_name = pd.DataFrame({
        "record_id": df["record_id"],
        "Drug": df["Drug"],
        "Summary": df["generated_report"],
        "accurate": None,
        "organized": None,
        "comprehensible": None,
        "succinct": None,
    })
    scoreFile_name.head()

    return scoreFile_name

basic_scorefile = create_scorefile(df_basic, "basic")
onco_scorefile = create_scorefile(df_onco, "onco")
reviewer_scorefile = create_scorefile(df_reviewer, "reviewer")

basic_scorefile.head()
    

Unnamed: 0,record_id,Drug,Summary,accurate,organized,comprehensible,succinct
0,0,Metformin,")<Output>\n{{\n ""Mechanism"": ""<Metformin targ...",,,,
1,1,Pioglitazone,")<Output format>\n{{\n ""Mechanism"": ""<Pioglit...",,,,
2,2,Alogliptin,")<Output format>\n{{\n ""Mechanism"": ""<Aloglip...",,,,
3,3,Linagliptin,")<Output format>\n{{\n ""Mechanism"": ""<Linagli...",,,,
4,4,Sitagliptin,")<Output format>\n{{\n ""Mechanism"": ""<Sitagli...",,,,


In [15]:
def call_deepseek_r1(prompt: str) -> str:
    """
    Call DeepSeek-R1 (deepseek-reasoner) and return the raw text output.
    Raises an error if anything goes wrong or if the model returns empty text.
    """
    if prompt is None or str(prompt).strip() == "":
        raise ValueError("Prompt is empty or blank")

    try:
        resp = client_deepseek.chat.completions.create(
            model="deepseek-reasoner",   # R1 reasoning model
            messages=[{"role": "user", "content": prompt}],
            max_tokens=6000,
            temperature=1.0,
        )
    except Exception as e:
        # Do NOT silently swallow this – let it be visible
        raise RuntimeError(f"DeepSeek API call failed: {e}")

    # Inspect choices
    if not resp.choices or resp.choices[0].message is None:
        raise RuntimeError(f"DeepSeek returned no choices: {resp}")

    content = resp.choices[0].message.content
    if content is None:
        raise RuntimeError(f"DeepSeek returned None content: {resp}")

    content = str(content)
    if not content.strip():
        raise RuntimeError(f"DeepSeek returned EMPTY content: {repr(content)}\nFull response: {resp}")

    return content

def parse_deepseek_output(text):
    text = text.strip()
    # extract JSON between <think> and </think>
    match = re.search(r'<think>(.*?)</think>', text, re.DOTALL)
    if not match:
        return text  # return original text if no match
    
    json_part = match.group(1).strip()
    return json.loads(json_part)

In [16]:
def extract_rating(df, scoreFile):
    raw_outputs = []
    errors = []

    # Make updates easier/faster by indexing scoreFile on record_id
    if "record_id" not in scoreFile.columns:
        raise ValueError("scoreFile is missing 'record_id' column")
    
    scoreFile_indexed = scoreFile.set_index("record_id")

    for i, row in tqdm(df.iterrows(), total=len(df)):
        record_id = row["record_id"]
        prompt = row["prompt"]

        # handle empty / NaN prompts
        if prompt is None or (isinstance(prompt, float) and pd.isna(prompt)):
            errors.append({
                "index": i,
                "record_id": record_id,
                "error": "Empty or NaN prompt"
            })
            continue  # IMPORTANT: skip calling the model for this row

        try:
            raw = call_deepseek_r1(prompt)
            parse = parse_deepseek_output(raw)
            raw_outputs.append({"record_id": record_id, "raw_output": parse})

            # Try to parse JSON
            try:
                scores = json.loads(raw)
            except Exception as je:
                errors.append({
                    "index": i,
                    "record_id": record_id,
                    "error": f"JSON parse failed: {je}",
                    "raw_output": raw,
                })
                continue

            # Update scoreFile_indexed with parsed scores
            for col in ["accurate", "organized", "comprehensible", "succinct"]:
                if col in scores:
                    scoreFile_indexed.loc[record_id, col] = scores[col]

        except Exception as e:
            errors.append({
                "index": i,
                "record_id": record_id,
                "error": f"API call failed: {e}",
            })

    # --- turn everything into DataFrames and RETURN them ---
    updated_scores = scoreFile_indexed.reset_index()
    raw_df = pd.DataFrame(raw_outputs)
    errors_df = pd.DataFrame(errors)

    return updated_scores, raw_df, errors_df


In [17]:
basic_scores, basic_raw, basic_errors = extract_rating(
    df_basic_prompts,
    basic_scorefile
)

onco_scores, onco_raw, onco_errors = extract_rating(
    df_onco_prompts,
    onco_scorefile

)

reviewer_scores, reviewer_raw, reviewer_errors = extract_rating(
    df_reviewer_prompts,
    reviewer_scorefile
)


100%|██████████| 29/29 [42:30<00:00, 87.96s/it] 
100%|██████████| 29/29 [42:40<00:00, 88.28s/it] 
100%|██████████| 29/29 [40:51<00:00, 84.52s/it]


In [18]:
output_path = config[output_dir]["deepseek_ratings_output_dir"]
os.makedirs(output_path, exist_ok=True)

# create output directories for each evaluation set
basic_output_path = os.path.join(output_path, "basic_realworld_test")
os.makedirs(basic_output_path, exist_ok=True)
onco_output_path = os.path.join(output_path, "onco_realworld_test")
os.makedirs(onco_output_path, exist_ok=True)
reviewer_output_path = os.path.join(output_path, "reviewer_realworld_test")
os.makedirs(reviewer_output_path, exist_ok=True)

# Save basic results
basic_scores.to_csv(os.path.join(basic_output_path, "deepseek_ratings_basic_realworld_test.csv"), index=False)
basic_raw.to_csv(os.path.join(basic_output_path, "deepseek_raw_outputs_basic_realworld_test.csv"), index=False)
basic_errors.to_csv(os.path.join(basic_output_path, "deepseek_errors_basic_realworld_test.csv"), index=False)

# Save onco results
onco_scores.to_csv(os.path.join(onco_output_path, "deepseek_ratings_onco_realworld_test.csv"), index=False)
onco_raw.to_csv(os.path.join(onco_output_path, "deepseek_raw_outputs_onco_realworld_test.csv"), index=False)
onco_errors.to_csv(os.path.join(onco_output_path, "deepseek_errors_onco_realworld_test.csv"), index=False)

# Save reviewer results
reviewer_scores.to_csv(os.path.join(reviewer_output_path, "deepseek_ratings_reviewer_realworld_test.csv"), index=False)
reviewer_raw.to_csv(os.path.join(reviewer_output_path, "deepseek_raw_outputs_reviewer_realworld_test.csv"), index=False)
reviewer_errors.to_csv(os.path.join(reviewer_output_path, "deepseek_errors_reviewer_realworld_test.csv"), index=False)

In [23]:

RUBRICS = ["accurate", "organized", "comprehensible", "succinct"]

def _strip_think_tags(s: str) -> str:
    # Remove <think>...</think> and any similar XML-ish blocks
    s = re.sub(r"<think>.*?</think>", " ", s, flags=re.DOTALL | re.IGNORECASE)
    return s

def _extract_first_braced_object(s: str) -> str | None:
    """
    Extract the first {...} block using a simple brace-balancing scan.
    This is much safer than regex for nested braces.
    """
    if not s:
        return None

    start = s.find("{")
    if start == -1:
        return None

    depth = 0
    for i in range(start, len(s)):
        ch = s[i]
        if ch == "{":
            depth += 1
        elif ch == "}":
            depth -= 1
            if depth == 0:
                return s[start:i+1]

    # Unbalanced braces (no closing '}')
    return None

def parse_deepseek_output(raw: str):
    """
    Returns a dict with rubric keys if parse succeeds, else None.
    """
    if raw is None or (isinstance(raw, float) and pd.isna(raw)):
        return None

    s = str(raw).strip()

    # Normalize obvious formatting noise
    s = s.replace("\r", " ")
    s = re.sub(r"\s+", " ", s)

    # Remove think tags
    s = _strip_think_tags(s)

    # Try to grab the first JSON-like object
    obj_txt = _extract_first_braced_object(s)
    if obj_txt is None:
        return None

    # Attempt 1: strict JSON
    try:
        data = json.loads(obj_txt)
    except Exception:
        # Attempt 2: fix common issues then json.loads
        fixed = obj_txt

        # remove trailing commas before } or ]
        fixed = re.sub(r",\s*([}\]])", r"\1", fixed)

        # convert single quotes to double quotes (common DeepSeek issue)
        # NOTE: this is a heuristic; good enough for simple dicts like yours
        fixed = re.sub(r"(?<!\\)'", '"', fixed)

        try:
            data = json.loads(fixed)
        except Exception:
            # Attempt 3: python literal eval (handles single quotes naturally)
            try:
                data = ast.literal_eval(obj_txt)
            except Exception:
                return None

    if not isinstance(data, dict):
        return None

    # Normalize keys + values
    out = {}
    for k in RUBRICS:
        v = data.get(k, data.get(k.capitalize(), None))
        if v is None:
            out[k] = None
            continue

        # convert "5", 5, 5.0 -> float/int
        try:
            v_num = float(v)
            # if it's an integer-like float, store as int (optional)
            out[k] = int(v_num) if v_num.is_integer() else v_num
        except Exception:
            out[k] = None

    # If all are missing, treat as parse fail
    if all(out[k] is None for k in RUBRICS):
        return None

    return out

def clean_rawdf(raw_df: pd.DataFrame) -> pd.DataFrame:
    df = raw_df.copy()

    parsed = df["raw_output"].apply(parse_deepseek_output)

    for rubric in RUBRICS:
        df[rubric] = parsed.apply(lambda x: x.get(rubric) if isinstance(x, dict) else None)

    # (Optional) keep a flag for debugging
    df["parse_ok"] = parsed.apply(lambda x: isinstance(x, dict))

    # Light cleanup for display only (doesn't affect parsing anymore)
    df["raw_output"] = df["raw_output"].astype(str).str.replace(r"\s+", " ", regex=True)

    return df


In [24]:
onco_raw = pd.read_csv(os.path.join(onco_output_path, "deepseek_raw_outputs_onco_realworld_test.csv"))
basic_raw = pd.read_csv(os.path.join(basic_output_path, "deepseek_raw_outputs_basic_realworld_test.csv"))
reviewer_raw = pd.read_csv(os.path.join(reviewer_output_path, "deepseek_raw_outputs_reviewer_realworld_test.csv"))

basic_raw = clean_rawdf(basic_raw)
onco_raw = clean_rawdf(onco_raw)
reviewer_raw = clean_rawdf(reviewer_raw)

basic_raw.head()


Unnamed: 0,record_id,raw_output,accurate,organized,comprehensible,succinct,parse_ok
0,0,"{""accurate"": 2, ""organized"": 5, ""comprehensibl...",2,5,5,5,True
1,1,"{'accurate': 2, 'organized': 5, 'comprehensibl...",2,5,5,5,True
2,2,"{""accurate"": 5, ""organized"": 2, ""comprehensibl...",5,2,5,2,True
3,3,"{'accurate': 4, 'organized': 5, 'comprehensibl...",4,5,4,2,True
4,4,"{'accurate': 2, 'organized': 3, 'comprehensibl...",2,3,3,1,True


In [25]:
def expand_rawdf(raw_df):
    raw_df_expanded = raw_df['raw_output'].str.split(',', expand=True)
    raw_df_expanded.head()

    # remove all non-numeric characters from each cell in raw_df_expanded and rename columns to accurate, organized, comprehensible, succinct
    for i, col in enumerate(raw_df_expanded.columns):
        raw_df_expanded[col] = raw_df_expanded[col].str.replace(r'[^0-9.]', '', regex=True)
        if i == 0:
            raw_df_expanded.rename(columns={col: 'accurate'}, inplace=True)
        elif i == 1:
            raw_df_expanded.rename(columns={col: 'organized'}, inplace=True)
        elif i == 2:
            raw_df_expanded.rename(columns={col: 'comprehensible'}, inplace=True)
        elif i == 3:
            raw_df_expanded.rename(columns={col: 'succinct'}, inplace=True)

    # add record_id column back to raw_df_expanded
    raw_df_expanded['record_id'] = raw_df['record_id']
    return raw_df_expanded

basic_raw_expanded = expand_rawdf(basic_raw)
onco_raw_expanded = expand_rawdf(onco_raw)
reviewer_raw_expanded = expand_rawdf(reviewer_raw)
basic_raw_expanded.head()

Unnamed: 0,accurate,organized,comprehensible,succinct,record_id
0,2,5,5,5,0
1,2,5,5,5,1
2,5,2,5,2,2
3,4,5,4,2,3
4,2,3,3,1,4


In [26]:
def replace_scorefile(raw_df, scoreFile):
    for i, row in scoreFile.iterrows():
        record_id = row['record_id']
        matching_row = raw_df[raw_df['record_id'] == record_id]
        if not matching_row.empty:
            scoreFile.at[i, 'accurate'] = matching_row['accurate'].values[0]
            scoreFile.at[i, 'organized'] = matching_row['organized'].values[0]
            scoreFile.at[i, 'comprehensible'] = matching_row['comprehensible'].values[0]
            scoreFile.at[i, 'succinct'] = matching_row['succinct'].values[0]
    return scoreFile

basic_scorefile = replace_scorefile(basic_raw_expanded, basic_scorefile)
onco_scorefile = replace_scorefile(onco_raw_expanded, onco_scorefile)
reviewer_scorefile = replace_scorefile(reviewer_raw_expanded, reviewer_scorefile)

In [27]:
basic_scorefile.head()

Unnamed: 0,record_id,Drug,Summary,accurate,organized,comprehensible,succinct
0,0,Metformin,")<Output>\n{{\n ""Mechanism"": ""<Metformin targ...",2,5,5,5
1,1,Pioglitazone,")<Output format>\n{{\n ""Mechanism"": ""<Pioglit...",2,5,5,5
2,2,Alogliptin,")<Output format>\n{{\n ""Mechanism"": ""<Aloglip...",5,2,5,2
3,3,Linagliptin,")<Output format>\n{{\n ""Mechanism"": ""<Linagli...",4,5,4,2
4,4,Sitagliptin,")<Output format>\n{{\n ""Mechanism"": ""<Sitagli...",2,3,3,1


In [29]:
# save cleaned scorefiles
basic_scorefile.to_csv(os.path.join(basic_output_path, "deepseek_cleaned_ratings_basic_realworld_test.csv"), index=False)
onco_scorefile.to_csv(os.path.join(onco_output_path, "deepseek_cleaned_ratings_onco_realworld_test.csv"), index=False)
reviewer_scorefile.to_csv(os.path.join(reviewer_output_path, "deepseek_cleaned_ratings_reviewer_realworld_test.csv"), index=False)

# save raw outputs
basic_raw_expanded.to_csv(os.path.join(basic_output_path, "deepseek_raw_outputs_cleaned_basic_realworld_test.csv"), index=False)
onco_raw_expanded.to_csv(os.path.join(onco_output_path, "deepseek_raw_outputs_cleaned_onco_realworld_test.csv"), index=False)
reviewer_raw_expanded.to_csv(os.path.join(reviewer_output_path, "deepseek_raw_outputs_cleaned_reviewer_realworld_test.csv"), index=False)