In [6]:
%load_ext autoreload
%autoreload 2

%load_ext dotenv
%dotenv

In [7]:
from datasets import load_dataset, DatasetDict

covidqa_dataset_test = load_dataset('PatronusAI/covidqa-test')
ragtruth_dataset_test = load_dataset('PatronusAI/RAGTruth-test')
drop_dataset_test = load_dataset('PatronusAI/drop-test')
pubmedqa_dataset_test = load_dataset('PatronusAI/pubmedqa-test')
financebench_dataset_test = load_dataset('PatronusAI/financebench-test')
halueval_dataset_test = load_dataset('PatronusAI/halueval-test')

In [9]:
print(type(drop_dataset_test))
print(drop_dataset_test)

<class 'datasets.dataset_dict.DatasetDict'>
DatasetDict({
    test: Dataset({
        features: ['messages', 'LABEL', '_id'],
        num_rows: 1000
    })
})


In [1]:
import os
import json

import pandas as pd

from openai import OpenAI
from tqdm import tqdm

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
TEMP = 0

In [11]:
def _get_openai_response(
    model_name: str,
    user_prompt: str | None = None,
    system_prompt: str | None = None,
    assistant_prompt: str | None = None,
) -> str:
    messages = []
    if user_prompt:
        messages.append({"role": "user", "content": user_prompt})
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    if assistant_prompt:
        messages.append({"role": "assistant", "content": assistant_prompt})

    response = client.chat.completions.create(
        model=model_name,
        messages=messages,
        temperature=TEMP,
        response_format={"type": "json_object"},
    )

    return response.choices[0].message.content

In [12]:
def _calculate_accuracy(df: pd.DataFrame):
    df['correct'] = df['score'] == df['label']
    correct_predictions = df['correct'].sum()
    total_predictions = len(df)
    accuracy = correct_predictions / total_predictions
    return accuracy
    

In [13]:
def evaluate_test_with_gpt(ds: DatasetDict, model_name: str, part: int, parts: int, sample: int | None = None):
    label_list = []
    message_list = []
    response_list = []
    score_list = []
    
    n_rows = len(ds["test"])
    frac_start = (part-1)/parts
    frac_end = part/parts
    id_start = int(frac_start*n_rows)
    id_end = int(frac_end*n_rows)
    
    # Filter the dataset to include only rows from index 100 to 199 (a total of 100 rows)
    test_data = ds["test"]
    
    if all([part, parts]):
        selected_indices = list(range(id_start, id_end))
        test_data = test_data.select(selected_indices)
    
    for i, row in tqdm(enumerate(test_data)):
        user_prompts = [message for message in row["messages"] if message["role"] == "user"]
        response = _get_openai_response(model_name=model_name, user_prompt=user_prompts[0]["content"])
        
        try:
            score = json.loads(response)["SCORE"]
        except:
            score = "ERROR"
        
        label_list.append(row["LABEL"])
        message_list.append(user_prompts)
        response_list.append(response)
        score_list.append(score)
        
        if sample:
            if sample == i:
                break
        
    df = pd.DataFrame({"message": message_list, "response": response_list, "label": label_list, "score": score_list})
    return df

In [14]:
model_name = "gpt-4o"
main_path = f"../data/models/scores/{model_name}"

In [15]:
# drop_df = evaluate_test_with_gpt(ds=drop_dataset_test, model_name=model_name)
# drop_df.to_csv(f"{main_path}/drop.csv")
# 
# drop_accuracy = _calculate_accuracy(df=drop_df)
# drop_pass_accuracy = _calculate_accuracy(df=drop_df[drop_df["label"] == "PASS"])
# drop_fail_accuracy = _calculate_accuracy(df=drop_df[drop_df["label"] == "FAIL"])
# 
# print(f"DROP accuracy: {drop_accuracy}")
# print(f"DROP pass accuracy: {drop_pass_accuracy}")
# print(f"DROP fail accuracy: {drop_fail_accuracy}")

In [16]:
# financebench_df = evaluate_test_with_gpt(ds=financebench_dataset_test, model_name=model_name)
# financebench_df.to_csv(f"{main_path}/financebench.csv")
# 
# financebench_accuracy = _calculate_accuracy(df=financebench_df)
# financebench_pass_accuracy = _calculate_accuracy(df=financebench_df[financebench_df["label"] == "PASS"])
# financebench_fail_accuracy = _calculate_accuracy(df=financebench_df[financebench_df["label"] == "FAIL"])
# 
# print(f"Financebench accuracy: {financebench_accuracy}")
# print(f"Financebench pass accuracy: {financebench_pass_accuracy}")
# print(f"Financebench fail accuracy: {financebench_fail_accuracy}")

In [17]:
# ragtruth_df = evaluate_test_with_gpt(ds=ragtruth_dataset_test, model_name=model_name)
# ragtruth_df.to_csv(f"{main_path}/ragtruth.csv")
# 
# ragtruth_accuracy = _calculate_accuracy(df=ragtruth_df)
# ragtruth_pass_accuracy = _calculate_accuracy(df=ragtruth_df[ragtruth_df["label"] == "PASS"])
# ragtruth_fail_accuracy = _calculate_accuracy(df=ragtruth_df[ragtruth_df["label"] == "FAIL"])
# 
# print(f"Ragtruth accuracy: {ragtruth_accuracy}")
# print(f"Ragtruth pass accuracy: {ragtruth_pass_accuracy}")
# print(f"Ragtruth fail accuracy: {ragtruth_fail_accuracy}")

In [18]:
# pubmedqa_df = evaluate_test_with_gpt(ds=pubmedqa_dataset_test, model_name=model_name)
# pubmedqa_df.to_csv(f"{main_path}/pubmedqa.csv")
# 
# pubmedqa_accuracy = _calculate_accuracy(df=pubmedqa_df)
# pubmedqa_pass_accuracy = _calculate_accuracy(df=pubmedqa_df[pubmedqa_df["label"] == "PASS"])
# pubmedqa_fail_accuracy = _calculate_accuracy(df=pubmedqa_df[pubmedqa_df["label"] == "FAIL"])
# 
# print(f"PubmedQA accuracy: {pubmedqa_accuracy}")
# print(f"PubmedQA pass accuracy: {pubmedqa_pass_accuracy}")
# print(f"PubmedQA fail accuracy: {pubmedqa_fail_accuracy}")

In [19]:
part = 1
parts = 1
covidqa_df = evaluate_test_with_gpt(ds=covidqa_dataset_test, model_name=model_name, part=part, parts=parts)
covidqa_df.to_csv(f"{main_path}/covidqa_{part}_{parts}.csv")

covidqa_accuracy = _calculate_accuracy(df=covidqa_df)
covidqa_pass_accuracy = _calculate_accuracy(df=covidqa_df[covidqa_df["label"] == "PASS"])
covidqa_fail_accuracy = _calculate_accuracy(df=covidqa_df[covidqa_df["label"] == "FAIL"])

print(f"CovidQA accuracy: {covidqa_accuracy}")
print(f"CovidQA pass accuracy: {covidqa_pass_accuracy}")
print(f"CovidQA fail accuracy: {covidqa_fail_accuracy}")

469it [21:53,  2.80s/it]


KeyboardInterrupt: 

In [None]:
# halueval_df = evaluate_test_with_gpt(ds=halueval_dataset_test, model_name=model_name)
# halueval_df.to_csv(f"{main_path}/halueval.csv")
# 
# halueval_accuracy = _calculate_accuracy(df=halueval_df)
# halueval_pass_accuracy = _calculate_accuracy(df=halueval_df[halueval_df["label"] == "PASS"])
# halueval_fail_accuracy = _calculate_accuracy(df=halueval_df[halueval_df["label"] == "FAIL"])
# 
# print(f"HaluEval accuracy: {halueval_accuracy}")
# print(f"HaluEval pass accuracy: {halueval_pass_accuracy}")
# print(f"HaluEval fail accuracy: {halueval_fail_accuracy}")