In [1]:
import ast
import os

from pathlib import Path
from datasets import load_dataset

import pandas as pd
%load_ext autoreload
%autoreload 2

%load_ext dotenv
%dotenv

In [2]:
hallucination_evaluation_benchmark_ds = load_dataset('PatronusAI/hallucination-evaluation-benchmark', download_mode="force_redownload")

Downloading readme:   0%|          | 0.00/452 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.51M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/14900 [00:00<?, ? examples/s]

In [42]:
source_ds_list = sorted(list(set(hallucination_evaluation_benchmark_ds["test"]["source_ds"])))
source_ds_list

['DROP', 'FinanceBench', 'RAGTruth', 'covidQA', 'halueval', 'pubmedQA']

In [21]:
ds = hallucination_evaluation_benchmark_ds["test"].select(range(10))
ds

Dataset({
    features: ['id', 'passage', 'question', 'answer', 'label', 'source_ds'],
    num_rows: 10
})

In [61]:
def rename_keys(example):
    return {
        "contexts": example["passage"]
    }

def convert_to_list(example):
    example['contexts'] = [example['contexts']] # Splitting the text into a list of words
    return example

# Define a function to filter by a specific value
def filter_by_label(example, source_ds_value):
    return example['source_ds'] == source_ds_value


In [64]:
ds = hallucination_evaluation_benchmark_ds["test"]
renamed_ds = ds.map(rename_keys, remove_columns=["passage"]).map(convert_to_list)
renamed_ds[0]

Map:   0%|          | 0/14900 [00:00<?, ? examples/s]

Map:   0%|          | 0/14900 [00:00<?, ? examples/s]

{'id': 'd3fb4c3c-d21b-480a-baa0-98d6d0d17c1d',
 'question': 'Which team scored the longest field goal kick of the game?',
 'answer': "['Rams', 'second', 'Marc Bulger', 'Kevin Curtis']",
 'label': 'FAIL',
 'source_ds': 'DROP',
 'contexts': ['Hoping to rebound from the road loss to the Chargers, the Rams went home for Week 9, as they fought the Kansas City Chiefs in a "Show Me State Showdown". The Chiefs struck first as RB Larry Johnson got a 1-yard TD run for the only score of the period.  In the second quarter, things got worse for the Rams as QB Damon Huard completed a 3-yard TD pass to TE Tony Gonzalez, while kicker Lawrence Tynes nailed a 42-yard field goal.  St. Louis got on the board with RB Steven Jackson getting a 2-yard TD run, yet Huard and Gonzalez hooked up with each other again on a 25-yard TD strike.  Rams kicker Jeff Wilkins made a 41-yard field goal to end the half.  In the third quarter, QB Marc Bulger completed a 2-yard TD pass to WR Kevin Curtis for the only score of 

In [66]:
drop_ds = renamed_ds.filter(filter_by_label, fn_kwargs={'source_ds_value': "DROP"})
financebench_ds = renamed_ds.filter(filter_by_label, fn_kwargs={'source_ds_value': "FinanceBench"})
ragtruth_ds = renamed_ds.filter(filter_by_label, fn_kwargs={'source_ds_value': "RAGTruth"})
covidqa_ds = renamed_ds.filter(filter_by_label, fn_kwargs={'source_ds_value': "covidQA"})
halueval_ds = renamed_ds.filter(filter_by_label, fn_kwargs={'source_ds_value': "halueval"})
pubmedqa_ds = renamed_ds.filter(filter_by_label, fn_kwargs={'source_ds_value': "pubmedQA"})

Filter:   0%|          | 0/14900 [00:00<?, ? examples/s]

Filter:   0%|          | 0/14900 [00:00<?, ? examples/s]

Filter:   0%|          | 0/14900 [00:00<?, ? examples/s]

Filter:   0%|          | 0/14900 [00:00<?, ? examples/s]

Filter:   0%|          | 0/14900 [00:00<?, ? examples/s]

Filter:   0%|          | 0/14900 [00:00<?, ? examples/s]

In [131]:
import numpy as np

# Function to replace values based on a condition
def replace_values(row, benchmark: float):
    if row.isna().any():
        return "ERROR"
    if row['faithfulness'] > benchmark:
        return 'PASS'
    if row['faithfulness'] <= benchmark:
        return 'FAIL'
    
def get_accuracy_and_correct_match(df: pd.DataFrame) -> tuple[float, float]:
    n_correct = sum(df["label"] == df["score"])
    accuracy = n_correct / df.shape[0]
    print(f"Accuracy: {accuracy * 100:.2f}%")

    return n_correct, accuracy
    
def calculate_accuracies(df: pd.DataFrame):
    error_df = df[df["score"] == "ERROR"].copy()
    new_df = df[df["score"] != "ERROR"].copy()
    pass_df = new_df[new_df["label"] == "PASS"]
    fail_df = new_df[new_df["label"] == "FAIL"]
    error_rows = error_df.shape[0]
    all_rows = new_df.shape[0]
    all_pass_rows = pass_df.shape[0]
    all_fail_rows = fail_df.shape[0]
    if not all_rows:
        print("DataFrame is empty. ")
    if all_rows > 0:
        n_correct, accuracy = get_accuracy_and_correct_match(new_df)
        print(f"All rows: {all_rows}")
        print(f"Error rows: {error_rows}")
        print(f"Correct examples: {n_correct}   Accuracy: {accuracy}")
        if not all_pass_rows:
            print("PASS DataFrame is empty.")
        if not all_fail_rows:
            print("FAIL DataFrame is empty.")

        if all_pass_rows > 0:
            n_pass_correct, pass_accuracy = get_accuracy_and_correct_match(pass_df)
            print(f"All PASS rows: {all_pass_rows}")
            print(
                f"Correct PASS examples: {n_pass_correct}   PASS Accuracy: {pass_accuracy}"
            )
        if all_fail_rows > 0:
            n_fail_correct, fail_accuracy = get_accuracy_and_correct_match(fail_df)
            print(f"All FAIL rows: {all_fail_rows}")
            print(
                f"Correct FAIL examples: {n_fail_correct}   FAIL Accuracy: {fail_accuracy}"
            )


In [137]:
from ragas import evaluate
from ragas.metrics import faithfulness, answer_correctness

def transform_ds(ds, filename: str, benchmark: float):
    save_path = Path("/home/bartosz-mielczarek/patronus/repositories/hallucination-evaluator-finetuning/data/models/ragas")
    
    score_ds = evaluate(ds, metrics=[faithfulness])
    score_df = score_ds.to_pandas()
    score_df["score"] = score_df.apply(replace_values, axis=1, benchmark=benchmark)

    score_df.to_csv(save_path / filename, index=False)
        
    calculate_accuracies(df=score_df)
    
    return score_df

In [139]:
benchmark = 0.5

score_drop_df = transform_ds(ds=drop_ds, benchmark=benchmark, filename=f"drop_{100*benchmark}.csv")
# score_financebench_df = transform_ds(ds=financebench_ds, benchmark=benchmark, filename=f"financebench_{100*benchmark}.csv")
# score_ragtruth_df = transform_ds(ds=ragtruth_ds, benchmark=benchmark, filename=f"ragtruth_{100*benchmark}.csv")
# score_covidqa_df = transform_ds(ds=covidqa_ds, benchmark=benchmark, filename=f"covidqa_{100*benchmark}.csv")
# score_halueval_df = transform_ds(ds=halueval_ds.shuffle(seed=42).select(range(0, 1000)), benchmark=benchmark, filename=f"halueval_{100*benchmark}.csv")
# score_pubmedqa_df = transform_ds(ds=pubmedqa_ds, benchmark=benchmark, filename=f"pubmedqa_{100*benchmark}.csv")

Evaluating:   0%|          | 0/1000 [00:00<?, ?it/s]

No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from 

Flattening the indices:   0%|          | 0/1000 [00:00<?, ? examples/s]

Accuracy: 59.59%
All rows: 636
Error rows: 364
Correct examples: 379   Accuracy: 0.5959119496855346
Accuracy: 36.22%
All PASS rows: 323
Correct PASS examples: 117   PASS Accuracy: 0.3622291021671827
Accuracy: 83.71%
All FAIL rows: 313
Correct FAIL examples: 262   FAIL Accuracy: 0.8370607028753994


In [143]:
benchmark = 0.5

# score_drop_df = transform_ds(ds=drop_ds, benchmark=benchmark, filename=f"drop_{100*benchmark}.csv")
score_financebench_df = transform_ds(ds=financebench_ds, benchmark=benchmark, filename=f"financebench_{100*benchmark}.csv")
# score_ragtruth_df = transform_ds(ds=ragtruth_ds, benchmark=benchmark, filename=f"ragtruth_{100*benchmark}.csv")
# score_covidqa_df = transform_ds(ds=covidqa_ds, benchmark=benchmark, filename=f"covidqa_{100*benchmark}.csv")
# score_halueval_df = transform_ds(ds=halueval_ds.shuffle(seed=42).select(range(0, 1000)), benchmark=benchmark, filename=f"halueval_{100*benchmark}.csv")
# score_pubmedqa_df = transform_ds(ds=pubmedqa_ds, benchmark=benchmark, filename=f"pubmedqa_{100*benchmark}.csv")

Evaluating:   0%|          | 0/1000 [00:00<?, ?it/s]

No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from 

Flattening the indices:   0%|          | 0/1000 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/1000 [00:00<?, ? examples/s]

Accuracy: 57.42%
All rows: 559
Error rows: 441
Correct examples: 321   Accuracy: 0.5742397137745975
Accuracy: 30.82%
All PASS rows: 279
Correct PASS examples: 86   PASS Accuracy: 0.30824372759856633
Accuracy: 83.93%
All FAIL rows: 280
Correct FAIL examples: 235   FAIL Accuracy: 0.8392857142857143


In [144]:
benchmark = 0.5

# score_drop_df = transform_ds(ds=drop_ds, benchmark=benchmark, filename=f"drop_{100*benchmark}.csv")
# score_financebench_df = transform_ds(ds=financebench_ds, benchmark=benchmark, filename=f"financebench_{100*benchmark}.csv")
score_ragtruth_df = transform_ds(ds=ragtruth_ds, benchmark=benchmark, filename=f"ragtruth_{100*benchmark}.csv")
# score_covidqa_df = transform_ds(ds=covidqa_ds, benchmark=benchmark, filename=f"covidqa_{100*benchmark}.csv")
# score_halueval_df = transform_ds(ds=halueval_ds.shuffle(seed=42).select(range(0, 1000)), benchmark=benchmark, filename=f"halueval_{100*benchmark}.csv")
# score_pubmedqa_df = transform_ds(ds=pubmedqa_ds, benchmark=benchmark, filename=f"pubmedqa_{100*benchmark}.csv")

Evaluating:   0%|          | 0/900 [00:00<?, ?it/s]

No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.


Flattening the indices:   0%|          | 0/900 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/900 [00:00<?, ? examples/s]

Accuracy: 75.76%
All rows: 883
Error rows: 17
Correct examples: 669   Accuracy: 0.7576443941109853
Accuracy: 84.25%
All PASS rows: 724
Correct PASS examples: 610   PASS Accuracy: 0.8425414364640884
Accuracy: 37.11%
All FAIL rows: 159
Correct FAIL examples: 59   FAIL Accuracy: 0.3710691823899371


In [146]:
benchmark = 0.5

# score_drop_df = transform_ds(ds=drop_ds, benchmark=benchmark, filename=f"drop_{100*benchmark}.csv")
# score_financebench_df = transform_ds(ds=financebench_ds, benchmark=benchmark, filename=f"financebench_{100*benchmark}.csv")
# score_ragtruth_df = transform_ds(ds=ragtruth_ds, benchmark=benchmark, filename=f"ragtruth_{100*benchmark}.csv")
score_covidqa_df = transform_ds(ds=covidqa_ds, benchmark=benchmark, filename=f"covidqa_{100*benchmark}.csv")
# score_halueval_df = transform_ds(ds=halueval_ds.shuffle(seed=42).select(range(0, 1000)), benchmark=benchmark, filename=f"halueval_{100*benchmark}.csv")
# score_pubmedqa_df = transform_ds(ds=pubmedqa_ds, benchmark=benchmark, filename=f"pubmedqa_{100*benchmark}.csv")

Evaluating:   0%|          | 0/1000 [00:00<?, ?it/s]

No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from the answer.
No statements were generated from 

Flattening the indices:   0%|          | 0/1000 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/1000 [00:00<?, ? examples/s]

Accuracy: 75.00%
All rows: 812
Error rows: 188
Correct examples: 609   Accuracy: 0.75
Accuracy: 72.86%
All PASS rows: 409
Correct PASS examples: 298   PASS Accuracy: 0.7286063569682152
Accuracy: 77.17%
All FAIL rows: 403
Correct FAIL examples: 311   FAIL Accuracy: 0.771712158808933


In [None]:
benchmark = 0.5

# score_drop_df = transform_ds(ds=drop_ds, benchmark=benchmark, filename=f"drop_{100*benchmark}.csv")
# score_financebench_df = transform_ds(ds=financebench_ds, benchmark=benchmark, filename=f"financebench_{100*benchmark}.csv")
# score_ragtruth_df = transform_ds(ds=ragtruth_ds, benchmark=benchmark, filename=f"ragtruth_{100*benchmark}.csv")
# score_covidqa_df = transform_ds(ds=covidqa_ds, benchmark=benchmark, filename=f"covidqa_{100*benchmark}.csv")
score_halueval_df = transform_ds(ds=halueval_ds.shuffle(seed=42).select(range(0, 1000)), benchmark=benchmark, filename=f"halueval_{100*benchmark}.csv")
# score_pubmedqa_df = transform_ds(ds=pubmedqa_ds, benchmark=benchmark, filename=f"pubmedqa_{100*benchmark}.csv")