In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import module
from dotenv import load_dotenv
from model_list import models
import pandas as pd
import os 

load_dotenv() 
hf_api_key             = os.getenv('HF_TOKEN')                   #<insert your own huggingface token here>
openai_api_key         = os.getenv('OPENAI_API_KEY_TEAM')        #<insert your own openai token here>

In [3]:
data_pub_eval = pd.read_csv('../data/hidden_data/biobert_embed/CT-Pub-With-Examples-Corrected-biobert-alleval.csv')
data_pub_gen = pd.read_csv('../data/hidden_data/biobert_embed/CT-Pub-With-Examples-Corrected-biobert-allgen.csv')
print(data_pub_eval.shape)
print(data_pub_gen.shape)

(103, 3)
(103, 14)


In [4]:
data_pub_gen.head(2)

Unnamed: 0,NCTId,BriefTitle,EligibilityCriteria,BriefSummary,Conditions,Interventions,PrimaryOutcomes,TrialGroup,API_BaselineMeasures,API_BaselineMeasures_Corrected,Paper_BaselineMeasures,Paper_BaselineMeasures_Corrected,gpt4o_rag_ts_gen,llama3_70b_it_rag_ts_gen
0,NCT00000620,Action to Control Cardiovascular Risk in Diabe...,Inclusion Criteria:\n\n* Diagnosed with type 2...,The purpose of this study is to prevent major ...,"Atherosclerosis, Cardiovascular Diseases, Hype...","Anti-hyperglycemic Agents, Anti-hypertensive A...",First Occurrence of a Major Cardiovascular Eve...,hypertension,"Age, Continuous, Gender, Ethnicity (NIH/OMB), ...","`Age, Continuous`, `Gender`, `Ethnicity (NIH/O...","Age, Female sex, Median duration of diabetes, ...","`Age`, `Female sex`, `Median duration of diabe...",,
1,NCT00126737,Home-Based Exercise and Weight Control Program...,Inclusion Criteria:\n\n* Male \& female 50 yea...,The purpose of this study is to determine whet...,"Chronic Diseases, Obesity, Osteoarthritis, Pain,","Weight Control Nutritional Program, Home-based...","WOMAC Function, Physical Scale SF-36v, Mental ...",obesity,"Age, Continuous, Sex: Female, Male, Race/Ethni...","`Age, Continuous`, `Sex: Female, Male`, `Race/...","Age, Duration of OA, Kellgren-Lawrence Classif...","`Age`, `Duration of OA`, `Kellgren-Lawrence Cl...","`Age, Continuous`, `Sex: Female, Male`, `Race ...","`Age, Continuous`, `Sex: Female, Male`, `Ethni..."


In [5]:
data_pub_eval.head(2)

Unnamed: 0,NCTId,gpt4o_ts_gen_matches,llama3_70b_it_ts_gen_matches
0,NCT00000620,,
1,NCT00126737,"{\n ""matched_features"": [\n [""Age"", ...","{\n ""matched_features"": [\n [""Age"", ..."


## Example Hallucination Calculation Check 

In [6]:

#`Gender` in reference and `Inflammation` in candidate are Negative hallucinations, not reported in matches or remainings 
reference_features = ['`Age`', '`Blood Pressure`', '`Height`', '`Gender`', '`Previous Medication`', '`Race`', '`Ethnicity`']
candidate_features = ['`Age`', '`Systolic Blood Pressure`', '`Diastolic Blood Pressure`', '`Body Mass Index`', '`Race`', "`Inflammation`"]

matched_results =   {
                        "matched_features": [
                            ["`Age`", "`Age`"],
                            ["`bogus1`", "`bogus 2"], ## <-- positive hallucinations in both reference and candidate, but we count only 1 for each matched pair
                            ["`body mass index`", "`Body Mass Index`"], ## <-- positive hallucination 'body mass index' doesn't exist in reference
                            ["`Blood Pressure`", "`Systolic Blood Pressure`"], ## <-- multimatch hallucination in reference (counting one of multiple matches as correct match)
                            ["`Blood Pressure`", "`Diastolic Blood Pressure`"],## <-- multimatch hallucination in reference
                            ["`Race`","`Race`"], ## <-- multimatch hallucination in candidate (counting one of multiple matches as correct match)
                            ["`Ethnicity`", "`Race`"], ## <-- multimatch hallucination in candidate
                            ["`Height`", "`patient height`"] ##<-- positive hallucination 'patient height' doesn't exist in candidate
                        ],
                        "remaining_reference_features": ["`Previous Medication`"],
                        "remaining_candidate_features": []
                    }

module.calculate_hallucination(reference_features, candidate_features, matched_results)

(3, 2, 2, 3)

## Calculate for whole CT-Pub Dataset and save in dataframe

In [7]:
pub_hallucination_results = pd.DataFrame()
pub_hallucination_results['NCTId'] = data_pub_gen['NCTId']
pub_hallucination_results['TrialGroup'] = data_pub_gen['TrialGroup']

In [8]:
import json 
ref_column_name = 'Paper_BaselineMeasures_Corrected'

for index, row_gen in data_pub_gen.iterrows():
    avoid_ids = ['NCT00000620', 'NCT01483560', 'NCT04280783'] #these were used as examples for 3-shot generation
    if row_gen['NCTId'] in avoid_ids:
        continue

    #print(f"Processing {row_gen['NCTId']}")

    row_eval = data_pub_eval[data_pub_eval['NCTId'] == row_gen['NCTId']]
    if row_eval.empty:
        print(f"Missing NCTId {row_gen['NCTId']} in data_pub_eval")
        continue
    reference_features = module.extract_elements_v2(row_gen[ref_column_name])

    #calculate adjusted precision, recall and f1 for GPT4 three shot generation
    gts_candidate = module.extract_elements_v2(row_gen['gpt4o_rag_ts_gen'])
    gts_matches = json.loads(row_eval['gpt4o_ts_gen_matches'].values[0])
    gts_hallucination = module.calculate_hallucination(reference_features, gts_candidate, gts_matches)
    if 'gpt4o_ts_gen_hal' not in pub_hallucination_results.columns:
        pub_hallucination_results['gpt4o_ts_gen_hal'] = None
    gts_precision = gts_hallucination[3]/len(gts_candidate)
    gts_recall = gts_hallucination[3]/len(reference_features)
    gts_f1 = 2 * (gts_precision * gts_recall) / (gts_precision + gts_recall) if gts_precision + gts_recall > 0 else 0
    pub_hallucination_results.at[index, 'gpt4o_ts_gen_hal'] = (gts_hallucination[0], gts_hallucination[1], gts_hallucination[2], gts_hallucination[3], gts_precision, gts_recall, gts_f1)

    #calculate adjusted precision, recall and f1 for LLAMA3 three shot generation
    lts_candidate = module.extract_elements_v2(row_gen['llama3_70b_it_rag_ts_gen'])
    lts_matches = json.loads(row_eval['llama3_70b_it_ts_gen_matches'].values[0])
    lts_hallucination = module.calculate_hallucination(reference_features, lts_candidate, lts_matches)
    if 'llama3_70b_it_ts_gen_hal' not in pub_hallucination_results.columns:
        pub_hallucination_results['llama3_70b_it_ts_gen_hal'] = None
    lts_precision = lts_hallucination[3]/len(lts_candidate)
    lts_recall = lts_hallucination[3]/len(reference_features)
    lts_f1 = 2 * (lts_precision * lts_recall) / (lts_precision + lts_recall) if lts_precision + lts_recall > 0 else 0
    pub_hallucination_results.at[index, 'llama3_70b_it_ts_gen_hal'] = (lts_hallucination[0], lts_hallucination[1], lts_hallucination[2], lts_hallucination[3], lts_precision, lts_recall, lts_f1)


In [9]:
pub_hallucination_results

Unnamed: 0,NCTId,TrialGroup,gpt4o_ts_gen_hal,llama3_70b_it_ts_gen_hal
0,NCT00000620,hypertension,,
1,NCT00126737,obesity,"(0, 0, 0, 5, 0.625, 0.4166666666666667, 0.5)","(0, 0, 0, 5, 0.45454545454545453, 0.4166666666..."
2,NCT00283686,hypertension,"(0, 0, 0, 3, 0.375, 0.07894736842105263, 0.130...","(0, 0, 0, 3, 0.375, 0.07894736842105263, 0.130..."
3,NCT00329030,cancer,"(0, 0, 0, 6, 0.46153846153846156, 0.75, 0.5714...","(0, 0, 0, 5, 0.7142857142857143, 0.625, 0.6666..."
4,NCT00360334,diabetes,"(0, 0, 0, 7, 1.0, 0.7777777777777778, 0.875000...","(0, 0, 0, 6, 1.0, 0.6666666666666666, 0.8)"
...,...,...,...,...
98,NCT03890588,chronic kidney disease,"(0, 0, 0, 8, 0.8, 0.8, 0.8000000000000002)","(0, 0, 0, 8, 0.8, 0.8, 0.8000000000000002)"
99,NCT03987919,diabetes,"(0, 0, 0, 4, 0.6666666666666666, 0.28571428571...","(0, 0, 0, 8, 0.8, 0.5714285714285714, 0.666666..."
100,NCT04280783,hypertension,,
101,NCT04392375,hypertension,"(1, 12, 0, 6, 0.5454545454545454, 0.2222222222...","(0, 0, 0, 8, 0.5333333333333333, 0.29629629629..."


# Workshop Hallucination Record Generation


In [12]:
import pandas as pd

def transform_to_long_format(pub_hallucination_results):
    """
    Transforms the given DataFrame containing model results into a long format.

    Parameters:
    pub_hallucination_results (pd.DataFrame): The DataFrame with columns for model results.

    Returns:
    pd.DataFrame: Transformed DataFrame in the long format with detailed metrics.
    """
    # Melt and extract tuples into a long format
    long_format = pub_hallucination_results.melt(
        id_vars=['NCTId', 'TrialGroup'],
        value_vars=['gpt4o_ts_gen_hal', 'llama3_70b_it_ts_gen_hal'],
        var_name='Generation Model',
        value_name='Metrics'
    )

    # Ensure that all entries in the 'Metrics' column are tuples of length 7
    long_format['Metrics'] = long_format['Metrics'].apply(lambda x: x if isinstance(x, tuple) and len(x) == 7 else (None,) * 7)

    # Expand the tuples into their respective columns
    long_format[['Positive Hallucination', 'Negative Hallucination', 'Multi-match Hallucination', 
                 'Correct Matches', 'Precision', 'Recall', 'F1']] = pd.DataFrame(
        long_format['Metrics'].tolist(), index=long_format.index
    )

    # Drop the original 'Metrics' column as it's no longer needed
    long_format.drop(columns=['Metrics'], inplace=True)

    return long_format



In [15]:
long_data = transform_to_long_format(pub_hallucination_results)
long_data.to_csv('../workshop_results/CT_Pub_rag_hallucination_results.csv', index=False)
long_data.head(5)

Unnamed: 0,NCTId,TrialGroup,Generation Model,Positive Hallucination,Negative Hallucination,Multi-match Hallucination,Correct Matches,Precision,Recall,F1
0,NCT00126737,obesity,gpt4o_ts_gen_hal,0,0,0,5,0.625,0.416667,0.5
1,NCT00283686,hypertension,gpt4o_ts_gen_hal,0,0,0,3,0.375,0.078947,0.130435
2,NCT00329030,cancer,gpt4o_ts_gen_hal,0,0,0,6,0.461538,0.75,0.571429
3,NCT00360334,diabetes,gpt4o_ts_gen_hal,0,0,0,7,1.0,0.777778,0.875
4,NCT00395746,diabetes,gpt4o_ts_gen_hal,0,0,0,5,0.625,1.0,0.769231


In [16]:
long_data.shape

(200, 10)

# Score calculation (Avg)

In [10]:
#calculate average precision, recall and f1 for each model
#average over all examples, save in separate dataframe
adjusted_scores = pd.DataFrame()
adjusted_scores["Metric"] = ["Adjusted Precision", "Adjusted Recall", "Adjusted F1"]
models = ['gpt4o_ts_gen_hal', 'llama3_70b_it_ts_gen_hal']

#remove None values from the dataframe
pub_hallucination_results = pub_hallucination_results.dropna()
print(pub_hallucination_results.shape)

for model in models:
    adjusted_scores[model] = [pub_hallucination_results[model].apply(lambda x: x[4]).mean(), #precision mean
                              pub_hallucination_results[model].apply(lambda x: x[5]).mean(), #recall mean 
                              pub_hallucination_results[model].apply(lambda x: x[6]).mean()] #f1 mean 

adjusted_scores

(100, 4)


Unnamed: 0,Metric,gpt4o_ts_gen_hal,llama3_70b_it_ts_gen_hal
0,Adjusted Precision,0.531134,0.538396
1,Adjusted Recall,0.542683,0.495542
2,Adjusted F1,0.494834,0.480023


# Grouped Score Calculation by TrialGroup

In [11]:
# Group by TrialGroup and calculate mean precision, recall, and F1 scores for each model
grouped_scores = pub_hallucination_results.groupby('TrialGroup').apply(
    lambda x: pd.Series({
        'gpt4o_rag_three_shot_precision': x['gpt4o_ts_gen_hal'].apply(lambda y: y[4]).mean(),
        'gpt4o_rag_three_shot_recall': x['gpt4o_ts_gen_hal'].apply(lambda y: y[5]).mean(),
        'gpt4o_rag_three_shot_f1': x['gpt4o_ts_gen_hal'].apply(lambda y: y[6]).mean(),
        'llama3_rag_three_shot_precision': x['llama3_70b_it_ts_gen_hal'].apply(lambda y: y[4]).mean(),
        'llama3_rag_three_shot_recall': x['llama3_70b_it_ts_gen_hal'].apply(lambda y: y[5]).mean(),
        'llama3_rag_three_shot_f1': x['llama3_70b_it_ts_gen_hal'].apply(lambda y: y[6]).mean(),
    })
).reset_index()

grouped_scores.T

  grouped_scores = pub_hallucination_results.groupby('TrialGroup').apply(


Unnamed: 0,0,1,2,3,4
TrialGroup,cancer,chronic kidney disease,diabetes,hypertension,obesity
gpt4o_rag_three_shot_precision,0.403782,0.594078,0.601417,0.473141,0.493742
gpt4o_rag_three_shot_recall,0.583516,0.499542,0.600824,0.49217,0.478992
gpt4o_rag_three_shot_f1,0.446131,0.516523,0.556035,0.415888,0.462235
llama3_rag_three_shot_precision,0.390035,0.611941,0.621254,0.493118,0.475433
llama3_rag_three_shot_recall,0.497815,0.472811,0.555641,0.474923,0.418772
llama3_rag_three_shot_f1,0.400876,0.507324,0.550469,0.432416,0.427039
