In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import module
from dotenv import load_dotenv
from model_list import models
import pandas as pd
import os 

load_dotenv() 
hf_api_key             = os.getenv('HF_TOKEN')                   #<insert your own huggingface token here>
openai_api_key         = os.getenv('OPENAI_API_KEY_TEAM')        #<insert your own openai token here>

In [3]:
data_repo_eval = pd.read_csv('../data/hidden_data/biobert_embed/CT-Repo-rag-biobert-alleval.csv')
data_repo_gen = pd.read_csv('../data/hidden_data/biobert_embed/CT-Repo-rag-biobert-allgen.csv')
print(data_repo_eval.shape)
print(data_repo_gen.shape)

(1693, 3)
(1693, 12)


In [4]:
data_repo_gen.head(2)

Unnamed: 0,NCTId,BriefTitle,EligibilityCriteria,BriefSummary,Conditions,Interventions,PrimaryOutcomes,TrialGroup,API_BaselineMeasures,API_BaselineMeasures_Corrected,gpt4o_rag_ts_gen,llama3_70b_it_rag_ts_gen
0,NCT00000620,Action to Control Cardiovascular Risk in Diabe...,Inclusion Criteria:\n\n* Diagnosed with type 2...,The purpose of this study is to prevent major ...,"Atherosclerosis, Cardiovascular Diseases, Hype...","Anti-hyperglycemic Agents, Anti-hypertensive A...",First Occurrence of a Major Cardiovascular Eve...,hypertension,"Age, Continuous, Gender, Ethnicity (NIH/OMB), ...","`Age, Continuous`, `Gender`, `Ethnicity (NIH/O...",,
1,NCT00003901,Prognostic Study of Metastases in Patients Wit...,Inclusion Criteria:\n\n1. Patient must be ≥ 18...,RATIONALE: Prognostic testing for early signs ...,"Lung Cancer,","immunohistochemistry staining method, biopsy, ...",Overall Survival in Lymph Nodes Examined Patie...,cancer,"Age, Continuous, Gender, Race/Ethnicity, Custo...","`Age, Continuous`, `Gender`, `Race/Ethnicity, ...","`Age, Continuous`, `Sex: Female, Male`, `Race ...","`Age, Continuous`, `Sex: Female, Male`, `Ethni..."


In [5]:
data_repo_eval.head(2)

Unnamed: 0,NCTId,gpt4o_ts_gen_matches,llama3_70b_it_ts_gen_matches
0,NCT00000620,,
1,NCT00003901,"{\n ""matched_features"": [\n [""Age, C...","{\n ""matched_features"": [\n [""Age, C..."


## Example Hallucination Calculation Check 

In [6]:

#`Gender` in reference and `Inflammation` in candidate are Negative hallucinations, not reported in matches or remainings 
reference_features = ['`Age`', '`Blood Pressure`', '`Height`', '`Gender`', '`Previous Medication`', '`Race`', '`Ethnicity`']
candidate_features = ['`Age`', '`Systolic Blood Pressure`', '`Diastolic Blood Pressure`', '`Body Mass Index`', '`Race`', "`Inflammation`"]

matched_results =   {
                        "matched_features": [
                            ["`Age`", "`Age`"],
                            ["`bogus1`", "`bogus 2"], ## <-- positive hallucinations in both reference and candidate, but we count only 1 for each matched pair
                            ["`body mass index`", "`Body Mass Index`"], ## <-- positive hallucination 'body mass index' doesn't exist in reference
                            ["`Blood Pressure`", "`Systolic Blood Pressure`"], ## <-- multimatch hallucination in reference (counting one of multiple matches as correct match)
                            ["`Blood Pressure`", "`Diastolic Blood Pressure`"],## <-- multimatch hallucination in reference
                            ["`Race`","`Race`"], ## <-- multimatch hallucination in candidate (counting one of multiple matches as correct match)
                            ["`Ethnicity`", "`Race`"], ## <-- multimatch hallucination in candidate
                            ["`Height`", "`patient height`"] ##<-- positive hallucination 'patient height' doesn't exist in candidate
                        ],
                        "remaining_reference_features": ["`Previous Medication`"],
                        "remaining_candidate_features": []
                    }

module.calculate_hallucination(reference_features, candidate_features, matched_results)

(3, 2, 2, 3)

## Calculate for whole CT-repo Dataset and save in dataframe 

Note: NCT03923933 couldn't be processed in any way for generation

In [7]:
repo_hallucination_results = pd.DataFrame()
repo_hallucination_results['NCTId'] = data_repo_gen['NCTId']
repo_hallucination_results['TrialGroup'] = data_repo_gen['TrialGroup']

In [8]:
import json 
ref_column_name = 'API_BaselineMeasures_Corrected'

for index, row_gen in data_repo_gen.iterrows():
    avoid_ids = ['NCT00000620', 'NCT01483560', 'NCT04280783'] #these were used as examples for 3-shot generation
    if row_gen['NCTId'] in avoid_ids:
        continue
    if row_gen['NCTId'] == 'NCT03923933': #there is one ID for which there is no RAG results
        continue

    #print(f"Processing {row_gen['NCTId']}")

    row_eval = data_repo_eval[data_repo_eval['NCTId'] == row_gen['NCTId']]
    if row_eval.empty:
        print(f"Missing NCTId {row_gen['NCTId']} in data_repo_eval")
        continue
    reference_features = module.extract_elements_v2(row_gen[ref_column_name])

    #calculate adjusted precision, recall and f1 for GPT4 three shot generation
    gts_candidate = module.extract_elements_v2(row_gen['gpt4o_rag_ts_gen'])
    gts_matches = json.loads(row_eval['gpt4o_ts_gen_matches'].values[0])
    gts_hallucination = module.calculate_hallucination(reference_features, gts_candidate, gts_matches)
    if 'gpt4o_ts_gen_hal' not in repo_hallucination_results.columns:
        repo_hallucination_results['gpt4o_ts_gen_hal'] = None
    gts_precision = gts_hallucination[3]/len(gts_candidate)
    gts_recall = gts_hallucination[3]/len(reference_features)
    gts_f1 = 2 * (gts_precision * gts_recall) / (gts_precision + gts_recall) if gts_precision + gts_recall > 0 else 0
    repo_hallucination_results.at[index, 'gpt4o_ts_gen_hal'] = (gts_hallucination[0], gts_hallucination[1], gts_hallucination[2], gts_hallucination[3], gts_precision, gts_recall, gts_f1)

    #calculate adjusted precision, recall and f1 for LLAMA3 three shot generation
    lts_candidate = module.extract_elements_v2(row_gen['llama3_70b_it_rag_ts_gen'])
    lts_matches = json.loads(row_eval['llama3_70b_it_ts_gen_matches'].values[0])
    lts_hallucination = module.calculate_hallucination(reference_features, lts_candidate, lts_matches)
    if 'llama3_70b_it_ts_gen_hal' not in repo_hallucination_results.columns:
        repo_hallucination_results['llama3_70b_it_ts_gen_hal'] = None
    lts_precision = lts_hallucination[3]/len(lts_candidate)
    lts_recall = lts_hallucination[3]/len(reference_features)
    lts_f1 = 2 * (lts_precision * lts_recall) / (lts_precision + lts_recall) if lts_precision + lts_recall > 0 else 0
    repo_hallucination_results.at[index, 'llama3_70b_it_ts_gen_hal'] = (lts_hallucination[0], lts_hallucination[1], lts_hallucination[2], lts_hallucination[3], lts_precision, lts_recall, lts_f1)


In [9]:
repo_hallucination_results

Unnamed: 0,NCTId,TrialGroup,gpt4o_ts_gen_hal,llama3_70b_it_ts_gen_hal
0,NCT00000620,hypertension,,
1,NCT00003901,cancer,"(1, 0, 0, 6, 0.6666666666666666, 0.54545454545...","(0, 0, 0, 5, 0.5555555555555556, 0.45454545454..."
2,NCT00005879,cancer,"(0, 0, 0, 5, 0.25, 0.4166666666666667, 0.3125)","(0, 0, 0, 5, 0.4166666666666667, 0.41666666666..."
3,NCT00005908,cancer,"(0, 0, 0, 3, 0.16666666666666666, 0.5, 0.25)","(0, 0, 0, 3, 0.25, 0.5, 0.3333333333333333)"
4,NCT00006110,cancer,"(0, 0, 0, 3, 0.21428571428571427, 0.4285714285...","(0, 0, 0, 3, 0.2, 0.42857142857142855, 0.27272..."
...,...,...,...,...
1688,NCT05204134,diabetes,"(0, 0, 0, 5, 0.45454545454545453, 0.7142857142...","(0, 0, 0, 5, 0.5, 0.7142857142857143, 0.588235..."
1689,NCT05289869,obesity,"(3, 0, 0, 6, 0.1935483870967742, 0.66666666666...","(0, 0, 0, 5, 0.20833333333333334, 0.5555555555..."
1690,NCT05387889,hypertension,"(0, 0, 0, 5, 0.5, 0.625, 0.5555555555555556)","(0, 0, 0, 4, 0.4, 0.5, 0.4444444444444445)"
1691,NCT05451329,hypertension,"(0, 0, 0, 3, 0.42857142857142855, 0.5, 0.46153...","(0, 0, 0, 5, 0.7142857142857143, 0.83333333333..."


# Workshop Hallucination Record Generation


In [14]:
def transform_to_long_format(repo_hallucination_results):
    """
    Transforms the given DataFrame containing model results into a long format.

    Parameters:
    repo_hallucination_results (pd.DataFrame): The DataFrame with columns for model results.

    Returns:
    pd.DataFrame: Transformed DataFrame in the long format with detailed metrics.
    """
    # Melt and extract tuples into a long format
    long_format = repo_hallucination_results.melt(
        id_vars=['NCTId', 'TrialGroup'],
        value_vars=['gpt4o_ts_gen_hal', 'llama3_70b_it_ts_gen_hal'],
        var_name='Generation Model',
        value_name='Metrics'
    )

    # Ensure that all entries in the 'Metrics' column are tuples of length 7
    long_format['Metrics'] = long_format['Metrics'].apply(lambda x: x if isinstance(x, tuple) and len(x) == 7 else (None,) * 7)

    # Expand the tuples into their respective columns
    long_format[['Positive Hallucination', 'Negative Hallucination', 'Multi-match Hallucination', 
                 'Correct Matches', 'Precision', 'Recall', 'F1']] = pd.DataFrame(
        long_format['Metrics'].tolist(), index=long_format.index
    )

    # Drop the original 'Metrics' column as it's no longer needed
    long_format.drop(columns=['Metrics'], inplace=True)

    return long_format

In [15]:
long_data = transform_to_long_format(repo_hallucination_results)
long_data.to_csv('../workshop_results/CT_Repo_rag_hallucination_results.csv', index=False)
long_data.head(5)

Unnamed: 0,NCTId,TrialGroup,Generation Model,Positive Hallucination,Negative Hallucination,Multi-match Hallucination,Correct Matches,Precision,Recall,F1
0,NCT00003901,cancer,gpt4o_ts_gen_hal,1,0,0,6,0.666667,0.545455,0.6
1,NCT00005879,cancer,gpt4o_ts_gen_hal,0,0,0,5,0.25,0.416667,0.3125
2,NCT00005908,cancer,gpt4o_ts_gen_hal,0,0,0,3,0.166667,0.5,0.25
3,NCT00006110,cancer,gpt4o_ts_gen_hal,0,0,0,3,0.214286,0.428571,0.285714
4,NCT00006392,cancer,gpt4o_ts_gen_hal,0,0,0,5,0.333333,0.625,0.434783


In [16]:
long_data.shape

(3378, 10)

# Score calculation (Avg)

In [10]:
#calculate average precision, recall and f1 for each model
#average over all examples, save in separate dataframe
adjusted_scores = pd.DataFrame()
adjusted_scores["Metric"] = ["Adjusted Precision", "Adjusted Recall", "Adjusted F1"]
models = ['gpt4o_ts_gen_hal', 'llama3_70b_it_ts_gen_hal']

#remove None values from the dataframe
repo_hallucination_results = repo_hallucination_results.dropna()
print(repo_hallucination_results.shape)

for model in models:
    adjusted_scores[model] = [repo_hallucination_results[model].apply(lambda x: x[4]).mean(), #precision mean
                              repo_hallucination_results[model].apply(lambda x: x[5]).mean(), #recall mean 
                              repo_hallucination_results[model].apply(lambda x: x[6]).mean()] #f1 mean 

adjusted_scores

(1689, 4)


Unnamed: 0,Metric,gpt4o_ts_gen_hal,llama3_70b_it_ts_gen_hal
0,Adjusted Precision,0.44912,0.476169
1,Adjusted Recall,0.627009,0.622964
2,Adjusted F1,0.491735,0.512057


# Grouped Score Calculation by TrialGroup

In [11]:
# Group by TrialGroup and calculate mean precision, recall, and F1 scores for each model
grouped_scores = repo_hallucination_results.groupby('TrialGroup').apply(
    lambda x: pd.Series({
        'gpt4o_rag_three_shot_precision': x['gpt4o_ts_gen_hal'].apply(lambda y: y[4]).mean(),
        'gpt4o_rag_three_shot_recall': x['gpt4o_ts_gen_hal'].apply(lambda y: y[5]).mean(),
        'gpt4o_rag_three_shot_f1': x['gpt4o_ts_gen_hal'].apply(lambda y: y[6]).mean(),
        'llama3_rag_three_shot_precision': x['llama3_70b_it_ts_gen_hal'].apply(lambda y: y[4]).mean(),
        'llama3_rag_three_shot_recall': x['llama3_70b_it_ts_gen_hal'].apply(lambda y: y[5]).mean(),
        'llama3_rag_three_shot_f1': x['llama3_70b_it_ts_gen_hal'].apply(lambda y: y[6]).mean(),
    })
).reset_index()

grouped_scores.T

  grouped_scores = repo_hallucination_results.groupby('TrialGroup').apply(


Unnamed: 0,0,1,2,3,4
TrialGroup,cancer,chronic kidney disease,diabetes,hypertension,obesity
gpt4o_rag_three_shot_precision,0.384212,0.437643,0.484228,0.488191,0.470128
gpt4o_rag_three_shot_recall,0.595799,0.616221,0.657947,0.645322,0.61751
gpt4o_rag_three_shot_f1,0.437765,0.478461,0.526434,0.529911,0.497132
llama3_rag_three_shot_precision,0.418238,0.438108,0.524671,0.505709,0.487618
llama3_rag_three_shot_recall,0.579361,0.650757,0.657775,0.633112,0.612897
llama3_rag_three_shot_f1,0.461782,0.48816,0.557391,0.537646,0.51146
