## We are running CT_Repo Remaining 1000 Sample data using ct_repo indexed stores

In [42]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [43]:
import json 
import pandas as pd 
from dotenv import load_dotenv
from model_list import models 
import chromadb
import openai
import module 
import os 
from tqdm import tqdm 

In [44]:
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY_TEAM')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN')
os.environ['GROQ_API_KEY'] = os.getenv('GROQ_API_KEY')

In [45]:
# data_500 = pd.read_csv('../data/CT-Repo-With-Examples-Corrected-500-Sample.csv')
# data = pd.read_csv('../data/CT-Repo-With-Examples-Processed-Version-Corrected.csv')
data = pd.read_csv('../data/hidden_data/biobert_embed/CT-Repo-rag-biobert-allgen.csv')

In [46]:
data.head(3)

Unnamed: 0,NCTId,BriefTitle,EligibilityCriteria,BriefSummary,Conditions,Interventions,PrimaryOutcomes,TrialGroup,API_BaselineMeasures,API_BaselineMeasures_Corrected,gpt4o_rag_ts_gen,llama3_70b_it_rag_ts_gen
0,NCT00000620,Action to Control Cardiovascular Risk in Diabe...,Inclusion Criteria:\n\n* Diagnosed with type 2...,The purpose of this study is to prevent major ...,"Atherosclerosis, Cardiovascular Diseases, Hype...","Anti-hyperglycemic Agents, Anti-hypertensive A...",First Occurrence of a Major Cardiovascular Eve...,hypertension,"Age, Continuous, Gender, Ethnicity (NIH/OMB), ...","`Age, Continuous`, `Gender`, `Ethnicity (NIH/O...",,
1,NCT00003901,Prognostic Study of Metastases in Patients Wit...,Inclusion Criteria:\n\n1. Patient must be ≥ 18...,RATIONALE: Prognostic testing for early signs ...,"Lung Cancer,","immunohistochemistry staining method, biopsy, ...",Overall Survival in Lymph Nodes Examined Patie...,cancer,"Age, Continuous, Gender, Race/Ethnicity, Custo...","`Age, Continuous`, `Gender`, `Race/Ethnicity, ...","`Age, Continuous`, `Sex: Female, Male`, `Race ...","`Age, Continuous`, `Sex: Female, Male`, `Ethni..."
2,NCT00005879,LY353381 in Preventing Breast Cancer in Women ...,DISEASE CHARACTERISTICS:\n\n* Current random f...,RATIONALE: Chemoprevention therapy is the use ...,"Breast Cancer,","arzoxifene, Placebo,","Change in Masood Score, Number of Participants...",cancer,"Age, Continuous, Sex: Female, Male, Region of ...","`Age, Continuous`, `Sex: Female, Male`, `Regio...","`Age, Continuous`, `Sex: Female`, `Menopausal ...","`Age, Continuous`, `Sex: Female`, `Region of E..."


In [32]:
#main_path = '/Users/nafisneehal/Desktop/CTBench_RAG/'

# Initialize ChromaDB
# Directory containing JSON files
input_json_directory = '../data/ctrepo_json' #source of input json files
rag_json_directory = '../data/ctrepo_json' #source of example json files
vector_store_path = '../chroma_db_ctrepo_indexedby_biobert' #search the ctrepo vector store for examples 

# #initialize client 
c_db = chromadb.PersistentClient(path=vector_store_path)
#get collection
chroma_collection = c_db.get_or_create_collection("ctrepo_all_biobert") #located in df_to_json_and_indexing.ipynb file 

In [33]:
def json_file_to_rag_query(file_name, json_directory):
    with open(f"{json_directory}/{file_name}", "r") as file:
        trial_data = json.load(file)
        trial_query = f"""
        BriefTitle: {trial_data['BriefTitle']}\n
        EligibilityCriteria: {trial_data['EligibilityCriteria']}\n 
        BriefSummary: {trial_data['BriefSummary']}\n
        Conditions: {trial_data['Conditions']}\n
        Interventions: {trial_data['Interventions']}\n
        PrimaryOutcomes: {trial_data['PrimaryOutcomes']}
        """
        return trial_query

In [70]:
eval_error_ids

['NCT01364584',
 'NCT01452412',
 'NCT02337946',
 'NCT02966028',
 'NCT03197038',
 'NCT03686150',
 'NCT03923933']

# GPT-4o Generation with RAG (3 shot) 

In [71]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

import logging
logger = logging.getLogger(__name__)
logging.basicConfig(
    filename='gpt4_rag_gen_1000.log',
    filemode='a',
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    datefmt='%d-%b-%y %H:%M:%S',
    encoding='utf-8',
    level=logging.INFO,
    force=True
)

number_of_similar_trials = 3
model_name   = models['gpt-4o']
embed_model = HuggingFaceEmbedding(model_name="dmis-lab/biobert-base-cased-v1.2")
Settings.embed_model = embed_model
Settings.llm = None 

#data['gpt4o_rag_ts_gen'] = None 

for index, row in tqdm(data.iterrows()):

    #print(row['NCTId'])

    #AVOID these trials that were used as three-shot examples only
    example_trials_to_avoid = ['NCT00000620', 'NCT01483560', 'NCT04280783']
    if row['NCTId'] in example_trials_to_avoid:
        logging.info(f"Skipping {row['NCTId']} as it was used as a three-shot example")
        continue

    if row['NCTId'] not in eval_error_ids:
        continue 
    
    file_name = row['NCTId'] + '.json'
    query_trial = json_file_to_rag_query(file_name, input_json_directory) #string query for searching rag database only

    #run rag query and find the filenames of similar trials
    similar_trials = module.query_clinical_trials_using_llamaindex(chroma_collection, vector_store_path,  
                                                                   query_trial, top_k=number_of_similar_trials,
                                                                   embed_model=embed_model)
    
    #print(similar_trials)

    #get system message, question 
    system_message, question = module.build_three_shot_prompt(row, similar_trials, rag_json_directory, 
                                                              ref_col_name='API_BaselineMeasures_Corrected') #asking ct_repo examples 

    model_query = module.system_user_template(system_message, question)

    try:
        model_response = module.run_generation_single_openai(model_query, model_name = model_name, 
                                                    openai_token = os.environ["OPENAI_API_KEY"], temperature=0.0)
    except Exception as e:
        logging.error(f"Error {e} in generating response for {row['NCTId']}")
        print(f"Error {e} in generating response for {row['NCTId']}")
        model_response = None
        continue

    data.at[index, 'gpt4o_rag_ts_gen'] = model_response
    
    # print 
    # print(system_message)
    # print(question)
    # print(model_response)
    # break

LLM is explicitly disabled. Using MockLLM.


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.58it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.03it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.16it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  6.95it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.94it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.99it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.53it/s]
1693it [00:28, 60.27it/s]


In [72]:
data.head(3)

Unnamed: 0,NCTId,BriefTitle,EligibilityCriteria,BriefSummary,Conditions,Interventions,PrimaryOutcomes,TrialGroup,API_BaselineMeasures,API_BaselineMeasures_Corrected,gpt4o_rag_ts_gen,llama3_70b_it_rag_ts_gen
0,NCT00000620,Action to Control Cardiovascular Risk in Diabe...,Inclusion Criteria:\n\n* Diagnosed with type 2...,The purpose of this study is to prevent major ...,"Atherosclerosis, Cardiovascular Diseases, Hype...","Anti-hyperglycemic Agents, Anti-hypertensive A...",First Occurrence of a Major Cardiovascular Eve...,hypertension,"Age, Continuous, Gender, Ethnicity (NIH/OMB), ...","`Age, Continuous`, `Gender`, `Ethnicity (NIH/O...",,
1,NCT00003901,Prognostic Study of Metastases in Patients Wit...,Inclusion Criteria:\n\n1. Patient must be ≥ 18...,RATIONALE: Prognostic testing for early signs ...,"Lung Cancer,","immunohistochemistry staining method, biopsy, ...",Overall Survival in Lymph Nodes Examined Patie...,cancer,"Age, Continuous, Gender, Race/Ethnicity, Custo...","`Age, Continuous`, `Gender`, `Race/Ethnicity, ...","`Age, Continuous`, `Sex: Female, Male`, `Race ...","`Age, Continuous`, `Sex: Female, Male`, `Ethni..."
2,NCT00005879,LY353381 in Preventing Breast Cancer in Women ...,DISEASE CHARACTERISTICS:\n\n* Current random f...,RATIONALE: Chemoprevention therapy is the use ...,"Breast Cancer,","arzoxifene, Placebo,","Change in Masood Score, Number of Participants...",cancer,"Age, Continuous, Sex: Female, Male, Region of ...","`Age, Continuous`, `Sex: Female, Male`, `Regio...","`Age, Continuous`, `Sex: Female`, `Menopausal ...","`Age, Continuous`, `Sex: Female`, `Region of E..."


In [73]:
#get ids where llama3_70b_it_rag_ts_gen is None 
empty_lists = data[data['llama3_70b_it_rag_ts_gen'].isnull()].NCTId.values.tolist()
len(empty_lists)

3

# Llama3 Generation with RAG (3-shot) 

In [74]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

import logging
logger = logging.getLogger(__name__)
logging.basicConfig(
    filename='llama3_rag_gen_1000.log',
    filemode='a',
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    datefmt='%d-%b-%y %H:%M:%S',
    encoding='utf-8',
    level=logging.INFO,
    force=True
)

number_of_similar_trials = 3
model_name   = models['gpt-4o']
model_hf_endpoint = models['llama3_1-70b-it']

embed_model = HuggingFaceEmbedding(model_name="dmis-lab/biobert-base-cased-v1.2")
Settings.embed_model = embed_model
Settings.llm = None 

#data['llama3_70b_it_rag_ts_gen'] = None 

for index, row in tqdm(data.iterrows()):

    #print(row['NCTId'])
    #AVOID these trials that were used as three-shot examples only
    example_trials_to_avoid = ['NCT00000620', 'NCT01483560', 'NCT04280783']
    if row['NCTId'] in example_trials_to_avoid:
        logging.info(f"Skipping {row['NCTId']} as it was used as a three-shot example")
        continue

    #run only IDs that are in empty_lists
    if row['NCTId'] not in eval_error_ids:
        continue

    file_name = row['NCTId'] + '.json'
    query_trial = json_file_to_rag_query(file_name, input_json_directory) #string query for searching rag database only

    
    #run rag query and find the filenames of similar trials
    similar_trials = module.query_clinical_trials_using_llamaindex(chroma_collection, vector_store_path,  
                                                                query_trial, top_k=number_of_similar_trials,
                                                                embed_model=embed_model)
    
    # print(similar_trials)

    #get system message, question 
    system_message, question = module.build_three_shot_prompt(row, similar_trials, rag_json_directory, 
                                                            ref_col_name='API_BaselineMeasures_Corrected') #which column to use for reference in RAG examples from ctrepo

    model_query = module.system_user_template(system_message, question)

    try:
        model_response = module.run_generation_single_hf_models(model_query, model_hf_endpoint, 
                                                        os.environ['HF_TOKEN'], temperature=0.0)
        logging.info(f"Generated response for {row['NCTId']} at index {index}")
        data.at[index, 'llama3_70b_it_rag_ts_gen'] = model_response

    #Note: Weird Bug: UnprocessableEntityError: Error code: 422 - {'error': 'Input validation error: `inputs` tokens + `max_new_tokens` must be <= 8192. Given: 7573 `inputs` tokens and 1000 `max_new_tokens`', 'error_type': 'validation'}
    except Exception as e:
        print(f"An error occurred while processing trial {row['NCTId']}: {e}")
        logging.error(f"Error {e} in generating response for {row['NCTId']} at index {index}")
        data.at[index, 'llama3_70b_it_rag_ts_gen'] = None
        continue

    #print 
    # print(system_message)
    # print(question)
    # print(model_response)
    # break

LLM is explicitly disabled. Using MockLLM.


Batches: 100%|██████████| 1/1 [00:00<00:00,  5.98it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  2.62it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.16it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.44it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.12it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.04it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.02it/s]
1693it [01:03, 26.86it/s]


In [75]:
data.isna().sum()

NCTId                             0
BriefTitle                        0
EligibilityCriteria               0
BriefSummary                      0
Conditions                        0
Interventions                     0
PrimaryOutcomes                   0
TrialGroup                        0
API_BaselineMeasures              0
API_BaselineMeasures_Corrected    0
gpt4o_rag_ts_gen                  3
llama3_70b_it_rag_ts_gen          3
dtype: int64

In [76]:
data.to_csv('../data/hidden_data/biobert_embed/CT-Repo-rag-biobert-allgen.csv', index=False)

## GPT4 Evaluation

In [78]:
eval_data = pd.read_csv('../data/hidden_data/biobert_embed/CT-Repo-rag-biobert-alleval.csv')

In [79]:
eval_error_ids

['NCT01364584',
 'NCT01452412',
 'NCT02337946',
 'NCT02966028',
 'NCT03197038',
 'NCT03686150',
 'NCT03923933']

In [80]:
import json
from tqdm import tqdm

import logging
logger = logging.getLogger(__name__)
logging.basicConfig(
    filename='ctrepo_rag_eval_1000.log',
    filemode='a',
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    datefmt='%d-%b-%y %H:%M:%S',
    encoding='utf-8',
    level=logging.INFO,
    force=True
)

logging.info("Evaluating again with GPT4o 2000 eval max tokens for affected IDs only")

#llama_error_ids = []

for index, row in data.iterrows():

    example_trials_to_avoid = ['NCT00000620', 'NCT01483560', 'NCT04280783']
    if row['NCTId'] in example_trials_to_avoid:
        logging.info(f"Skipping {row['NCTId']} as it was used as a three-shot example at index {index}")
        continue

   #run only IDs that are in empty_lists
    if row['NCTId'] not in eval_error_ids:
        continue
    

    print(row['NCTId'])
    qstart = module.get_question_from_row(row)

    reference_list = module.extract_elements_v2(row['API_BaselineMeasures_Corrected']) #which column to use for reference as input from ctpub
    
    candidate_gts = module.extract_elements_v2(row['gpt4o_rag_ts_gen'])
    candidate_lts = module.extract_elements_v2(row['llama3_70b_it_rag_ts_gen'])
    
    system_message_gts, question_gts = module.build_gpt4_eval_prompt(reference_list,
                                                            candidate_gts,
                                                            qstart)
    
    
    system_message_lts, question_lts = module.build_gpt4_eval_prompt(reference_list,
                                                            candidate_lts,
                                                            qstart)
    

    try:
        eval_model_response_gts = module.run_evaluation_with_gpt4o(system_message_gts, question_gts, os.environ["OPENAI_API_KEY"])
        eval_data.at[index, 'gpt4o_ts_gen_matches'] = eval_model_response_gts
    except Exception as e:
        logging.error(f"Error {e} in generating response for {row['NCTId']} at index {index}")
        print(f"Error {e} in generating response for {row['NCTId']} at index {index}")
        eval_data.at[index, 'gpt4o_ts_gen_matches'] = None
        continue
    
    try:
        eval_model_response_lts = module.run_evaluation_with_gpt4o(system_message_lts, question_lts, os.environ["OPENAI_API_KEY"])
        eval_data.at[index, 'llama3_70b_it_ts_gen_matches'] = eval_model_response_lts
        logging.info(f"Generated response for {row['NCTId']} at index {index}")
    except Exception as e:
        logging.error(f"Error {e} in generating response for {row['NCTId']} at index {index}")
        print(f"Error {e} in generating response for {row['NCTId']} at index {index}")
        eval_data.at[index, 'llama3_70b_it_ts_gen_matches'] = None
        continue

    # if index%50==0:
    #     eval_data.to_csv(f'../data/hidden_data/biobert_embed/no_release/CT-Repo-1000-Samples-rag-biobert-alleval_{index}.csv', index=False)
    #break

#logging.info(f"Error ids: {llama_error_ids}")
#eval_data.to_csv('../data/hidden_data/biobert_embed/CT-Repo-1000-Samples-rag-biobert-alleval.csv', index=False)
    


NCT01364584
NCT01452412
NCT02337946
NCT02966028
NCT03197038
NCT03686150
NCT03923933


In [81]:
eval_data.to_csv('../data/hidden_data/biobert_embed/CT-Repo-rag-biobert-alleval.csv', index=False)

In [82]:
print(eval_data.at[2, 'gpt4o_ts_gen_matches'])

{
    "matched_features": [
        ["Age, Continuous", "Age, Continuous"],
        ["Sex: Female, Male", "Sex: Female"],
        ["Menopause Status", "Menopausal Status"],
        ["Hormone Use", "Hormone Replacement Therapy Use"],
        ["Number relatives with Breast Cancer", "Family History of Breast Cancer"]
    ],
    "remaining_reference_features": [
        "Region of Enrollment",
        "Height",
        "Weight",
        "BMI",
        "Age at Menarche",
        "Age at First Live Birth",
        "Prior Biopsy"
    ],
    "remaining_candidate_features": [
        "BRCA1/BRCA2 Mutation Status",
        "Oral Contraceptive Use",
        "Baseline Masood Score",
        "Hemoglobin",
        "Granulocyte Count",
        "Albumin",
        "Bilirubin",
        "AST",
        "Alkaline Phosphatase",
        "Creatinine",
        "History of Deep Venous Thrombosis",
        "History of Coronary Artery Disease",
        "History of Stroke",
        "Factor V Leiden Mutation Status

## Score Calculation (Whole 1700 Trials)

In [83]:
eval_data.isna().sum()

NCTId                           0
gpt4o_ts_gen_matches            3
llama3_70b_it_ts_gen_matches    3
dtype: int64

In [84]:
eval_data.shape

(1693, 3)

In [85]:
score_df = pd.DataFrame()
score_df['NCTId'] = data['NCTId']

In [86]:
import json 

gts_sum = {'precision':0, 'recall':0, 'f1':0}
lts_sum = {'precision':0, 'recall':0, 'f1':0}

total_count = 0

eval_error_ids = []

for index, row in tqdm(eval_data.iterrows()):
    avoid_ids = ['NCT00000620', 'NCT01483560', 'NCT04280783'] #these were used as examples for 3-shot generation
    if row['NCTId'] in avoid_ids:
        continue
    if not row['llama3_70b_it_ts_gen_matches'] or not row['gpt4o_ts_gen_matches']:
        continue

    #print(row['NCTId'])
    
    try:
        # gts
        gts_dict = json.loads(row['gpt4o_ts_gen_matches'])
        gts_matches = gts_dict['matched_features']
        gts_remaining_references = gts_dict['remaining_reference_features']
        gts_remaining_candidates = gts_dict['remaining_candidate_features']
        gts_score = module.match_to_score(gts_matches, gts_remaining_references, gts_remaining_candidates)
        score_df.at[index, 'gpt4o_ts_gen_scores'] = json.dumps(gts_score)
        gts_sum['precision'] += gts_score['precision']
        gts_sum['recall'] += gts_score['recall']
        gts_sum['f1'] += gts_score['f1']

        # lts 
        lts_dict = json.loads(row['llama3_70b_it_ts_gen_matches'])
        lts_matches = lts_dict['matched_features']
        lts_remaining_references = lts_dict['remaining_reference_features']
        lts_remaining_candidates = lts_dict['remaining_candidate_features']
        lts_score = module.match_to_score(lts_matches, lts_remaining_references, lts_remaining_candidates)
        score_df.at[index, 'llama3_70b_it_ts_gen_scores'] = json.dumps(lts_score)
        lts_sum['precision'] += lts_score['precision']
        lts_sum['recall'] += lts_score['recall']
        lts_sum['f1'] += lts_score['f1']

        total_count += 1

    except Exception as e:
        print(f"An error occurred while processing trial {row['NCTId']}: {e}")
        eval_error_ids.append(row['NCTId'])
        continue

    #break



1693it [00:00, 17796.35it/s]

An error occurred while processing trial NCT03923933: Unterminated string starting at: line 627 column 9 (char 23154)





In [87]:
eval_error_ids

['NCT03923933']

In [88]:
score_df.head()

Unnamed: 0,NCTId,gpt4o_ts_gen_scores,llama3_70b_it_ts_gen_scores
0,NCT00000620,,
1,NCT00003901,"{""precision"": 0.7777777777777778, ""recall"": 0....","{""precision"": 0.5555555555555556, ""recall"": 0...."
2,NCT00005879,"{""precision"": 0.25, ""recall"": 0.41666666666666...","{""precision"": 0.4166666666666667, ""recall"": 0...."
3,NCT00005908,"{""precision"": 0.16666666666666666, ""recall"": 0...","{""precision"": 0.25, ""recall"": 0.5, ""f1"": 0.333..."
4,NCT00006110,"{""precision"": 0.21428571428571427, ""recall"": 0...","{""precision"": 0.2, ""recall"": 0.428571428571428..."


New Score

In [102]:
size = eval_data.shape[0] - len(eval_error_ids)
print(f"GPT-4o Three Shot Scores: Precision={gts_sum['precision']/size} Recall={gts_sum['recall']/size} F1={gts_sum['f1']/size}")
print(f"Llama-3 70B Instruct Three Shot Scores: Precision={lts_sum['precision']/size} Recall={lts_sum['recall']/size} F1={lts_sum['f1']/size}")


GPT-4o Three Shot Scores: Precision=0.45989216975047625 Recall=0.6339730760819745 F1=0.5010950848650941
Llama-3 70B Instruct Three Shot Scores: Precision=0.482190981343621 Recall=0.6252284937268767 F1=0.5163080729745275


In [90]:
size = eval_data.shape[0] - len(eval_error_ids)
print(f"GPT-4o Three Shot Scores: Precision={gts_sum['precision']/size} Recall={gts_sum['recall']/size} F1={gts_sum['f1']/size}")
print(f"Llama-3 70B Instruct Three Shot Scores: Precision={lts_sum['precision']/size} Recall={lts_sum['recall']/size} F1={lts_sum['f1']/size}")

GPT-4o Three Shot Scores: Precision=0.4574058059746241 Recall=0.6339029669924033 F1=0.4990967535520557
Llama-3 70B Instruct Three Shot Scores: Precision=0.4825286746822389 Recall=0.628152797930514 F1=0.5176417915705456
