# Run the whole benchmark on CT-Repo Dataset

In [11]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
import module, secret_keys
from model_list import models
import pandas as pd

hf_api_key             = secret_keys.HF_TOKEN                   #<insert your own huggingface token here>
openai_api_key         = secret_keys.OPENAI_API_KEY_TEAM        #<insert your own openai token here>

# Load Dataset

These three IDs are three-shot example IDs in each dataset - ['NCT00000620', 'NCT01483560', 'NCT04280783']. Do not run generation or evaluation on them.

In [3]:
data_repo = pd.read_csv('data_new/CT-Repo-With-Examples-Processed-Version-Corrected.csv')
print(data_repo.shape)
data_repo.head(2)

(1693, 10)


Unnamed: 0,NCTId,BriefTitle,EligibilityCriteria,BriefSummary,Conditions,Interventions,PrimaryOutcomes,TrialGroup,API_BaselineMeasures,API_BaselineMeasures_Corrected
0,NCT00000620,Action to Control Cardiovascular Risk in Diabe...,Inclusion Criteria:\n\n* Diagnosed with type 2...,The purpose of this study is to prevent major ...,"Atherosclerosis, Cardiovascular Diseases, Hype...","Anti-hyperglycemic Agents, Anti-hypertensive A...",First Occurrence of a Major Cardiovascular Eve...,hypertension,"Age, Continuous, Gender, Ethnicity (NIH/OMB), ...","`Age, Continuous`, `Gender`, `Ethnicity (NIH/O..."
1,NCT00003901,Prognostic Study of Metastases in Patients Wit...,Inclusion Criteria:\n\n1. Patient must be ≥ 18...,RATIONALE: Prognostic testing for early signs ...,"Lung Cancer,","immunohistochemistry staining method, biopsy, ...",Overall Survival in Lymph Nodes Examined Patie...,cancer,"Age, Continuous, Gender, Race/Ethnicity, Custo...","`Age, Continuous`, `Gender`, `Race/Ethnicity, ..."


## Generation using GPT-4o with 0-shot prompts (CT-Repo)

In [None]:
from tqdm import tqdm 
model_name   = models['gpt-4o']

data_repo['gpt4o_zs_gen'] = None

for index, row in tqdm(data_repo.iterrows()):

    try:
        system_message, question = module.build_zeroshot_prompt(data_repo, row)
        gen_model_query = module.system_user_template(system_message, question)
        gen_model_response = module.run_generation_single_openai(gen_model_query, model_name, 
                                                    openai_api_key, temperature=0.0)
    
    except Exception as e:
        print(f"Error {e} at index {index}")
        continue
    
    data_repo.at[index, 'gpt4o_zs_gen'] = gen_model_response

    # print(system_message)
    # print(question)
    # print(gen_model_response)

    # break 


## Generation using GPT-4o with 3-shot prompts (CT-Repo)

In [24]:
model_name   = models['gpt-4o']

data_repo['gpt4o_ts_gen'] = None

for index, row in tqdm(data_repo.iterrows()):
    system_message, question = module.build_three_shot_prompt(data_repo, row, ref_col_name='API_BaselineMeasures_Corrected')
    model_query = module.system_user_template(system_message, question)
    model_response = module.run_generation_single_openai(model_query, model_name, 
                                                    openai_api_key, temperature=0.0)
    
    data_repo.at[index, 'gpt4o_ts_gen'] = model_response



103it [09:32,  5.56s/it]


## Generation using Llama3-70B-it with 0-shot prompts (CT-Repo)

In [29]:
model_hf_endpoint = models['llama3-70b-it']

data_repo['llama3_70b_it_zs_gen'] = None

for index, row in tqdm(data_repo.iterrows()):

    system_message, question = module.build_zeroshot_prompt(data_repo, row)
    model_query = module.system_user_template(system_message, question)
    model_response = module.run_generation_single_hf_models(model_query, model_hf_endpoint, 
                                                    hf_api_key, temperature=0.0)
    
    data_repo.at[index, 'llama3_70b_it_zs_gen'] = model_response



103it [05:17,  3.08s/it]


## Generation using Llama3-70B-it with 3-shot prompts (CT-Repo)

In [11]:
from tqdm import tqdm 
model_hf_endpoint = models['llama3-70b-it']

#data_repo['llama3_70b_it_ts_gen'] = None

for index, row in tqdm(data_repo.iterrows()):

    # re_ids = ['NCT00286091', 'NCT00311155', 'NCT00769704']
    # if row['NCTId'] not in re_ids:
    #     continue

    system_message, question = module.build_three_shot_prompt(data_repo, row, ref_col_name='API_BaselineMeasures_Corrected')
    model_query = module.system_user_template(system_message, question)
    model_response = module.run_generation_single_hf_models(model_query, model_hf_endpoint, 
                                                    hf_api_key, temperature=0.0)
    
    data_repo.at[index, 'llama3_70b_it_ts_gen'] = model_response
    


1693it [00:06, 256.04it/s]


In [12]:
data_repo.to_csv('hidden_data/CT-Repo-With-Examples-Corrected-allgen.csv', index=False)

## Evaluation Code

In [14]:
data_repo = pd.read_csv('hidden_data/CT-Repo-With-Examples-Corrected-allgen.csv')

In [15]:
data_repo.head(3)

Unnamed: 0,NCTId,BriefTitle,EligibilityCriteria,BriefSummary,Conditions,Interventions,PrimaryOutcomes,TrialGroup,API_BaselineMeasures,API_BaselineMeasures_Corrected,gpt4o_zs_gen,gpt4o_ts_gen,llama3_70b_it_zs_gen,llama3_70b_it_ts_gen
0,NCT00000620,Action to Control Cardiovascular Risk in Diabe...,Inclusion Criteria:\n\n* Diagnosed with type 2...,The purpose of this study is to prevent major ...,"Atherosclerosis, Cardiovascular Diseases, Hype...","Anti-hyperglycemic Agents, Anti-hypertensive A...",First Occurrence of a Major Cardiovascular Eve...,hypertension,"Age, Continuous, Gender, Ethnicity (NIH/OMB), ...","`Age, Continuous`, `Gender`, `Ethnicity (NIH/O...","`Age`, `Sex`, `Race/Ethnicity`, `Body Mass Ind...","`Age, Continuous`, `Gender`, `Ethnicity (NIH/O...","`Age`, `Sex`, `Race`, `Ethnicity`, `Body Mass ...","`Age, Continuous`, `Gender`, `Ethnicity (NIH/O..."
1,NCT00003901,Prognostic Study of Metastases in Patients Wit...,Inclusion Criteria:\n\n1. Patient must be ≥ 18...,RATIONALE: Prognostic testing for early signs ...,"Lung Cancer,","immunohistochemistry staining method, biopsy, ...",Overall Survival in Lymph Nodes Examined Patie...,cancer,"Age, Continuous, Gender, Race/Ethnicity, Custo...","`Age, Continuous`, `Gender`, `Race/Ethnicity, ...","`Age`, `Gender`, `ECOG/Zubrod status`, `Clinic...","`Age, Continuous`, `Sex: Female, Male`, `Race/...","`age`, `sex`, `ECOG/Zubrod status`, `histologi...","`Age, Continuous`, `Sex: Female, Male`, `Race/..."
2,NCT00005879,LY353381 in Preventing Breast Cancer in Women ...,DISEASE CHARACTERISTICS:\n\n* Current random f...,RATIONALE: Chemoprevention therapy is the use ...,"Breast Cancer,","arzoxifene, Placebo,","Change in Masood Score, Number of Participants...",cancer,"Age, Continuous, Sex: Female, Male, Region of ...","`Age, Continuous`, `Sex: Female, Male`, `Regio...","`Age`, `Sex`, `Menopausal status`, `Hemoglobin...","`Age, Continuous`, `Sex: Female`, `Race/Ethnic...","`Age`, `Sex`, `Menopausal status`, `Body Mass ...","`Age, Continuous`, `Sex: Female`, `Race/Ethnic..."


### Evaluation of Reference VS Candidate Responses

In [5]:
# eval_data_repo = pd.DataFrame()
# eval_data_repo['NCTId'] = data_repo['NCTId']

In [13]:
eval_data_repo = pd.read_csv('hidden_data/CT-Repo-With-Examples-Corrected-allgpteval.csv')

Note: eval_model_response from GPT-4o is coming in string format with JSON structure. You need to use json.loads() to use it as a json object.

In [None]:
import json
from tqdm import tqdm
import time 

for index, row in tqdm(data_repo.iterrows()):
    
    print(f"Processing NCTId {row['NCTId']}")

    # avoid_ids = ['NCT00000620', 'NCT01483560', 'NCT04280783'] #these were used as examples for 3-shot generation
    # if row['NCTId'] in avoid_ids:
    #     continue

    if row['NCTId']!='NCT04985383':
        continue 

    qstart = module.get_question_from_row(row)

    reference_list = module.extract_elements_v2(row['API_BaselineMeasures_Corrected'])
    
    candidate_gzs = module.extract_elements_v2(row['gpt4o_zs_gen'])
    candidate_gts = module.extract_elements_v2(row['gpt4o_ts_gen'])
    candidate_lzs = module.extract_elements_v2(row['llama3_70b_it_zs_gen'])
    candidate_lts = module.extract_elements_v2(row['llama3_70b_it_ts_gen'])
    
    system_message_gzs, question_gzs = module.build_gpt4_eval_prompt(reference_list, 
                                                            candidate_gzs, 
                                                            qstart)
    
    system_message_gts, question_gts = module.build_gpt4_eval_prompt(reference_list,
                                                            candidate_gts,
                                                            qstart)
    
    system_message_lzs, question_lzs = module.build_gpt4_eval_prompt(reference_list,
                                                            candidate_lzs,
                                                            qstart)
    
    system_message_lts, question_lts = module.build_gpt4_eval_prompt(reference_list,
                                                            candidate_lts,
                                                            qstart)

    try:
        eval_model_response_gzs = module.run_evaluation_with_gpt4o(system_message_gzs, question_gzs, openai_api_key)
    except Exception as e:
        eval_model_response_gzs = None
        print(f"Error {e} at index {index} with NCTId {row['NCTId']}")
        break
    try:
        eval_model_response_gts = module.run_evaluation_with_gpt4o(system_message_gts, question_gts, openai_api_key)
    except Exception as e:
        eval_model_response_gts = None
        print(f"Error {e} at index {index} with NCTId {row['NCTId']}")
        break 
    try:
        eval_model_response_lzs = module.run_evaluation_with_gpt4o(system_message_lzs, question_lzs, openai_api_key)
    except Exception as e:
        eval_model_response_lzs = None
        print(f"Error {e} at index {index} with NCTId {row['NCTId']}")
        break 
    try:
        eval_model_response_lts = module.run_evaluation_with_gpt4o(system_message_lts, question_lts, openai_api_key)
    except Exception as e:
        eval_model_response_lts = None
        print(f"Error {e} at index {index} with NCTId {row['NCTId']}")
        break 
    
    #Convert eval_model_response to a JSON string and store in the dataframe
    eval_data_repo.at[index, 'gpt4o_zs_gen_matches'] = eval_model_response_gzs 
    eval_data_repo.at[index, 'gpt4o_ts_gen_matches'] = eval_model_response_gts
    eval_data_repo.at[index, 'llama3_70b_it_zs_gen_matches'] = eval_model_response_lzs
    eval_data_repo.at[index, 'llama3_70b_it_ts_gen_matches'] = eval_model_response_lts

    # if index%20==0:
    #     time.sleep(5)
    
    # if index%100==0:
    #     eval_data_repo.to_csv(f'hidden_data/no_release/CT-Repo-With-Examples-Corrected-allgen-eval-upto-{index}.csv', index=False)
    


In [25]:
eval_data_repo.to_csv('hidden_data/CT-Repo-With-Examples-Corrected-allgpteval.csv', index=False)