# Run the whole benchmark on CT-Pub Dataset

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import module, secret_keys
from model_list import models
import pandas as pd

hf_api_key             = secret_keys.HF_TOKEN                   #<insert your own huggingface token here>
openai_api_key         = secret_keys.OPENAI_API_KEY_TEAM        #<insert your own openai token here>

# Load Dataset

These three IDs are three-shot example IDs in each dataset - ['NCT00000620', 'NCT01483560', 'NCT04280783']. Do not run generation or evaluation on them.

In [3]:
data_pub = pd.read_csv('data_new/CT-Pub-With-Examples-Corrected.csv')
print(data_pub.shape)
data_pub.head(2)

(103, 12)


Unnamed: 0,NCTId,BriefTitle,EligibilityCriteria,BriefSummary,Conditions,Interventions,PrimaryOutcomes,TrialGroup,API_BaselineMeasures,API_BaselineMeasures_Corrected,Paper_BaselineMeasures,Paper_BaselineMeasures_Corrected
0,NCT00000620,Action to Control Cardiovascular Risk in Diabe...,Inclusion Criteria:\n\n* Diagnosed with type 2...,The purpose of this study is to prevent major ...,"Atherosclerosis, Cardiovascular Diseases, Hype...","Anti-hyperglycemic Agents, Anti-hypertensive A...",First Occurrence of a Major Cardiovascular Eve...,hypertension,"Age, Continuous, Gender, Ethnicity (NIH/OMB), ...","`Age, Continuous`, `Gender`, `Ethnicity (NIH/O...","Age, Female sex, Median duration of diabetes, ...","`Age`, `Female sex`, `Median duration of diabe..."
1,NCT00126737,Home-Based Exercise and Weight Control Program...,Inclusion Criteria:\n\n* Male \& female 50 yea...,The purpose of this study is to determine whet...,"Chronic Diseases, Obesity, Osteoarthritis, Pain,","Weight Control Nutritional Program, Home-based...","WOMAC Function, Physical Scale SF-36v, Mental ...",obesity,"Age, Continuous, Sex: Female, Male, Race/Ethni...","`Age, Continuous`, `Sex: Female, Male`, `Race/...","Age, Duration of OA, Kellgren-Lawrence Classif...","`Age`, `Duration of OA`, `Kellgren-Lawrence Cl..."


## Generation using GPT-4o with 0-shot prompts (CT-Pub)

In [17]:
from tqdm import tqdm 
model_name   = models['gpt-4o']

data_pub['gpt4o_zs_gen'] = None

for index, row in tqdm(data_pub.iterrows()):
    system_message, question = module.build_zeroshot_prompt(data_pub, row)
    gen_model_query = module.system_user_template(system_message, question)
    gen_model_response = module.run_generation_single_openai(gen_model_query, model_name, 
                                                openai_api_key, temperature=0.0)
    
    data_pub.at[index, 'gpt4o_zs_gen'] = gen_model_response

    # print(system_message)
    # print(question)
    # print(gen_model_response)


103it [03:27,  2.01s/it]


## Generation using GPT-4o with 3-shot prompts (CT-Pub)

In [24]:
model_name   = models['gpt-4o']

data_pub['gpt4o_ts_gen'] = None

for index, row in tqdm(data_pub.iterrows()):
    system_message, question = module.build_three_shot_prompt(data_pub, row, ref_col_name='Paper_BaselineMeasures_Corrected')
    model_query = module.system_user_template(system_message, question)
    model_response = module.run_generation_single_openai(model_query, model_name, 
                                                    openai_api_key, temperature=0.0)
    
    data_pub.at[index, 'gpt4o_ts_gen'] = model_response



103it [09:32,  5.56s/it]


## Generation using Llama3-70B-it with 0-shot prompts (CT-Pub)

In [29]:
model_hf_endpoint = models['llama3-70b-it']

data_pub['llama3_70b_it_zs_gen'] = None

for index, row in tqdm(data_pub.iterrows()):

    system_message, question = module.build_zeroshot_prompt(data_pub, row)
    model_query = module.system_user_template(system_message, question)
    model_response = module.run_generation_single_hf_models(model_query, model_hf_endpoint, 
                                                    hf_api_key, temperature=0.0)
    
    data_pub.at[index, 'llama3_70b_it_zs_gen'] = model_response



103it [05:17,  3.08s/it]


## Generation using Llama3-70B-it with 3-shot prompts (CT-Pub)

In [33]:
model_hf_endpoint = models['llama3-70b-it']

data_pub['llama3_70b_it_ts_gen'] = None

for index, row in tqdm(data_pub.iterrows()):

    system_message, question = module.build_three_shot_prompt(data_pub, row, ref_col_name='Paper_BaselineMeasures_Corrected')
    model_query = module.system_user_template(system_message, question)
    model_response = module.run_generation_single_hf_models(model_query, model_hf_endpoint, 
                                                    hf_api_key, temperature=0.0)
    
    data_pub.at[index, 'llama3_70b_it_ts_gen'] = model_response
    


103it [05:23,  3.14s/it]


In [35]:
data_pub.to_csv('hidden_data/CT-Pub-With-Examples-Corrected-allgen.csv', index=False)

## Evaluation Code

In [4]:
data_pub = pd.read_csv('hidden_data/CT-Pub-With-Examples-Corrected-allgen.csv')

### Evaluation of Reference VS Candidate Responses

In [5]:
eval_data_pub = pd.DataFrame()
eval_data_pub['NCTId'] = data_pub['NCTId']

Note: eval_model_response from GPT-4o is coming in string format with JSON structure. You need to use json.loads() to use it as a json object.

In [16]:
import json
from tqdm import tqdm

for index, row in tqdm(data_pub.iterrows()):

    qstart = module.get_question_from_row(row)

    reference_list = module.extract_elements_v2(row['Paper_BaselineMeasures_Corrected'])
    
    candidate_gzs = module.extract_elements_v2(row['gpt4o_zs_gen'])
    candidate_gts = module.extract_elements_v2(row['gpt4o_ts_gen'])
    candidate_lzs = module.extract_elements_v2(row['llama3_70b_it_zs_gen'])
    candidate_lts = module.extract_elements_v2(row['llama3_70b_it_ts_gen'])
    
    system_message_gzs, question_gzs = module.build_gpt4_eval_prompt_with_example(reference_list, 
                                                            candidate_gzs, 
                                                            qstart)
    
    system_message_gts, question_gts = module.build_gpt4_eval_prompt_with_example(reference_list,
                                                            candidate_gts,
                                                            qstart)
    
    system_message_lzs, question_lzs = module.build_gpt4_eval_prompt_with_example(reference_list,
                                                            candidate_lzs,
                                                            qstart)
    
    system_message_lts, question_lts = module.build_gpt4_eval_prompt_with_example(reference_list,
                                                            candidate_lts,
                                                            qstart)

    
    eval_model_response_gzs = module.run_evaluation_with_gpt4o(system_message_gzs, question_gzs, openai_api_key)
    eval_model_response_gts = module.run_evaluation_with_gpt4o(system_message_gts, question_gts, openai_api_key)
    eval_model_response_lzs = module.run_evaluation_with_gpt4o(system_message_lzs, question_lzs, openai_api_key)
    eval_model_response_lts = module.run_evaluation_with_gpt4o(system_message_lts, question_lts, openai_api_key)
    
    #Convert eval_model_response to a JSON string and store in the dataframe
    eval_data_pub.at[index, 'gpt4o_zs_gen_matches'] = eval_model_response_gzs 
    eval_data_pub.at[index, 'gpt4o_ts_gen_matches'] = eval_model_response_gts
    eval_data_pub.at[index, 'llama3_70b_it_zs_gen_matches'] = eval_model_response_lzs
    eval_data_pub.at[index, 'llama3_70b_it_ts_gen_matches'] = eval_model_response_lts

    #break
    


103it [00:15,  6.60it/s]


In [90]:
eval_data_pub.to_csv('hidden_data/CT-Pub-With-Examples-Corrected-allgpteval-eval-prompt-edited.csv', index=False)