## We are running CT_Pub data using ct_repo indexed stores

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json 
import pandas as pd 
from dotenv import load_dotenv
from model_list import models 
import chromadb
import openai
import module 
import os 
from tqdm import tqdm 

In [3]:
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY_TEAM')
os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN')

In [4]:
data_pub = pd.read_csv('../data/CT-Pub-With-Examples-Corrected.csv')
data_pub.head(3)

Unnamed: 0,NCTId,BriefTitle,EligibilityCriteria,BriefSummary,Conditions,Interventions,PrimaryOutcomes,TrialGroup,API_BaselineMeasures,API_BaselineMeasures_Corrected,Paper_BaselineMeasures,Paper_BaselineMeasures_Corrected
0,NCT00000620,Action to Control Cardiovascular Risk in Diabe...,Inclusion Criteria:\n\n* Diagnosed with type 2...,The purpose of this study is to prevent major ...,"Atherosclerosis, Cardiovascular Diseases, Hype...","Anti-hyperglycemic Agents, Anti-hypertensive A...",First Occurrence of a Major Cardiovascular Eve...,hypertension,"Age, Continuous, Gender, Ethnicity (NIH/OMB), ...","`Age, Continuous`, `Gender`, `Ethnicity (NIH/O...","Age, Female sex, Median duration of diabetes, ...","`Age`, `Female sex`, `Median duration of diabe..."
1,NCT00126737,Home-Based Exercise and Weight Control Program...,Inclusion Criteria:\n\n* Male \& female 50 yea...,The purpose of this study is to determine whet...,"Chronic Diseases, Obesity, Osteoarthritis, Pain,","Weight Control Nutritional Program, Home-based...","WOMAC Function, Physical Scale SF-36v, Mental ...",obesity,"Age, Continuous, Sex: Female, Male, Race/Ethni...","`Age, Continuous`, `Sex: Female, Male`, `Race/...","Age, Duration of OA, Kellgren-Lawrence Classif...","`Age`, `Duration of OA`, `Kellgren-Lawrence Cl..."
2,NCT00283686,HALT Progression of Polycystic Kidney Disease ...,Inclusion Criteria:\n\n* Diagnosis of ADPKD.\n...,The efficacy of interruption of the renin-angi...,"Kidney, Polycystic,","Lisinopril, Telmisartan, Placebo, Standard Blo...",Study A: Percent Annual Change in Total Kidney...,hypertension,"Age, Continuous, Sex: Female, Male, Race (NIH/...","`Age, Continuous`, `Sex: Female, Male`, `Race ...","Age, Weight, Height, BMI, BSA, SBP, DBP, Liver...","`Age`, `Weight`, `Height`, `BMI`, `BSA`, `SBP`..."


In [6]:
#main_path = '/Users/nafisneehal/Desktop/CTBench_RAG/'

# Initialize ChromaDB
# Directory containing JSON files
input_json_directory = '../data/ctpub_json' #source of input json files
rag_json_directory = '../data/ctrepo_json' #source of example json files
vector_store_path = '../chroma_db_ctrepo_indexedby_biobert' #search the ctrepo vector store for examples 

# #initialize client 
c_db = chromadb.PersistentClient(path=vector_store_path)
#get collection
chroma_collection = c_db.get_or_create_collection("ctrepo_all_biobert") #located in df_to_json_and_indexing.ipynb file 

In [7]:
def json_file_to_rag_query(file_name, json_directory):
    with open(f"{json_directory}/{file_name}", "r") as file:
        trial_data = json.load(file)
        trial_query = f"""
        BriefTitle: {trial_data['BriefTitle']}\n
        EligibilityCriteria: {trial_data['EligibilityCriteria']}\n 
        BriefSummary: {trial_data['BriefSummary']}\n
        Conditions: {trial_data['Conditions']}\n
        Interventions: {trial_data['Interventions']}\n
        PrimaryOutcomes: {trial_data['PrimaryOutcomes']}
        """
        return trial_query

# GPT-4o Generation with RAG (3 shot) 

In [8]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
number_of_similar_trials = 3
model_name   = models['gpt-4o']
embed_model = HuggingFaceEmbedding(model_name="dmis-lab/biobert-base-cased-v1.2")
Settings.embed_model = embed_model
Settings.llm = None 

data_pub['gpt4o_rag_ts_gen'] = None 

for index, row in tqdm(data_pub.iterrows()):

    #print(row['NCTId'])

    #AVOID these trials that were used as three-shot examples only
    example_trials_to_avoid = ['NCT00000620', 'NCT01483560', 'NCT04280783']
    if row['NCTId'] in example_trials_to_avoid:
        continue

    file_name = row['NCTId'] + '.json'
    query_trial = json_file_to_rag_query(file_name, input_json_directory) #string query for searching rag database only

    #run rag query and find the filenames of similar trials
    similar_trials = module.query_clinical_trials_using_llamaindex(chroma_collection, vector_store_path,  
                                                                   query_trial, top_k=number_of_similar_trials,
                                                                   embed_model=embed_model)
    
    #print(similar_trials)

    #get system message, question 
    system_message, question = module.build_three_shot_prompt(row, similar_trials, rag_json_directory, 
                                                              ref_col_name='API_BaselineMeasures_Corrected') #asking ct_repo examples 

    model_query = module.system_user_template(system_message, question)
    model_response = module.run_generation_single_openai(model_query, model_name = model_name, 
                                                openai_token = os.environ["OPENAI_API_KEY"], temperature=0.0)

    
    data_pub.at[index, 'gpt4o_rag_ts_gen'] = model_response
    
    #print 
    # print(system_message)
    # print(question)
    # print(model_response)
    # break

No sentence-transformers model found with name dmis-lab/biobert-base-cased-v1.2. Creating a new one with mean pooling.


LLM is explicitly disabled. Using MockLLM.


103it [04:08,  2.41s/it]


In [10]:
data_pub.head(3)

Unnamed: 0,NCTId,BriefTitle,EligibilityCriteria,BriefSummary,Conditions,Interventions,PrimaryOutcomes,TrialGroup,API_BaselineMeasures,API_BaselineMeasures_Corrected,Paper_BaselineMeasures,Paper_BaselineMeasures_Corrected,gpt4o_rag_ts_gen
0,NCT00000620,Action to Control Cardiovascular Risk in Diabe...,Inclusion Criteria:\n\n* Diagnosed with type 2...,The purpose of this study is to prevent major ...,"Atherosclerosis, Cardiovascular Diseases, Hype...","Anti-hyperglycemic Agents, Anti-hypertensive A...",First Occurrence of a Major Cardiovascular Eve...,hypertension,"Age, Continuous, Gender, Ethnicity (NIH/OMB), ...","`Age, Continuous`, `Gender`, `Ethnicity (NIH/O...","Age, Female sex, Median duration of diabetes, ...","`Age`, `Female sex`, `Median duration of diabe...",
1,NCT00126737,Home-Based Exercise and Weight Control Program...,Inclusion Criteria:\n\n* Male \& female 50 yea...,The purpose of this study is to determine whet...,"Chronic Diseases, Obesity, Osteoarthritis, Pain,","Weight Control Nutritional Program, Home-based...","WOMAC Function, Physical Scale SF-36v, Mental ...",obesity,"Age, Continuous, Sex: Female, Male, Race/Ethni...","`Age, Continuous`, `Sex: Female, Male`, `Race/...","Age, Duration of OA, Kellgren-Lawrence Classif...","`Age`, `Duration of OA`, `Kellgren-Lawrence Cl...","`Age, Continuous`, `Sex: Female, Male`, `Race ..."
2,NCT00283686,HALT Progression of Polycystic Kidney Disease ...,Inclusion Criteria:\n\n* Diagnosis of ADPKD.\n...,The efficacy of interruption of the renin-angi...,"Kidney, Polycystic,","Lisinopril, Telmisartan, Placebo, Standard Blo...",Study A: Percent Annual Change in Total Kidney...,hypertension,"Age, Continuous, Sex: Female, Male, Race (NIH/...","`Age, Continuous`, `Sex: Female, Male`, `Race ...","Age, Weight, Height, BMI, BSA, SBP, DBP, Liver...","`Age`, `Weight`, `Height`, `BMI`, `BSA`, `SBP`...","`Age, Continuous`, `Sex: Female, Male`, `Race ..."


# Llama3 Generation with RAG (3-shot) 

In [11]:
number_of_similar_trials = 3
model_name   = models['gpt-4o']
model_hf_endpoint = models['llama3-70b-it']

embed_model = HuggingFaceEmbedding(model_name="dmis-lab/biobert-base-cased-v1.2")
Settings.embed_model = embed_model
Settings.llm = None 

data_pub['llama3_70b_it_rag_ts_gen'] = None 

for index, row in tqdm(data_pub.iterrows()):

    #print(row['NCTId'])

    #AVOID these trials that were used as three-shot examples only
    example_trials_to_avoid = ['NCT00000620', 'NCT01483560', 'NCT04280783']
    if row['NCTId'] in example_trials_to_avoid:
        continue

    file_name = row['NCTId'] + '.json'
    query_trial = json_file_to_rag_query(file_name, input_json_directory) #string query for searching rag database only

    try:
        #run rag query and find the filenames of similar trials
        similar_trials = module.query_clinical_trials_using_llamaindex(chroma_collection, vector_store_path,  
                                                                    query_trial, top_k=number_of_similar_trials,
                                                                    embed_model=embed_model)
        
        # print(similar_trials)

        #get system message, question 
        system_message, question = module.build_three_shot_prompt(row, similar_trials, rag_json_directory, 
                                                                ref_col_name='API_BaselineMeasures_Corrected') #which column to use for reference in RAG examples from ctrepo

        model_query = module.system_user_template(system_message, question)
        model_response = module.run_generation_single_hf_models(model_query, model_hf_endpoint, 
                                                        os.environ['HF_TOKEN'], temperature=0.0)

        
        data_pub.at[index, 'llama3_70b_it_rag_ts_gen'] = model_response
    
    #Note: Weird Bug: UnprocessableEntityError: Error code: 422 - {'error': 'Input validation error: `inputs` tokens + `max_new_tokens` must be <= 8192. Given: 7573 `inputs` tokens and 1000 `max_new_tokens`', 'error_type': 'validation'}
    except Exception as e:
        print(f"An error occurred while processing trial {row['NCTId']}: {e}")
        continue


    #print 
    # print(system_message)
    # print(question)
    # print(model_response)
    # break

No sentence-transformers model found with name dmis-lab/biobert-base-cased-v1.2. Creating a new one with mean pooling.


LLM is explicitly disabled. Using MockLLM.


94it [04:26,  2.19s/it]

An error occurred while processing trial NCT03394027: Error code: 422 - {'error': 'Input validation error: `inputs` tokens + `max_new_tokens` must be <= 8192. Given: 9604 `inputs` tokens and 1000 `max_new_tokens`', 'error_type': 'validation'}


103it [04:48,  2.80s/it]


In [12]:
data_pub.head(3)

Unnamed: 0,NCTId,BriefTitle,EligibilityCriteria,BriefSummary,Conditions,Interventions,PrimaryOutcomes,TrialGroup,API_BaselineMeasures,API_BaselineMeasures_Corrected,Paper_BaselineMeasures,Paper_BaselineMeasures_Corrected,gpt4o_rag_ts_gen,llama3_70b_it_rag_ts_gen
0,NCT00000620,Action to Control Cardiovascular Risk in Diabe...,Inclusion Criteria:\n\n* Diagnosed with type 2...,The purpose of this study is to prevent major ...,"Atherosclerosis, Cardiovascular Diseases, Hype...","Anti-hyperglycemic Agents, Anti-hypertensive A...",First Occurrence of a Major Cardiovascular Eve...,hypertension,"Age, Continuous, Gender, Ethnicity (NIH/OMB), ...","`Age, Continuous`, `Gender`, `Ethnicity (NIH/O...","Age, Female sex, Median duration of diabetes, ...","`Age`, `Female sex`, `Median duration of diabe...",,
1,NCT00126737,Home-Based Exercise and Weight Control Program...,Inclusion Criteria:\n\n* Male \& female 50 yea...,The purpose of this study is to determine whet...,"Chronic Diseases, Obesity, Osteoarthritis, Pain,","Weight Control Nutritional Program, Home-based...","WOMAC Function, Physical Scale SF-36v, Mental ...",obesity,"Age, Continuous, Sex: Female, Male, Race/Ethni...","`Age, Continuous`, `Sex: Female, Male`, `Race/...","Age, Duration of OA, Kellgren-Lawrence Classif...","`Age`, `Duration of OA`, `Kellgren-Lawrence Cl...","`Age, Continuous`, `Sex: Female, Male`, `Race ...","`Age, Continuous`, `Sex: Female, Male`, `Ethni..."
2,NCT00283686,HALT Progression of Polycystic Kidney Disease ...,Inclusion Criteria:\n\n* Diagnosis of ADPKD.\n...,The efficacy of interruption of the renin-angi...,"Kidney, Polycystic,","Lisinopril, Telmisartan, Placebo, Standard Blo...",Study A: Percent Annual Change in Total Kidney...,hypertension,"Age, Continuous, Sex: Female, Male, Race (NIH/...","`Age, Continuous`, `Sex: Female, Male`, `Race ...","Age, Weight, Height, BMI, BSA, SBP, DBP, Liver...","`Age`, `Weight`, `Height`, `BMI`, `BSA`, `SBP`...","`Age, Continuous`, `Sex: Female, Male`, `Race ...","`Age, Continuous`, `Sex: Female, Male`, `Race ..."


In [13]:
data_pub.to_csv('../data/hidden_data/biobert_embed/CT-Pub-With-Examples-Corrected-biobert-allgen.csv', index=False)

## GPT4 Evaluation

In [14]:
eval_data_pub = pd.DataFrame()
eval_data_pub['NCTId'] = data_pub['NCTId']

In [15]:
data_pub.head(3)

Unnamed: 0,NCTId,BriefTitle,EligibilityCriteria,BriefSummary,Conditions,Interventions,PrimaryOutcomes,TrialGroup,API_BaselineMeasures,API_BaselineMeasures_Corrected,Paper_BaselineMeasures,Paper_BaselineMeasures_Corrected,gpt4o_rag_ts_gen,llama3_70b_it_rag_ts_gen
0,NCT00000620,Action to Control Cardiovascular Risk in Diabe...,Inclusion Criteria:\n\n* Diagnosed with type 2...,The purpose of this study is to prevent major ...,"Atherosclerosis, Cardiovascular Diseases, Hype...","Anti-hyperglycemic Agents, Anti-hypertensive A...",First Occurrence of a Major Cardiovascular Eve...,hypertension,"Age, Continuous, Gender, Ethnicity (NIH/OMB), ...","`Age, Continuous`, `Gender`, `Ethnicity (NIH/O...","Age, Female sex, Median duration of diabetes, ...","`Age`, `Female sex`, `Median duration of diabe...",,
1,NCT00126737,Home-Based Exercise and Weight Control Program...,Inclusion Criteria:\n\n* Male \& female 50 yea...,The purpose of this study is to determine whet...,"Chronic Diseases, Obesity, Osteoarthritis, Pain,","Weight Control Nutritional Program, Home-based...","WOMAC Function, Physical Scale SF-36v, Mental ...",obesity,"Age, Continuous, Sex: Female, Male, Race/Ethni...","`Age, Continuous`, `Sex: Female, Male`, `Race/...","Age, Duration of OA, Kellgren-Lawrence Classif...","`Age`, `Duration of OA`, `Kellgren-Lawrence Cl...","`Age, Continuous`, `Sex: Female, Male`, `Race ...","`Age, Continuous`, `Sex: Female, Male`, `Ethni..."
2,NCT00283686,HALT Progression of Polycystic Kidney Disease ...,Inclusion Criteria:\n\n* Diagnosis of ADPKD.\n...,The efficacy of interruption of the renin-angi...,"Kidney, Polycystic,","Lisinopril, Telmisartan, Placebo, Standard Blo...",Study A: Percent Annual Change in Total Kidney...,hypertension,"Age, Continuous, Sex: Female, Male, Race (NIH/...","`Age, Continuous`, `Sex: Female, Male`, `Race ...","Age, Weight, Height, BMI, BSA, SBP, DBP, Liver...","`Age`, `Weight`, `Height`, `BMI`, `BSA`, `SBP`...","`Age, Continuous`, `Sex: Female, Male`, `Race ...","`Age, Continuous`, `Sex: Female, Male`, `Race ..."


In [16]:
import json
from tqdm import tqdm

for index, row in tqdm(data_pub.iterrows()):

    example_trials_to_avoid = ['NCT00000620', 'NCT01483560', 'NCT04280783']
    if row['NCTId'] in example_trials_to_avoid:
        continue

    qstart = module.get_question_from_row(row)

    if row['llama3_70b_it_rag_ts_gen'] is None:
        continue

    reference_list = module.extract_elements_v2(row['Paper_BaselineMeasures_Corrected']) #which column to use for reference as input from ctpub
    
    candidate_gts = module.extract_elements_v2(row['gpt4o_rag_ts_gen'])
    candidate_lts = module.extract_elements_v2(row['llama3_70b_it_rag_ts_gen'])
    
    system_message_gts, question_gts = module.build_gpt4_eval_prompt(reference_list,
                                                            candidate_gts,
                                                            qstart)
    
    
    system_message_lts, question_lts = module.build_gpt4_eval_prompt(reference_list,
                                                            candidate_lts,
                                                            qstart)

    
    eval_model_response_gts = module.run_evaluation_with_gpt4o(system_message_gts, question_gts, os.environ["OPENAI_API_KEY"])
    eval_model_response_lts = module.run_evaluation_with_gpt4o(system_message_lts, question_lts, os.environ["OPENAI_API_KEY"])
    
    #Convert eval_model_response to a JSON string and store in the dataframe
    eval_data_pub.at[index, 'gpt4o_ts_gen_matches'] = eval_model_response_gts
    eval_data_pub.at[index, 'llama3_70b_it_ts_gen_matches'] = eval_model_response_lts

    #break
    


103it [14:10,  8.26s/it]


In [17]:
eval_data_pub.to_csv('../data/hidden_data/biobert_embed/CT-Pub-With-Examples-Corrected-biobert-alleval.csv', index=False)    

In [18]:
eval_data_pub.head(3)

Unnamed: 0,NCTId,gpt4o_ts_gen_matches,llama3_70b_it_ts_gen_matches
0,NCT00000620,,
1,NCT00126737,"{\n ""matched_features"": [\n [""Age"", ...","{\n ""matched_features"": [\n [""Age"", ..."
2,NCT00283686,"{\n ""matched_features"": [\n [""Age"", ...","{\n ""matched_features"": [\n [""Age"", ..."


In [19]:
print(eval_data_pub.at[2, 'gpt4o_ts_gen_matches'])

{
    "matched_features": [
        ["Age", "Age, Continuous"],
        ["BMI", "Body Mass Index (kg/m2)"],
        ["eGFR", "Chronic Kidney Disease Epidemiology Collaboration (CKD EPI) eGFR (ml/min/m^2)"]
    ],
    "remaining_reference_features": [
        "Weight",
        "Height",
        "BSA",
        "SBP",
        "DBP",
        "Liver cysts",
        "Liver volume",
        "height adjusted liver volume",
        "liver cyst volume",
        "height adjusted liver cyst volume",
        "liver parenchymal volume",
        "height adjusted liver parnehcymal volume",
        "total kidney volume",
        "height adjusted total kidney volume",
        "spleen volume",
        "height adjusted spleen volume",
        "serum albumin",
        "serum sodium",
        "hemoglobin",
        "WBC",
        "platelets",
        "AST",
        "ALT",
        "alkaline phosphatase",
        "bilirubin",
        "physical functioning QOL",
        "physical role QOL",
        "bodily pain

## Score Calculation

In [20]:
score_df = pd.DataFrame()
score_df['NCTId'] = data_pub['NCTId']

In [21]:
import json 

gts_sum = {'precision':0, 'recall':0, 'f1':0}
lts_sum = {'precision':0, 'recall':0, 'f1':0}

total_count = 0

for index, row in eval_data_pub.iterrows():
    avoid_ids = ['NCT00000620', 'NCT01483560', 'NCT04280783'] #these were used as examples for 3-shot generation
    if row['NCTId'] in avoid_ids:
        continue
    if not row['llama3_70b_it_ts_gen_matches'] or not row['gpt4o_ts_gen_matches']:
        continue

    print(row['NCTId'])
    
    try:
        # gts
        gts_dict = json.loads(row['gpt4o_ts_gen_matches'])
        gts_matches = gts_dict['matched_features']
        gts_remaining_references = gts_dict['remaining_reference_features']
        gts_remaining_candidates = gts_dict['remaining_candidate_features']
        gts_score = module.match_to_score(gts_matches, gts_remaining_references, gts_remaining_candidates)
        score_df.at[index, 'gpt4o_ts_gen_scores'] = json.dumps(gts_score)
        gts_sum['precision'] += gts_score['precision']
        gts_sum['recall'] += gts_score['recall']
        gts_sum['f1'] += gts_score['f1']

        # lts 
        lts_dict = json.loads(row['llama3_70b_it_ts_gen_matches'])
        lts_matches = lts_dict['matched_features']
        lts_remaining_references = lts_dict['remaining_reference_features']
        lts_remaining_candidates = lts_dict['remaining_candidate_features']
        lts_score = module.match_to_score(lts_matches, lts_remaining_references, lts_remaining_candidates)
        score_df.at[index, 'llama3_70b_it_ts_gen_scores'] = json.dumps(lts_score)
        lts_sum['precision'] += lts_score['precision']
        lts_sum['recall'] += lts_score['recall']
        lts_sum['f1'] += lts_score['f1']

        total_count += 1

    except Exception as e:
        print(f"An error occurred while processing trial {row['NCTId']}: {e}")
        continue

    #break



NCT00126737
NCT00283686
NCT00329030
NCT00360334
NCT00395746
NCT00419562
NCT00441064
NCT00490529
NCT00552409
NCT00556933
NCT00568178
NCT00618072
NCT00713830
NCT00751114
NCT00781937
NCT00791479
NCT00793455
NCT00819182
NCT00863746
NCT00896181
NCT00917267
NCT00949884
NCT00962247
NCT00967668
NCT01000480
NCT01031680
NCT01101880
NCT01279109
NCT01357551
NCT01435603
NCT01441973
NCT01484873
NCT01496469
NCT01574157
NCT01592695
NCT01621178
NCT01652729
NCT01676220
NCT01686828
NCT01757847
NCT01760239
NCT01767155
NCT01768637
NCT01785849
NCT01821352
NCT01862796
NCT01973972
NCT01986881
NCT02003963
NCT02008682
NCT02038179
NCT02109029
NCT02111980
NCT02137512
NCT02214186
NCT02278471
NCT02342639
NCT02358668
NCT02409329
NCT02437084
NCT02473926
NCT02531035
NCT02572882
NCT02592421
NCT02602496
NCT02620774
NCT02623348
NCT02643966
NCT02646982
NCT02680574
NCT02692040
NCT02692560
NCT02698891
NCT02738086
NCT02776553
NCT02790437
NCT02809183
NCT02833857
NCT02834663
NCT02836873
NCT02846779
NCT02892149
NCT03014479
NCT0

In [22]:
score_df.head()

Unnamed: 0,NCTId,gpt4o_ts_gen_scores,llama3_70b_it_ts_gen_scores
0,NCT00000620,,
1,NCT00126737,"{""precision"": 0.625, ""recall"": 0.4166666666666...","{""precision"": 0.45454545454545453, ""recall"": 0..."
2,NCT00283686,"{""precision"": 0.375, ""recall"": 0.0789473684210...","{""precision"": 0.375, ""recall"": 0.0789473684210..."
3,NCT00329030,"{""precision"": 0.46153846153846156, ""recall"": 0...","{""precision"": 0.7142857142857143, ""recall"": 0...."
4,NCT00360334,"{""precision"": 1.0, ""recall"": 0.777777777777777...","{""precision"": 1.0, ""recall"": 0.666666666666666..."


Old Score on default embedding

In [35]:
# print(f"GPT-4o Three Shot Scores: Precision={gts_sum['precision']/total_count} Recall={gts_sum['recall']/total_count} F1={gts_sum['f1']/total_count}")
# print(f"Llama-3 70B Instruct Three Shot Scores: Precision={lts_sum['precision']/total_count} Recall={lts_sum['recall']/total_count} F1={lts_sum['f1']/total_count}")


GPT-4o Three Shot Scores: Precision=0.5490141313583681 Recall=0.5489891569472068 F1=0.5151587237155687
Llama-3 70B Instruct Three Shot Scores: Precision=0.5660702514654267 Recall=0.5073770601671448 F1=0.5078170294444107


New Score on Biobert embedding

In [22]:
# print(f"GPT-4o Three Shot Scores: Precision={gts_sum['precision']/total_count} Recall={gts_sum['recall']/total_count} F1={gts_sum['f1']/total_count}")
# print(f"Llama-3 70B Instruct Three Shot Scores: Precision={lts_sum['precision']/total_count} Recall={lts_sum['recall']/total_count} F1={lts_sum['f1']/total_count}")


GPT-4o Three Shot Scores: Precision=0.5591517436943357 Recall=0.5535307578557245 F1=0.5193824245300138
Llama-3 70B Instruct Three Shot Scores: Precision=0.5732227379319589 Recall=0.5148503992157587 F1=0.5111383339243238


In [23]:
print(f"GPT-4o Three Shot Scores: Precision={gts_sum['precision']/total_count} Recall={gts_sum['recall']/total_count} F1={gts_sum['f1']/total_count}")
print(f"Llama-3 70B Instruct Three Shot Scores: Precision={lts_sum['precision']/total_count} Recall={lts_sum['recall']/total_count} F1={lts_sum['f1']/total_count}")

GPT-4o Three Shot Scores: Precision=0.5650678660346715 Recall=0.5606475864098799 F1=0.5228024148860804
Llama-3 70B Instruct Three Shot Scores: Precision=0.5746215150846026 Recall=0.5176042232709696 F1=0.5121088313144019
