In [35]:
import numpy as np 
import pandas as pd 
import re 
import os
import json 

In [33]:
def extract_elements_with_cleaning(text):
    # Example: If the input text is "##Answer: a, b, (c, d), e"
    # Remove the '##Answer:' tag from some responses at the beginning
    text = text.replace("##Answer:", "").strip()

    # Regex pattern to correctly handle nested commas within parentheses
    # This pattern matches non-parenthetical text or text within balanced parentheses
    # Example: "a, b, (c, d), e" -> ['a', 'b', '(c, d)', 'e']
    pattern = r'\([^()]*\)|[^,]+'

    # Use findall to get all matches, adjusting for nested structure
    # Example: After applying the pattern, the matches would be ['a', ' b', ' (c, d)', ' e']
    matches = re.findall(pattern, text)

    # Clean and combine the elements
    # Example: Initialize an empty list 'elements' and an empty string 'temp'
    elements = []
    temp = ''
    for match in matches:
        # Continue appending to temp if it starts with an unmatched '('
        # Example: If 'temp' has unmatched '(', continue appending to it, otherwise start a new element
        if temp.count('(') != temp.count(')'):
            temp += ',' + match
        else:
            if temp:
                elements.append(temp.strip())
            temp = match
    if temp:  # Append the last collected item
        elements.append(temp.strip())

    # Remove special characters if they appear as single elements
    # Example: ["a", "b", "(c, d)", "e", "~"] -> ["a", "b", "(c, d)", "e"]
    special_characters = ["(", ")", "`", "~", "!", "@", "#", "$", "%", "^", "&", "*", "-", "+", "=", 
                          "|", "\\", "{", "}", "[", "]", ":", ";", '"', "'", "<", ">", ",", ".", "?", "/", "_"]
    elements = [item for item in elements if item not in special_characters]

    # Remove duplicates while maintaining order
    # Example: ["a", "b", "(c, d)", "e", "a"] -> ["a", "b", "(c, d)", "e"]
    seen = set()
    unique_list = []
    for item in elements:
        if item not in seen:
            seen.add(item)
            unique_list.append(item)

    # Filter out items containing any banned keywords (case insensitive)
    # Example: If 'banned_keywords' are ['Continuous', 'Customized'], remove any items containing these keywords
    banned_keywords = ['Continuous', 'Customized']
    processed_list = [item for item in unique_list if not any(banned_keyword.lower() in item.lower() for banned_keyword in banned_keywords)]

    # Return the final processed list
    return processed_list


In [18]:
file_path1 = "/Users/nafisneehal/Desktop/CTBench_DAR_Project/data/CT-Pub-With_Examples.csv"
if not os.path.exists(file_path1):
    file_path1 = "/Users/nafisneehal/Desktop/CTBench_DAR_Project/data/CT-Pub-With-Examples.csv"
    file_path2 = "/Users/nafisneehal/Desktop/CTBench_DAR_Project/data/CT-Repo-With-Examples-Processed-Version.csv"
ct_pub = pd.read_csv(file_path1)
ct_repo = pd.read_csv(file_path2)

In [19]:
ct_pub.head(3)

Unnamed: 0,NCTId,BriefTitle,EligibilityCriteria,BriefSummary,Conditions,Interventions,PrimaryOutcomes,BaselineMeasures,Paper_BaselineMeasures
0,NCT02572882,Gut Microbiome and p-Inulin in Hemodialysis,Inclusion Criteria:\n\n* Maintenance hemodialy...,"The Microbiome trial is a non-randomized, open...","End-Stage Renal Disease, Gut Microbiome Dysbio...","p-inulin,",Within Participant Variability in Microbiome C...,"Age, Categorical, Age, Continuous, Sex: Female...","Age, Sex, Race, Ethnicity, Hypertension, Diabe..."
1,NCT02623348,Use of Pedometers to Measure and Increase Walk...,Inclusion Criteria:\n\n* on hemodialysis for ≥...,Randomized controlled trial using pedometers t...,"End-stage Renal Disease,","pedometer,","Physical Activity,","Age, Continuous, Sex: Female, Male, Ethnicity ...","Age, Sex, Race, Ethnicity, BMI, HTN, DM, CAD, ..."
2,NCT01574157,Investigations of the Optimum Serum Bicarbonat...,Inclusion Criteria:\n\n* Veteran\n* Age older ...,The purpose of this study is to see if treatme...,"Chronic Renal Insufficiency, Diabetes Mellitus,","Sodium bicarbonate, Placebo,",Change in Urinary Transforming Growth Factor B...,"Age, Continuous, Sex: Female, Male, Ethnicity ...","Age, Male, White, Hispanic, Coronary artery di..."


In [20]:
ct_repo.head(3)

Unnamed: 0,NCTId,BriefTitle,EligibilityCriteria,BriefSummary,Conditions,Interventions,PrimaryOutcomes,BaselineMeasures
0,NCT02151643,Study to Evaluate the Efficacy and Safety of P...,Inclusion Criteria:\n\n* Men or women aged 18 ...,The main purpose of this study is to see wheth...,"Hyperphosphataemia,","PT20, Placebo,",Change in Serum Phosphate Concentration From B...,"Age, Continuous, Sex: Female, Male, Ethnicity ..."
1,NCT03036150,A Study to Evaluate the Effect of Dapagliflozi...,Inclusion Criteria:\n\n* Provision of signed i...,The purpose of this study is to evaluate the e...,"Chronic Kidney Disease,","Dapagliflozin, Placebo,",Time to the First Occurrence of Any of the Com...,"Age, Continuous, Age, Customized, Age, Customi..."
2,NCT01640184,Efficacy and Safety of Ultrasonic Ablation to ...,Inclusion Criteria:\n\n* Patients with age bet...,It is difficulty for the treatment of secondar...,"Hyperparathyroidism, Disorders of Parathyroid ...","Parathyroidectomy, Active vitamin D, Ultrasoni...",Rate of Achieving the Target on Blood Intact P...,"Age, Continuous, Sex: Female, Male, Race (NIH/..."


## Process CT-Pub Dataset

In [29]:
for index, row in ct_pub.iterrows():
    processed_list = extract_elements_with_cleaning(row['Paper_BaselineMeasures'])
    processed_string = ', '.join(processed_list)
    ct_pub.at[index, 'Paper_BaselineMeasures_Processed'] = processed_string

In [30]:
ct_pub.head(3)

Unnamed: 0,NCTId,BriefTitle,EligibilityCriteria,BriefSummary,Conditions,Interventions,PrimaryOutcomes,BaselineMeasures,Paper_BaselineMeasures,Paper_BaselineMeasures_Processed
0,NCT02572882,Gut Microbiome and p-Inulin in Hemodialysis,Inclusion Criteria:\n\n* Maintenance hemodialy...,"The Microbiome trial is a non-randomized, open...","End-Stage Renal Disease, Gut Microbiome Dysbio...","p-inulin,",Within Participant Variability in Microbiome C...,"Age, Categorical, Age, Continuous, Sex: Female...","Age, Sex, Race, Ethnicity, Hypertension, Diabe...","Age, Sex, Race, Ethnicity, Hypertension, Diabe..."
1,NCT02623348,Use of Pedometers to Measure and Increase Walk...,Inclusion Criteria:\n\n* on hemodialysis for ≥...,Randomized controlled trial using pedometers t...,"End-stage Renal Disease,","pedometer,","Physical Activity,","Age, Continuous, Sex: Female, Male, Ethnicity ...","Age, Sex, Race, Ethnicity, BMI, HTN, DM, CAD, ...","Age, Sex, Race, Ethnicity, BMI, HTN, DM, CAD, ..."
2,NCT01574157,Investigations of the Optimum Serum Bicarbonat...,Inclusion Criteria:\n\n* Veteran\n* Age older ...,The purpose of this study is to see if treatme...,"Chronic Renal Insufficiency, Diabetes Mellitus,","Sodium bicarbonate, Placebo,",Change in Urinary Transforming Growth Factor B...,"Age, Continuous, Sex: Female, Male, Ethnicity ...","Age, Male, White, Hispanic, Coronary artery di...","Age, Male, White, Hispanic, Coronary artery di..."


## Process CT-Repo Dataset

In [31]:
for index, row in ct_repo.iterrows():
    processed_list = extract_elements_with_cleaning(row['BaselineMeasures'])
    processed_string = ', '.join(processed_list)
    ct_repo.at[index, 'BaselineMeasures_Processed'] = processed_string

In [32]:
ct_repo.head(3)

Unnamed: 0,NCTId,BriefTitle,EligibilityCriteria,BriefSummary,Conditions,Interventions,PrimaryOutcomes,BaselineMeasures,BaselineMeasures_Processed
0,NCT02151643,Study to Evaluate the Efficacy and Safety of P...,Inclusion Criteria:\n\n* Men or women aged 18 ...,The main purpose of this study is to see wheth...,"Hyperphosphataemia,","PT20, Placebo,",Change in Serum Phosphate Concentration From B...,"Age, Continuous, Sex: Female, Male, Ethnicity ...","Age, Sex: Female, Male, Ethnicity (NIH/OMB), R..."
1,NCT03036150,A Study to Evaluate the Effect of Dapagliflozi...,Inclusion Criteria:\n\n* Provision of signed i...,The purpose of this study is to evaluate the e...,"Chronic Kidney Disease,","Dapagliflozin, Placebo,",Time to the First Occurrence of Any of the Com...,"Age, Continuous, Age, Customized, Age, Customi...","Age, Sex: Female, Male, Ethnicity (NIH/OMB), R..."
2,NCT01640184,Efficacy and Safety of Ultrasonic Ablation to ...,Inclusion Criteria:\n\n* Patients with age bet...,It is difficulty for the treatment of secondar...,"Hyperparathyroidism, Disorders of Parathyroid ...","Parathyroidectomy, Active vitamin D, Ultrasoni...",Rate of Achieving the Target on Blood Intact P...,"Age, Continuous, Sex: Female, Male, Race (NIH/...","Age, Sex: Female, Male, Race (NIH/OMB), Region..."


## Save the datafiles with processed_columns

In [34]:
ct_pub.to_csv("/Users/nafisneehal/Desktop/CTBench_DAR_Project/data/CT-Pub-With-Examples-With-Processed-Baselines.csv", index=False)
ct_repo.to_csv("/Users/nafisneehal/Desktop/CTBench_DAR_Project/data/CT-Repo-With-Examples-With-Processed-Baselines.csv", index=False)

## Process and save the generated responses of LLMs - CT-Pub

In [39]:
import os

# Specify the directory path
directory_path = 'results/ct_pub/'

# List all files in the directory
file_list = os.listdir(directory_path)

#we will only look at the first trial file as example
for file in file_list:
    # Read the JSON file
    with open(directory_path + file) as f:
        data = json.load(f)

        print(data['trial_id'])

        gpt4_omni_zs = data['generated-by-gpt4-omni-zs']['gen-response']
        gpt4_omni_ts = data['generated-by-gpt4-omni-ts']['gen-response']
        llama3_ts = data['generated-by-llama3-70b-in-ts']['gen-response']
        llama3_zs = data['generated-by-llama3-70b-in-zs']['gen-response']

        # process the generated responses
        processed_gpt4_omni_zs = extract_elements_with_cleaning(gpt4_omni_zs)
        processed_gpt4_omni_ts = extract_elements_with_cleaning(gpt4_omni_ts)
        processed_llama3_ts = extract_elements_with_cleaning(llama3_ts)
        processed_llama3_zs = extract_elements_with_cleaning(llama3_zs)

        #make list to string 
        processed_gpt4_omni_zs = ', '.join(processed_gpt4_omni_zs)
        processed_gpt4_omni_ts = ', '.join(processed_gpt4_omni_ts)
        processed_llama3_ts = ', '.join(processed_llama3_ts)
        processed_llama3_zs = ', '.join(processed_llama3_zs)

        #save the processed responses in the same json file
        data['generated-by-gpt4-omni-zs']['processed-gen-response'] = processed_gpt4_omni_zs
        data['generated-by-gpt4-omni-ts']['processed-gen-response'] = processed_gpt4_omni_ts
        data['generated-by-llama3-70b-in-ts']['processed-gen-response'] = processed_llama3_ts
        data['generated-by-llama3-70b-in-zs']['processed-gen-response'] = processed_llama3_zs 

        #save the processed responses in the same json file
        with open(directory_path + file, 'w+') as f2:
            json.dump(data, f2, indent=2)

    #stopping the loop here after the first trial is loaded
    #break 

NCT02003963
NCT00395746
NCT03028948
NCT03394027
NCT00793455
NCT02008682
NCT03546270
NCT02833857
NCT00781937
NCT00329030
NCT02592421
NCT02738086
NCT02698891
NCT00713830
NCT03141905
NCT03708770
NCT02111980
NCT01000480
NCT03890588
NCT02809183
NCT02214186
NCT01101880
NCT01686828
NCT03987919
NCT01574157
NCT02692560
NCT01279109
NCT00819182
NCT01785849
NCT00863746
NCT01621178
NCT01676220
NCT02892149
NCT01973972
NCT03674112
NCT01767155
NCT02473926
NCT02643966
NCT03371108
NCT01760239
NCT00490529
NCT03223649
NCT00917267
NCT00791479
NCT03195140
NCT01862796
NCT01652729
NCT02836873
NCT02834663
NCT02137512
NCT03210220
NCT00949884
NCT03242252
NCT01757847
NCT01031680
NCT00967668
NCT00126737
NCT00618072
NCT02109029
NCT00283686
NCT02602496
NCT00419562
NCT02278471
NCT02692040
NCT02437084
NCT02623348
NCT00896181
NCT02342639
NCT01441973
NCT03014479
NCT01986881
NCT00552409
NCT01357551
NCT02358668
NCT02776553
NCT01484873
NCT02038179
NCT04823949
NCT00441064
NCT02409329
NCT02620774
NCT01768637
NCT01496469
NCT0

## Process and save the generated responses of LLMs - CT-Repo

In [41]:
import os

# Specify the directory path
directory_path = 'results/ct_repo/'

# List all files in the directory
file_list = os.listdir(directory_path)

#we will only look at the first trial file as example
for file in file_list:
    # Read the JSON file
    with open(directory_path + file) as f:
        data = json.load(f)

        print(data['trial_id'])

        gpt4_omni_zs = data['generated-by-gpt4-omni-zs']['gen-response']
        gpt4_omni_ts = data['generated-by-gpt4-omni-ts']['gen-response']
        llama3_ts = data['generated-by-llama3-70b-in-ts']['gen-response']
        llama3_zs = data['generated-by-llama3-70b-in-zs']['gen-response']

        # process the generated responses
        processed_gpt4_omni_zs = extract_elements_with_cleaning(gpt4_omni_zs)
        processed_gpt4_omni_ts = extract_elements_with_cleaning(gpt4_omni_ts)
        processed_llama3_ts = extract_elements_with_cleaning(llama3_ts)
        processed_llama3_zs = extract_elements_with_cleaning(llama3_zs)

        #make list to string 
        processed_gpt4_omni_zs = ', '.join(processed_gpt4_omni_zs)
        processed_gpt4_omni_ts = ', '.join(processed_gpt4_omni_ts)
        processed_llama3_ts = ', '.join(processed_llama3_ts)
        processed_llama3_zs = ', '.join(processed_llama3_zs)

        #save the processed responses in the same json file
        data['generated-by-gpt4-omni-zs']['processed-gen-response'] = processed_gpt4_omni_zs
        data['generated-by-gpt4-omni-ts']['processed-gen-response'] = processed_gpt4_omni_ts
        data['generated-by-llama3-70b-in-ts']['processed-gen-response'] = processed_llama3_ts
        data['generated-by-llama3-70b-in-zs']['processed-gen-response'] = processed_llama3_zs 

        #save the processed responses in the same json file
        with open(directory_path + file, 'w+') as f2:
            json.dump(data, f2, indent=2)

    #stopping the loop here after the first trial is loaded
    #break 

NCT03345095
NCT00802204
NCT01232491
NCT00129389
NCT01170208
NCT02211014
NCT03036852
NCT02461589
NCT02552888
NCT00796991
NCT03426787
NCT02175212
NCT03686150
NCT05204134
NCT00258362
NCT00546052
NCT02003963
NCT03950674
NCT01989546
NCT00395746
NCT00086580
NCT01763346
NCT02010359
NCT03028948
NCT03440814
NCT03394027
NCT01159574
NCT02648204
NCT02448563
NCT01221090
NCT04884191
NCT00793455
NCT01683266
NCT05159622
NCT00602420
NCT04603560
NCT00700817
NCT02582840
NCT02750501
NCT03038620
NCT00088530
NCT02124460
NCT02248714
NCT02029976
NCT02008682
NCT03724487
NCT03818581
NCT04039503
NCT03240874
NCT04771403
NCT03164538
NCT03988920
NCT03374176
NCT01841073
NCT02337946
NCT02214017
NCT01227187
NCT02104804
NCT00515723
NCT03163992
NCT01887600
NCT00798720
NCT01535040
NCT01104870
NCT03387735
NCT02229552
NCT01556997
NCT03670602
NCT01422408
NCT02341417
NCT03546270
NCT00855166
NCT02833857
NCT02960204
NCT00682448
NCT01123980
NCT03887299
NCT02802865
NCT02107443
NCT00649389
NCT02244424
NCT03180294
NCT04591171
NCT0