### This code was for an experiment in the supplementary material

In [2]:
import module, secret_keys
from model_list import models
import pandas as pd
from time import sleep
from tqdm import tqdm
from google.cloud import firestore
from sklearn.metrics import cohen_kappa_score
from sklearn.preprocessing import LabelEncoder

hf_api_key             = secret_keys.HF_TOKEN                   #<insert your own huggingface token here>
openai_api_key         = secret_keys.OPENAI_API_KEY_TEAM        #<insert your own openai token here>



In [3]:
#fetching the data from Firestore database instead of using the CSV file - they are the same data
db = firestore.Client.from_service_account_json("ct-llm-firebase-key.json")

In [8]:
from sklearn.metrics import cohen_kappa_score

def get_kappa_score_among_matches(reference_list, candidate_list, shayom, bowen, kristin, gpt4):

    # print(f"Reference List: {reference_list}")
    # print(f"Candidate List: {candidate_list}")
    # print(f"Shayom: {shayom}")
    # print(f"Bowen: {bowen}")
    # print(f"Kristin: {kristin}")
    # print(f"GPT4: {gpt4}")
    

    # Mappings from reference and candidate items to numerical labels
    reference_to_index = {item: idx for idx, item in enumerate(reference_list)}
    candidate_to_index = {item: idx for idx, item in enumerate(candidate_list)}

    # print(reference_to_index)
    # print(candidate_to_index)

    # Function to encode matched pairs based on reference items
    def encode_pairs(pairs, reference_to_index, candidate_to_index):
        encoded = {}
        for ref, cand in pairs:
            if ref in reference_to_index and cand in candidate_to_index:
                encoded[reference_to_index[ref]] = candidate_to_index[cand]
        return encoded

    # Encode the annotator lists
    encoded_list_shayom = encode_pairs(shayom, reference_to_index, candidate_to_index)
    encoded_list_bowen = encode_pairs(bowen, reference_to_index, candidate_to_index)
    encoded_list_kristin = encode_pairs(kristin, reference_to_index, candidate_to_index)
    encoded_list_gpt4 = encode_pairs(gpt4, reference_to_index, candidate_to_index)

    # print(f"Encoded List Shayom: {encoded_list_shayom}")
    # print(f"Encoded List Bowen: {encoded_list_bowen}")
    # print(f"Encoded List Kristin: {encoded_list_kristin}")
    # print(f"Encoded List GPT4: {encoded_list_gpt4}")

    # Align the encoded lists by reference items
    aligned_refs = sorted(reference_to_index.values())
    aligned_list_shayom = [encoded_list_shayom.get(ref, -1) for ref in aligned_refs]
    aligned_list_bowen = [encoded_list_bowen.get(ref, -1) for ref in aligned_refs]
    aligned_list_kristin = [encoded_list_kristin.get(ref, -1) for ref in aligned_refs]
    aligned_list_gpt4 = [encoded_list_gpt4.get(ref, -1) for ref in aligned_refs]

    # print(aligned_refs)
    # print(f"Aligned List Shayom: {aligned_list_shayom}")
    # print(f"Aligned List Bowen: {aligned_list_bowen}")
    # print(f"Aligned List Kristin: {aligned_list_kristin}")
    # print(f"Aligned List GPT4: {aligned_list_gpt4}")

    # Create an inter-rater agreement table
    agreement_table = list(zip(aligned_list_shayom, aligned_list_bowen, aligned_list_kristin, aligned_list_gpt4))

    #print(agreement_table)

    # Calculate Cohen's Kappa for each pair of annotators
    kappa_shayom_bowen = cohen_kappa_score(aligned_list_shayom, aligned_list_bowen)
    kappa_shayom_kristin = cohen_kappa_score(aligned_list_shayom, aligned_list_kristin)
    kappa_shayom_gpt4 = cohen_kappa_score(aligned_list_shayom, aligned_list_gpt4)
    kappa_bowen_kristin = cohen_kappa_score(aligned_list_bowen, aligned_list_kristin)
    kappa_bowen_gpt4 = cohen_kappa_score(aligned_list_bowen, aligned_list_gpt4)
    kappa_kristin_gpt4 = cohen_kappa_score(aligned_list_kristin, aligned_list_gpt4)

    result = {
        "kappa_shayom_bowen": kappa_shayom_bowen,
        "kappa_shayom_kristin": kappa_shayom_kristin,
        "kappa_shayom_gpt4": kappa_shayom_gpt4,
        "kappa_bowen_kristin": kappa_bowen_kristin,
        "kappa_bowen_gpt4": kappa_bowen_gpt4,
        "kappa_kristin_gpt4": kappa_kristin_gpt4
    }

    #print(result)

    return agreement_table, result


In [9]:
import json
docs = db.collection('Gold-100').list_documents()
avoid_ids = ['NCT00000620', 'NCT01483560', 'NCT04280783']

dataframe = pd.DataFrame(columns=['trial_id', 'kappa_shayom_bowen', 'kappa_shayom_kristin', 'kappa_bowen_kristin', 
                                  'kappa_shayom_gpt4', 'kappa_bowen_gpt4', 'kappa_kristin_gpt4', 'agreement_table'])

for doc in docs:
    trial_id = doc.id
    if trial_id in avoid_ids:
        continue
    print(trial_id)

    ref_ref= db.collection('Gold-100').document(trial_id)
    ref_data = ref_ref.get().to_dict()
    reference_list = module.extract_elements_with_cleaning(ref_data['Paper_BaselineMeasures'])

    doc_ref = db.collection('Gold-100').document(trial_id).collection('gen-eval').document('gpt4-omni-ts')
    data = doc_ref.get().to_dict()
    candidate_list = module.extract_elements_with_cleaning(data['gen-response'])

    shayom_matches = json.loads(data['shayom-response'])['matched_features']
    bowen_matches = json.loads(data['bowen-response'])['matched_features']
    kristin_matches = json.loads(data['kristin-response'])['matched_features']
    gpt4_matches = json.loads(data['gpt4-omni-matches'])['matched_features']
    
    agreement_table, result = get_kappa_score_among_matches(reference_list, candidate_list, shayom_matches, bowen_matches, kristin_matches, gpt4_matches)

    new_df = pd.DataFrame({
        'trial_id': [trial_id],
        'kappa_shayom_bowen': [result['kappa_shayom_bowen']],
        'kappa_shayom_kristin': [result['kappa_shayom_kristin']],
        'kappa_bowen_kristin': [result['kappa_bowen_kristin']],
        'kappa_shayom_gpt4': [result['kappa_shayom_gpt4']],
        'kappa_bowen_gpt4': [result['kappa_bowen_gpt4']],
        'kappa_kristin_gpt4': [result['kappa_kristin_gpt4']],
        'agreement_table': [agreement_table]
    })

    dataframe = pd.concat([dataframe, new_df], ignore_index=True)
    
    #break 

NCT00126737


  dataframe = pd.concat([dataframe, new_df], ignore_index=True)


NCT00283686
NCT00329030
NCT00360334
NCT00395746
NCT00419562
NCT00441064
NCT00490529
NCT00552409
NCT00556933
NCT00568178
NCT00618072
NCT00713830
NCT00751114
NCT00781937
NCT00791479
NCT00793455
NCT00819182
NCT00863746
NCT00896181
NCT00917267
NCT00949884
NCT00962247
NCT00967668
NCT01000480
NCT01031680
NCT01101880
NCT01279109
NCT01357551
NCT01435603
NCT01441973
NCT01484873
NCT01496469
NCT01574157
NCT01592695
NCT01621178
NCT01652729
NCT01676220
NCT01686828
NCT01757847
NCT01760239
NCT01767155
NCT01768637
NCT01785849
NCT01821352
NCT01862796
NCT01973972
NCT01986881
NCT02003963
NCT02008682
NCT02038179
NCT02109029
NCT02111980
NCT02137512
NCT02214186
NCT02278471
NCT02342639
NCT02358668
NCT02409329
NCT02437084
NCT02473926
NCT02531035
NCT02572882
NCT02592421
NCT02602496
NCT02620774
NCT02623348
NCT02643966
NCT02646982
NCT02680574
NCT02692040
NCT02692560
NCT02698891
NCT02738086
NCT02776553
NCT02790437
NCT02809183
NCT02833857
NCT02834663
NCT02836873
NCT02846779
NCT02892149
NCT03014479
NCT03028948
NCT0

In [10]:
dataframe[['kappa_shayom_bowen',  'kappa_shayom_kristin', 'kappa_bowen_kristin', 'kappa_shayom_gpt4', 'kappa_bowen_gpt4', 'kappa_kristin_gpt4']].mean()

kappa_shayom_bowen      0.832767
kappa_shayom_kristin    0.831810
kappa_bowen_kristin     0.870561
kappa_shayom_gpt4       0.783869
kappa_bowen_gpt4        0.847636
kappa_kristin_gpt4      0.816234
dtype: float64