# **Bachelor Project Artificial Intelligence**

M Bartos (2724195)

## **Preprocessing the Train Set**

In [1]:
import os
import pandas as pd

def read_data(directory):
    data = []
    for subdir, _, files in os.walk(os.path.join(directory, 'documents')):
        for filename in files:
            if filename.endswith('.text'):
                pmid = filename.split('.')[0]
                with open(os.path.join(subdir, filename), 'r') as file:
                    text = file.read().strip()
                tokens_file = os.path.join(subdir, f"{pmid}.tokens")
                with open(tokens_file, 'r') as file:
                    tokens = file.read().strip().split()
                data.append({'pmid': pmid, 'text': text, 'tokens': tokens})
    return data

def parse_pico(tokens, labels_p_file, labels_i_file, labels_o_file):
    pico_annotations = []
    with open(labels_p_file, 'r') as p_file, open(labels_i_file, 'r') as i_file, open(labels_o_file, 'r') as o_file:
        labels_p = p_file.read().strip().split(',')
        labels_i = i_file.read().strip().split(',')
        labels_o = o_file.read().strip().split(',')

    for token, p, i, o in zip(tokens, labels_p, labels_i, labels_o):
        pico_annotations.append({
            'token': token,
            'P': int(p),
            'I': int(i),
            'O': int(o)
        })
    return pico_annotations

data_directory = '/Users/markbartos/Library/Mobile Documents/com~apple~CloudDocs/DRIVE/EDUCATION/VU_AI/YEAR3 PERIOD 5/BPAI/Code/EBM-NLP-master/ebm_nlp_1_00'

data = read_data(data_directory)
#print(data[0])

sentence_data = []
for entry in data:
    pmid = entry['pmid']
    tokens = entry['tokens']
    
    labels_p_file = os.path.join(data_directory, 'annotations', 'aggregated', 'hierarchical_labels', 'participants', 'train', f"{pmid}_AGGREGATED.ann")
    labels_i_file = os.path.join(data_directory, 'annotations', 'aggregated', 'hierarchical_labels', 'interventions', 'train', f"{pmid}_AGGREGATED.ann")
    labels_o_file = os.path.join(data_directory, 'annotations', 'aggregated', 'hierarchical_labels', 'outcomes', 'train', f"{pmid}_AGGREGATED.ann")

    if os.path.exists(labels_p_file) and os.path.exists(labels_i_file) and os.path.exists(labels_o_file):
        annotations = parse_pico(tokens, labels_p_file, labels_i_file, labels_o_file)
        sentence_data.append({'PMID': pmid, 'Annotations': annotations})


df = pd.DataFrame(sentence_data)
print(df.head())

       PMID                                        Annotations
0   6988462  [{'token': 'A', 'P': 0, 'I': 0, 'O': 0}, {'tok...
1  18157013  [{'token': 'Minimally', 'P': 0, 'I': 0, 'O': 0...
2  17297323  [{'token': 'Combination', 'P': 0, 'I': 0, 'O':...
3  15603203  [{'token': 'Histamine', 'P': 4, 'I': 0, 'O': 0...
4  15734707  [{'token': 'The', 'P': 0, 'I': 0, 'O': 0}, {'t...


In [2]:
pd.set_option('display.max_colwidth', None)  
print(df.iloc[0])

PMID                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

In [3]:
annotations = df.at[0, 'Annotations']  # Access the annotations for the first row

# Loop through each annotation in the list and print it
for annotation in annotations:
    print(f"Token: {annotation['token']} - P: {annotation['P']}, I: {annotation['I']}, O: {annotation['O']}")


Token: A - P: 0, I: 0, O: 0
Token: comparative - P: 0, I: 0, O: 0
Token: trial - P: 0, I: 0, O: 0
Token: of - P: 0, I: 0, O: 0
Token: liver - P: 0, I: 0, O: 0
Token: biopsy - P: 0, I: 0, O: 0
Token: needles - P: 0, I: 0, O: 0
Token: . - P: 0, I: 0, O: 0
Token: A - P: 0, I: 0, O: 0
Token: sheathed - P: 0, I: 2, O: 0
Token: needle - P: 0, I: 2, O: 0
Token: ( - P: 0, I: 0, O: 0
Token: Tru-Cut - P: 0, I: 0, O: 0
Token: ) - P: 0, I: 0, O: 0
Token: was - P: 0, I: 0, O: 0
Token: compared - P: 0, I: 0, O: 0
Token: with - P: 0, I: 0, O: 0
Token: a - P: 0, I: 0, O: 0
Token: suction - P: 0, I: 2, O: 0
Token: biopsy - P: 0, I: 2, O: 0
Token: needle - P: 0, I: 2, O: 0
Token: ( - P: 0, I: 2, O: 0
Token: Menghini - P: 0, I: 2, O: 0
Token: ) - P: 0, I: 2, O: 0
Token: in - P: 0, I: 0, O: 0
Token: a - P: 0, I: 0, O: 0
Token: randomised - P: 0, I: 0, O: 0
Token: prospective - P: 0, I: 0, O: 0
Token: trial - P: 0, I: 0, O: 0
Token: over - P: 0, I: 0, O: 0
Token: 18 - P: 0, I: 0, O: 0
Token: months - P: 0,

In [4]:
import pandas as pd

# Assuming 'df' is your existing DataFrame
annotations_list = df.at[0, 'Annotations']  # Get the annotations for the first row
annotations_df = pd.DataFrame(annotations_list)

# Display the DataFrame to see the table
print(annotations_df)


# Sort by the 'P' values, then 'I', then 'O'
sorted_annotations_df = annotations_df.sort_values(by=['P', 'I', 'O'])

# Print the sorted DataFrame
pd.set_option('display.max_colwidth', None)  
print(sorted_annotations_df)

           token  P  I  O
0              A  0  0  0
1    comparative  0  0  0
2          trial  0  0  0
3             of  0  0  0
4          liver  0  0  0
..           ... .. .. ..
111           be  0  0  0
112         used  0  0  0
113         once  0  0  0
114         only  0  0  0
115            .  0  0  0

[116 rows x 4 columns]
           token  P  I  O
0              A  0  0  0
1    comparative  0  0  0
2          trial  0  0  0
3             of  0  0  0
4          liver  0  0  0
..           ... .. .. ..
83        needle  0  6  0
96       suction  0  6  0
97        needle  0  6  0
106     sheathed  0  6  0
107       needle  0  6  0

[116 rows x 4 columns]


## **Preprocessing the Test Set**

In [13]:
import os
import pandas as pd

def read_data(directory):
    data = []
    for subdir, _, files in os.walk(os.path.join(directory, 'documents')):
        for filename in files:
            if filename.endswith('.text'):
                pmid = filename.split('.')[0]
                with open(os.path.join(subdir, filename), 'r') as file:
                    text = file.read().strip()
                tokens_file = os.path.join(subdir, f"{pmid}.tokens")
                with open(tokens_file, 'r') as file:
                    tokens = file.read().strip().split()
                data.append({'pmid': pmid, 'text': text, 'tokens': tokens})
    return data

def parse_pico(tokens, labels_p_file, labels_i_file, labels_o_file):
    pico_annotations = []
    with open(labels_p_file, 'r') as p_file, open(labels_i_file, 'r') as i_file, open(labels_o_file, 'r') as o_file:
        labels_p = p_file.read().strip().split(',')
        labels_i = i_file.read().strip().split(',')
        labels_o = o_file.read().strip().split(',')

    for token, p, i, o in zip(tokens, labels_p, labels_i, labels_o):
        pico_annotations.append({
            'token': token,
            'P': int(p),
            'I': int(i),
            'O': int(o)
        })
    return pico_annotations

data_directory = '/Users/markbartos/Library/Mobile Documents/com~apple~CloudDocs/DRIVE/EDUCATION/VU_AI/YEAR3 PERIOD 5/BPAI/Code/EBM-NLP-master/ebm_nlp_1_00'

data = read_data(data_directory)
#print(data[0])

sentence_data = []
for entry in data:
    pmid = entry['pmid']
    tokens = entry['tokens']
    
    labels_p_file = os.path.join(data_directory, 'annotations', 'aggregated', 'hierarchical_labels', 'participants', 'test', 'gold', f"{pmid}_AGGREGATED.ann")
    labels_i_file = os.path.join(data_directory, 'annotations', 'aggregated', 'hierarchical_labels', 'interventions', 'test', 'gold', f"{pmid}_AGGREGATED.ann")
    labels_o_file = os.path.join(data_directory, 'annotations', 'aggregated', 'hierarchical_labels', 'outcomes', 'test', 'gold', f"{pmid}_AGGREGATED.ann")

    if os.path.exists(labels_p_file) and os.path.exists(labels_i_file) and os.path.exists(labels_o_file):
        annotations = parse_pico(tokens, labels_p_file, labels_i_file, labels_o_file)
        sentence_data.append({'PMID': pmid, 'Annotations': annotations})


test_df = pd.DataFrame(sentence_data)
print(test_df.head())

       PMID                                        Annotations
0  22692114  [{'token': 'The', 'P': 0, 'I': 0, 'O': 0}, {'t...
1   8986845  [{'token': 'Pedantic', 'P': 0, 'I': 5, 'O': 0}...
2  19054718  [{'token': 'Timing', 'P': 0, 'I': 0, 'O': 0}, ...
3   9806121  [{'token': 'Sunbathing', 'P': 0, 'I': 0, 'O': ...
4  10715372  [{'token': 'Study', 'P': 0, 'I': 0, 'O': 0}, {...


In [4]:
annotations = test_df.at[1, 'Annotations']  # Access the annotations for the first row

# Loop through each annotation in the list and print it
for annotation in annotations:
    print(f"Token: {annotation['token']} - P: {annotation['P']}, I: {annotation['I']}, O: {annotation['O']}")

Token: Pedantic - P: 0, I: 5, O: 0
Token: speaking - P: 0, I: 5, O: 0
Token: style - P: 0, I: 0, O: 0
Token: differentiates - P: 0, I: 0, O: 0
Token: Asperger - P: 0, I: 0, O: 0
Token: syndrome - P: 0, I: 0, O: 0
Token: from - P: 0, I: 0, O: 0
Token: high-functioning - P: 0, I: 0, O: 0
Token: autism - P: 0, I: 0, O: 0
Token: . - P: 0, I: 0, O: 0
Token: Asperger - P: 0, I: 0, O: 0
Token: syndrome - P: 0, I: 0, O: 0
Token: ( - P: 0, I: 0, O: 0
Token: AS - P: 0, I: 0, O: 0
Token: ) - P: 0, I: 0, O: 0
Token: is - P: 0, I: 0, O: 0
Token: a - P: 0, I: 0, O: 0
Token: pervasive - P: 0, I: 0, O: 0
Token: developmental - P: 0, I: 0, O: 0
Token: disorder - P: 0, I: 0, O: 0
Token: recently - P: 0, I: 0, O: 0
Token: introduced - P: 0, I: 0, O: 0
Token: as - P: 0, I: 0, O: 0
Token: a - P: 0, I: 0, O: 0
Token: new - P: 0, I: 0, O: 0
Token: diagnostic - P: 0, I: 0, O: 0
Token: category - P: 0, I: 0, O: 0
Token: in - P: 0, I: 0, O: 0
Token: the - P: 0, I: 0, O: 0
Token: ICD-10 - P: 0, I: 0, O: 0
Token:

## **Training the biLSTM model**

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchcrf import CRF
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [None]:
class BiLSTMCRF(nn.Module):
    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTMCRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # The CRF module
        self.crf = CRF(self.tagset_size, batch_first=True)

    def forward(self, sentence, tags=None, mask=None):
        # Get the emission scores from the BiLSTM
        embeds = self.word_embeds(sentence)
        lstm_out, _ = self.lstm(embeds)
        lstm_out = lstm_out.view(len(sentence), -1, self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)

        # If tags are provided, compute the loss
        if tags is not None:
            loss = -self.crf(lstm_feats, tags, mask=mask, reduction='mean')
            return loss
        else:
            # Find the best path, given the features
            score, tag_seq = self.crf.decode(lstm_feats, mask=mask)
            return score, tag_seq


In [None]:
# Example usage
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

# Training the model
def train_model():
    # Just for example, assume some data
    training_data = df
    word_to_ix = {}
    for sentence, tags in training_data:
        for word in sentence:
            if word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)

    tag_to_ix = {"B-per": 0, "I-per": 1, "B-geo": 2, "I-geo": 3, "O": 4}
    model = BiLSTMCRF(len(word_to_ix), tag_to_ix, embedding_dim=5, hidden_dim=4)

    optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

    # Make sure prepare_sequence from earlier in the LSTM section is loaded
    for epoch in range(300):  # again, normally you would NOT do 300 epochs, this number is here as an example
        for sentence, tags in training_data:
            model.zero_grad()

            sentence_in = prepare_sequence(sentence, word_to_ix)
            targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)

            loss = model(sentence_in, targets)

            loss.backward()
            optimizer.step()

    with torch.no_grad():
        precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
        print(model(precheck_sent))

train_model()


ValueError: too many values to unpack (expected 2)

## **Setting up the GPT-4 model**

### **Creating a suitable df for GPT**

In [14]:
import pandas as pd

pico_map = {
    'P': ['No label', 'Age', 'Sex', 'Sample size', 'Condition'],
    'I': ['No label', 'Surgical', 'Physical', 'Drug', 'Educational', 'Psychological', 'Other', 'Control'],
    'O': ['No label', 'Physical', 'Pain', 'Mortality', 'Adverse effects', 'Mental', 'Other']
}

def map_annotations(annotation):
    mapped_p = pico_map['P'][annotation['P']] if annotation['P'] < len(pico_map['P']) else 'No label'
    mapped_i = pico_map['I'][annotation['I']] if annotation['I'] < len(pico_map['I']) else 'No label'
    mapped_o = pico_map['O'][annotation['O']] if annotation['O'] < len(pico_map['O']) else 'No label'

    result = []
    if mapped_p != 'No label':
        result.append(f"P-{mapped_p}")
    if mapped_i != 'No label':
        result.append(f"I-{mapped_i}")
    if mapped_o != 'No label':
        result.append(f"O-{mapped_o}")

    return ','.join(result) if result else '0'

def create_gpt_df(df):
    gpt_data = []
    for index, row in df.iterrows():
        sentence = ','.join([token['token'] for token in row['Annotations']])
        annotation = ','.join([map_annotations(token) for token in row['Annotations']])
        gpt_data.append({"sentence": sentence, "annotation": annotation})
    
    return pd.DataFrame(gpt_data)

GPT_df = create_gpt_df(test_df)
print(GPT_df.head())
#print(GPT_df.iloc[0])

                                            sentence   
0  The,acute,effects,of,fluid,intake,on,urine,spe...  \
1  Pedantic,speaking,style,differentiates,Asperge...   
2  Timing,for,delivering,individualized,patient,e...   
3  Sunbathing,and,sunbed,use,related,to,self-imag...   
4  Study,of,the,vaginal,tolerance,to,Acidform,,,a...   

                                          annotation  
0  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,P-Condition,P-Co...  
1  I-Psychological,I-Psychological,0,0,0,0,0,0,0,...  
2  0,0,0,I-Educational,I-Educational,I-Educationa...  
3  0,0,0,0,0,0,0,0,0,0,0,0,0,P-Age,0,0,0,0,0,0,0,...  
4  0,0,0,0,0,0,I-Drug,I-Drug,I-Drug,I-Drug,I-Drug...  


In [6]:
import openai
openai.api_key = "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"
openai.api_key = "sk-proj-oiuxDclszqeK2nXu4n9CT3BlbkFJttrqrKBM9FCiXJjJPHvp"

assistant = openai.beta.assistants.create(
  name="PICO annotator",
  instructions="Given a text input, respond with one of the following PICO annotation for each word: P-Age, P-Sex, P-Sample size, P-Condition, I-Surgical, I-Physical, I-Drug, I-Educational, I-Psychological, I-Other, I-Control, O-Physical, O-Pain, O-Mortality, O-Adverse effects, O-Mental, O-Other and 0 for none",
  model="gpt-3.5-turbo",
)

In [7]:
thread = openai.beta.threads.create()

In [22]:
print(GPT_df.iloc[50]["sentence"])
print(GPT_df.iloc[50]["annotation"])

Serum,bactericidal,activities,and,comparative,pharmacokinetics,of,meropenem,and,imipenem-cilastatin,.,The,pharmacokinetics,and,serum,bactericidal,activities,(,SBAs,),of,imipenem,and,meropenem,were,investigated,in,a,randomized,crossover,study,.,Twelve,healthy,male,volunteers,received,a,constant,30-min,infusion,of,either,1,g,of,imipenem,plus,1,g,of,cilastatin,or,1,g,of,meropenem,.,The,concentrations,of,the,drugs,in,serum,and,urine,were,determined,by,bioassay,and,high-pressure,liquid,chromatography,.,Pharmacokinetic,parameters,were,based,on,an,open,two-compartment,model,and,a,noncompartmental,technique,.,At,the,end,of,infusion,,,the,mean,concentrations,of,imipenem,and,meropenem,measured,in,serum,were,61.2,+/-,9.8,and,51.6,+/-,6.5,mg/liter,,,respectively,;,urinary,recoveries,were,48.6,%,+/-,8.2,%,and,60.0,%,+/-,6.5,%,of,the,dose,in,12,h,,,respectively,;,and,the,areas,under,the,concentration-time,curve,from,time,zero,to,infinity,were,96.1,+/-,14.4,and,70.5,+/-,10.3,mg.h/liter,,,respectively

In [9]:
message = openai.beta.threads.messages.create(
    thread_id=thread.id,
    role="user",
    content=str(GPT_df.iloc[0]["sentence"])
)

In [10]:
run = openai.beta.threads.runs.create_and_poll(
  thread_id=thread.id,
  assistant_id=assistant.id,
  instructions="Given a text input, respond with one of the following PICO annotation for each word: P-Age, P-Sex, P-Sample size, P-Condition, I-Surgical, I-Physical, I-Drug, I-Educational, I-Psychological, I-Other, I-Control, O-Physical, O-Pain, O-Mortality, O-Adverse effects, O-Mental, O-Other and 0 for none"
)

In [11]:
if run.status == 'completed': 
  messages = openai.beta.threads.messages.list(
    thread_id=thread.id
  )
  print(messages)
else:
  print(run.status)

SyncCursorPage[Message](data=[Message(id='msg_0P85n3ZsN3XRJXQK1RmVmIO3', assistant_id='asst_B38yzWFSvRHiFGYMhZSOPXs9', completed_at=None, content=[TextContentBlock(text=Text(annotations=[], value='P-Condition, I-Surgical, I-Other, I-Psychological, I-Educational, O-Physical'), type='text')], created_at=1713870091, file_ids=[], incomplete_at=None, incomplete_details=None, metadata={}, object='thread.message', role='assistant', run_id='run_BZYd3RcTHOmS8Y8G1knUUsZ6', status=None, thread_id='thread_BRV9pKiEH10N596PytZ6aV42'), Message(id='msg_XKJCQhbxqsutdi6eYvi8rBq4', assistant_id=None, completed_at=None, content=[TextContentBlock(text=Text(annotations=[], value='A comparative trial of liver biopsy needles . A sheathed needle ( Tru-Cut ) was compared with a suction biopsy needle ( Menghini ) in a randomised prospective trial over 18 months to determine whether the former offered any special advantages in routine percutaneous liver biopsy . Seventy-seven consecutive biopsies were performed b

In [7]:
import openai
openai.api_key = "sk-proj-oiuxDclszqeK2nXu4n9CT3BlbkFJttrqrKBM9FCiXJjJPHvp"

completion = openai.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role":"system", "content": "You are a PICO annotator. Given a text input, return one of the following PICO annotation for each and every value separated by comas: 0 for none, or P-Age, P-Sex, P-Sample size, P-Condition, I-Surgical, I-Physical, I-Drug, I-Educational, I-Psychological, I-Other, I-Control, O-Physical, O-Pain, O-Mortality, O-Adverse effects, O-Mental, O-Other"},
    {"role": "user", "content": GPT_df.iloc[50]["sentence"]}, # 1-shot sentence
    {"role": "user", "content": GPT_df.iloc[50]["annotation"]}, # 1-shot annotation
    {"role": "user", "content": GPT_df.iloc[0]["sentence"]} # Sentence to annotate (will be iterated)
  ]
)

print("--- INPUT ---")
print(GPT_df.iloc[0]["sentence"])
print("--- GPT ANNOTATION ---")
print(completion.choices[0].message.content)
print("--- GOLD ANNOTATION ---")
print(GPT_df.iloc[0]["annotation"])

--- INPUT ---
A,comparative,trial,of,liver,biopsy,needles,.,A,sheathed,needle,(,Tru-Cut,),was,compared,with,a,suction,biopsy,needle,(,Menghini,),in,a,randomised,prospective,trial,over,18,months,to,determine,whether,the,former,offered,any,special,advantages,in,routine,percutaneous,liver,biopsy,.,Seventy-seven,consecutive,biopsies,were,performed,by,a,single,operator,.,Although,biopsy,fragmentation,was,commoner,with,the,suction,needle,,,the,length,and,volume,of,the,largest,core,obtained,was,similar,to,results,with,the,sheathed,needle,.,Cytology,provided,useful,additional,information,with,the,Menghini,technique,.,The,suction,needle,was,repeatedly,reusable,and,considerably,cheaper,than,the,sheathed,needle,,,which,may,be,used,once,only,.
--- GPT ANNOTATION ---
O-Physical,O-Physical,O-Physical,O-Physical,O-Physical,I-Physical,O-Physical,O-Physical,O-Physical,O-Other,O-Other,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
--- 

### **Labelling the Test Set**

In [15]:
import openai
import pandas as pd

# Assuming your data is in a DataFrame called `GPT_df`
# with columns ['sentence', 'annotation']

# Set your OpenAI API key
openai.api_key = "sk-proj-oiuxDclszqeK2nXu4n9CT3BlbkFJttrqrKBM9FCiXJjJPHvp"

# Function to get annotations from the model
def get_annotations(sentence):
    completion = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role":"system", "content": "You are a PICO annotator. Given a text input, return one of the following PICO annotation for each and every value separated by comas: 0 for none, or P-Age, P-Sex, P-Sample size, P-Condition, I-Surgical, I-Physical, I-Drug, I-Educational, I-Psychological, I-Other, I-Control, O-Physical, O-Pain, O-Mortality, O-Adverse effects, O-Mental, O-Other"},
            {"role": "user", "content": GPT_df.iloc[50]["sentence"]}, # 1-shot sentence
            {"role": "user", "content": GPT_df.iloc[50]["annotation"]}, # 1-shot annotation
            {"role": "user", "content": sentence} # Sentence to annotate (will be iterated)
  ]
    )
    return completion.choices[0].message.content

results = pd.DataFrame(columns=['sentence', 'gpt_annotation', 'gold_annotation'])

#for index, row in GPT_df.iterrows(): # Remove limitation of only iterating over the first 10 elements
for index, row in GPT_df.iloc[:10].iterrows():
    gpt_annotation = get_annotations(row['sentence'])
    new_row = pd.DataFrame({
        'sentence': [row['sentence'].split(',')],
        'gpt_annotation': [gpt_annotation.split(',')],
        'gold_annotation': [row['annotation'].split(',')]
    })
    results = pd.concat([results, new_row], ignore_index=True)

print(results)
#results.to_csv('annotation_comparison.csv', index=False)


                                            sentence   
0  [The, acute, effects, of, fluid, intake, on, u...  \
1  [Pedantic, speaking, style, differentiates, As...   
2  [Timing, for, delivering, individualized, pati...   
3  [Sunbathing, and, sunbed, use, related, to, se...   
4  [Study, of, the, vaginal, tolerance, to, Acidf...   
5  [Is, a, calculated, total, hip, BMD, of, clini...   
6  [Relation, of, total, homocysteine, and, lipid...   
7  [Immunological, changes, after, minimally, inv...   
8  [Computer-navigated, versus, conventional, tot...   
9  [Enhanced, small, group, instruction, using, c...   

                                      gpt_annotation   
0  [0, 0, 0, 0, 0, 0, 0, 0, P-Sample size, 0, 0, ...  \
1  [0, 0, 0, P-Condition, 0, 0, 0, P-Condition, 0...   
2  [I-Control, 0, 0, 0, 0, 0, 0, O-Physical, 0, 0...   
3  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, I-Physical, ...   
4  [0, 0, P-Condition, 0, 0, 0, 0, 0, 0, 0, 0, 0,...   
5  [0, 0, 0, 0, 0, P-Condition, I-Drug, 0, 0, 0

### **Evaluting on the Test Set**

In [16]:
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

#Binarizing the gold labels
gold_labels = mlb.fit_transform(results['gold_annotation'])
predicted_labels = mlb.transform(results['gpt_annotation'])

print(classification_report(gold_labels, predicted_labels, target_names=mlb.classes_))

                   precision    recall  f1-score   support

                0       1.00      1.00      1.00        10
           I-Drug       0.00      0.00      0.00         2
    I-Educational       0.00      0.00      0.00         3
          I-Other       0.50      0.50      0.50         2
  I-Psychological       0.00      0.00      0.00         1
       I-Surgical       0.00      0.00      0.00         3
O-Adverse effects       0.00      0.00      0.00         2
         O-Mental       0.00      0.00      0.00         3
      O-Mortality       0.00      0.00      0.00         1
          O-Other       1.00      0.25      0.40         4
           O-Pain       0.00      0.00      0.00         1
       O-Physical       0.60      0.43      0.50         7
            P-Age       0.00      0.00      0.00         4
      P-Condition       0.62      0.83      0.71         6
    P-Sample size       0.67      0.57      0.62         7
            P-Sex       1.00      0.25      0.40       

  _warn_prf(average, modifier, msg_start, len(result))


### **Manual Evaluation**

In [17]:
def print_formatted_data(df):
    for idx, row in df.iterrows():
        print("\nSentence", idx+1)
        sentence_words = row['sentence']
        gpt_annotations = row['gpt_annotation']
        gold_annotations = row['gold_annotation']
        
        for word_idx, word in enumerate(sentence_words):
            if word_idx < len(gpt_annotations) and word_idx < len(gold_annotations):
                # Strip whitespace for clean display
                g_word = gpt_annotations[word_idx].strip()
                gold_word = gold_annotations[word_idx].strip()
                print(f"{word.strip()} - GPT: {g_word}, Gold: {gold_word}")

# Print formatted data
print_formatted_data(results)


Sentence 1
The - GPT: 0, Gold: 0
acute - GPT: 0, Gold: 0
effects - GPT: 0, Gold: 0
of - GPT: 0, Gold: 0
fluid - GPT: 0, Gold: 0
intake - GPT: 0, Gold: 0
on - GPT: 0, Gold: 0
urine - GPT: 0, Gold: 0
specific - GPT: P-Sample size, Gold: 0
gravity - GPT: 0, Gold: 0
and - GPT: 0, Gold: 0
fluid - GPT: 0, Gold: 0
retention - GPT: 0, Gold: 0
in - GPT: 0, Gold: 0
a - GPT: 0, Gold: 0
mildly - GPT: 0, Gold: P-Condition
dehydrated - GPT: 0, Gold: P-Condition
state - GPT: P-Condition, Gold: P-Condition
. - GPT: 0, Gold: 0
Many - GPT: 0, Gold: 0
athletes - GPT: 0, Gold: 0
arrive - GPT: P-Sample size, Gold: 0
at - GPT: P-Sample size, Gold: 0
training - GPT: P-Condition, Gold: 0
sessions - GPT: 0, Gold: 0
and - GPT: 0, Gold: 0
competitions - GPT: 0, Gold: 0
in - GPT: 0, Gold: 0
a - GPT: 0, Gold: 0
mildly - GPT: 0, Gold: 0
hypohydrated - GPT: 0, Gold: 0
( - GPT: 0, Gold: 0
HYPO - GPT: 0, Gold: P-Condition
) - GPT: 0, Gold: 0
state - GPT: 0, Gold: 0
and - GPT: 0, Gold: 0
are - GPT: 0, Gold: 0
instruct

In [18]:
from IPython.display import display, HTML

def print_formatted_data_html(df):
    for idx, row in df.iterrows():
        html_output = f"<h3>Sentence {idx+1}</h3><div style='display: flex; flex-wrap: wrap; gap: 20px; align-items: flex-start;'>"
        sentence_words = row['sentence']
        gpt_annotations = row['gpt_annotation']
        gold_annotations = row['gold_annotation']
        
        for word_idx, word in enumerate(sentence_words):
            if word_idx < len(gpt_annotations) and word_idx < len(gold_annotations):
                gpt_color = "red" if gpt_annotations[word_idx].strip() != gold_annotations[word_idx].strip() else "black"
                word_html = f"<div style='text-align: center;'>"
                word_html += f"<div style='color: black; font-weight: bold;'>{word.strip()}</div>"
                word_html += f"<div style='color: {gpt_color};'>GPT: {gpt_annotations[word_idx].strip()}</div>"
                word_html += f"<div style='color: black;'>Gold: {gold_annotations[word_idx].strip()}</div>"
                word_html += "</div>"
                html_output += word_html
        
        html_output += "</div>"
        display(HTML(html_output))

print_formatted_data_html(results)