# **Bachelor Project Artificial Intelligence**

M Bartos (2724195)

Vrije Universiteit Amsterdam

## **Preprocessing the Train Set**

In [10]:
import os
import pandas as pd

def read_data(directory):
    data = []
    for subdir, _, files in os.walk(os.path.join(directory, 'documents')):
        for filename in files:
            if filename.endswith('.text'):
                pmid = filename.split('.')[0]
                with open(os.path.join(subdir, filename), 'r') as file:
                    text = file.read().strip()
                tokens_file = os.path.join(subdir, f"{pmid}.tokens")
                with open(tokens_file, 'r') as file:
                    tokens = file.read().strip().split()
                data.append({'pmid': pmid, 'text': text, 'tokens': tokens})
    return data

def parse_pico(tokens, labels_p_file, labels_i_file, labels_o_file):
    pico_annotations = []
    with open(labels_p_file, 'r') as p_file, open(labels_i_file, 'r') as i_file, open(labels_o_file, 'r') as o_file:
        labels_p = p_file.read().strip().split(',')
        labels_i = i_file.read().strip().split(',')
        labels_o = o_file.read().strip().split(',')

    for token, p, i, o in zip(tokens, labels_p, labels_i, labels_o):
        pico_annotations.append({
            'token': token,
            'P': int(p),
            'I': int(i),
            'O': int(o)
        })
    return pico_annotations

data_directory = '/Users/markbartos/Library/Mobile Documents/com~apple~CloudDocs/DRIVE/EDUCATION/VU_AI/YEAR3 PERIOD 5/BPAI/Code/EBM-NLP-master/ebm_nlp_1_00'

data = read_data(data_directory)

sentence_data = []
for entry in data:
    pmid = entry['pmid']
    tokens = entry['tokens']
    
    labels_p_file = os.path.join(data_directory, 'annotations', 'aggregated', 'hierarchical_labels', 'participants', 'train', f"{pmid}_AGGREGATED.ann")
    labels_i_file = os.path.join(data_directory, 'annotations', 'aggregated', 'hierarchical_labels', 'interventions', 'train', f"{pmid}_AGGREGATED.ann")
    labels_o_file = os.path.join(data_directory, 'annotations', 'aggregated', 'hierarchical_labels', 'outcomes', 'train', f"{pmid}_AGGREGATED.ann")

    if os.path.exists(labels_p_file) and os.path.exists(labels_i_file) and os.path.exists(labels_o_file):
        annotations = parse_pico(tokens, labels_p_file, labels_i_file, labels_o_file)
        sentence_data.append({'PMID': pmid, 'Annotations': annotations})


df = pd.DataFrame(sentence_data)
#print(df.head())

## **Preprocessing the Test Set**

In [11]:
import os
import pandas as pd

def read_data(directory):
    data = []
    for subdir, _, files in os.walk(os.path.join(directory, 'documents')):
        for filename in files:
            if filename.endswith('.text'):
                pmid = filename.split('.')[0]
                with open(os.path.join(subdir, filename), 'r') as file:
                    text = file.read().strip()
                tokens_file = os.path.join(subdir, f"{pmid}.tokens")
                with open(tokens_file, 'r') as file:
                    tokens = file.read().strip().split()
                data.append({'pmid': pmid, 'text': text, 'tokens': tokens})
    return data

def parse_pico(tokens, labels_p_file, labels_i_file, labels_o_file):
    pico_annotations = []
    with open(labels_p_file, 'r') as p_file, open(labels_i_file, 'r') as i_file, open(labels_o_file, 'r') as o_file:
        labels_p = p_file.read().strip().split(',')
        labels_i = i_file.read().strip().split(',')
        labels_o = o_file.read().strip().split(',')

    for token, p, i, o in zip(tokens, labels_p, labels_i, labels_o):
        pico_annotations.append({
            'token': token,
            'P': int(p),
            'I': int(i),
            'O': int(o)
        })
    return pico_annotations

data_directory = '/Users/markbartos/Library/Mobile Documents/com~apple~CloudDocs/DRIVE/EDUCATION/VU_AI/YEAR3 PERIOD 5/BPAI/Code/EBM-NLP-master/ebm_nlp_1_00'

data = read_data(data_directory)
#print(data[0])

sentence_data = []
for entry in data:
    pmid = entry['pmid']
    tokens = entry['tokens']
    
    labels_p_file = os.path.join(data_directory, 'annotations', 'aggregated', 'hierarchical_labels', 'participants', 'test', 'gold', f"{pmid}_AGGREGATED.ann")
    labels_i_file = os.path.join(data_directory, 'annotations', 'aggregated', 'hierarchical_labels', 'interventions', 'test', 'gold', f"{pmid}_AGGREGATED.ann")
    labels_o_file = os.path.join(data_directory, 'annotations', 'aggregated', 'hierarchical_labels', 'outcomes', 'test', 'gold', f"{pmid}_AGGREGATED.ann")

    if os.path.exists(labels_p_file) and os.path.exists(labels_i_file) and os.path.exists(labels_o_file):
        annotations = parse_pico(tokens, labels_p_file, labels_i_file, labels_o_file)
        sentence_data.append({'PMID': pmid, 'Annotations': annotations})


test_df = pd.DataFrame(sentence_data)
#print(test_df.head())

### **Inspecting the Database**

Uncomment the desiered lines. Keep commented out for clarity.

**Per PMID**

In [3]:
pd.set_option('display.max_colwidth', None)  
#print(df.iloc[0])
#print(test_df.iloc[0])

**Annotations per token**

In [4]:
annotations = df.at[0, 'Annotations']
annotations = test_df.at[0, 'Annotations']

for annotation in annotations:
    #print(f"Token: {annotation['token']} - P: {annotation['P']}, I: {annotation['I']}, O: {annotation['O']}")
    break

**Token Matrix**

In [5]:
annotations_list = df.at[0, 'Annotations']
annotations_df = pd.DataFrame(annotations_list)
sorted_annotations_df = annotations_df.sort_values(by=['P', 'I', 'O'])
#print(sorted_annotations_df)

## **Training the biLSTM model**

### **Creating a df suitable for LSTM**

In [12]:
pico_map = {
    'P': ['No label', 'Age', 'Sex', 'Sample size', 'Condition'],
    'I': ['No label', 'Surgical', 'Physical', 'Drug', 'Educational', 'Psychological', 'Other', 'Control'],
    'O': ['No label', 'Physical', 'Pain', 'Mortality', 'Adverse effects', 'Mental', 'Other']
}

def map_annotations(annotation):
    mapped_p = pico_map['P'][annotation['P']] if annotation['P'] < len(pico_map['P']) else 'No label'
    mapped_i = pico_map['I'][annotation['I']] if annotation['I'] < len(pico_map['I']) else 'No label'
    mapped_o = pico_map['O'][annotation['O']] if annotation['O'] < len(pico_map['O']) else 'No label'

    result = []
    if mapped_p != 'No label':
        result.append(f"P-{mapped_p}")
    elif mapped_i != 'No label':
        result.append(f"I-{mapped_i}")
    elif mapped_o != 'No label':
        result.append(f"O-{mapped_o}")

    return ','.join(result) if result else '0'

def create_lstm_df(df):
    lstm_data = []
    for index, row in df.iterrows():
        tokens = [token['token'] for token in row['Annotations']]
        annotations = [map_annotations(token) for token in row['Annotations']]
        lstm_data.append({"PMID": pmid, "Tokens": tokens, "Annotations": annotations})
    
    return pd.DataFrame(lstm_data)


LSTM_df = create_lstm_df(df)
LSTM_test_df = create_lstm_df(test_df)
#print(LSTM_df.head())

### **Training the BiLSTM model**

In [13]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

def prepare_data(df):
    sentences = df['Tokens'].tolist()
    labels = df['Annotations'].tolist()
    return sentences, labels

train_sentences, train_labels = prepare_data(LSTM_df)
test_sentences, test_labels = prepare_data(LSTM_test_df)

def build_model(max_len, n_words, n_tags):
    input = Input(shape=(max_len,))
    model = Embedding(input_dim=n_words, output_dim=50, input_length=max_len)(input)
    model = Dropout(0.1)(model)
    model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
    model = TimeDistributed(Dense(50, activation="relu"))(model)
    out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)
    model = Model(input, out)
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    return model

max_len = 500 # This is a cut-off value, to speed up training but it does increase loss
n_words = len(set([token for sentence in train_sentences for token in sentence])) + 1
n_tags = len(set([annotation for labels in train_labels for annotation in labels])) + 1

model = build_model(max_len, n_words, n_tags)
model.summary()

def encode_data(sentences, labels, max_len, n_words, n_tags):
    word2idx = {w: i for i, w in enumerate(set([token for sentence in sentences for token in sentence]), 1)}
    label2idx = {l: i for i, l in enumerate(set([label for label_list in labels for label in label_list]), 1)}
    
    X = [[word2idx[token] for token in sentence] for sentence in sentences]
    X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=n_words-1)
    
    y = [[label2idx[label] for label in label_list] for label_list in labels]
    y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=n_tags-1)
    
    return X, np.array(y), word2idx, label2idx

# Encode training data
X_train, y_train, word2idx, label2idx = encode_data(train_sentences, train_labels, max_len, n_words, n_tags)
X_test, y_test, _, _ = encode_data(test_sentences, test_labels, max_len, n_words, n_tags)

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 500)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 500, 50)           1887500   
                                                                 
 dropout_1 (Dropout)         (None, 500, 50)           0         
                                                                 
 bidirectional_1 (Bidirecti  (None, 500, 200)          120800    
 onal)                                                           
                                                                 
 time_distributed_2 (TimeDi  (None, 500, 50)           10050     
 stributed)                                                      
                                                                 
 time_distributed_3 (TimeDi  (None, 500, 19)           969 

In [14]:
model.fit(X_train, y_train, batch_size=32, epochs=10, validation_split=0.1, verbose=1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"Loss: {loss}, Accuracy: {accuracy}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss: 1.0304228067398071, Accuracy: 0.815353512763977


### **Predicting for the test data**

In [None]:
# Predicting labels
predictions = model.predict(X_test, verbose=1)

# Converting predictions from encoded back to the original labels
predicted_labels = np.argmax(predictions, axis=-1)
idx2label = {i: label for label, i in label2idx.items()}
print("Label index to name mapping:", idx2label)

def decode_predictions(predictions, sentences, idx2label):
    decoded_predictions = []
    for i, sentence in enumerate(sentences):
        decoded_sentence = []
        for j, token in enumerate(sentence):
            if j < len(predictions[i]):
                label_idx = predictions[i][j]
                label = idx2label.get(label_idx, 'O')  # O if label not found
                decoded_sentence.append((token, label))
            else:
                decoded_sentence.append((token, 'O'))  # O if label not found
        decoded_predictions.append(decoded_sentence)
    return decoded_predictions

# Raise error if label not in range
for label in predicted_labels.flatten():
    if label not in idx2label:
        print(f"Warning: Predicted label index {label} not found in idx2label mapping.")

decoded_predictions = decode_predictions(predicted_labels, test_sentences, idx2label)

def create_prediction_df(df, decoded_predictions):
    prediction_data = []
    for index, row in df.iterrows():
        pmid = row["PMID"]
        tokens = row["Tokens"]
        annotations = [label for _, label in decoded_predictions[index]]
        prediction_data.append({"PMID": pmid, "Tokens": tokens, "Predictions": annotations})
    
    return pd.DataFrame(prediction_data)

# Implementing it back into a df
prediction_df = create_prediction_df(LSTM_test_df, decoded_predictions)
#print(prediction_df.head())

# Ability to manually examine predictions
for i in range(0): # Increase range to get started
    print(f"Sentence {i+1}:")
    for token, label in decoded_predictions[i]:
        print(f"{token}: {label}")
    print("\n")

Label index to name mapping: {1: 'P-Sample size', 2: '0', 3: 'I-Drug', 4: 'P-Age', 5: 'I-Educational', 6: 'I-Physical', 7: 'P-Sex', 8: 'I-Control', 9: 'O-Mental', 10: 'O-Physical', 11: 'O-Pain', 12: 'O-Mortality', 13: 'I-Other', 14: 'O-Other', 15: 'I-Surgical', 16: 'P-Condition', 17: 'O-Adverse effects', 18: 'I-Psychological'}


### **Evaluating for LSTM**

**Absract-level Evaluation**

In [None]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer

def abstract_level_evaluation(prediction_df, LSTM_test_df):
    mlb = MultiLabelBinarizer()

    # Prepare gold labels
    gold_labels = [set(row['Annotations']) for _, row in LSTM_test_df.iterrows()]
    predicted_labels = [set(row['Predictions']) for _, row in prediction_df.iterrows()]

    # Binarize the labels
    gold_labels_binarized = mlb.fit_transform(gold_labels)
    predicted_labels_binarized = mlb.transform(predicted_labels)

    # Generate the classification report
    report = classification_report(gold_labels_binarized, predicted_labels_binarized, target_names=mlb.classes_, zero_division=0)
    print(report)

    accuracy = accuracy_score(gold_labels_binarized, predicted_labels_binarized)
    print(f"Accuracy: {accuracy:.4f}")

# Assuming prediction_df and LSTM_test_df are already created and loaded as per your previous steps
abstract_level_evaluation(prediction_df, LSTM_test_df)

                   precision    recall  f1-score   support

                0       1.00      1.00      1.00        99
        I-Control       0.00      0.00      0.00        36
           I-Drug       0.66      0.63      0.65        65
    I-Educational       0.20      0.08      0.12        12
          I-Other       0.00      0.00      0.00        15
       I-Physical       0.14      0.05      0.08        19
  I-Psychological       0.04      1.00      0.08         4
       I-Surgical       0.00      0.00      0.00        16
O-Adverse effects       0.00      0.00      0.00        29
         O-Mental       0.11      0.06      0.07        18
      O-Mortality       0.00      0.00      0.00        11
          O-Other       0.33      0.02      0.04        53
           O-Pain       0.00      0.00      0.00         5
       O-Physical       0.82      0.18      0.29        79
            P-Age       0.00      0.00      0.00        30
      P-Condition       0.00      0.00      0.00       



**Token-level Evaluation**

In [None]:
from sklearn.metrics import classification_report

true_labels = y_test.flatten()
pred_labels = predicted_labels.flatten()

# Convert the labels back to their original names
true_label_names = [idx2label[idx] for idx in true_labels]
pred_label_names = [idx2label[idx] for idx in pred_labels]

report = classification_report(true_label_names, pred_label_names, labels=list(idx2label.values()), zero_division=0, digits=2)
print(report)

                   precision    recall  f1-score   support

      O-Mortality       0.00      0.00      0.00        90
        I-Control       0.00      0.00      0.00      1748
      P-Condition       0.04      0.01      0.02       676
                0       0.80      0.75      0.77     22846
       O-Physical       0.00      0.00      0.00        25
            P-Age       0.00      0.00      0.00       100
       I-Surgical       0.00      0.00      0.00       201
    I-Educational       0.00      0.00      0.00       295
         O-Mental       0.02      0.01      0.01       341
           O-Pain       0.00      0.00      0.00       120
            P-Sex       0.00      0.00      0.00        49
          O-Other       0.00      0.00      0.00       555
       I-Physical       0.00      0.02      0.01       253
O-Adverse effects       0.00      0.00      0.00       144
    P-Sample size       0.00      0.00      0.00       161
          I-Other       0.00      0.00      0.00       

## **Setting up the GPT-4 model**

### **Creating a suitable df for GPT**

In [17]:
import pandas as pd

pico_map = {
    'P': ['No label', 'Age', 'Sex', 'Sample size', 'Condition'],
    'I': ['No label', 'Surgical', 'Physical', 'Drug', 'Educational', 'Psychological', 'Other', 'Control'],
    'O': ['No label', 'Physical', 'Pain', 'Mortality', 'Adverse effects', 'Mental', 'Other']
}

def map_annotations(annotation):
    mapped_p = pico_map['P'][annotation['P']] if annotation['P'] < len(pico_map['P']) else 'No label'
    mapped_i = pico_map['I'][annotation['I']] if annotation['I'] < len(pico_map['I']) else 'No label'
    mapped_o = pico_map['O'][annotation['O']] if annotation['O'] < len(pico_map['O']) else 'No label'

    result = []
    if mapped_p != 'No label':
        result.append(f"P-{mapped_p}")
    elif mapped_i != 'No label':
        result.append(f"I-{mapped_i}")
    elif mapped_o != 'No label':
        result.append(f"O-{mapped_o}")

    return ','.join(result) if result else '0'

def create_gpt_df(df):
    gpt_data = []
    for index, row in df.iterrows():
        sentence = '•'.join([token['token'] for token in row['Annotations']])
        annotation = '•'.join([map_annotations(token) for token in row['Annotations']])
        gpt_data.append({"sentence": sentence, "annotation": annotation})
    
    return pd.DataFrame(gpt_data)

GPT_df = create_gpt_df(test_df)
#print(GPT_df.head())

**Manual Examination of Database**

In [18]:
pd.set_option('display.max_colwidth', None)  
#print(GPT_df.iloc[0])

### **Predicting with GPT-4o**

In [50]:
import openai
import pandas as pd
from tqdm import tqdm


openai.api_key = "sk-proj-oiuxDclszqeK2nXu4n9CT3BlbkFJttrqrKBM9FCiXJjJPHvp"
chunk_size = 75


def get_annotations(sentence):
    completion = openai.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a PICO annotator tasked with labeling tokens. For every chunk of 75 tokens separated by bullets (•), return 75 PICO annotations. Use one of the following options: 0 for none, P-Age, P-Sex, P-Sample size, P-Condition, I-Surgical, I-Physical, I-Drug, I-Educational, I-Psychological, I-Other, I-Control, O-Physical, O-Pain, O-Mortality, O-Adverse effects, O-Mental, O-Other. It is crucial that you return exactly 75 labels for each chunk, ensuring that the number of labels matches the number of tokens exactly."},
            {"role": "user", "content": GPT_df.iloc[50]["sentence"][:521]}, # 75 token long 1-shot sentence
            {"role": "assistant", "content": GPT_df.iloc[50]["annotation"][:200]}, # 75 token long 1-shot annotation
            {"role": "user", "content": sentence}  # Sentence to annotate
        ]
    )
    
    gpt_annotations = completion.choices[0].message.content.split('•')

    if len(gpt_annotations) > chunk_size:
        gpt_annotations = gpt_annotations[:chunk_size]
    elif len(gpt_annotations) < chunk_size:
        gpt_annotations += ['0'] * (chunk_size - len(gpt_annotations))
    
    return '•'.join(gpt_annotations)


def process_annotations(gpt_df):
    results = pd.DataFrame(columns=['sentence', 'gpt_annotation', 'gold_annotation'])

    # Iteration over each element of the Dataframe (per PMID)
    for index, row in tqdm(GPT_df.iloc[:99].iterrows(), total=99, desc="Processing annotations"):
        sentences = row['sentence'].split('•')
        annotations = row['annotation'].split('•')
        
        temp_sentences = []
        temp_gpt_annotations = []
        temp_gold_annotations = []

        # Chuncking is taking part underneath this iteration
        for i in range(0, len(sentences), chunk_size):
            sentence_chunk = '•'.join(sentences[i:i+chunk_size])
            annotation_chunk = '•'.join(annotations[i:i+chunk_size])
            
            # Utilizing the API call to classify
            gpt_annotation = get_annotations(sentence_chunk)
            
            # Storing results
            temp_sentences.append(sentence_chunk)
            temp_gpt_annotations.append(gpt_annotation)
            temp_gold_annotations.append(annotation_chunk)

        # Merging results with gold data, to evaluate
        results = pd.concat([results, pd.DataFrame({
            'sentence': ['•'.join(temp_sentences).split("•")],
            'gpt_annotation': ['•'.join(temp_gpt_annotations).split("•")],
            'gold_annotation': ['•'.join(temp_gold_annotations).split("•")]
        })], ignore_index=True)

    return results

results = process_annotations(GPT_df)
results.to_csv('annotations_results.csv', index=True) # Saving the results for later evaluation
print(results.head())

Processing annotations: 100%|██████████| 99/99 [22:52<00:00, 13.86s/it]


                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      sentence   
0                                                                                                                     [The, acute, effects, of, fluid, intake, on, urine, specific, gravity, and, fluid, retention, in, a, mildly, dehydrated, state, 

**Reloading the results for later evaluation**

In [15]:
import pandas as pd

results = pd.read_csv('annotations_results.csv', converters={'sentence': eval, 'gpt_annotation': eval, 'gold_annotation': eval})

### **Confusion Matrix for GPT-4o**

**Abstract-level Evaluation**

In [20]:
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

#Binarizing the gold labels
gold_labels = mlb.fit_transform(results['gold_annotation'])
predicted_labels = mlb.transform(results['gpt_annotation'])

print(classification_report(gold_labels, predicted_labels, target_names=mlb.classes_))

subset_accuracy = accuracy_score(gold_labels, predicted_labels)
print(f"Accuracy: {subset_accuracy:.4f}")

                   precision    recall  f1-score   support

                0       1.00      1.00      1.00        99
        I-Control       0.58      0.97      0.73        36
           I-Drug       0.89      0.91      0.90        65
    I-Educational       0.64      0.75      0.69        12
          I-Other       0.17      0.67      0.27        15
       I-Physical       0.61      0.58      0.59        19
  I-Psychological       0.17      0.25      0.20         4
       I-Surgical       0.50      0.69      0.58        16
O-Adverse effects       0.68      0.90      0.78        29
         O-Mental       0.65      0.72      0.68        18
      O-Mortality       0.69      1.00      0.81        11
          O-Other       0.56      0.91      0.69        53
           O-Pain       0.56      1.00      0.71         5
       O-Physical       0.89      0.65      0.75        79
            P-Age       0.76      0.87      0.81        30
      P-Condition       0.92      1.00      0.96       



**Token-level Evaluation**

In [21]:
gold_labels_flat = [label for sublist in results['gold_annotation'] for label in sublist]
predicted_labels_flat = [label for sublist in results['gpt_annotation'] for label in sublist]

min_length = min(len(gold_labels_flat), len(predicted_labels_flat))
gold_labels_flat = gold_labels_flat[:min_length]
predicted_labels_flat = predicted_labels_flat[:min_length]

mlb = MultiLabelBinarizer()
gold_labels_binarized = mlb.fit_transform([[label] for label in gold_labels_flat])
predicted_labels_binarized = mlb.transform([[label] for label in predicted_labels_flat])

print(classification_report(gold_labels_binarized, predicted_labels_binarized, target_names=mlb.classes_))

token_accuracy = accuracy_score(gold_labels_binarized, predicted_labels_binarized)
print(f"Accuracy: {token_accuracy:.4f}")

                   precision    recall  f1-score   support

                0       0.80      0.84      0.82     22917
        I-Control       0.02      0.03      0.02       120
           I-Drug       0.04      0.03      0.03       892
    I-Educational       0.01      0.00      0.01       295
          I-Other       0.00      0.00      0.00       120
       I-Physical       0.03      0.01      0.01       253
  I-Psychological       0.00      0.00      0.00        33
       I-Surgical       0.02      0.01      0.02       201
O-Adverse effects       0.00      0.00      0.00       144
         O-Mental       0.00      0.00      0.00       341
      O-Mortality       0.01      0.02      0.02        90
          O-Other       0.02      0.02      0.02       562
           O-Pain       0.00      0.00      0.00        25
       O-Physical       0.04      0.01      0.01      1756
            P-Age       0.00      0.00      0.00       100
      P-Condition       0.03      0.04      0.03       

  _warn_prf(average, modifier, msg_start, len(result))


## **Manual Evaluation**

**Text-based Evaluation**

In [19]:
def print_formatted_data(df):
    for idx, row in df.iterrows():
        print("\nSentence", idx+1)
        sentence_words = row['sentence']
        gpt_annotations = row['gpt_annotation']
        gold_annotations = row['gold_annotation']
        
        for word_idx, word in enumerate(sentence_words):
            if word_idx < len(gpt_annotations) and word_idx < len(gold_annotations):
                g_word = gpt_annotations[word_idx].strip()
                gold_word = gold_annotations[word_idx].strip()
                print(f"{word.strip()} - GPT: {g_word}, Gold: {gold_word}")

#print_formatted_data(results)

**Visual Evaluation**

In [53]:
from IPython.display import HTML

def print_formatted_data_html(df, lstm_predictions, output_file='output.html'):
    html_output = "<html><head><title>Annotated Sentences</title></head><body>"
    
    for idx, row in df.iterrows():
        sentence_html = f"<h3>Sentence {idx+1}</h3><div style='display: flex; flex-wrap: wrap; gap: 20px; align-items: flex-start;'>"
        sentence_words = row['sentence']
        gpt_annotations = row['gpt_annotation']
        gold_annotations = row['gold_annotation']
        lstm_annotations = [label for token, label in lstm_predictions[idx]]
        
        for word_idx, word in enumerate(sentence_words):
            gpt_annotation = gpt_annotations[word_idx].strip() if word_idx < len(gpt_annotations) else '!'
            lstm_annotation = lstm_annotations[word_idx].strip() if word_idx < len(lstm_annotations) else '!'
            gold_annotation = gold_annotations[word_idx].strip() if word_idx < len(gold_annotations) else '!'
            
            gpt_color = "red" if gpt_annotation != gold_annotation else "black"
            lstm_color = "orange" if lstm_annotation != gold_annotation else "black"
            
            word_html = f"<div style='text-align: center;'>"
            word_html += f"<div style='color: black; font-weight: bold;'>{word.strip()}</div>"
            word_html += f"<div style='color: {gpt_color};'>GPT: {gpt_annotation}</div>"
            word_html += f"<div style='color: {lstm_color};'>LSTM: {lstm_annotation}</div>"
            word_html += f"<div style='color: black;'>Gold: {gold_annotation}</div>"
            word_html += "</div>"
            sentence_html += word_html
        
        sentence_html += "</div>"
        html_output += sentence_html
    
    html_output += "</body></html>"
    
    # Saving it to a file
    with open(output_file, 'w') as f:
        f.write(html_output)
    
    # Displaing it within this jupyter notebook
    display(HTML(html_output))

print_formatted_data_html(results, decoded_predictions)

**The cell below only evaluates the results of GPT-4o.**

In [29]:
from IPython.display import display, HTML

def print_formatted_data_html(df):
    for idx, row in df.iterrows():
        html_output = f"<h3>Sentence {idx+1}</h3><div style='display: flex; flex-wrap: wrap; gap: 20px; align-items: flex-start;'>"
        sentence_words = row['sentence']
        gpt_annotations = row['gpt_annotation']
        gold_annotations = row['gold_annotation']
        
        for word_idx, word in enumerate(sentence_words):
            if word_idx < len(gpt_annotations) and word_idx < len(gold_annotations):
                gpt_color = "red" if gpt_annotations[word_idx].strip() != gold_annotations[word_idx].strip() else "black"
                word_html = f"<div style='text-align: center;'>"
                word_html += f"<div style='color: black; font-weight: bold;'>{word.strip()}</div>"
                word_html += f"<div style='color: {gpt_color};'>GPT: {gpt_annotations[word_idx].strip()}</div>"
                word_html += f"<div style='color: black;'>Gold: {gold_annotations[word_idx].strip()}</div>"
                word_html += "</div>"
                html_output += word_html
        
        html_output += "</div>"
        display(HTML(html_output))

#print_formatted_data_html(results)