## **Importing the Evaluation Set**

In [43]:
import pandas as pd

# Load the CSV files
file1 = '/Users/markbartos/Library/Mobile Documents/com~apple~CloudDocs/DRIVE/EDUCATION/VU_AI/YEAR3 PERIOD 5/BPAI/Full-Lenght Papers/PMC2949396.csv'
file2 = '/Users/markbartos/Library/Mobile Documents/com~apple~CloudDocs/DRIVE/EDUCATION/VU_AI/YEAR3 PERIOD 5/BPAI/Full-Lenght Papers/PMC7799030.csv'
file3 = '/Users/markbartos/Library/Mobile Documents/com~apple~CloudDocs/DRIVE/EDUCATION/VU_AI/YEAR3 PERIOD 5/BPAI/Full-Lenght Papers/PMC10707032.csv'

# Read the CSV files into DataFrames
df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)
df3 = pd.read_csv(file3)

df1.columns = ["token", "Annotations"]
df2.columns = ["token", "Annotations"]
df3.columns = ["token", "Annotations"]

df1['PMID'] = 'PMC2949396'
df2['PMID'] = 'PMC7799030'
df3['PMID'] = 'PMC10707032'

# Display the first few rows of each DataFrame to understand their structure

# Combine the dataframes into one
combined_df = pd.concat([df1, df2, df3])

# Convert to the desired format
full_lenght_df = combined_df.groupby('PMID').apply(
    lambda x: x[['token', 'Annotations']].to_dict('records')
).reset_index(name='Annotations')

# Display the combined and formatted DataFrame
print(full_lenght_df.head())

          PMID                                        Annotations
0  PMC10707032  [{'token': 'Abstract:', 'Annotations': '0'}, {...
1   PMC2949396  [{'token': 'Abstract', 'Annotations': '0'}, {'...
2   PMC7799030  [{'token': 'Abstract', 'Annotations': '0'}, {'...


## **Importing the Train Set**

In [44]:
import os
import pandas as pd

def read_data(directory):
    data = []
    for subdir, _, files in os.walk(os.path.join(directory, 'documents')):
        for filename in files:
            if filename.endswith('.text'):
                pmid = filename.split('.')[0]
                with open(os.path.join(subdir, filename), 'r') as file:
                    text = file.read().strip()
                tokens_file = os.path.join(subdir, f"{pmid}.tokens")
                with open(tokens_file, 'r') as file:
                    tokens = file.read().strip().split()
                data.append({'pmid': pmid, 'text': text, 'tokens': tokens})
    return data

def parse_pico(tokens, labels_p_file, labels_i_file, labels_o_file):
    pico_annotations = []
    with open(labels_p_file, 'r') as p_file, open(labels_i_file, 'r') as i_file, open(labels_o_file, 'r') as o_file:
        labels_p = p_file.read().strip().split(',')
        labels_i = i_file.read().strip().split(',')
        labels_o = o_file.read().strip().split(',')

    for token, p, i, o in zip(tokens, labels_p, labels_i, labels_o):
        pico_annotations.append({
            'token': token,
            'P': int(p),
            'I': int(i),
            'O': int(o)
        })
    return pico_annotations

data_directory = '/Users/markbartos/Library/Mobile Documents/com~apple~CloudDocs/DRIVE/EDUCATION/VU_AI/YEAR3 PERIOD 5/BPAI/Code/EBM-NLP-master/ebm_nlp_1_00'

data = read_data(data_directory)

sentence_data = []
for entry in data:
    pmid = entry['pmid']
    Tokens = entry['tokens']
    
    labels_p_file = os.path.join(data_directory, 'annotations', 'aggregated', 'hierarchical_labels', 'participants', 'train', f"{pmid}_AGGREGATED.ann")
    labels_i_file = os.path.join(data_directory, 'annotations', 'aggregated', 'hierarchical_labels', 'interventions', 'train', f"{pmid}_AGGREGATED.ann")
    labels_o_file = os.path.join(data_directory, 'annotations', 'aggregated', 'hierarchical_labels', 'outcomes', 'train', f"{pmid}_AGGREGATED.ann")

    if os.path.exists(labels_p_file) and os.path.exists(labels_i_file) and os.path.exists(labels_o_file):
        annotations = parse_pico(Tokens, labels_p_file, labels_i_file, labels_o_file)
        sentence_data.append({'PMID': pmid, 'Annotations': annotations})


df = pd.DataFrame(sentence_data)
print(df.head())

       PMID                                        Annotations
0   6988462  [{'token': 'A', 'P': 0, 'I': 0, 'O': 0}, {'tok...
1  18157013  [{'token': 'Minimally', 'P': 0, 'I': 0, 'O': 0...
2  17297323  [{'token': 'Combination', 'P': 0, 'I': 0, 'O':...
3  15603203  [{'token': 'Histamine', 'P': 4, 'I': 0, 'O': 0...
4  15734707  [{'token': 'The', 'P': 0, 'I': 0, 'O': 0}, {'t...


## **Training the biLSTM model**

### **Creating a df suitable for LSTM**

In [45]:
pico_map = {
    'P': ['No label', 'Age', 'Sex', 'Sample size', 'Condition'],
    'I': ['No label', 'Surgical', 'Physical', 'Drug', 'Educational', 'Psychological', 'Other', 'Control'],
    'O': ['No label', 'Physical', 'Pain', 'Mortality', 'Adverse effects', 'Mental', 'Other']
}

def map_annotations(annotation):
    mapped_p = pico_map['P'][annotation['P']] if annotation['P'] < len(pico_map['P']) else 'No label'
    mapped_i = pico_map['I'][annotation['I']] if annotation['I'] < len(pico_map['I']) else 'No label'
    mapped_o = pico_map['O'][annotation['O']] if annotation['O'] < len(pico_map['O']) else 'No label'

    result = []
    if mapped_p != 'No label':
        result.append(f"P-{mapped_p}")
    elif mapped_i != 'No label':
        result.append(f"I-{mapped_i}")
    elif mapped_o != 'No label':
        result.append(f"O-{mapped_o}")

    return ','.join(result) if result else '0'

def create_lstm_df(df):
    lstm_data = []
    for index, row in df.iterrows():
        tokens = [token['token'] for token in row['Annotations']]
        annotations = [map_annotations(token) for token in row['Annotations']]
        lstm_data.append({"PMID": pmid, "Tokens": tokens, "Annotations": annotations})
    
    return pd.DataFrame(lstm_data)


LSTM_df = create_lstm_df(df)
print(LSTM_df.head())

       PMID                                             Tokens   
0  18376682  [A, comparative, trial, of, liver, biopsy, nee...  \
1  18376682  [Minimally, invasive, treatment, combined, wit...   
2  18376682  [Combination, of, arteriovenous, extracorporea...   
3  18376682  [Histamine, intolerance-like, symptoms, in, he...   
4  18376682  [The, effect, of, vitamin, A-fortified, coconu...   

                                         Annotations  
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, I-Physical, I-Phys...  
1  [0, 0, 0, 0, 0, I-Drug, I-Drug, I-Drug, I-Drug...  
2  [0, 0, I-Physical, I-Physical, I-Physical, I-P...  
3  [P-Condition, P-Condition, P-Condition, 0, P-C...  
4  [0, 0, 0, I-Drug, I-Drug, I-Drug, I-Drug, I-Dr...  


In [47]:
def create_lstm_df2(df):
    lstm_data = []
    for index, row in df.iterrows():
        tokens = [token['token'] for token in row['Annotations']]
        annotations = [token['Annotations'] for token in row['Annotations']]
        lstm_data.append({"PMID": row['PMID'], "Tokens": tokens, "Annotations": annotations})
    
    return pd.DataFrame(lstm_data)


LSTM_test_df = create_lstm_df2(full_lenght_df)
print(LSTM_test_df.head())

          PMID                                             Tokens   
0  PMC10707032  [Abstract:, Postpartum, hemorrhage, (PPH), rem...  \
1   PMC2949396  [Abstract, Background:, Glaucoma, surgery, is,...   
2   PMC7799030  [Abstract, Background:, The, coronavirus, dise...   

                                         Annotations  
0  [0, P-Condition, P-Condition, P-Condition, 0, ...  
1  [0, 0, I-Surgical, I-Surgical, 0, 0, 0, 0, 0, ...  
2  [0, 0, 0, P-Condition, P-Condition, P-Conditio...  


### **Training the BiLSTM model**

In [48]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

def prepare_data(df):
    sentences = df['Tokens'].tolist()
    labels = df['Annotations'].tolist()
    return sentences, labels

train_sentences, train_labels = prepare_data(LSTM_df)
test_sentences, test_labels = prepare_data(LSTM_test_df)

def build_model(max_len, n_words, n_tags):
    input = Input(shape=(max_len,))
    model = Embedding(input_dim=n_words, output_dim=50, input_length=max_len)(input)
    model = Dropout(0.1)(model)
    model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
    model = TimeDistributed(Dense(50, activation="relu"))(model)
    out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)
    model = Model(input, out)
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    return model

max_len = 500 # This is a cut-off value, to speed up training but it does increase loss
n_words = len(set([token for sentence in train_sentences for token in sentence])) + 1
n_tags = len(set([annotation for labels in train_labels for annotation in labels])) + 1

model = build_model(max_len, n_words, n_tags)
model.summary()

def encode_data(sentences, labels, max_len, n_words, n_tags):
    word2idx = {w: i for i, w in enumerate(set([token for sentence in sentences for token in sentence]), 1)}
    label2idx = {l: i for i, l in enumerate(set([label for label_list in labels for label in label_list]), 1)}
    
    X = [[word2idx[token] for token in sentence] for sentence in sentences]
    X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=n_words-1)
    
    y = [[label2idx[label] for label in label_list] for label_list in labels]
    y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=n_tags-1)
    
    return X, np.array(y), word2idx, label2idx

# Encode training data
X_train, y_train, word2idx, label2idx = encode_data(train_sentences, train_labels, max_len, n_words, n_tags)
X_test, y_test, _, _ = encode_data(test_sentences, test_labels, max_len, n_words, n_tags)

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 500)]             0         
                                                                 
 embedding (Embedding)       (None, 500, 50)           1887500   
                                                                 
 dropout (Dropout)           (None, 500, 50)           0         
                                                                 
 bidirectional (Bidirection  (None, 500, 200)          120800    
 al)                                                             
                                                                 
 time_distributed (TimeDist  (None, 500, 50)           10050     
 ributed)                                                        
                                                                 
 time_distributed_1 (TimeDi  (None, 500, 19)           969   

In [49]:
model.fit(X_train, y_train, batch_size=32, epochs=10, validation_split=0.1, verbose=1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"Loss: {loss}, Accuracy: {accuracy}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss: 1.3059178590774536, Accuracy: 0.6700000166893005


In [51]:
# Predicting labels
predictions = model.predict(X_test, verbose=1)

# Converting predictions from encoded back to the original labels
predicted_labels = np.argmax(predictions, axis=-1)
idx2label = {i: label for label, i in label2idx.items()}
print("Label index to name mapping:", idx2label)

def decode_predictions(predictions, sentences, idx2label):
    decoded_predictions = []
    for i, sentence in enumerate(sentences):
        decoded_sentence = []
        for j, token in enumerate(sentence):
            if j < len(predictions[i]):
                label_idx = predictions[i][j]
                label = idx2label.get(label_idx, 'O')  # O if label not found
                decoded_sentence.append((token, label))
            else:
                decoded_sentence.append((token, 'O'))  # O if label not found
        decoded_predictions.append(decoded_sentence)
    return decoded_predictions

# Raise error if label not in range
for label in predicted_labels.flatten():
    if label not in idx2label:
        print(f"Warning: Predicted label index {label} not found in idx2label mapping.")

decoded_predictions = decode_predictions(predicted_labels, test_sentences, idx2label)

def create_prediction_df(df, decoded_predictions):
    prediction_data = []
    for index, row in df.iterrows():
        pmid = row["PMID"]
        tokens = row["Tokens"]
        annotations = [label for _, label in decoded_predictions[index]]
        prediction_data.append({"PMID": pmid, "Tokens": tokens, "Predictions": annotations})
    
    return pd.DataFrame(prediction_data)

# Implementing it back into a df
prediction_df = create_prediction_df(LSTM_test_df, decoded_predictions)
#print(prediction_df.head())

# Ability to manually examine predictions
for i in range(0): # Increase range to get started
    print(f"Sentence {i+1}:")
    for token, label in decoded_predictions[i]:
        print(f"{token}: {label}")
    print("\n")

Label index to name mapping: {1: 'O-Other', 2: 'I-Educational', 3: 'I-Psychological', 4: 'P-Sample size', 5: 'O-Mental', 6: '0', 7: 'P-Condition', 8: 'I-Physical', 9: 'O-Mortality', 10: 'I-Control', 11: 'P-Age', 12: 'I-Drug', 13: 'P-Sex', 14: 'O-Pain', 15: 'O-Adverse effects', 16: 'O-Physical', 17: 'I-Surgical', 18: 'I-Other'}


**Token-level Evaluation**

In [67]:
from sklearn.metrics import classification_report, accuracy_score

true_labels = y_test.flatten()
pred_labels = predicted_labels.flatten()

# Convert the labels back to their original names
true_label_names = [idx2label[idx] for idx in true_labels]
pred_label_names = [idx2label[idx] for idx in pred_labels]

report = classification_report(true_label_names, pred_label_names, labels=list(idx2label.values()), zero_division=0, digits=2)
print(report)

token_accuracy = accuracy_score(true_label_names, pred_label_names)
print(f"Accuracy: {token_accuracy:.4f}")

                   precision    recall  f1-score   support

          O-Other       0.00      0.00      0.00         0
    I-Educational       0.00      0.00      0.00        13
  I-Psychological       0.00      0.00      0.00        46
    P-Sample size       0.00      0.00      0.00         0
         O-Mental       0.00      0.00      0.00         0
                0       0.92      0.72      0.81      1394
      P-Condition       0.00      0.00      0.00         0
       I-Physical       0.00      0.00      0.00         0
      O-Mortality       0.00      0.00      0.00        15
        I-Control       0.00      0.00      0.00         0
            P-Age       0.00      0.00      0.00         1
           I-Drug       0.00      0.00      0.00         0
            P-Sex       0.00      0.00      0.00         0
           O-Pain       0.00      0.00      0.00        15
O-Adverse effects       0.00      0.00      0.00         0
       O-Physical       0.00      0.00      0.00       

## **Setting up the GPT-4 model**

### **Creating a suitable df for GPT**

In [59]:
def create_gpt_df(df):
    gpt_data = []
    for index, row in df.iterrows():
        sentence = '•'.join([token['token'] for token in row['Annotations']])
        annotation = '•'.join([token["Annotations"] for token in row['Annotations']])
        gpt_data.append({"sentence": sentence, "annotation": annotation})
    
    return pd.DataFrame(gpt_data)

GPT_df = create_gpt_df(full_lenght_df)
print(GPT_df.head())

                                            sentence   
0  Abstract:•Postpartum•hemorrhage•(PPH)•remains•...  \
1  Abstract•Background:•Glaucoma•surgery•is•the•m...   
2  Abstract•Background:•The•coronavirus•disease•2...   

                                          annotation  
0  0•P-Condition•P-Condition•P-Condition•0•0•0•0•...  
1  0•0•I-Surgical•I-Surgical•0•0•0•0•0•0•I-Other•...  
2  0•0•0•P-Condition•P-Condition•P-Condition•P-Co...  


### **Predicting with GPT-4o**

In [61]:
import openai
import pandas as pd
from tqdm import tqdm


openai.api_key = "sk-proj-oiuxDclszqeK2nXu4n9CT3BlbkFJttrqrKBM9FCiXJjJPHvp"
chunk_size = 75


def get_annotations(sentence):
    completion = openai.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a PICO annotator tasked with labeling tokens. For every chunk of 75 tokens separated by bullets (•), return 75 PICO annotations. Use one of the following options: 0 for none, P-Age, P-Sex, P-Sample size, P-Condition, I-Surgical, I-Physical, I-Drug, I-Educational, I-Psychological, I-Other, I-Control, O-Physical, O-Pain, O-Mortality, O-Adverse effects, O-Mental, O-Other. It is crucial that you return exactly 75 labels for each chunk, ensuring that the number of labels matches the number of tokens exactly."},
            {"role": "user", "content": "Randomized•,•double-blind•,•placebo-controlled•trial•of•oral•sirolimus•for•restenosis•prevention•in•patients•with•in-stent•restenosis•:•the•Oral•Sirolimus•to•Inhibit•Recurrent•In-stent•Stenosis•(•OSIRIS•)•trial•.•BACKGROUND•Despite•recent•advances•in•interventional•cardiology•,•including•the•introduction•of•drug-eluting•stents•for•de•novo•coronary•lesions•,•the•treatment•of•in-stent•restenosis•(•ISR•)•remains•a•challenging•clinical•issue•.•Given•the•efficacy•of•systemic•sirolimus•administration•to•prevent•neointimal"}, # 75 token long 1-shot sentence
            {"role": "assistant", "content": "0•0•0•0•I-Control•0•0•0•0•0•O-Physical•0•0•0•0•P-Condition•P-Condition•0•0•0•I-Drug•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•O-Physical"}, # 75 token long 1-shot annotation
            {"role": "user", "content": sentence}  # Sentence to annotate
        ]
    )
    
    gpt_annotations = completion.choices[0].message.content.split('•')

    if len(gpt_annotations) > chunk_size:
        gpt_annotations = gpt_annotations[:chunk_size]
    elif len(gpt_annotations) < chunk_size:
        gpt_annotations += ['0'] * (chunk_size - len(gpt_annotations))
    
    return '•'.join(gpt_annotations)


def process_annotations(gpt_df):
    results = pd.DataFrame(columns=['sentence', 'gpt_annotation', 'gold_annotation'])

    # Iteration over each element of the Dataframe (per PMID)
    for index, row in tqdm(GPT_df.iloc[:99].iterrows(), total=3, desc="Processing annotations"):
        sentences = row['sentence'].split('•')
        annotations = row['annotation'].split('•')
        
        temp_sentences = []
        temp_gpt_annotations = []
        temp_gold_annotations = []

        # Chuncking is taking part underneath this iteration
        for i in range(0, len(sentences), chunk_size):
            sentence_chunk = '•'.join(sentences[i:i+chunk_size])
            annotation_chunk = '•'.join(annotations[i:i+chunk_size])
            
            # Utilizing the API call to classify
            gpt_annotation = get_annotations(sentence_chunk)
            
            # Storing results
            temp_sentences.append(sentence_chunk)
            temp_gpt_annotations.append(gpt_annotation)
            temp_gold_annotations.append(annotation_chunk)

        # Merging results with gold data, to evaluate
        results = pd.concat([results, pd.DataFrame({
            'sentence': ['•'.join(temp_sentences).split("•")],
            'gpt_annotation': ['•'.join(temp_gpt_annotations).split("•")],
            'gold_annotation': ['•'.join(temp_gold_annotations).split("•")]
        })], ignore_index=True)

    return results

results = process_annotations(GPT_df)
results.to_csv('fulllenght_annotations_results.csv', index=True) # Saving the results for later evaluation
print(results.head())

Processing annotations:   0%|          | 0/99 [00:00<?, ?it/s]

Processing annotations:   3%|▎         | 3/99 [11:41<6:13:54, 233.69s/it]

                                            sentence   
0  [Abstract:, Postpartum, hemorrhage, (PPH), rem...  \
1  [Abstract, Background:, Glaucoma, surgery, is,...   
2  [Abstract, Background:, The, coronavirus, dise...   

                                      gpt_annotation   
0  [0, P-Condition, P-Condition, 0, 0, 0, O-Morta...  \
1  [0, 0, P-Condition, I-Surgical, 0, 0, 0, I-Oth...   
2  [0, 0, P-Condition, 0, 0, P-Condition, 0, 0, P...   

                                     gold_annotation  
0  [0, P-Condition, P-Condition, P-Condition, 0, ...  
1  [0, 0, I-Surgical, I-Surgical, 0, 0, 0, 0, 0, ...  
2  [0, 0, 0, P-Condition, P-Condition, P-Conditio...  





**Token-level Evaluation**

In [65]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer

gold_labels_flat = [label for sublist in results['gold_annotation'] for label in sublist]
predicted_labels_flat = [label for sublist in results['gpt_annotation'] for label in sublist]

min_length = min(len(gold_labels_flat), len(predicted_labels_flat))
gold_labels_flat = gold_labels_flat[:min_length]
predicted_labels_flat = predicted_labels_flat[:min_length]

mlb = MultiLabelBinarizer()
gold_labels_binarized = mlb.fit_transform([[label] for label in gold_labels_flat])
predicted_labels_binarized = mlb.transform([[label] for label in predicted_labels_flat])

print(classification_report(gold_labels_binarized, predicted_labels_binarized, target_names=mlb.classes_))

token_accuracy = accuracy_score(gold_labels_binarized, predicted_labels_binarized)
print(f"Accuracy: {token_accuracy:.4f}")

                   precision    recall  f1-score   support

                0       0.83      0.90      0.87     10869
           I-Drug       0.02      0.03      0.03        30
    I-Educational       0.00      0.00      0.00        86
          I-Other       0.02      0.01      0.01       170
       I-Physical       0.00      0.00      0.00        98
  I-Psychological       0.05      0.01      0.01       119
       I-Surgical       0.04      0.03      0.03       120
O-Adverse effects       0.08      0.07      0.08        83
         O-Mental       0.09      0.11      0.10       149
       O-Morality       0.00      0.00      0.00         5
          O-Other       0.04      0.02      0.03       256
       O-Physical       0.13      0.04      0.06       603
            P-Age       0.04      0.03      0.03        40
      P-Condition       0.08      0.07      0.08       417
    P-Sample size       0.08      0.06      0.07        99
            P-Sex       0.00      0.00      0.00       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## **Manual Evaluation**

In [66]:
from IPython.display import HTML

def print_formatted_data_html(df, lstm_predictions, output_file='fulllenght_output.html'):
    html_output = "<html><head><title>Annotated Sentences</title></head><body>"
    
    for idx, row in df.iterrows():
        sentence_html = f"<h3>Sentence {idx+1}</h3><div style='display: flex; flex-wrap: wrap; gap: 20px; align-items: flex-start;'>"
        sentence_words = row['sentence']
        gpt_annotations = row['gpt_annotation']
        gold_annotations = row['gold_annotation']
        lstm_annotations = [label for token, label in lstm_predictions[idx]]
        
        for word_idx, word in enumerate(sentence_words):
            gpt_annotation = gpt_annotations[word_idx].strip() if word_idx < len(gpt_annotations) else '!'
            lstm_annotation = lstm_annotations[word_idx].strip() if word_idx < len(lstm_annotations) else '!'
            gold_annotation = gold_annotations[word_idx].strip() if word_idx < len(gold_annotations) else '!'
            
            gpt_color = "red" if gpt_annotation != gold_annotation else "black"
            lstm_color = "orange" if lstm_annotation != gold_annotation else "black"
            
            word_html = f"<div style='text-align: center;'>"
            word_html += f"<div style='color: black; font-weight: bold;'>{word.strip()}</div>"
            word_html += f"<div style='color: {gpt_color};'>GPT: {gpt_annotation}</div>"
            word_html += f"<div style='color: {lstm_color};'>LSTM: {lstm_annotation}</div>"
            word_html += f"<div style='color: black;'>Gold: {gold_annotation}</div>"
            word_html += "</div>"
            sentence_html += word_html
        
        sentence_html += "</div>"
        html_output += sentence_html
    
    html_output += "</body></html>"
    
    # Saving it to a file
    with open(output_file, 'w') as f:
        f.write(html_output)
    
    # Displaing it within this jupyter notebook
    display(HTML(html_output))

print_formatted_data_html(results, decoded_predictions)