# **Bachelor Project Artificial Intelligence**

M Bartos (2724195)

## **Preprocessing the Train Set**

In [15]:
import os
import pandas as pd

def read_data(directory):
    data = []
    for subdir, _, files in os.walk(os.path.join(directory, 'documents')):
        for filename in files:
            if filename.endswith('.text'):
                pmid = filename.split('.')[0]
                with open(os.path.join(subdir, filename), 'r') as file:
                    text = file.read().strip()
                tokens_file = os.path.join(subdir, f"{pmid}.tokens")
                with open(tokens_file, 'r') as file:
                    tokens = file.read().strip().split()
                data.append({'pmid': pmid, 'text': text, 'tokens': tokens})
    return data

def parse_pico(tokens, labels_p_file, labels_i_file, labels_o_file):
    pico_annotations = []
    with open(labels_p_file, 'r') as p_file, open(labels_i_file, 'r') as i_file, open(labels_o_file, 'r') as o_file:
        labels_p = p_file.read().strip().split(',')
        labels_i = i_file.read().strip().split(',')
        labels_o = o_file.read().strip().split(',')

    for token, p, i, o in zip(tokens, labels_p, labels_i, labels_o):
        pico_annotations.append({
            'token': token,
            'P': int(p),
            'I': int(i),
            'O': int(o)
        })
    return pico_annotations

data_directory = '/Users/markbartos/Library/Mobile Documents/com~apple~CloudDocs/DRIVE/EDUCATION/VU_AI/YEAR3 PERIOD 5/BPAI/Code/EBM-NLP-master/ebm_nlp_1_00'

data = read_data(data_directory)
#print(data[0])

sentence_data = []
for entry in data:
    pmid = entry['pmid']
    tokens = entry['tokens']
    
    labels_p_file = os.path.join(data_directory, 'annotations', 'aggregated', 'hierarchical_labels', 'participants', 'train', f"{pmid}_AGGREGATED.ann")
    labels_i_file = os.path.join(data_directory, 'annotations', 'aggregated', 'hierarchical_labels', 'interventions', 'train', f"{pmid}_AGGREGATED.ann")
    labels_o_file = os.path.join(data_directory, 'annotations', 'aggregated', 'hierarchical_labels', 'outcomes', 'train', f"{pmid}_AGGREGATED.ann")

    if os.path.exists(labels_p_file) and os.path.exists(labels_i_file) and os.path.exists(labels_o_file):
        annotations = parse_pico(tokens, labels_p_file, labels_i_file, labels_o_file)
        sentence_data.append({'PMID': pmid, 'Annotations': annotations})


df = pd.DataFrame(sentence_data)
print(df.head())

       PMID   
0   6988462  \
1  18157013   
2  17297323   
3  15603203   
4  15734707   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             

In [21]:
pd.set_option('display.max_colwidth', None)  
print(df.iloc[0])

PMID                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

In [3]:
annotations = df.at[0, 'Annotations']  # Access the annotations for the first row

# Loop through each annotation in the list and print it
for annotation in annotations:
    print(f"Token: {annotation['token']} - P: {annotation['P']}, I: {annotation['I']}, O: {annotation['O']}")


Token: A - P: 0, I: 0, O: 0
Token: comparative - P: 0, I: 0, O: 0
Token: trial - P: 0, I: 0, O: 0
Token: of - P: 0, I: 0, O: 0
Token: liver - P: 0, I: 0, O: 0
Token: biopsy - P: 0, I: 0, O: 0
Token: needles - P: 0, I: 0, O: 0
Token: . - P: 0, I: 0, O: 0
Token: A - P: 0, I: 0, O: 0
Token: sheathed - P: 0, I: 2, O: 0
Token: needle - P: 0, I: 2, O: 0
Token: ( - P: 0, I: 0, O: 0
Token: Tru-Cut - P: 0, I: 0, O: 0
Token: ) - P: 0, I: 0, O: 0
Token: was - P: 0, I: 0, O: 0
Token: compared - P: 0, I: 0, O: 0
Token: with - P: 0, I: 0, O: 0
Token: a - P: 0, I: 0, O: 0
Token: suction - P: 0, I: 2, O: 0
Token: biopsy - P: 0, I: 2, O: 0
Token: needle - P: 0, I: 2, O: 0
Token: ( - P: 0, I: 2, O: 0
Token: Menghini - P: 0, I: 2, O: 0
Token: ) - P: 0, I: 2, O: 0
Token: in - P: 0, I: 0, O: 0
Token: a - P: 0, I: 0, O: 0
Token: randomised - P: 0, I: 0, O: 0
Token: prospective - P: 0, I: 0, O: 0
Token: trial - P: 0, I: 0, O: 0
Token: over - P: 0, I: 0, O: 0
Token: 18 - P: 0, I: 0, O: 0
Token: months - P: 0,

In [4]:
import pandas as pd

# Assuming 'df' is your existing DataFrame
annotations_list = df.at[0, 'Annotations']  # Get the annotations for the first row
annotations_df = pd.DataFrame(annotations_list)

# Display the DataFrame to see the table
print(annotations_df)


# Sort by the 'P' values, then 'I', then 'O'
sorted_annotations_df = annotations_df.sort_values(by=['P', 'I', 'O'])

# Print the sorted DataFrame
pd.set_option('display.max_colwidth', None)  
print(sorted_annotations_df)

           token  P  I  O
0              A  0  0  0
1    comparative  0  0  0
2          trial  0  0  0
3             of  0  0  0
4          liver  0  0  0
..           ... .. .. ..
111           be  0  0  0
112         used  0  0  0
113         once  0  0  0
114         only  0  0  0
115            .  0  0  0

[116 rows x 4 columns]
           token  P  I  O
0              A  0  0  0
1    comparative  0  0  0
2          trial  0  0  0
3             of  0  0  0
4          liver  0  0  0
..           ... .. .. ..
83        needle  0  6  0
96       suction  0  6  0
97        needle  0  6  0
106     sheathed  0  6  0
107       needle  0  6  0

[116 rows x 4 columns]


## **Preprocessing the Test Set**

In [16]:
import os
import pandas as pd

def read_data(directory):
    data = []
    for subdir, _, files in os.walk(os.path.join(directory, 'documents')):
        for filename in files:
            if filename.endswith('.text'):
                pmid = filename.split('.')[0]
                with open(os.path.join(subdir, filename), 'r') as file:
                    text = file.read().strip()
                tokens_file = os.path.join(subdir, f"{pmid}.tokens")
                with open(tokens_file, 'r') as file:
                    tokens = file.read().strip().split()
                data.append({'pmid': pmid, 'text': text, 'tokens': tokens})
    return data

def parse_pico(tokens, labels_p_file, labels_i_file, labels_o_file):
    pico_annotations = []
    with open(labels_p_file, 'r') as p_file, open(labels_i_file, 'r') as i_file, open(labels_o_file, 'r') as o_file:
        labels_p = p_file.read().strip().split(',')
        labels_i = i_file.read().strip().split(',')
        labels_o = o_file.read().strip().split(',')

    for token, p, i, o in zip(tokens, labels_p, labels_i, labels_o):
        pico_annotations.append({
            'token': token,
            'P': int(p),
            'I': int(i),
            'O': int(o)
        })
    return pico_annotations

data_directory = '/Users/markbartos/Library/Mobile Documents/com~apple~CloudDocs/DRIVE/EDUCATION/VU_AI/YEAR3 PERIOD 5/BPAI/Code/EBM-NLP-master/ebm_nlp_1_00'

data = read_data(data_directory)
#print(data[0])

sentence_data = []
for entry in data:
    pmid = entry['pmid']
    tokens = entry['tokens']
    
    labels_p_file = os.path.join(data_directory, 'annotations', 'aggregated', 'hierarchical_labels', 'participants', 'test', 'gold', f"{pmid}_AGGREGATED.ann")
    labels_i_file = os.path.join(data_directory, 'annotations', 'aggregated', 'hierarchical_labels', 'interventions', 'test', 'gold', f"{pmid}_AGGREGATED.ann")
    labels_o_file = os.path.join(data_directory, 'annotations', 'aggregated', 'hierarchical_labels', 'outcomes', 'test', 'gold', f"{pmid}_AGGREGATED.ann")

    if os.path.exists(labels_p_file) and os.path.exists(labels_i_file) and os.path.exists(labels_o_file):
        annotations = parse_pico(tokens, labels_p_file, labels_i_file, labels_o_file)
        sentence_data.append({'PMID': pmid, 'Annotations': annotations})


test_df = pd.DataFrame(sentence_data)
print(test_df.head())

       PMID   
0  22692114  \
1   8986845   
2  19054718   
3   9806121   
4  10715372   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             

In [6]:
annotations = test_df.at[0, 'Annotations']  # Access the annotations for the first row

# Loop through each annotation in the list and print it
for annotation in annotations:
    print(f"Token: {annotation['token']} - P: {annotation['P']}, I: {annotation['I']}, O: {annotation['O']}")

Token: The - P: 0, I: 0, O: 0
Token: acute - P: 0, I: 0, O: 0
Token: effects - P: 0, I: 0, O: 0
Token: of - P: 0, I: 0, O: 0
Token: fluid - P: 0, I: 0, O: 0
Token: intake - P: 0, I: 0, O: 0
Token: on - P: 0, I: 0, O: 0
Token: urine - P: 0, I: 0, O: 0
Token: specific - P: 0, I: 0, O: 0
Token: gravity - P: 0, I: 0, O: 0
Token: and - P: 0, I: 0, O: 0
Token: fluid - P: 0, I: 0, O: 0
Token: retention - P: 0, I: 0, O: 0
Token: in - P: 0, I: 0, O: 0
Token: a - P: 0, I: 0, O: 0
Token: mildly - P: 4, I: 0, O: 0
Token: dehydrated - P: 4, I: 0, O: 0
Token: state - P: 4, I: 0, O: 0
Token: . - P: 0, I: 0, O: 0
Token: Many - P: 0, I: 0, O: 0
Token: athletes - P: 0, I: 0, O: 0
Token: arrive - P: 0, I: 0, O: 0
Token: at - P: 0, I: 0, O: 0
Token: training - P: 0, I: 0, O: 0
Token: sessions - P: 0, I: 0, O: 0
Token: and - P: 0, I: 0, O: 0
Token: competitions - P: 0, I: 0, O: 0
Token: in - P: 0, I: 0, O: 0
Token: a - P: 0, I: 0, O: 0
Token: mildly - P: 0, I: 0, O: 0
Token: hypohydrated - P: 0, I: 0, O: 0

## **Training the biLSTM model**

### **Creating a df suitable for LSTM**

In [17]:
pico_map = {
    'P': ['No label', 'Age', 'Sex', 'Sample size', 'Condition'],
    'I': ['No label', 'Surgical', 'Physical', 'Drug', 'Educational', 'Psychological', 'Other', 'Control'],
    'O': ['No label', 'Physical', 'Pain', 'Mortality', 'Adverse effects', 'Mental', 'Other']
}

def map_annotations(annotation):
    mapped_p = pico_map['P'][annotation['P']] if annotation['P'] < len(pico_map['P']) else 'No label'
    mapped_i = pico_map['I'][annotation['I']] if annotation['I'] < len(pico_map['I']) else 'No label'
    mapped_o = pico_map['O'][annotation['O']] if annotation['O'] < len(pico_map['O']) else 'No label'

    result = []
    if mapped_p != 'No label':
        result.append(f"P-{mapped_p}")
    elif mapped_i != 'No label':
        result.append(f"I-{mapped_i}")
    elif mapped_o != 'No label':
        result.append(f"O-{mapped_o}")

    return ','.join(result) if result else '0'

def create_lstm_df(df):
    lstm_data = []
    for index, row in df.iterrows():
        tokens = [token['token'] for token in row['Annotations']]
        annotations = [map_annotations(token) for token in row['Annotations']]
        lstm_data.append({"PMID": pmid, "Tokens": tokens, "Annotations": annotations})
    
    return pd.DataFrame(lstm_data)


LSTM_df = create_lstm_df(df)
LSTM_test_df = create_lstm_df(test_df)
print(LSTM_df.head())

       PMID   
0  18376682  \
1  18376682   
2  18376682   
3  18376682   
4  18376682   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       Tokens   
0                                                                      [A, comparative, tria

In [38]:
for i in range(len(LSTM_df["PMID"])):
    if len(LSTM_df["Annotations"][i]) != len(LSTM_df["Tokens"][i]):
        print(i)
    
print(LSTM_df["Annotations"][1020])
print(LSTM_df["Tokens"][1020])

['I-Physical', 'I-Physical', 'I-Physical', '0', 'P-Condition', 'P-Condition', 'P-Condition', '0', '0', '0', 'I-Drug', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', 'I-Physical', 'I-Physical', 'I-Physical', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', 'P-Condition', '0', '0', '0', '0', '0', '0', '0', 'P-Condition', 'P-Age', '0', 'P-Condition', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', 'I-Educational',

### **Training the BiLSTM model**

In [39]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

def prepare_data(df):
    sentences = df['Tokens'].tolist()
    labels = df['Annotations'].tolist()
    return sentences, labels

train_sentences, train_labels = prepare_data(LSTM_df)
test_sentences, test_labels = prepare_data(LSTM_test_df)

def build_model(max_len, n_words, n_tags):
    input = Input(shape=(max_len,))
    model = Embedding(input_dim=n_words, output_dim=50, input_length=max_len)(input)
    model = Dropout(0.1)(model)
    model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
    model = TimeDistributed(Dense(50, activation="relu"))(model)
    out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)
    model = Model(input, out)
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    return model

max_len = 500 # This is a cut-off value, to speed up training but it does increase loss
n_words = len(set([token for sentence in train_sentences for token in sentence])) + 1
n_tags = len(set([annotation for labels in train_labels for annotation in labels])) + 1

model = build_model(max_len, n_words, n_tags)
model.summary()

def encode_data(sentences, labels, max_len, n_words, n_tags):
    word2idx = {w: i for i, w in enumerate(set([token for sentence in sentences for token in sentence]), 1)}
    label2idx = {l: i for i, l in enumerate(set([label for label_list in labels for label in label_list]), 1)}
    
    X = [[word2idx[token] for token in sentence] for sentence in sentences]
    X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=n_words-1)
    
    y = [[label2idx[label] for label in label_list] for label_list in labels]
    y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=n_tags-1)
    
    return X, np.array(y), word2idx, label2idx

# Encode training data
X_train, y_train, word2idx, label2idx = encode_data(train_sentences, train_labels, max_len, n_words, n_tags)
X_test, y_test, _, _ = encode_data(test_sentences, test_labels, max_len, n_words, n_tags)

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 500)]             0         
                                                                 
 embedding_2 (Embedding)     (None, 500, 50)           1887500   
                                                                 
 dropout_2 (Dropout)         (None, 500, 50)           0         
                                                                 
 bidirectional_2 (Bidirecti  (None, 500, 200)          120800    
 onal)                                                           
                                                                 
 time_distributed_4 (TimeDi  (None, 500, 50)           10050     
 stributed)                                                      
                                                                 
 time_distributed_5 (TimeDi  (None, 500, 19)           969 

In [40]:
model.fit(X_train, y_train, batch_size=32, epochs=5, validation_split=0.1, verbose=1)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f"Loss: {loss}, Accuracy: {accuracy}")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss: 2.1709091663360596, Accuracy: 0.4265252649784088


### **Predicting for the test data**

In [41]:
# Predicting labels for the test data
predictions = model.predict(X_test, verbose=1)

# Convert predictions from one-hot encoded back to label indices
predicted_labels = np.argmax(predictions, axis=-1)

# Create the inverse mapping of label2idx to get label names
idx2label = {i: label for label, i in label2idx.items()}
print("Label index to name mapping:", idx2label)

def decode_predictions(predictions, sentences, idx2label):
    decoded_predictions = []
    for i, sentence in enumerate(sentences):
        decoded_sentence = []
        for j, token in enumerate(sentence):
            if j < len(predictions[i]):
                label_idx = predictions[i][j]
                label = idx2label.get(label_idx, 'O')  # Default to 'O' if label index not found
                decoded_sentence.append((token, label))
            else:
                decoded_sentence.append((token, 'O'))  # Padding tokens default to 'O'
        decoded_predictions.append(decoded_sentence)
    return decoded_predictions

# Add a check to see if predicted labels are within range
for label in predicted_labels.flatten():
    if label not in idx2label:
        print(f"Warning: Predicted label index {label} not found in idx2label mapping.")

decoded_predictions = decode_predictions(predicted_labels, test_sentences, idx2label)

# Print some example predictions
for i in range(5):
    print(f"Sentence {i+1}:")
    for token, label in decoded_predictions[i]:
        print(f"{token}: {label}")
    print("\n")

Label index to name mapping: {1: 'O-Pain', 2: 'P-Sex', 3: 'I-Drug', 4: 'I-Control', 5: 'I-Other', 6: 'P-Condition', 7: 'P-Age', 8: 'O-Mental', 9: 'I-Surgical', 10: 'O-Other', 11: '0', 12: 'O-Physical', 13: 'I-Physical', 14: 'I-Educational', 15: 'O-Adverse effects', 16: 'I-Psychological', 17: 'O-Mortality', 18: 'P-Sample size'}
Sentence 1:
The: 0
acute: 0
effects: 0
of: I-Drug
fluid: 0
intake: 0
on: 0
urine: 0
specific: 0
gravity: 0
and: 0
fluid: 0
retention: I-Drug
in: 0
a: I-Drug
mildly: I-Drug
dehydrated: 0
state: 0
.: 0
Many: I-Drug
athletes: 0
arrive: I-Drug
at: 0
training: 0
sessions: 0
and: 0
competitions: 0
in: 0
a: I-Drug
mildly: 0
hypohydrated: 0
(: 0
HYPO: 0
): I-Drug
state: 0
and: 0
are: 0
instructed: I-Drug
to: I-Drug
drink: 0
fluids: 0
before: 0
exercise: 0
to: I-Drug
reach: I-Drug
a: I-Drug
euhydrated: 0
(: 0
HYD: 0
): I-Drug
state: 0
.: 0
Ten: 0
recreational: I-Drug
athletes: 0
(: 0
6: I-Drug
women: I-Drug
,: I-Drug
4: I-Drug
men: I-Drug
;: I-Drug
71.9: I-Drug
?: I-Drug


In [27]:
from sklearn.metrics import classification_report
# Predicting labels for the test data
predictions = model.predict(X_test, verbose=1)

# Convert predictions from one-hot encoded back to label indices
predicted_labels = np.argmax(predictions, axis=-1)

# Flatten the predictions and true labels
flat_true_labels = [label for sentence in y_test_p for label in sentence]
flat_pred_labels = [label for sentence in predicted_labels for label in sentence]

# Remove padding
flat_true_labels = [label for label in flat_true_labels if label != (n_tags-1)]
flat_pred_labels = [label for label in flat_pred_labels if label != (n_tags-1)]

# Map the numeric labels to their corresponding P, I, O labels
label_map = {0: 'P', 1: 'I', 2: 'O'}
true_label_names = [label_map[label] for label in flat_true_labels]
pred_label_names = [label_map[label] for label in flat_pred_labels]

# Print the classification report
report = classification_report(true_label_names, pred_label_names, target_names=["P", "I", "O"])
print(report)



KeyError: 26

## **Setting up the GPT-4 model**

### **Creating a suitable df for GPT**

In [3]:
import pandas as pd

pico_map = {
    'P': ['No label', 'Age', 'Sex', 'Sample size', 'Condition'],
    'I': ['No label', 'Surgical', 'Physical', 'Drug', 'Educational', 'Psychological', 'Other', 'Control'],
    'O': ['No label', 'Physical', 'Pain', 'Mortality', 'Adverse effects', 'Mental', 'Other']
}

def map_annotations(annotation):
    mapped_p = pico_map['P'][annotation['P']] if annotation['P'] < len(pico_map['P']) else 'No label'
    mapped_i = pico_map['I'][annotation['I']] if annotation['I'] < len(pico_map['I']) else 'No label'
    mapped_o = pico_map['O'][annotation['O']] if annotation['O'] < len(pico_map['O']) else 'No label'

    result = []
    if mapped_p != 'No label':
        result.append(f"P-{mapped_p}")
    if mapped_i != 'No label':
        result.append(f"I-{mapped_i}")
    if mapped_o != 'No label':
        result.append(f"O-{mapped_o}")

    return ','.join(result) if result else '0'

def create_gpt_df(df):
    gpt_data = []
    for index, row in df.iterrows():
        sentence = '•'.join([token['token'] for token in row['Annotations']])
        annotation = '•'.join([map_annotations(token) for token in row['Annotations']])
        gpt_data.append({"sentence": sentence, "annotation": annotation})
    
    return pd.DataFrame(gpt_data)

GPT_df = create_gpt_df(test_df)
print(GPT_df.head())
#print(GPT_df.iloc[0])

                                            sentence   
0  The•acute•effects•of•fluid•intake•on•urine•spe...  \
1  Pedantic•speaking•style•differentiates•Asperge...   
2  Timing•for•delivering•individualized•patient•e...   
3  Sunbathing•and•sunbed•use•related•to•self-imag...   
4  Study•of•the•vaginal•tolerance•to•Acidform•,•a...   

                                          annotation  
0  0•0•0•0•0•0•0•0•0•0•0•0•0•0•0•P-Condition•P-Co...  
1  I-Psychological•I-Psychological•0•0•0•0•0•0•0•...  
2  0•0•0•I-Educational•I-Educational•I-Educationa...  
3  0•0•0•0•0•0•0•0•0•0•0•0•0•P-Age•0•0•0•0•0•0•0•...  
4  0•0•0•0•0•0•I-Drug•I-Drug•I-Drug•I-Drug•I-Drug...  


In [4]:
pd.set_option('display.max_colwidth', None)  
print(GPT_df.iloc[0])

sentence      The•acute•effects•of•fluid•intake•on•urine•specific•gravity•and•fluid•retention•in•a•mildly•dehydrated•state•.•Many•athletes•arrive•at•training•sessions•and•competitions•in•a•mildly•hypohydrated•(•HYPO•)•state•and•are•instructed•to•drink•fluids•before•exercise•to•reach•a•euhydrated•(•HYD•)•state•.•Ten•recreational•athletes•(•6•women•,•4•men•;•71.9•?•4.6•kg•,•25.2•?•0.9•years•)•participated•in•the•studies•to•examine•(•a•)•the•day-to-day•variability•of•morning•urine•specific•gravity•(•USG•)•,•(•b•)•the•effects•of•consuming•600•ml•of•water•on•the•hydration•status•of•HYD•and•HYPO•(•USG•>•1.020•)•subjects•,•and•(•c•)•the•effects•of•consuming•water•(•W•)•,•salt-water•(•SW•,•40•mM•Na•)•,•a•carbohydrate-electrolyte•solution•with•3•%•or•light•carbohydrate•(•CES-L•,•20•mM•Na•)•or•a•CES•with•6•%•carbohydrate•(•CES•,•20•mM•Na•)•on•the•hydration•status•of•HYPO•subjects•.•The•hydration•status•was•assessed•with•USG•and•body•mass•measures•and•urine•volume•collections•.•The•day-to-day•var

Drafts

In [5]:
import openai
openai.api_key = "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"
openai.api_key = "sk-proj-oiuxDclszqeK2nXu4n9CT3BlbkFJttrqrKBM9FCiXJjJPHvp"

assistant = openai.beta.assistants.create(
  name="PICO annotator",
  instructions="Given a text input, respond with one of the following PICO annotation for each word: P-Age, P-Sex, P-Sample size, P-Condition, I-Surgical, I-Physical, I-Drug, I-Educational, I-Psychological, I-Other, I-Control, O-Physical, O-Pain, O-Mortality, O-Adverse effects, O-Mental, O-Other and 0 for none",
  model="gpt-3.5-turbo",
)

In [None]:
thread = openai.beta.threads.create()

In [None]:
print(GPT_df.iloc[50]["sentence"])
print(GPT_df.iloc[50]["annotation"])

In [None]:
message = openai.beta.threads.messages.create(
    thread_id=thread.id,
    role="user",
    content=str(GPT_df.iloc[0]["sentence"])
)

In [None]:
run = openai.beta.threads.runs.create_and_poll(
  thread_id=thread.id,
  assistant_id=assistant.id,
  instructions="Given a text input, respond with one of the following PICO annotation for each word: P-Age, P-Sex, P-Sample size, P-Condition, I-Surgical, I-Physical, I-Drug, I-Educational, I-Psychological, I-Other, I-Control, O-Physical, O-Pain, O-Mortality, O-Adverse effects, O-Mental, O-Other and 0 for none"
)

In [None]:
if run.status == 'completed': 
  messages = openai.beta.threads.messages.list(
    thread_id=thread.id
  )
  print(messages)
else:
  print(run.status)

In [None]:
import openai
openai.api_key = "sk-proj-oiuxDclszqeK2nXu4n9CT3BlbkFJttrqrKBM9FCiXJjJPHvp"

completion = openai.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role":"system", "content": "You are a PICO annotator. Given a text input, return one of the following PICO annotation for each and every value separated by comas: 0 for none, or P-Age, P-Sex, P-Sample size, P-Condition, I-Surgical, I-Physical, I-Drug, I-Educational, I-Psychological, I-Other, I-Control, O-Physical, O-Pain, O-Mortality, O-Adverse effects, O-Mental, O-Other"},
    {"role": "user", "content": GPT_df.iloc[50]["sentence"]}, # 1-shot sentence
    {"role": "user", "content": GPT_df.iloc[50]["annotation"]}, # 1-shot annotation
    {"role": "user", "content": GPT_df.iloc[0]["sentence"]} # Sentence to annotate (will be iterated)
  ]
)

print("--- INPUT ---")
print(GPT_df.iloc[0]["sentence"])
print("--- GPT ANNOTATION ---")
print(completion.choices[0].message.content)
print("--- GOLD ANNOTATION ---")
print(GPT_df.iloc[0]["annotation"])

In [None]:
import openai
import pandas as pd

# Assuming your data is in a DataFrame called `GPT_df`
# with columns ['sentence', 'annotation']

# Set your OpenAI API key
openai.api_key = "sk-proj-oiuxDclszqeK2nXu4n9CT3BlbkFJttrqrKBM9FCiXJjJPHvp"

# Function to get annotations from the model
def get_annotations(sentence):
    completion = openai.chat.completions.create(
        model="gpt-4-turbo",
        messages=[
            {"role":"system", "content": "You are a PICO annotator. Each token is separated by a bullet (•). Provide a label for every token from the following options: 0 for none, or P-Age, P-Sex, P-Sample size, P-Condition, I-Surgical, I-Physical, I-Drug, I-Educational, I-Psychological, I-Other, I-Control, O-Physical, O-Pain, O-Mortality, O-Adverse effects, O-Mental, O-Other. It is crucial that you return exactly one label for each token, ensuring that the number of labels matches the number of tokens exactly. If the number of input tokens does not match the number of output tokens, you will be punished."},
            # "0 for none" at the end resulted in a lot being annotated as non-zero
            # You are a PICO annotator. Return precisely as many annotations, as there are tokens separated by •. Use one of the following PICO annotation: 
            # For each and every token separated by comas, return
            {"role": "user", "content": GPT_df.iloc[1]["sentence"]}, # 1-shot sentence
            {"role": "user", "content": GPT_df.iloc[1]["annotation"]}, # 1-shot annotation
            #{"role": "user", "content": GPT_df.iloc[49]["sentence"]}, # 2-shot sentence
            #{"role": "user", "content": GPT_df.iloc[49]["annotation"]}, # 2-shot annotation
            {"role": "user", "content": sentence} # Sentence to annotate (will be iterated)
  ]
    )
    return completion.choices[0].message.content

results = pd.DataFrame(columns=['sentence', 'gpt_annotation', 'gold_annotation'])

#for index, row in GPT_df.iterrows(): # Remove limitation of only iterating over the first 10 elements
for index, row in GPT_df.iloc[:1].iterrows():
    gpt_annotation = get_annotations(row['sentence'])
    new_row = pd.DataFrame({
        'sentence': [row['sentence'].split('•')],
        'gpt_annotation': [gpt_annotation.split('•')],
        'gold_annotation': [row['annotation'].split('•')]
    })
    results = pd.concat([results, new_row], ignore_index=True)

print(results)
#results.to_csv('annotation_comparison.csv', index=False)

In [None]:
pd.set_option('display.max_colwidth', None)  
print(results.iloc[0]["sentence"])
print(results.iloc[0]["gpt_annotation"])
print(results.iloc[0]["gold_annotation"])

In [None]:
print(GPT_df.iloc[1]["sentence"])

### **Labelling the Test Set**

In [58]:
print(GPT_df.iloc[50]["annotation"].split("•")[:25])
print(GPT_df.iloc[50]["annotation"][:91])

['0', '0', '0', '0', 'I-Control', '0', '0', '0', '0', '0', 'O-Physical', '0', '0', '0', '0', 'P-Condition', 'P-Condition', '0', '0', '0', 'I-Drug', '0', '0', '0', '0']
0•0•0•0•I-Control•0•0•0•0•0•O-Physical•0•0•0•0•P-Condition•P-Condition•0•0•0•I-Drug•0•0•0•0


In [59]:
import openai
import pandas as pd

# Set your OpenAI API key
openai.api_key = "sk-proj-oiuxDclszqeK2nXu4n9CT3BlbkFJttrqrKBM9FCiXJjJPHvp"

# Function to get annotations from the model
def get_annotations(sentence):
    completion = openai.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a PICO annotator tasked with labeling each token. Every token is separated by a bullet (•) and must be individually labeled by only one and precisely one annotation. Provide a label for every token from the following options: 0 for none, P-Age, P-Sex, P-Sample size, P-Condition, I-Surgical, I-Physical, I-Drug, I-Educational, I-Psychological, I-Other, I-Control, O-Physical, O-Pain, O-Mortality, O-Adverse effects, O-Mental, O-Other. It is crucial that you return exactly one label for each token, ensuring that the number of labels matches the number of tokens exactly."},
            {"role": "user", "content": GPT_df.iloc[50]["sentence"][:184]}, # 1-shot sentence
            {"role": "assistant", "content": GPT_df.iloc[50]["annotation"][:91]}, # 1-shot annotation
            {"role": "user", "content": sentence}  # Sentence to annotate
        ]
    )
    print(len(sentence.split("•")))
    print(sentence.split("•"))
    print(len(completion.choices[0].message.content.split("•")))
    print(completion.choices[0].message.content.split("•"))
    return completion.choices[0].message.content

# Process the DataFrame in chunks and merge annotations
def process_annotations(gpt_df):
    results = pd.DataFrame(columns=['sentence', 'gpt_annotation', 'gold_annotation'])
    chunk_size = 25  # Adjust the chunk size here if needed

    # Process each row in the DataFrame
    for index, row in GPT_df.iloc[:1].iterrows():
        sentences = row['sentence'].split('•')
        annotations = row['annotation'].split('•')
        
        # Temporary lists to store chunk results for merging
        temp_sentences = []
        temp_gpt_annotations = []
        temp_gold_annotations = []

        # Process each chunk
        for i in range(0, len(sentences), chunk_size):
            sentence_chunk = '•'.join(sentences[i:i+chunk_size])
            annotation_chunk = '•'.join(annotations[i:i+chunk_size])
            
            # Get GPT annotations for the chunk
            gpt_annotation = get_annotations(sentence_chunk)
            
            # Store results for this chunk
            temp_sentences.append(sentence_chunk)
            temp_gpt_annotations.append(gpt_annotation)
            temp_gold_annotations.append(annotation_chunk)

            #temp_sentences.extend(sentence_chunk.split('•'))
            #temp_gpt_annotations.extend(gpt_annotation.split('•'))
            #temp_gold_annotations.extend(annotation_chunk.split('•'))

        # Add merged results for this PMID to the DataFrame
        results = pd.concat([results, pd.DataFrame({
            'sentence': ['•'.join(temp_sentences).split("•")],
            'gpt_annotation': ['•'.join(temp_gpt_annotations).split("•")],
            'gold_annotation': ['•'.join(temp_gold_annotations).split("•")]
        })], ignore_index=True)

    return results

results = process_annotations(GPT_df)
print(results.head())

25
['The', 'acute', 'effects', 'of', 'fluid', 'intake', 'on', 'urine', 'specific', 'gravity', 'and', 'fluid', 'retention', 'in', 'a', 'mildly', 'dehydrated', 'state', '.', 'Many', 'athletes', 'arrive', 'at', 'training', 'sessions']
27
['0', '0', 'O-Other', '0', '0', 'I-Other', '0', '0', 'O-Other', '0', '0', '0', '0', '0', 'P-Condition', '0', '0', '0', '0', '0', '0', '0', 'P-Condition', '0', '0', '0', '0']
25
['and', 'competitions', 'in', 'a', 'mildly', 'hypohydrated', '(', 'HYPO', ')', 'state', 'and', 'are', 'instructed', 'to', 'drink', 'fluids', 'before', 'exercise', 'to', 'reach', 'a', 'euhydrated', '(', 'HYD', ')']
22
['0', '0', '0', '0', 'P-Condition', '0', '0', '0', 'I-Educational', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', 'P-Condition', '0', 'P-Condition']
25
['state', '.', 'Ten', 'recreational', 'athletes', '(', '6', 'women', ',', '4', 'men', ';', '71.9', '?', '4.6', 'kg', ',', '25.2', '?', '0.9', 'years', ')', 'participated', 'in', 'the']
23
['0', '0', 'P-Sample size',

In [60]:
pd.set_option('display.max_colwidth', None)  
sentence_number = 0

print(len(results.iloc[sentence_number]["sentence"]))
print(results.iloc[sentence_number]["sentence"])
print(len(results.iloc[sentence_number]["gpt_annotation"]))
print(results.iloc[sentence_number]["gpt_annotation"])
print(len(results.iloc[sentence_number]["gold_annotation"]))
print(results.iloc[sentence_number]["gold_annotation"])

358
['The', 'acute', 'effects', 'of', 'fluid', 'intake', 'on', 'urine', 'specific', 'gravity', 'and', 'fluid', 'retention', 'in', 'a', 'mildly', 'dehydrated', 'state', '.', 'Many', 'athletes', 'arrive', 'at', 'training', 'sessions', 'and', 'competitions', 'in', 'a', 'mildly', 'hypohydrated', '(', 'HYPO', ')', 'state', 'and', 'are', 'instructed', 'to', 'drink', 'fluids', 'before', 'exercise', 'to', 'reach', 'a', 'euhydrated', '(', 'HYD', ')', 'state', '.', 'Ten', 'recreational', 'athletes', '(', '6', 'women', ',', '4', 'men', ';', '71.9', '?', '4.6', 'kg', ',', '25.2', '?', '0.9', 'years', ')', 'participated', 'in', 'the', 'studies', 'to', 'examine', '(', 'a', ')', 'the', 'day-to-day', 'variability', 'of', 'morning', 'urine', 'specific', 'gravity', '(', 'USG', ')', ',', '(', 'b', ')', 'the', 'effects', 'of', 'consuming', '600', 'ml', 'of', 'water', 'on', 'the', 'hydration', 'status', 'of', 'HYD', 'and', 'HYPO', '(', 'USG', '>', '1.020', ')', 'subjects', ',', 'and', '(', 'c', ')', 'the',

### **Evaluting on the Test Set**

In [16]:
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

#Binarizing the gold labels
gold_labels = mlb.fit_transform(results['gold_annotation'])
predicted_labels = mlb.transform(results['gpt_annotation'])

print(classification_report(gold_labels, predicted_labels, target_names=mlb.classes_))

                         precision    recall  f1-score   support

                      0       1.00      1.00      1.00         2
          I-Educational       0.00      0.00      0.00         1
        I-Psychological       0.00      0.00      0.00         1
I-Psychological,O-Other       0.00      0.00      0.00         1
             O-Physical       1.00      1.00      1.00         1
                  P-Age       0.50      1.00      0.67         1
            P-Condition       1.00      1.00      1.00         2
          P-Sample size       0.50      1.00      0.67         1
                  P-Sex       0.50      1.00      0.67         1

              micro avg       0.73      0.73      0.73        11
              macro avg       0.50      0.67      0.56        11
           weighted avg       0.59      0.73      0.64        11
            samples avg       0.75      0.73      0.72        11



  _warn_prf(average, modifier, msg_start, len(result))


### **Manual Evaluation**

In [25]:
def print_formatted_data(df):
    for idx, row in df.iterrows():
        print("\nSentence", idx+1)
        sentence_words = row['sentence']
        gpt_annotations = row['gpt_annotation']
        gold_annotations = row['gold_annotation']
        
        for word_idx, word in enumerate(sentence_words):
            if word_idx < len(gpt_annotations) and word_idx < len(gold_annotations):
                # Strip whitespace for clean display
                g_word = gpt_annotations[word_idx].strip()
                gold_word = gold_annotations[word_idx].strip()
                print(f"{word.strip()} - GPT: {g_word}, Gold: {gold_word}")

# Print formatted data
print_formatted_data(results)


Sentence 1
The - GPT: 0, Gold: 0
acute - GPT: I-Other, Gold: 0
effects - GPT: O-Physical, Gold: 0
of - GPT: 0, Gold: 0
fluid - GPT: I-Other, Gold: 0
intake - GPT: 0, Gold: 0
on - GPT: 0, Gold: 0
urine - GPT: O-Physical, Gold: 0
specific - GPT: 0, Gold: 0
gravity - GPT: I-Other, Gold: 0
and - GPT: 0, Gold: 0
fluid - GPT: 0, Gold: 0
retention - GPT: 0, Gold: 0
in - GPT: O-Physical, Gold: 0
a - GPT: 0, Gold: 0
mildly - GPT: 0, Gold: P-Condition
dehydrated - GPT: 0, Gold: P-Condition
state - GPT: P-Condition, Gold: P-Condition
. - GPT: 0, Gold: 0
Many - GPT: 0, Gold: 0
athletes - GPT: 0, Gold: 0
arrive - GPT: 0, Gold: 0
at - GPT: P-Condition, Gold: 0
training - GPT: 0, Gold: 0
sessions - GPT: 0, Gold: 0
and - GPT: 0, Gold: 0
competitions - GPT: 0, Gold: 0
in - GPT: P-Condition, Gold: 0
a - GPT: 0, Gold: 0
mildly - GPT: 0, Gold: 0
hypohydrated - GPT: P-Condition, Gold: 0
( - GPT: 0, Gold: 0
HYPO - GPT: 0, Gold: P-Condition
) - GPT: 0, Gold: 0
state - GPT: P-Condition, Gold: 0
and - GPT: 0,

In [18]:
from IPython.display import display, HTML

def print_formatted_data_html(df):
    for idx, row in df.iterrows():
        html_output = f"<h3>Sentence {idx+1}</h3><div style='display: flex; flex-wrap: wrap; gap: 20px; align-items: flex-start;'>"
        sentence_words = row['sentence']
        gpt_annotations = row['gpt_annotation']
        gold_annotations = row['gold_annotation']
        
        for word_idx, word in enumerate(sentence_words):
            if word_idx < len(gpt_annotations) and word_idx < len(gold_annotations):
                gpt_color = "red" if gpt_annotations[word_idx].strip() != gold_annotations[word_idx].strip() else "black"
                word_html = f"<div style='text-align: center;'>"
                word_html += f"<div style='color: black; font-weight: bold;'>{word.strip()}</div>"
                word_html += f"<div style='color: {gpt_color};'>GPT: {gpt_annotations[word_idx].strip()}</div>"
                word_html += f"<div style='color: black;'>Gold: {gold_annotations[word_idx].strip()}</div>"
                word_html += "</div>"
                html_output += word_html
        
        html_output += "</div>"
        display(HTML(html_output))

print_formatted_data_html(results)