# **PICO EXTRACTION** *| BERT NER*
### Participants/Problem (P), Intervention (I), Comparison (C) and Outcome (O)

In [47]:
# Intstall required libraries
!pip install datasets
!pip install transformers
!pip install seqeval



In [48]:
# Import necessary libraries
import re
import pandas as pd
import numpy as np
import itertools
from datasets import Dataset
from datasets import load_metric
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch

In [49]:
# Create necessary functions
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label_encoding_dict[label[word_idx]])
            else:
                label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
        
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}

    
def replace_entity(old_entity):
    if "POPULATION" in old_entity or "SAMPLE" in old_entity:
        entity = "I-POPULATION"
    elif "PARTICIPANT" in old_entity:
        entity = "I-PARTICIPANT"
    elif "INTERVENTION" in old_entity:
        entity = "I-INTERVENTION"
    elif "COMPAR" in old_entity:
        entity = "I-COMPARISON"
    elif "OUTCOME" in old_entity:
        entity = "I-OUTCOME"
    elif bool(re.search('[a-zA-Z]', old_entity)) == False:
        entity = ""
    else:
        entity = "O"
    return(entity)

In [50]:
# Read the text data
with open("../Datasets/pubmed_data.txt") as f:
    content = f.readlines()

In [51]:
print(len(content))

369304


### **DATA PREPARATION**
Convert the text data to PICO BERT trainable data format

Convert the text file and seperate the following fields in seperate column in a dataframe

* PMID
* PICO_Entity
* AOMRC Entity
* Text

In [52]:
# Data preprocessing & manipulation
content_df = pd.DataFrame()
for i in range(10000):
    if("###" in content[i]):
        content_df.loc[i, "PMID"] = content[i].split(":")[0].replace("#", "")
    elif(content[i] == "\n"):
        pass
    else:
        content_df.loc[i, "PMID"] = content_df.loc[i-1, "PMID"]
        content_df.loc[i, "PICO_Entity"] = content[i].split("|")[0]
        content_df.loc[i, "AOMRC_Entity"] = content[i].split("|")[1]
        content_df.loc[i, "Text"] = content[i].split("|")[2]

In [53]:
# Display manipulated data | samples
content_df.head()

Unnamed: 0,PMID,PICO_Entity,AOMRC_Entity,Text
0,28628768,,,
1,28628768,OBJECTIVE,A,To test the efficacy of a pregnancy adapted ve...
2,28628768,SETTING,M,Online and telephone .\n
3,28628768,POPULATION OR SAMPLE,P,Self-referred pregnant women ( gestational wee...
4,28628768,METHODS,M,@ pregnant women ( gestational week @ ) with m...


### **DATA CLEANING**
You can there are NAN's occupied with the columns

In [54]:
# Remove NAN's
content_df = content_df[pd.notnull(content_df['Text'])]

In [55]:
# Display cleaned data | samples
content_df.head()

Unnamed: 0,PMID,PICO_Entity,AOMRC_Entity,Text
1,28628768,OBJECTIVE,A,To test the efficacy of a pregnancy adapted ve...
2,28628768,SETTING,M,Online and telephone .\n
3,28628768,POPULATION OR SAMPLE,P,Self-referred pregnant women ( gestational wee...
4,28628768,METHODS,M,@ pregnant women ( gestational week @ ) with m...
5,28628768,MAIN OUTCOME MEASURES,O,The primary outcome was depressive symptoms me...


### **DATA CONVERSION**
Convert the data with the required format and rename

In [56]:
# Datatype formating and renaming
content_df["Text"] = content_df["Text"].astype(str)
content_df["Text"] = content_df["Text"].apply(lambda x:x.strip())
content_df = content_df[["PMID", "Text", "PICO_Entity"]]
content_df.rename(columns = {'Text':'tokens', 'PICO_Entity':'ner_tags'}, inplace = True)
content_df = content_df.reset_index(drop=True)

In [57]:
# Display formatted samples
content_df.head()

Unnamed: 0,PMID,tokens,ner_tags
0,28628768,To test the efficacy of a pregnancy adapted ve...,OBJECTIVE
1,28628768,Online and telephone .,SETTING
2,28628768,Self-referred pregnant women ( gestational wee...,POPULATION OR SAMPLE
3,28628768,@ pregnant women ( gestational week @ ) with m...,METHODS
4,28628768,The primary outcome was depressive symptoms me...,MAIN OUTCOME MEASURES


### **DATA PREPROCESSING**
Convert the data required parameters that needs to train the model

In [58]:
# Calculate token length
content_df["token_len"] = content_df["tokens"].apply(lambda x: len(x.split()))

In [59]:
# Display samples
print(content_df.head())

       PMID                                             tokens  \
0  28628768  To test the efficacy of a pregnancy adapted ve...   
1  28628768                             Online and telephone .   
2  28628768  Self-referred pregnant women ( gestational wee...   
3  28628768  @ pregnant women ( gestational week @ ) with m...   
4  28628768  The primary outcome was depressive symptoms me...   

                ner_tags  token_len  
0              OBJECTIVE         29  
1                SETTING          4  
2   POPULATION OR SAMPLE         17  
3                METHODS         36  
4  MAIN OUTCOME MEASURES         18  


In [60]:
# Replace entities with required entity names
content_df["ner_tags"] = content_df["ner_tags"].apply(lambda x: replace_entity(x))

In [61]:
# Display samples
content_df.head()

Unnamed: 0,PMID,tokens,ner_tags,token_len
0,28628768,To test the efficacy of a pregnancy adapted ve...,O,29
1,28628768,Online and telephone .,O,4
2,28628768,Self-referred pregnant women ( gestational wee...,I-POPULATION,17
3,28628768,@ pregnant women ( gestational week @ ) with m...,O,36
4,28628768,The primary outcome was depressive symptoms me...,I-OUTCOME,18


In [62]:
# Iterate entities to all words/token
for i in range(len(content_df)):
    content_df.loc[i, "ner_tags"] = str([content_df["ner_tags"][i]] * content_df["token_len"][i])
    content_df.loc[i, "tokens"] = str(content_df["tokens"][i].split())

In [63]:
#content_df = pd.read_excel("df.xlsx")

In [64]:
# Converts string to list of values
content_df["ner_tags"] = content_df["ner_tags"].apply(lambda x:eval(x))
content_df["tokens"] = content_df["tokens"].apply(lambda x:eval(x))

In [65]:
# Group by PMID
content_df = content_df.groupby('PMID').agg(lambda x: x.to_list())

In [66]:
# Combine list of list
content_df["tokens"] = content_df["tokens"].apply(lambda x:list(itertools.chain.from_iterable(x)))
content_df["ner_tags"] = content_df["ner_tags"].apply(lambda x:list(itertools.chain.from_iterable(x)))

In [67]:
# Reset dataframe
content_df = content_df.reset_index(drop=True)

In [68]:
# Display preprocessed data | samples
content_df.head()

Unnamed: 0,tokens,ner_tags,token_len
0,"[Reviews, of, the, quality, of, reporting, of,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[22, 40, 22, 7, 25, 16, 15, 15, 21, 22, 26, 13..."
1,"[Fan, therapy, is, often, suggested, for, reli...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[25, 29, 23, 17, 32, 10, 50, 25, 22, 12, 9]"
2,"[To, compare, the, effects, of, nut-based, sna...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[34, 10, 6, 18, 13, 15, 52, 25, 42]"
3,"[Increase, physical, activity, in, health, car...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[26, 5, 4, 17, 41, 10, 26, 21, 17, 14, 18, 41,..."
4,"[To, investigate, the, effect, of, a, @, senso...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[25, 28, 20, 36, 30, 34, 29, 24, 30]"


In [69]:
# Save preprocessed data
content_df.to_excel("../Datasets/preprocessed_df.xlsx")

**TEST / TRAIN SPLIT**
---
Split the train data set into two sets by the ratio 8:2. and we can furthur evaluate the model performance

In [70]:
#Test train split
train, test = train_test_split(content_df, test_size=0.2, shuffle=False, random_state=1)

train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

train_df = train
test_df = test

In [71]:
# Print Shape
print(len(train_df))
print(len(test_df))

518
130


In [72]:
# Label to numeric representation
label_list = ['O','I-POPULATION', 'I-PARTICIPANT','I-INTERVENTION', 'I-COMPARISON','I-OUTCOME']
label_encoding_dict = {'O': 0, 'I-POPULATION': 1, 'I-PARTICIPANT': 2, 'I-INTERVENTION': 3, 'I-COMPARISON': 4, 'I-OUTCOME': 5}

**MODEL INITIALIZATION**
---
Initialize the BERT model. Define the Task Name, Model, Tokenizer

In [73]:
# Initialize the BERT model
task = "ner" 
model_checkpoint = "distilbert-base-uncased"
batch_size = 16
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

# Initialize pretrained tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) 

data_collator = DataCollatorForTokenClassification(tokenizer) #Some random data enhancements

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN t

**TOKENIZATION**
---
Tokenize and embed the dataset with pretrained BERT

In [74]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_datasets = test_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/518 [00:00<?, ? examples/s]

Map:   0%|          | 0/130 [00:00<?, ? examples/s]

**HYPER PARAMETER TURNING**
---
Hyperparameters - We can iterate and tune the model with these parameters for better results

In [75]:
args = TrainingArguments(
    f"test-{task}",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=1e-5,
)

metric = load_metric("seqeval")

  metric = load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

**TRAINING MODULE**
---
Train the model with different parameters and finalize the optimal one

In [76]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=test_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, ner_tags, token_len. If tokens, ner_tags, token_len are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 518
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 99
  Number of trainable parameters = 66367494


  0%|          | 0/99 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, ner_tags, token_len. If tokens, ner_tags, token_len are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 130
  Batch size = 16


  0%|          | 0/9 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.5536863207817078, 'eval_precision': 0.002785515320334262, 'eval_recall': 0.008438818565400843, 'eval_f1': 0.00418848167539267, 'eval_accuracy': 0.805645657975775, 'eval_runtime': 136.6961, 'eval_samples_per_second': 0.951, 'eval_steps_per_second': 0.066, 'epoch': 1.0}


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, ner_tags, token_len. If tokens, ner_tags, token_len are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 130
  Batch size = 16


  0%|          | 0/9 [00:00<?, ?it/s]

{'eval_loss': 0.47596320509910583, 'eval_precision': 0.026785714285714284, 'eval_recall': 0.0759493670886076, 'eval_f1': 0.039603960396039604, 'eval_accuracy': 0.8311845616916445, 'eval_runtime': 142.843, 'eval_samples_per_second': 0.91, 'eval_steps_per_second': 0.063, 'epoch': 2.0}


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, ner_tags, token_len. If tokens, ner_tags, token_len are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 130
  Batch size = 16


  0%|          | 0/9 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 0.46126243472099304, 'eval_precision': 0.06497175141242938, 'eval_recall': 0.1940928270042194, 'eval_f1': 0.09735449735449735, 'eval_accuracy': 0.8391090125230959, 'eval_runtime': 148.1912, 'eval_samples_per_second': 0.877, 'eval_steps_per_second': 0.061, 'epoch': 3.0}
{'train_runtime': 6289.0713, 'train_samples_per_second': 0.247, 'train_steps_per_second': 0.016, 'train_loss': 0.45245681146178585, 'epoch': 3.0}


TrainOutput(global_step=99, training_loss=0.45245681146178585, metrics={'train_runtime': 6289.0713, 'train_samples_per_second': 0.247, 'train_steps_per_second': 0.016, 'train_loss': 0.45245681146178585, 'epoch': 3.0})

# **EVALUATION**

---

Evaluate the remaining 20 percent of training data. 
It is a good idea to take a part of training data for validation while training the model.

In [77]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tokens, ner_tags, token_len. If tokens, ner_tags, token_len are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 130
  Batch size = 16


  0%|          | 0/9 [00:00<?, ?it/s]

{'eval_loss': 0.46126243472099304,
 'eval_precision': 0.06497175141242938,
 'eval_recall': 0.1940928270042194,
 'eval_f1': 0.09735449735449735,
 'eval_accuracy': 0.8391090125230959,
 'eval_runtime': 152.5579,
 'eval_samples_per_second': 0.852,
 'eval_steps_per_second': 0.059,
 'epoch': 3.0}

*Save the Model*
---
Save the model to disk

In [78]:
# Save the model to disk
trainer.save_model('un-ner.model')

Saving model checkpoint to un-ner.model
Configuration saved in un-ner.model/config.json
Model weights saved in un-ner.model/pytorch_model.bin
tokenizer config file saved in un-ner.model/tokenizer_config.json
Special tokens file saved in un-ner.model/special_tokens_map.json


**PREDICTION MODULE**
---
Predict the **Unseen Data**


In [79]:
# Load the tokenizer and trained model from disk
tokenizer = AutoTokenizer.from_pretrained('./un-ner.model/')
model = AutoModelForTokenClassification.from_pretrained('./un-ner.model/', num_labels=len(label_list))

loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./un-ner.model/config.json
Model config DistilBertConfig {
  "_name_or_path": "./un-ner.model/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForTokenClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",

In [80]:
# Input unseen data
paragraph = '''
OBJECTIVE|A|To test the efficacy of a pregnancy adapted version of an existing 10-week ICBT-program for depression as well as assessing acceptability and adherence DESIGN: Randomised controlled trial.
SETTING|M|Online and telephone.
POPULATION OR SAMPLE|P|Self-referred pregnant women (gestational week 10-28 at intake) currently suffering from major depressive disorder.
METHODS|M|42 pregnant women (gestational week 12-28) with major depression were randomised to either treatment as usual (TAU) provided at their antenatal clinic or to ICBT as an add-on to usual care.
MAIN OUTCOME MEASURES|O|The primary outcome was depressive symptoms measured with the Montgomery-sberg depression rating scale-self report (MADRS-S). The Edinburgh Postnatal Depression Scale and measures of anxiety and sleep were used. Credibility, satisfaction, adherence and utilization were also assessed.
RESULTS|R|The ICBT group had significantly lower levels of depressive symptoms post treatment (p < 0.001, Hedges g =1.21) and were more likely to be responders (i.e. achieve a statistically reliable improvement) (RR = 0.36; p = 0.004). Measures of treatment credibility, satisfaction, utilization, and adherence were comparable to implemented ICBT for depression.
LIMITATIONS|Others|Small sample size and no long-term evaluation.
CONCLUSION|C|Pregnancy adapted ICBT for antenatal depression is feasible, acceptable and efficacious. These results need to be replicated in larger trials to validate these promising findings.
'''

In [81]:
# Tokenize the paragraph
tokens = tokenizer(paragraph)
torch.tensor(tokens['input_ids']).unsqueeze(0).size()

torch.Size([1, 332])

In [82]:
# Predictions
predictions = model.forward(input_ids=torch.tensor(tokens['input_ids']).unsqueeze(0), attention_mask=torch.tensor(tokens['attention_mask']).unsqueeze(0))
predictions = torch.argmax(predictions.logits.squeeze(), axis=1)
predictions = [label_list[i] for i in predictions]

In [83]:
print(predictions)

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIPANT', 'I-PAR

In [84]:
words = tokenizer.batch_decode(tokens['input_ids'])

In [85]:
# Store the token and predicted entities in a dataframe
prediction_results = pd.DataFrame({'ner': predictions, 'words': words})

In [86]:
# Print predicted samples
print(prediction_results)

    ner      words
0     O      [CLS]
1     O  objective
2     O          |
3     O          a
4     O          |
..   ..        ...
327   O      these
328   O  promising
329   O   findings
330   O          .
331   O      [SEP]

[332 rows x 2 columns]


*Export the results*
---
Export test results to **csv**

Select the **Words, Labels** and **Confidence scores** to export csv file

In [87]:
# Export results to csv
prediction_results.to_csv("prediction_results.csv")