# **PICO EXTRACTION** *| BERT NER*
### Participants/Problem (P), Intervention (I), Comparison (C) and Outcome (O)

In [57]:
# Intstall required libraries
!pip install datasets
!pip install -U accelerate
!pip install -U transformers
!pip install seqeval
!pip install tqdm

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[0mhuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[0mhuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[0mhuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlo

In [58]:
# Import necessary libraries
import re
import pandas as pd
import numpy as np
import itertools
from datasets import Dataset
from datasets import load_metric
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch
from tqdm import tqdm

In [59]:
# Create necessary functions
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label_encoding_dict[label[word_idx]])
            else:
                label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
        
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}

    
def replace_entity(old_entity):
    if "POPULATION" in old_entity or "SAMPLE" in old_entity:
        entity = "I-POPULATION"
    elif "PARTICIPANT" in old_entity:
        entity = "I-PARTICIPANT"
    elif "INTERVENTION" in old_entity:
        entity = "I-INTERVENTION"
    elif "COMPAR" in old_entity:
        entity = "I-COMPARISON"
    elif "OUTCOME" in old_entity:
        entity = "I-OUTCOME"
    elif bool(re.search('[a-zA-Z]', old_entity)) == False:
        entity = ""
    else:
        entity = "O"
    return(entity)

In [60]:
# Read the text data
# with open("/kaggle/input/pubmeddata/pubmed_data.txt") as f:
#     content = f.readlines()
    
with open("/kaggle/input/pubmeddata/pubmed_custom_picobert.txt") as f:
    content_custom = f.readlines()

In [61]:
# print(len(content))
print(len(content_custom))

71


### **DATA PREPARATION**
Convert the text data to PICO BERT trainable data format

Convert the text file and seperate the following fields in seperate column in a dataframe

* PMID
* PICO_Entity
* AOMRC Entity
* Text

In [62]:
print(content_custom)

['###27932540:\n', 'OBJECTIVE|A|on a reduction in the incidence of childhood obesity .\n', 'PARTICIPANTS|P|In school-age children .\n', 'INTERVENTIONS|I|what is the effect of a school-based physical activity program .\n', 'INTERVENTION AND COMPARISON|I|compared with no intervention .\n', '\n', '###27932541:\n', 'OBJECTIVE|A|on a reduction in reported incidences of bullying .\n', 'PARTICIPANTS|P|In high school children .\n', 'INTERVENTIONS|I|what is the effect of a nurse-led presentation on bullying .\n', 'INTERVENTION AND COMPARISON|I|ccompared with no intervention .\n', '\n', '###27932542:\n', 'OBJECTIVE|A|at an increased risk of developing esophageal cancer .\n', 'PARTICIPANTS|P|compared with males age @ and older .\n', 'INTERVENTIONS|I|who have a history of @ year of smoking or less .\n', 'INTERVENTION AND COMPARISON|I|who have no smoking history .\n', '\n', '###27932543:\n', 'OBJECTIVE|A|at greater risk for developing blood clots .\n', 'PARTICIPANTS|P|Are women ages @-@ .\n', 'INTE

In [63]:
# Data preprocessing & manipulation
# content_df = pd.DataFrame()
content_df_custom = pd.DataFrame()

# for i in tqdm(range(len(content))):
#     if("###" in content[i]):
#         content_df.loc[i, "PMID"] = content[i].split(":")[0].replace("#", "")
#     elif(content[i] == "\n"):
#         pass
#     else:
#         content_df.loc[i, "PMID"] = content_df.loc[i-1, "PMID"]
#         content_df.loc[i, "PICO_Entity"] = content[i].split("|")[0]
#         content_df.loc[i, "AOMRC_Entity"] = content[i].split("|")[1]
#         content_df.loc[i, "Text"] = content[i].split("|")[2]
for i in tqdm(range(len(content_custom))):
    if("###" in content_custom[i]):
        content_df_custom.loc[i, "PMID"] = content_custom[i].split(":")[0].replace("#", "")
    elif(content_custom[i] == "\n"):
        pass
    else:
        content_df_custom.loc[i, "PMID"] = content_df_custom.loc[i-1, "PMID"]
        content_df_custom.loc[i, "PICO_Entity"] = content_custom[i].split("|")[0]
        content_df_custom.loc[i, "AOMRC_Entity"] = content_custom[i].split("|")[1]
        content_df_custom.loc[i, "Text"] = content_custom[i].split("|")[2]

100%|██████████| 71/71 [00:00<00:00, 1239.10it/s]


In [64]:
# content_df.to_csv('processed_data.csv', encoding='utf-8')
# content_df_custom.to_csv('processed_data_custom.csv', encoding='utf-8')

In [65]:
# Display manipulated data | samples
content_df = pd.read_csv("/kaggle/input/pubmeddata/processed_data.csv")
content_df = content_df.drop('Unnamed: 0', axis=1)
content_df.head()
content_df_custom.head()

Unnamed: 0,PMID,PICO_Entity,AOMRC_Entity,Text
0,27932540,,,
1,27932540,OBJECTIVE,A,on a reduction in the incidence of childhood o...
2,27932540,PARTICIPANTS,P,In school-age children .\n
3,27932540,INTERVENTIONS,I,what is the effect of a school-based physical ...
4,27932540,INTERVENTION AND COMPARISON,I,compared with no intervention .\n


### **DATA CLEANING**
You can there are NAN's occupied with the columns

In [66]:
# Remove NAN's
content_df = content_df[pd.notnull(content_df['Text'])]
content_df_custom = content_df_custom[pd.notnull(content_df_custom['Text'])]

In [67]:
# Display cleaned data | samples
content_df.head()
content_df_custom.head()

Unnamed: 0,PMID,PICO_Entity,AOMRC_Entity,Text
1,27932540,OBJECTIVE,A,on a reduction in the incidence of childhood o...
2,27932540,PARTICIPANTS,P,In school-age children .\n
3,27932540,INTERVENTIONS,I,what is the effect of a school-based physical ...
4,27932540,INTERVENTION AND COMPARISON,I,compared with no intervention .\n
7,27932541,OBJECTIVE,A,on a reduction in reported incidences of bully...


### **DATA CONVERSION**
Convert the data with the required format and rename

In [68]:
# Datatype formating and renaming
content_df["Text"] = content_df["Text"].astype(str)
content_df["Text"] = content_df["Text"].apply(lambda x:x.strip())
content_df = content_df[["PMID", "Text", "PICO_Entity"]]
content_df.rename(columns = {'Text':'tokens', 'PICO_Entity':'ner_tags'}, inplace = True)
content_df = content_df.reset_index(drop=True)

# Datatype formating and renaming
content_df_custom["Text"] = content_df_custom["Text"].astype(str)
content_df_custom["Text"] = content_df_custom["Text"].apply(lambda x:x.strip())
content_df_custom = content_df_custom[["PMID", "Text", "PICO_Entity"]]
content_df_custom.rename(columns = {'Text':'tokens', 'PICO_Entity':'ner_tags'}, inplace = True)
content_df_custom = content_df_custom.reset_index(drop=True)

In [69]:
# Display formatted samples
content_df.head()
# content_df_custom.head()

Unnamed: 0,PMID,tokens,ner_tags
0,28628768,To test the efficacy of a pregnancy adapted ve...,OBJECTIVE
1,28628768,Online and telephone .,SETTING
2,28628768,Self-referred pregnant women ( gestational wee...,POPULATION OR SAMPLE
3,28628768,@ pregnant women ( gestational week @ ) with m...,METHODS
4,28628768,The primary outcome was depressive symptoms me...,MAIN OUTCOME MEASURES


### **DATA PREPROCESSING**
Convert the data required parameters that needs to train the model

In [70]:
# Calculate token length
content_df["token_len"] = content_df["tokens"].apply(lambda x: len(x.split()))

content_df_custom["token_len"] = content_df_custom["tokens"].apply(lambda x: len(x.split()))

In [71]:
# Display samples
print(content_df.head())
print(content_df_custom.head())

       PMID                                             tokens  \
0  28628768  To test the efficacy of a pregnancy adapted ve...   
1  28628768                             Online and telephone .   
2  28628768  Self-referred pregnant women ( gestational wee...   
3  28628768  @ pregnant women ( gestational week @ ) with m...   
4  28628768  The primary outcome was depressive symptoms me...   

                ner_tags  token_len  
0              OBJECTIVE         29  
1                SETTING          4  
2   POPULATION OR SAMPLE         17  
3                METHODS         36  
4  MAIN OUTCOME MEASURES         18  
       PMID                                             tokens  \
0  27932540  on a reduction in the incidence of childhood o...   
1  27932540                           In school-age children .   
2  27932540  what is the effect of a school-based physical ...   
3  27932540                    compared with no intervention .   
4  27932541  on a reduction in reported incid

In [72]:
# Replace entities with required entity names
content_df["ner_tags"] = content_df["ner_tags"].apply(lambda x: replace_entity(x))

# Replace entities with required entity names
content_df_custom["ner_tags"] = content_df_custom["ner_tags"].apply(lambda x: replace_entity(x))

In [73]:
# Display samples
print(content_df.head())
print(content_df_custom.head())

       PMID                                             tokens      ner_tags  \
0  28628768  To test the efficacy of a pregnancy adapted ve...             O   
1  28628768                             Online and telephone .             O   
2  28628768  Self-referred pregnant women ( gestational wee...  I-POPULATION   
3  28628768  @ pregnant women ( gestational week @ ) with m...             O   
4  28628768  The primary outcome was depressive symptoms me...     I-OUTCOME   

   token_len  
0         29  
1          4  
2         17  
3         36  
4         18  
       PMID                                             tokens  \
0  27932540  on a reduction in the incidence of childhood o...   
1  27932540                           In school-age children .   
2  27932540  what is the effect of a school-based physical ...   
3  27932540                    compared with no intervention .   
4  27932541  on a reduction in reported incidences of bully...   

         ner_tags  token_len  
0

In [74]:
# Iterate entities to all words/token
for i in range(len(content_df)):
    content_df.loc[i, "ner_tags"] = str([content_df["ner_tags"][i]] * content_df["token_len"][i])
    content_df.loc[i, "tokens"] = str(content_df["tokens"][i].split())

# Iterate entities to all words/token
for i in range(len(content_df_custom)):
    content_df_custom.loc[i, "ner_tags"] = str([content_df_custom["ner_tags"][i]] * content_df_custom["token_len"][i])
    content_df_custom.loc[i, "tokens"] = str(content_df_custom["tokens"][i].split())

In [75]:
content_df.head()
content_df_custom.head()

Unnamed: 0,PMID,tokens,ner_tags,token_len
0,27932540,"['on', 'a', 'reduction', 'in', 'the', 'inciden...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",10
1,27932540,"['In', 'school-age', 'children', '.']","['I-PARTICIPANT', 'I-PARTICIPANT', 'I-PARTICIP...",4
2,27932540,"['what', 'is', 'the', 'effect', 'of', 'a', 'sc...","['I-INTERVENTION', 'I-INTERVENTION', 'I-INTERV...",11
3,27932540,"['compared', 'with', 'no', 'intervention', '.']","['I-INTERVENTION', 'I-INTERVENTION', 'I-INTERV...",5
4,27932541,"['on', 'a', 'reduction', 'in', 'reported', 'in...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']",9


In [76]:
# # Converts string to list of values
content_df["ner_tags"] = content_df["ner_tags"].apply(lambda x:eval(x))
content_df["tokens"] = content_df["tokens"].apply(lambda x:eval(x))

# Converts string to list of values
content_df_custom["ner_tags"] = content_df_custom["ner_tags"].apply(lambda x:eval(x))
content_df_custom["tokens"] = content_df_custom["tokens"].apply(lambda x:eval(x))

In [77]:
# # Group by PMID
content_df = content_df.groupby('PMID').agg(lambda x: x.to_list())

# Group by PMID
content_df_custom = content_df_custom.groupby('PMID').agg(lambda x: x.to_list())

In [78]:
# # Combine list of list
content_df["tokens"] = content_df["tokens"].apply(lambda x:list(itertools.chain.from_iterable(x)))
content_df["ner_tags"] = content_df["ner_tags"].apply(lambda x:list(itertools.chain.from_iterable(x)))

# Combine list of list
content_df_custom["tokens"] = content_df_custom["tokens"].apply(lambda x:list(itertools.chain.from_iterable(x)))
content_df_custom["ner_tags"] = content_df_custom["ner_tags"].apply(lambda x:list(itertools.chain.from_iterable(x)))

In [79]:
# # Reset dataframe
content_df = content_df.reset_index(drop=True)

# Reset dataframe
content_df_custom = content_df_custom.reset_index(drop=True)

In [80]:
# # Display preprocessed data | samples
content_df.head()

content_df_custom.head()

Unnamed: 0,tokens,ner_tags,token_len
0,"[on, a, reduction, in, the, incidence, of, chi...","[O, O, O, O, O, O, O, O, O, O, I-PARTICIPANT, ...","[10, 4, 11, 5]"
1,"[on, a, reduction, in, reported, incidences, o...","[O, O, O, O, O, O, O, O, O, I-PARTICIPANT, I-P...","[9, 5, 11, 5]"
2,"[at, an, increased, risk, of, developing, esop...","[O, O, O, O, O, O, O, O, O, I-PARTICIPANT, I-P...","[9, 8, 12, 6]"
3,"[at, greater, risk, for, developing, blood, cl...","[O, O, O, O, O, O, O, O, I-PARTICIPANT, I-PART...","[8, 5, 5, 6]"
4,"[more, effective, in, detecting, breast, cance...","[O, O, O, O, O, O, O, I-PARTICIPANT, I-PARTICI...","[7, 6, 5, 8]"


In [81]:
# Save preprocessed data
content_df.to_excel("preprocessed_df.xlsx")

content_df_custom.to_excel("preprocessed_df_custom.xlsx")

**TEST / TRAIN SPLIT**
---
Split the train data set into two sets by the ratio 8:2. and we can furthur evaluate the model performance

In [82]:
#content_df = pd.read_excel("/kaggle/input/pubmeddata/preprocessed_df_complete.xlsx")

#content_df_custom = pd.read_excel("/kaggle/working/preprocessed_df_custom.xlsx")

In [83]:
content_df.head()

Unnamed: 0,tokens,ner_tags,token_len
0,"[To, determine, whether, prophylactic, treatme...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[25, 38, 9, 19, 15, 53, 49, 21, 29]"
1,"[Since, it, is, not, clear, whether, testoster...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[44, 20, 38, 17, 17, 10, 21, 7, 25, 10, 30, 24]"
2,"[The, aim, was, to, study, the, pharmacokineti...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[30, 24, 41, 22, 12, 10, 42, 13, 16, 45, 37, 2..."
3,"[To, investigate, the, significance, of, treat...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[33, 15, 29, 32, 26, 19, 35, 33, 37, 16, 38, 7]"
4,"[Dopamine, agonists, have, a, well, establishe...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[26, 34, 12, 45, 22, 45, 27, 65, 22, 47, 23, 3..."


In [84]:
#Test train split
train, test = train_test_split(content_df, test_size=0.2, shuffle=False, random_state=1)

train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
content_df_custom = content_df_custom.reset_index(drop=True)
train_df = train
test_df = test

test_df_custom = content_df_custom

test_custom = test_df_custom.reset_index(drop=True)
test_df_custom = test_custom

In [85]:
# Print Shape
print(len(train_df))
print(len(test_df))

# Print Shape
print(len(test_df_custom))

19734
4934
7


In [86]:
# Label to numeric representation
label_list = ['O','I-POPULATION', 'I-PARTICIPANT','I-INTERVENTION', 'I-COMPARISON','I-OUTCOME']
label_encoding_dict = {'O': 0, 'I-POPULATION': 1, 'I-PARTICIPANT': 2, 'I-INTERVENTION': 3, 'I-COMPARISON': 4, 'I-OUTCOME': 5}

**MODEL INITIALIZATION**
---
Initialize the BERT model. Define the Task Name, Model, Tokenizer

In [87]:
# Initialize the BERT model
task = "ner" 
model_checkpoint = "distilbert-base-uncased"
batch_size = 16
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

# Initialize pretrained tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) 

data_collator = DataCollatorForTokenClassification(tokenizer) #Some random data enhancements

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream

**TOKENIZATION**
---
Tokenize and embed the dataset with pretrained BERT

In [88]:
train_df

Unnamed: 0,tokens,ner_tags,token_len
0,"[To, determine, whether, prophylactic, treatme...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[25, 38, 9, 19, 15, 53, 49, 21, 29]"
1,"[Since, it, is, not, clear, whether, testoster...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[44, 20, 38, 17, 17, 10, 21, 7, 25, 10, 30, 24]"
2,"[The, aim, was, to, study, the, pharmacokineti...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[30, 24, 41, 22, 12, 10, 42, 13, 16, 45, 37, 2..."
3,"[To, investigate, the, significance, of, treat...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[33, 15, 29, 32, 26, 19, 35, 33, 37, 16, 38, 7]"
4,"[Dopamine, agonists, have, a, well, establishe...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[26, 34, 12, 45, 22, 45, 27, 65, 22, 47, 23, 3..."
...,...,...,...
19729,"[To, evaluate, the, cost, effectiveness, of, a...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[26, 15, 2, 27, 13, 14, 15, 15, 9, 19, 16, 44,..."
19730,"[The, objective, of, this, study, was, to, det...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[38, 30, 31, 13, 17, 17, 21, 22, 21, 26, 14]"
19731,"[hospital, falls, place, a, substantial, burde...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[10, 18, 19, 21, 9, 16, 12, 8, 15, 18, 17, 30,..."
19732,"[Optimal, bowel, preparation, is, associated, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[24, 29, 9, 10, 36, 22, 21, 29, 39, 28, 19, 17..."


In [89]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
test_dataset_custom = Dataset.from_pandas(test_df_custom)
train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_datasets = test_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_datasets_custom = test_dataset_custom.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/20 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

**HYPER PARAMETER TURNING**
---
Hyperparameters - We can iterate and tune the model with these parameters for better results

In [90]:
args = TrainingArguments(
    f"test-{task}",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=1e-5,
)

metric = load_metric("seqeval")

**TRAINING MODULE**
---
Train the model with different parameters and finalize the optimal one

In [91]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=test_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.3267,0.280777,0.330155,0.645941,0.436967,0.886866
2,0.2199,0.258759,0.382363,0.64502,0.480117,0.904921
3,0.1693,0.265318,0.383734,0.648129,0.482059,0.909435




TrainOutput(global_step=1851, training_loss=0.21943413122868552, metrics={'train_runtime': 1891.0281, 'train_samples_per_second': 31.307, 'train_steps_per_second': 0.979, 'total_flos': 7724608640091936.0, 'train_loss': 0.21943413122868552, 'epoch': 3.0})

# **EVALUATION**

---

Evaluate the custom dataset

In [92]:
trainer.predict(test_dataset= test_tokenized_datasets_custom)

  _warn_prf(average, modifier, msg_start, len(result))


PredictionOutput(predictions=array([[[ 1.3793315 , -1.2613097 , -0.64545316,  0.1297017 ,
         -3.1528769 ,  1.9434631 ],
        [ 5.781609  , -2.554851  , -1.2305375 , -1.2460514 ,
         -4.5526385 ,  0.67352897],
        [ 6.077831  , -2.6164298 , -1.1976192 , -1.1274681 ,
         -4.703661  ,  0.81843615],
        ...,
        [ 1.1889987 , -2.0379605 , -1.1466179 ,  1.2493838 ,
         -3.4935138 ,  2.4941478 ],
        [ 1.2031623 , -1.9950373 , -1.1414206 ,  1.172266  ,
         -3.4921565 ,  2.5027792 ],
        [ 1.2311547 , -1.9717263 , -1.1611257 ,  1.1479452 ,
         -3.4341848 ,  2.404171  ]],

       [[ 1.8432531 , -0.68940663, -0.2307233 , -0.9598694 ,
         -3.2478917 ,  1.675049  ],
        [ 4.9784684 , -2.15573   , -1.1174445 , -2.1957538 ,
         -4.3356614 ,  2.242342  ],
        [ 5.258412  , -2.2477875 , -1.1562958 , -2.0574856 ,
         -4.492253  ,  2.469741  ],
        ...,
        [ 1.5581981 , -1.4034519 , -0.6331787 , -0.57186514,
         

In [93]:
trainer.evaluate(test_tokenized_datasets_custom)

{'eval_loss': 1.9022464752197266,
 'eval_precision': 0.14285714285714285,
 'eval_recall': 0.08333333333333333,
 'eval_f1': 0.10526315789473685,
 'eval_accuracy': 0.3526315789473684,
 'eval_runtime': 0.0853,
 'eval_samples_per_second': 82.03,
 'eval_steps_per_second': 11.719,
 'epoch': 3.0}

*Save the Model*
---
Save the model to disk

In [94]:
# Save the model to disk
trainer.save_model('un-ner.model')

**PREDICTION MODULE**
---
Predict the **Unseen Data**


In [95]:
# Load the tokenizer and trained model from disk
tokenizer = AutoTokenizer.from_pretrained('./un-ner.model/')
model = AutoModelForTokenClassification.from_pretrained('./un-ner.model/', num_labels=len(label_list))

In [96]:
# Input unseen data
paragraph = '''
OBJECTIVE|A|reduce the future risk of stroke .
PARTICIPANTS|P|For women under age @ .
INTERVENTIONS|I|does the daily use of @ mg low-dose Aspirin .
INTERVENTION AND COMPARISON|I|compared with no usage of low-dose Aspirin .
'''

In [97]:
# Tokenize the paragraph
tokens = tokenizer(paragraph)
torch.tensor(tokens['input_ids']).unsqueeze(0).size()

torch.Size([1, 59])

In [98]:
# Predictions
predictions = model.forward(input_ids=torch.tensor(tokens['input_ids']).unsqueeze(0), attention_mask=torch.tensor(tokens['attention_mask']).unsqueeze(0))
predictions = torch.argmax(predictions.logits.squeeze(), axis=1)
predictions = [label_list[i] for i in predictions]

In [99]:
print(predictions)

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-OUTCOME', 'I-OUTCOME', 'O', 'I-OUTCOME', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [100]:
words = tokenizer.batch_decode(tokens['input_ids'])

In [101]:
# Store the token and predicted entities in a dataframe
prediction_results = pd.DataFrame({'ner': predictions, 'words': words})

In [102]:
# Print predicted samples
print(prediction_results)

          ner          words
0           O          [CLS]
1           O      objective
2           O              |
3           O              a
4           O              |
5           O         reduce
6           O            the
7           O         future
8           O           risk
9           O             of
10          O         stroke
11          O              .
12          O   participants
13          O              |
14          O              p
15          O              |
16          O            for
17          O          women
18          O          under
19          O            age
20          O              @
21          O              .
22          O  interventions
23          O              |
24          O              i
25          O              |
26          O           does
27          O            the
28          O          daily
29          O            use
30          O             of
31          O              @
32          O             mg
33          O 

*Export the results*
---
Export test results to **csv**

Select the **Words, Labels** and **Confidence scores** to export csv file

In [103]:
label_list = ['O','I-POPULATION', 'I-PARTICIPANT','I-INTERVENTION', 'I-COMPARISON','I-OUTCOME']
O_List=[]
I_POPULATION_List=[]
I_PARTICIPANT_List=[]
I_INTERVENTION_List=[]
I_COMPARISON_List=[]
I_OUTCOME_List=[]
for ner,word in zip(predictions,words):
    if ner == 'O':
        O_List.append(word)
    elif ner == 'I-POPULATION':
        I_POPULATION_List.append(word)
    elif ner == 'I-PARTICIPANT':
        I_PARTICIPANT_List.append(word)
    elif ner == 'I-INTERVENTION':
        I_INTERVENTION_List.append(word)
    elif ner == 'I-COMPARISON':
        I_COMPARISON_List.append(word)
    elif ner == 'I-OUTCOME':
        I_OUTCOME_List.append(word)

In [104]:
print('O       '+' '.join(O_List))
print("------------------------------------------")
print('I-POPULATION        ' + ' '.join(I_POPULATION_List))
print("------------------------------------------")
print('I-PARTICIPANT       '+' '.join(I_PARTICIPANT_List))
print("------------------------------------------")
print('I-INTERVENTION       '+' '.join(I_INTERVENTION_List))
print("------------------------------------------")
print('I-COMPARISON        ' + ' '.join(I_COMPARISON_List))
print("------------------------------------------")
print('I-OUTCOME        ' + ' '.join(I_OUTCOME_List))

O       [CLS] objective | a | reduce the future risk of stroke . participants | p | for women under age @ . interventions | i | does the daily use of @ mg low - dose as ##pi ##rin . comparison i | compared with no usage of low - dose as ##pi ##rin . [SEP]
------------------------------------------
I-POPULATION        
------------------------------------------
I-PARTICIPANT       
------------------------------------------
I-INTERVENTION       
------------------------------------------
I-COMPARISON        
------------------------------------------
I-OUTCOME        intervention and |


In [105]:
# Export results to csv
prediction_results.to_csv("prediction_results.csv")