In [42]:
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from sklearn.metrics import f1_score
from kaggle_secrets import UserSecretsClient
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
import wandb
import torch
import spacy
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from spacy import displacy
nlp = spacy.blank('en')
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification

from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [2]:
try:
    user_secrets = UserSecretsClient()
    secret_value_0 = user_secrets.get_secret("wandb_api")
    wandb.login(key=secret_value_0)
    anony=None
except:
    anony = "must"
    
wandb.init(project="PICO Evidence Summarization",name=f"15.PICO Evidence Summarization BioBert-Base-Cased-v1.2-Run 15")

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [3]:
df=pd.read_csv('/kaggle/input/biobert-custom-dataset/PICO_Tags_Heart_Disease_Semi_Processed.csv')
print(df.head())
df_custom=pd.read_csv('/kaggle/input/biobert-custom-dataset/PICO_custom_dataset.csv')
print(df_custom.head())

                                           Sentences  \
0  methods total 107 women clinical indication in...   
1  patients her2 hr positive mbc labc randomized ...   
2      statistical significance defined p-value 0 05   
3  exclusion criteria included nephrotic proteinu...   
4  radial artery cannulation performed summarise ...   

                                            Tags  
0                 Population/Problem (element P)  
1  Intervention and Comparison (element I and C)  
2         Not Relevant to the Evidence (Label N)  
3                 Population/Problem (element P)  
4  Intervention and Comparison (element I and C)  
                                           Sentences  \
0                            In school-age children    
1  what is the effect of a school-based physical ...   
2  on a reduction in the incidence of childhood o...   
3                      compared with no intervention   
4                            In high school children   

                    

In [4]:
print(df['Tags'].value_counts())
print(df_custom['Tags'].value_counts())

Intervention and Comparison (element I and C)    760
Outcome (element O)                              653
Not Relevant to the Evidence (Label N)           648
Population/Problem (element P)                   636
Name: Tags, dtype: int64
Intervention and Comparison (element I and C)    20
Population/Problem (element P)                   14
Outcome (element O)                              12
Name: Tags, dtype: int64


In [5]:
evidence_labels = df.Tags.unique()

evidence_dict = {}
for index, evidence_labels in enumerate(evidence_labels):
    evidence_dict[evidence_labels] = index
evidence_dict

{'Population/Problem (element P)': 0,
 'Intervention and Comparison (element I and C)': 1,
 'Not Relevant to the Evidence (Label N)': 2,
 'Outcome (element O)': 3}

In [6]:
df['Tags Mapped'] = df.Tags.map(evidence_dict)
print(df)
df_custom['Tags Mapped'] = df_custom.Tags.map(evidence_dict)
print(df_custom)

                                              Sentences  \
0     methods total 107 women clinical indication in...   
1     patients her2 hr positive mbc labc randomized ...   
2         statistical significance defined p-value 0 05   
3     exclusion criteria included nephrotic proteinu...   
4     radial artery cannulation performed summarise ...   
...                                                 ...   
2692  crying grimacing reduced 82% 65% control infan...   
2693  design setting tobacco exercise diet messages ...   
2694  twenty-four-h holter recordings two-channel re...   
2695  patients assigned bmsc control restricted rand...   
2696  neonates assigned randomized manner 1 incubato...   

                                               Tags  Tags Mapped  
0                    Population/Problem (element P)            0  
1     Intervention and Comparison (element I and C)            1  
2            Not Relevant to the Evidence (Label N)            2  
3                    Po

In [7]:
X_train, X_val, y_train, y_val = train_test_split(df.index.values,\
                                                  df['Tags Mapped'].values,\
                                                  test_size=0.15,\
                                                  random_state=42,\
                                                  stratify=df['Tags Mapped'].values)

In [8]:
df['data_type'] = ['not_set']*df.shape[0]

df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'


# df_custom['data_type'] = ['not_set']*df_custom.shape[0]

# df_custom.loc[df_custom, 'data_type'] = 'val'

In [9]:
df.groupby(['Tags', 'Tags Mapped', 'data_type']).count()
df_custom.groupby(['Tags', 'Tags Mapped']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Sentences
Tags,Tags Mapped,Unnamed: 2_level_1
Intervention and Comparison (element I and C),1,20
Outcome (element O),3,12
Population/Problem (element P),0,14


In [10]:
print(df.head())
print(df_custom.head())

                                           Sentences  \
0  methods total 107 women clinical indication in...   
1  patients her2 hr positive mbc labc randomized ...   
2      statistical significance defined p-value 0 05   
3  exclusion criteria included nephrotic proteinu...   
4  radial artery cannulation performed summarise ...   

                                            Tags  Tags Mapped data_type  
0                 Population/Problem (element P)            0     train  
1  Intervention and Comparison (element I and C)            1       val  
2         Not Relevant to the Evidence (Label N)            2     train  
3                 Population/Problem (element P)            0       val  
4  Intervention and Comparison (element I and C)            1     train  
                                           Sentences  \
0                            In school-age children    
1  what is the effect of a school-based physical ...   
2  on a reduction in the incidence of childhood o..

In [11]:
tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.2', do_lower_case=True,truncation=True)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

In [12]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].Sentences.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    truncation=True,
    padding='longest', 
    max_length=512, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].Sentences.values, 
    add_special_tokens=True, 
    truncation=True,
    return_attention_mask=True, 
    padding='longest', 
    max_length=512, 
    return_tensors='pt'
)

encoded_data_val_custom = tokenizer.batch_encode_plus(
    df_custom.Sentences.values, 
    add_special_tokens=True, 
    truncation=True,
    return_attention_mask=True, 
    padding='longest', 
    max_length=512, 
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
tags_train = torch.tensor(df[df.data_type=='train']['Tags Mapped'].values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
tags_val = torch.tensor(df[df.data_type=='val']['Tags Mapped'].values)

input_ids_val_custom = encoded_data_val_custom['input_ids']
attention_masks_val_custom = encoded_data_val_custom['attention_mask']
tags_val_custom = torch.tensor(df_custom['Tags Mapped'].values)

In [13]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, tags_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, tags_val)
dataset_val_custom = TensorDataset(input_ids_val_custom, attention_masks_val_custom, tags_val_custom)

In [14]:
print(len(dataset_train), len(dataset_val), len(dataset_val_custom))

2292 405 46


In [15]:
model = BertForSequenceClassification.from_pretrained("dmis-lab/biobert-base-cased-v1.2",
                                                      num_labels=len(evidence_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.2 were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

In [16]:
batch_size = 32 #Defining Batch Size on which model has to be trained

# Converting tensors Dataset to dataloaders so that model can be trained
dataloader_train = DataLoader(dataset_train,sampler=RandomSampler(dataset_train), batch_size=batch_size) 

dataloader_validation = DataLoader(dataset_val, sampler=SequentialSampler(dataset_val), batch_size=batch_size)

dataloader_validation_custom = DataLoader(dataset_val_custom, sampler=SequentialSampler(dataset_val_custom), batch_size=batch_size)

In [17]:
optimizer = AdamW(model.parameters(),lr=8e-7, eps=1e-8)

In [18]:
epochs = 16
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,num_training_steps=len(dataloader_train)*epochs)

In [40]:
def f1_score_func(preds, tags):
    preds_flat = np.argmax(preds, axis=1).flatten()
    tags_flat = tags.flatten()
    return f1_score(tags_flat, preds_flat, average='weighted')

def recall_func(preds, tags):
    preds_flat = np.argmax(preds, axis=1).flatten()
    tags_flat = tags.flatten()
    return recall_score(tags_flat, preds_flat, average='weighted')

def accuracy_func(preds, tags):
    preds_flat = np.argmax(preds, axis=1).flatten()
    tags_flat = tags.flatten()
    return accuracy_score(tags_flat, preds_flat)

def precision_func(preds, tags):
    preds_flat = np.argmax(preds, axis=1).flatten()
    tags_flat = tags.flatten()
    return precision_score(tags_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, tags):
    evidence_dict_inverse = {v: k for k, v in evidence_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    tags_flat = tags.flatten()

    for tag in np.unique(tags_flat):
        y_preds = preds_flat[tags_flat==tag]
        y_true = tags_flat[tags_flat==tag]
        print(f'PICO Evidence Class: {evidence_dict_inverse[tag]}')
        print(f'Accuracy: {len(y_preds[y_preds==tag])}/{len(y_true)}\n')

In [20]:
seed_val = 199
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [21]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [22]:
!mkdir models

In [23]:
loss_train_avg_list = []
val_loss_list = []
val_f1_list = []

for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
    torch.save(model.state_dict(), f'models/finetuned_BioBERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    loss_train_avg_list.append(loss_train_avg)
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    val_loss_list.append(val_loss)
    tqdm.write(f'F1 validation Score (Weighted): {val_f1}')
    val_f1_list.append(val_f1)
    wandb.log({"Training loss":loss_train_avg, "Validation loss":val_loss,"F1 validation Score (Weighted)":val_f1,})

  0%|          | 0/16 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/72 [00:00<?, ?it/s]


Epoch 1
Training loss: 1.4213563071356878
Validation loss: 1.387370879833515
F1 validation Score (Weighted): 0.19135348840455305


Epoch 2:   0%|          | 0/72 [00:00<?, ?it/s]


Epoch 2
Training loss: 1.3687110195557277
Validation loss: 1.3422128237210786
F1 validation Score (Weighted): 0.3275236365625524


Epoch 3:   0%|          | 0/72 [00:00<?, ?it/s]


Epoch 3
Training loss: 1.3318832003408008
Validation loss: 1.2969790880496685
F1 validation Score (Weighted): 0.49838786037125793


Epoch 4:   0%|          | 0/72 [00:00<?, ?it/s]


Epoch 4
Training loss: 1.2867270724640951
Validation loss: 1.2481159705382128
F1 validation Score (Weighted): 0.5574553872985851


Epoch 5:   0%|          | 0/72 [00:00<?, ?it/s]


Epoch 5
Training loss: 1.2337967289818659
Validation loss: 1.1917434197205763
F1 validation Score (Weighted): 0.6135720886140984


Epoch 6:   0%|          | 0/72 [00:00<?, ?it/s]


Epoch 6
Training loss: 1.1740543627076678
Validation loss: 1.1203778431965754
F1 validation Score (Weighted): 0.637691239121041


Epoch 7:   0%|          | 0/72 [00:00<?, ?it/s]


Epoch 7
Training loss: 1.1128937866952684
Validation loss: 1.0615102327786958
F1 validation Score (Weighted): 0.68257264724503


Epoch 8:   0%|          | 0/72 [00:00<?, ?it/s]


Epoch 8
Training loss: 1.0596883073449135
Validation loss: 1.0100952157607446
F1 validation Score (Weighted): 0.7191297635927955


Epoch 9:   0%|          | 0/72 [00:00<?, ?it/s]


Epoch 9
Training loss: 1.0182546178499858
Validation loss: 0.9707092046737671
F1 validation Score (Weighted): 0.7360402912268891


Epoch 10:   0%|          | 0/72 [00:00<?, ?it/s]


Epoch 10
Training loss: 0.9807980424828
Validation loss: 0.9376579568936274
F1 validation Score (Weighted): 0.7488324073165555


Epoch 11:   0%|          | 0/72 [00:00<?, ?it/s]


Epoch 11
Training loss: 0.9538044532140096
Validation loss: 0.9091571248494662
F1 validation Score (Weighted): 0.7509971137374316


Epoch 12:   0%|          | 0/72 [00:00<?, ?it/s]


Epoch 12
Training loss: 0.9259516853425238
Validation loss: 0.8880928342158978
F1 validation Score (Weighted): 0.756024376518558


Epoch 13:   0%|          | 0/72 [00:00<?, ?it/s]


Epoch 13
Training loss: 0.9088057196802564
Validation loss: 0.872618161714994
F1 validation Score (Weighted): 0.7635202571656033


Epoch 14:   0%|          | 0/72 [00:00<?, ?it/s]


Epoch 14
Training loss: 0.8923897966742516
Validation loss: 0.8618835027401264
F1 validation Score (Weighted): 0.7660102500432214


Epoch 15:   0%|          | 0/72 [00:00<?, ?it/s]


Epoch 15
Training loss: 0.8834636708100637
Validation loss: 0.8554420929688674
F1 validation Score (Weighted): 0.768380534884315


Epoch 16:   0%|          | 0/72 [00:00<?, ?it/s]


Epoch 16
Training loss: 0.8786453612976604
Validation loss: 0.8533623997981732
F1 validation Score (Weighted): 0.768380534884315


In [24]:
model.load_state_dict(torch.load('models/finetuned_BioBERT_epoch_8.model', map_location=torch.device('cuda')))

<All keys matched successfully>

In [25]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
F1 validation Score (Weighted),▁▃▅▅▆▆▇▇████████
Training loss,█▇▇▆▆▅▄▃▃▂▂▂▁▁▁▁
Validation loss,█▇▇▆▅▅▄▃▃▂▂▁▁▁▁▁

0,1
F1 validation Score (Weighted),0.76838
Training loss,0.87865
Validation loss,0.85336


In [26]:
_, predictions, true_vals = evaluate(dataloader_validation)
accuracy_per_class(predictions, true_vals)

PICO Evidence Class: Population/Problem (element P)
Accuracy: 66/96

PICO Evidence Class: Intervention and Comparison (element I and C)
Accuracy: 90/114

PICO Evidence Class: Not Relevant to the Evidence (Label N)
Accuracy: 68/97

PICO Evidence Class: Outcome (element O)
Accuracy: 67/98



In [28]:
val_loss_custom, predictions_custom, true_vals_custom = evaluate(dataloader_validation_custom)
accuracy_per_class(predictions_custom, true_vals_custom)

PICO Evidence Class: Population/Problem (element P)
Accuracy: 1/14

PICO Evidence Class: Intervention and Comparison (element I and C)
Accuracy: 0/20

PICO Evidence Class: Outcome (element O)
Accuracy: 0/12



In [43]:
val_f1_custom = f1_score_func(predictions_custom, true_vals_custom)
tqdm.write(f'F1 custom validation Score (Weighted): {val_f1_custom}')

val_recall_custom = recall_func(predictions_custom, true_vals_custom)
tqdm.write(f'Recall custom validation Score (Weighted): {val_recall_custom}')

val_accuracy_custom = accuracy_func(predictions_custom, true_vals_custom)
tqdm.write(f'Accuracy custom validation Score (Weighted): {val_accuracy_custom}')

val_precision_custom = precision_func(predictions_custom, true_vals_custom)
tqdm.write(f'Precision custom validation Score (Weighted): {val_precision_custom}')

F1 custom validation Score (Weighted): 0.04057971014492754
Recall custom validation Score (Weighted): 0.021739130434782608
Accuracy custom validation Score (Weighted): 0.021739130434782608
Precision custom validation Score (Weighted): 0.30434782608695654


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
