In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# download libraries
!pip install transformers
!pip install jupyter
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/b5/d5/c6c23ad75491467a9a84e526ef2364e523d45e2b0fae28a7cbe8689e7e84/transformers-4.8.1-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 29.2MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 40.5MB/s 
Collecting huggingface-hub==0.0.12
  Downloading https://files.pythonhosted.org/packages/2f/ee/97e253668fda9b17e968b3f97b2f8e53aa0127e8807d24a547687423fe0b/huggingface_hub-0.0.12-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████

In [16]:
# inject libraries
import json
import numpy as np
import tensorflow as tf
import seaborn as sn
import torch as tr
import pandas as pd
import torch.nn as nn

import random as rn
import transformers
from tqdm import tqdm, trange
from collections import Counter
from transformers import AutoTokenizer, AutoModel, AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import classification_report
from torch.nn.utils import clip_grad_norm_
from IPython.display import clear_output
from sklearn.metrics import precision_score, f1_score, recall_score

In [4]:
label_dict = {
    'INHIBITOR': 0,
    'PART-OF': 1,
    'SUBSTRATE': 2,
    'ACTIVATOR': 3,
    'INDIRECT-DOWNREGULATOR': 4,
    'ANTAGONIST': 5,
    'INDIRECT-UPREGULATOR': 6,
    'AGONIST': 7,
    'DIRECT-REGULATOR': 8,
    'PRODUCT-OF': 9,
    'AGONIST-ACTIVATOR': 10,
    'AGONIST-INHIBITOR': 11, 
    'SUBSTRATE_PRODUCT-OF': 12
}

In [5]:
# For Google Colab
rd_abs_split = pd.read_csv("/content/drive/MyDrive/DrugProt/drugprot_training_splitabs.tsv", sep="\t", header=None)
rd_ent = pd.read_csv("/content/drive/MyDrive/DrugProt/drugprot_training_entities.tsv", sep="\t", header=None)
rd_rel = pd.read_csv("/content/drive/MyDrive/DrugProt/drugprot_training_relations.tsv", sep="\t", header=None)

rd_ent.columns = ["pubMedId", "entityId", "entityType", "sOffset", "eOffset", "entityText"]
rd_rel.columns = ["pubMedId", "relType", "Arg1", "Arg2"]

In [5]:
# For Local Run
rd_abs_split = pd.read_csv("./training/drugprot_training_splitabs.tsv", sep="\t", header=None)
rd_ent = pd.read_csv("./training/drugprot_training_entities.tsv", sep="\t", header=None)
rd_rel = pd.read_csv("./training/drugprot_training_relations.tsv", sep="\t", header=None)

rd_ent.columns = ["pubMedId", "entityId", "entityType", "sOffset", "eOffset", "entityText"]
rd_rel.columns = ["pubMedId", "relType", "Arg1", "Arg2"]

In [6]:
def unify_abs(dataframe):
    np_matrix = dataframe.to_numpy()
    np_matrix = np_matrix[np.logical_not(pd.isnull(np_matrix))]
    np_list = list(np_matrix)
    abs_dict = dict()

    pm_id_list = list()
    for ix, val in enumerate(np_list):
        try:
            if int(val):
                pm_id_list.append(ix)
        except Exception:
            pass

    for i in range(len(pm_id_list)-1):
        sentence_list = np_list[pm_id_list[i]:pm_id_list[i+1]]
        abs_dict[int(sentence_list[0])] = sentence_list[1:]

    sentence_list = np_list[pm_id_list[-1]:]
    abs_dict[int(sentence_list[0])] = sentence_list[1:]
    return abs_dict

In [7]:
def find_sentence_index(start, sentence_list):
    for ix in range(1,len(sentence_list)):
        if int(start) <= len(" ".join(sentence_list[:ix])):
            return ix
    return ix

def calculate_prev_sentence_length(sentence_count, sentence_list):
    return len(" ".join(sentence_list[:sentence_count-1]))+1

In [8]:
class BioBertModel(nn.Module):
    def __init__(self):
        super(BioBertModel, self).__init__()
        self.model = AutoModel.from_pretrained("dmis-lab/biobert-v1.1")
        self.linear = nn.Linear(768, 13)
    
    def forward(self, tokens, masks=None):
        output = self.model(tokens, attention_mask=masks)[0]
        output = output[:,0,:]
        output = self.linear(output)
        return output

In [9]:
device = tr.device("cuda" if tr.cuda.is_available() else "cpu")

In [10]:
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")

Downloading: 100%|██████████| 49.0/49.0 [00:00<00:00, 14.1kB/s]
Downloading: 100%|██████████| 462/462 [00:00<00:00, 194kB/s]
Downloading: 100%|██████████| 213k/213k [00:00<00:00, 1.15MB/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 49.2kB/s]


In [11]:
def preprocess(abstract_sentence_dict, entity_frame, relation_frame):
    prepped_list = list()
    sentence_dict = {
        "input": None,
        "mask": None,
        "label": None,
        "arg1": None,
        "arg2": None
        }
        
    for pubMedId in abstract_sentence_dict.keys():
        chem_sentence_id_dict = dict()
        non_chem_sentence_id_dict = dict()
        sentence_tagged_dict = dict(list())
        entities = entity_frame.loc[entity_frame["pubMedId"] == pubMedId]
        sentence_list = abstract_sentence_dict[pubMedId]
        chem_entities = entities.loc[entities["entityType"] == "CHEMICAL"]
        non_chem_entities = entities.loc[entities["entityType"] != "CHEMICAL"]

        for chem_ix in range(chem_entities.shape[0]):
            sOffset = chem_entities.iloc[chem_ix]["sOffset"]
            entityId = chem_entities.iloc[chem_ix]["entityId"]
            sentence_count = find_sentence_index(sOffset, sentence_list)
            chem_sentence_id_dict[entityId] = {
                "sOffset": chem_entities.iloc[chem_ix]["sOffset"], 
                "eOffset": chem_entities.iloc[chem_ix]["eOffset"],
                "sen_ct": sentence_count
                }

        for nonchem_ix in range(non_chem_entities.shape[0]):
            sOffset = non_chem_entities.iloc[nonchem_ix]["sOffset"]
            entityId = non_chem_entities.iloc[nonchem_ix]["entityId"]
            sentence_count = find_sentence_index(sOffset, sentence_list)
            non_chem_sentence_id_dict[entityId] = {
                "sOffset": non_chem_entities.iloc[nonchem_ix]["sOffset"], 
                "eOffset": non_chem_entities.iloc[nonchem_ix]["eOffset"], 
                "sen_ct": sentence_count
                }

        relations = relation_frame.loc[relation_frame["pubMedId"] == pubMedId]
        for relation_ix in range(relations.shape[0]):
            arg1 = relations.iloc[relation_ix]["Arg1"].split(":")[-1]
            arg2 = relations.iloc[relation_ix]["Arg2"].split(":")[-1]
            relType = relations.iloc[relation_ix]["relType"]

            type_arg1 = entities.loc[entities["entityId"] == arg1]["entityType"].to_string(index=False).strip()
            type_arg2 = entities.loc[entities["entityId"] == arg2]["entityType"].to_string(index=False).strip()

            if type_arg1 == "CHEMICAL":
                s1Offset = chem_sentence_id_dict[arg1]["sOffset"]
                e1Offset = chem_sentence_id_dict[arg1]["eOffset"]
                sentence_ct = chem_sentence_id_dict[arg1]["sen_ct"]
                rel_sentence = abstract_sentence_dict[pubMedId][sentence_ct-1]
                rel_sentence_offset = calculate_prev_sentence_length(sentence_ct, abstract_sentence_dict[pubMedId])
            else:
                s1Offset = non_chem_sentence_id_dict[arg1]["sOffset"]
                e1Offset = non_chem_sentence_id_dict[arg1]["eOffset"]
                sentence_ct = non_chem_sentence_id_dict[arg1]["sen_ct"]
                rel_sentence = abstract_sentence_dict[pubMedId][sentence_ct-1]
                rel_sentence_offset = calculate_prev_sentence_length(sentence_ct, abstract_sentence_dict[pubMedId])

            if type_arg2 == "CHEMICAL":
                s2Offset = chem_sentence_id_dict[arg2]["sOffset"]
                e2Offset = chem_sentence_id_dict[arg2]["eOffset"]
            else:
                s2Offset = non_chem_sentence_id_dict[arg2]["sOffset"]
                e2Offset = non_chem_sentence_id_dict[arg2]["eOffset"]

            if s2Offset < s1Offset:
                pre_arg2_text = tokenizer.encode(rel_sentence[:s2Offset-rel_sentence_offset], add_special_tokens=False)
                arg2_text = tokenizer.encode(rel_sentence[s2Offset-rel_sentence_offset:e2Offset-rel_sentence_offset], add_special_tokens=False)
                post_arg2_pre_arg1_text = tokenizer.encode(rel_sentence[e2Offset-rel_sentence_offset:s1Offset-rel_sentence_offset], add_special_tokens=False)
                arg1_text = tokenizer.encode(rel_sentence[s1Offset-rel_sentence_offset:e1Offset-rel_sentence_offset], add_special_tokens=False)
                post_arg1_text = tokenizer.encode(rel_sentence[e1Offset-rel_sentence_offset:], add_special_tokens=False)
                id_sentence = [101] + pre_arg2_text + [3] + arg2_text + [4] + post_arg2_pre_arg1_text + [1] + arg1_text + [2] + post_arg1_text + [102]
            else:
                pre_arg1_text = tokenizer.encode(rel_sentence[:s1Offset-rel_sentence_offset], add_special_tokens=False)
                arg1_text = tokenizer.encode(rel_sentence[s1Offset-rel_sentence_offset:e1Offset-rel_sentence_offset], add_special_tokens=False)
                post_arg1_pre_arg2_text = tokenizer.encode(rel_sentence[e1Offset-rel_sentence_offset:s2Offset-rel_sentence_offset], add_special_tokens=False)
                arg2_text = tokenizer.encode(rel_sentence[s2Offset-rel_sentence_offset:e2Offset-rel_sentence_offset], add_special_tokens=False)
                post_arg2_text = tokenizer.encode(rel_sentence[e2Offset-rel_sentence_offset:], add_special_tokens=False)
                id_sentence = [101] + pre_arg1_text + [1] + arg1_text + [2] + post_arg1_pre_arg2_text + [3] + arg2_text + [4] + post_arg2_text + [102]

            if len(id_sentence) < 512:
                sentence_dict = {
                    "input": id_sentence + [0] * (512 - len(id_sentence)),
                    "mask": [1] * len(id_sentence) + [0] * (512 - len(id_sentence)),
                    "label": label_dict[relType],
                    "arg1": arg1,
                    "arg2": arg2
                }
                prepped_list.append(sentence_dict)

    return prepped_list

In [12]:
abstract_sentence_dict = unify_abs(rd_abs_split)
prepped_sentence_list = preprocess(abstract_sentence_dict, rd_ent, rd_rel)

Token indices sequence length is longer than the specified maximum sequence length for this model (620 > 512). Running this sequence through the model will result in indexing errors


In [14]:
# Add shuffle
rn.seed(2021)
rn.shuffle(prepped_sentence_list)

training_size = round(len(prepped_sentence_list)*0.8)
training_sentences = prepped_sentence_list[:training_size]
test_sentences = prepped_sentence_list[training_size:]

In [14]:
BATCH_SIZE = 8
EPOCHS = 5

train_dataset = TensorDataset(tr.tensor([sentence["input"] for sentence in training_sentences]).to(device), tr.tensor([sentence["mask"] for sentence in training_sentences]).to(device), tr.tensor([sentence["label"] for sentence in training_sentences]).to(device))
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(tr.tensor([sentence["input"] for sentence in test_sentences]).to(device), tr.tensor([sentence["mask"] for sentence in test_sentences]).to(device), tr.tensor([sentence["label"] for sentence in test_sentences]).to(device))
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

In [15]:
bioBERT_model = BioBertModel()
bioBERT_model = bioBERT_model.to(device)
optimizer = tr.optim.Adam(params=bioBERT_model.parameters(), lr=1e-5, weight_decay=0.03)
tr.cuda.empty_cache()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433286112.0, style=ProgressStyle(descri…




In [18]:
bioBERT_model.train()
loss_func = nn.CrossEntropyLoss()
for epoch_num in trange(EPOCHS, desc="Epoch"):
    train_loss = 0.0
    for step_num, batch_data in enumerate(train_dataloader):
        inputs, masks, labels = batch_data
        output = bioBERT_model.forward(inputs, masks)
        batch_loss = loss_func(output, labels)
        train_loss += batch_loss.item()
        
        batch_loss.backward()
        
        optimizer.step()
        optimizer.zero_grad()

Epoch: 100%|██████████| 5/5 [1:56:40<00:00, 1400.01s/it]


In [22]:
tr.save({'model_state_dict': bioBERT_model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()}, "/content/model.pt")

In [32]:
bioBERT_model.eval()
all_predicted = []
true_labels = []
with tr.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):

        inputs, masks, labels = tuple(t for t in batch_data)

        outputs = bioBERT_model.forward(inputs, masks)
        _, predicted = tr.max(outputs.data, 1)
        predicted = predicted.tolist()
        
        all_predicted += predicted
        true_labels += labels.tolist()

In [33]:
print(classification_report(true_labels, all_predicted))

              precision    recall  f1-score   support

           0       0.86      0.84      0.85      1121
           1       0.58      0.80      0.67       148
           2       0.63      0.76      0.69       416
           3       0.68      0.70      0.69       244
           4       0.67      0.73      0.70       266
           5       0.89      0.88      0.89       204
           6       0.77      0.66      0.71       244
           7       0.83      0.84      0.83       144
           8       0.84      0.72      0.77       504
           9       0.66      0.54      0.60       146
          10       0.00      0.00      0.00         7
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00        10

    accuracy                           0.77      3455
   macro avg       0.57      0.58      0.57      3455
weighted avg       0.77      0.77      0.77      3455



  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
Counter(all_predicted)

Counter({0: 1094,
         1: 206,
         2: 502,
         3: 254,
         4: 289,
         5: 202,
         6: 210,
         7: 146,
         8: 433,
         9: 119})

In [None]:
# Run separetely
abstract_sentence_dict = unify_abs(rd_abs_split)
prepped_sentence_list = preprocess(abstract_sentence_dict, rd_ent, rd_rel)

rn.seed(2021)
rn.shuffle(prepped_sentence_list)

training_size = round(len(prepped_sentence_list)*0.8)
training_sentences = prepped_sentence_list[:training_size]
test_sentences = prepped_sentence_list[training_size:]

BATCH_SIZE = 8
EPOCHS = 5

train_dataset = TensorDataset(tr.tensor([sentence["input"] for sentence in training_sentences]).to(device), tr.tensor([sentence["mask"] for sentence in training_sentences]).to(device), tr.tensor([sentence["label"] for sentence in training_sentences]).to(device))
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(tr.tensor([sentence["input"] for sentence in test_sentences]).to(device), tr.tensor([sentence["mask"] for sentence in test_sentences]).to(device), tr.tensor([sentence["label"] for sentence in test_sentences]).to(device))
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

for model_instance in range(10):
    bioBERT_model = BioBertModel()
    bioBERT_model = bioBERT_model.to(device)
    optimizer = tr.optim.Adam(params=bioBERT_model.parameters(), lr=1e-5, weight_decay=0.03)
    tr.cuda.empty_cache()

    bioBERT_model.train()
    loss_func = nn.CrossEntropyLoss()
    for epoch_num in trange(EPOCHS, desc="Epoch"):
        train_loss = 0.0
        for step_num, batch_data in enumerate(train_dataloader):
            inputs, masks, labels = batch_data
            output = bioBERT_model.forward(inputs, masks)
            batch_loss = loss_func(output, labels)
            train_loss += batch_loss.item()
          
            batch_loss.backward()
            
            optimizer.step()
            optimizer.zero_grad()

    bioBERT_model.eval()
    all_predicted = []
    true_labels = []
    with tr.no_grad():
        for step_num, batch_data in enumerate(test_dataloader):

            inputs, masks, labels = batch_data

            outputs = bioBERT_model.forward(inputs, masks)
            _, predicted = tr.max(outputs.data, 1)
            predicted = predicted.tolist()
            
            all_predicted += predicted
            true_labels += labels.tolist()

    print("Iteration: " + str(model_instance) + " Results")
    print(classification_report(true_labels, all_predicted))

In [None]:
# Hyper-parameter tuned and randomness decreased version. RUN SEPARATELY!
abstract_sentence_dict = unify_abs(rd_abs_split)
prepped_sentence_list = preprocess(abstract_sentence_dict, rd_ent, rd_rel)

rn.seed(2021)
rn.shuffle(prepped_sentence_list)

training_size = round(len(prepped_sentence_list)*0.8)
training_sentences = prepped_sentence_list[:training_size]
test_sentences = prepped_sentence_list[training_size:]

BATCH_SIZE = 8
EPOCHS = 5

train_dataset = TensorDataset(
    tr.tensor([sentence["input"] for sentence in training_sentences]).to(device), 
    tr.tensor([sentence["mask"] for sentence in training_sentences]).to(device), 
    tr.tensor([sentence["label"] for sentence in training_sentences]).to(device)
    )
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(
    train_dataset, 
    sampler=train_sampler, 
    batch_size=BATCH_SIZE
    )

test_dataset = TensorDataset(
    tr.tensor([sentence["input"] for sentence in test_sentences]).to(device), 
    tr.tensor([sentence["mask"] for sentence in test_sentences]).to(device), 
    tr.tensor([sentence["label"] for sentence in test_sentences]).to(device)
    )
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(
    test_dataset, 
    sampler=test_sampler, 
    batch_size=BATCH_SIZE
    )

lr_list = [1e-5, 3e-5, 5e-5]
wd_list = [1e-2, 3e-2, 5e-2]

fout = open("/content/eval.json", "w")

for lr in lr_list:
    for wd in wd_list:
        for model_instance in range(10): # to overcome randomness, we've run the model 10 times and get the avg score
            bioBERT_model = BioBertModel()
            bioBERT_model = bioBERT_model.to(device)
            optimizer = tr.optim.Adam(params=bioBERT_model.parameters(), lr=lr, weight_decay=wd)
            tr.cuda.empty_cache()

            bioBERT_model.train()
            loss_func = nn.CrossEntropyLoss()
            for epoch_num in trange(EPOCHS, desc="Epoch"):
                train_loss = 0.0
                for step_num, batch_data in enumerate(train_dataloader):
                    inputs, masks, labels = batch_data
                    outputs = bioBERT_model.forward(inputs, masks)
                    batch_loss = loss_func(outputs, labels)
                    train_loss += batch_loss.item()

                    batch_loss.backward()

                    optimizer.step()
                    optimizer.zero_grad()

            bioBERT_model.eval()
            all_predicted = []
            true_labels = []
            with tr.no_grad():
                for step_num, batch_data in enumerate(test_dataloader):

                    inputs, masks, labels = batch_data
                    outputs = bioBERT_model.forward(inputs, masks)
                    _, predicted = tr.max(outputs.data, 1)
                    predicted = predicted.tolist()

                    all_predicted += predicted
                    true_labels += labels.tolist()

                ps = precision_score(true_labels, all_predicted, average="micro")
                rs = recall_score(true_labels, all_predicted, average="micro")
                f1s = f1_score(true_labels, all_predicted, average="micro")

                json.dump({ "lr": lr, "wd": wd, "model_instance": model_instance, 
                          "all_predicted": all_predicted, 
                          "true_labels": true_labels, 
                          "ps": ps, "rs": rs, "f1s": f1s }, fout)
                fout.flush()

fout.close()