In [None]:
!pip install pytorch-pretrained-bert
!pip install transformers

Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |██▋                             | 10kB 22.0MB/s eta 0:00:01[K     |█████▎                          | 20kB 1.8MB/s eta 0:00:01[K     |████████                        | 30kB 2.4MB/s eta 0:00:01[K     |██████████▋                     | 40kB 2.7MB/s eta 0:00:01[K     |█████████████▎                  | 51kB 2.1MB/s eta 0:00:01[K     |███████████████▉                | 61kB 2.4MB/s eta 0:00:01[K     |██████████████████▌             | 71kB 2.6MB/s eta 0:00:01[K     |█████████████████████▏          | 81kB 2.8MB/s eta 0:00:01[K     |███████████████████████▉        | 92kB 3.1MB/s eta 0:00:01[K     |██████████████████████████▌     | 102kB 2.9MB/s eta 0:00:01[K     |█████████████████████████████▏  | 112kB 2.9MB/s eta 0:00:01[K     |██████████████████████

In [None]:
import os
import re
import csv

import pandas as pd
import numpy as np
from tqdm import tqdm, trange

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler
from pytorch_pretrained_bert import BertModel, BertTokenizer, BertConfig
from transformers import BertForTokenClassification, AdamW
from transformers import get_linear_schedule_with_warmup

import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [None]:
# Get GPU device name
device_name = tf.test.gpu_device_name()

if device_name == '/device:GPU:0':
  print('Found GPU at: {}'.format(device_name))
else:
  raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [None]:
if torch.cuda.is_available():

  # tell Pytorch to use the GPU
  device = torch.device('cuda')

  print('There are %d GPU(s) available.' % torch.cuda.device_count())
  print('We will use the GPU:', torch.cuda.get_device_name(0))

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


# Load BioBERT

In [None]:
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1R84voFKHfWV9xjzeLzWBbmY1uOMYpnyD' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1R84voFKHfWV9xjzeLzWBbmY1uOMYpnyD" -O biobert_weights && rm -rf /tmp/cookies.txt

--2020-08-23 17:07:05--  https://docs.google.com/uc?export=download&confirm=7Tiu&id=1R84voFKHfWV9xjzeLzWBbmY1uOMYpnyD
Resolving docs.google.com (docs.google.com)... 173.194.216.113, 173.194.216.102, 173.194.216.101, ...
Connecting to docs.google.com (docs.google.com)|173.194.216.113|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-0g-4o-docs.googleusercontent.com/docs/securesc/2o8j02so6bs6kuq0d29ft32lvvm3jhvr/rln557m28pdgc0s7qnt08jipjlpl4a8d/1598202375000/13799006341648886493/10210308256775560418Z/1R84voFKHfWV9xjzeLzWBbmY1uOMYpnyD?e=download [following]
--2020-08-23 17:07:05--  https://doc-0g-4o-docs.googleusercontent.com/docs/securesc/2o8j02so6bs6kuq0d29ft32lvvm3jhvr/rln557m28pdgc0s7qnt08jipjlpl4a8d/1598202375000/13799006341648886493/10210308256775560418Z/1R84voFKHfWV9xjzeLzWBbmY1uOMYpnyD?e=download
Resolving doc-0g-4o-docs.googleusercontent.com (doc-0g-4o-docs.googleusercontent.com)... 172.217.193.132, 2607:f8b0:400c:c03::84
Conne

In [None]:
!tar -xzf biobert_weights
!ls biobert_v1.1_pubmed/

bert_config.json			model.ckpt-1000000.index  vocab.txt
model.ckpt-1000000.data-00000-of-00001	model.ckpt-1000000.meta


In [None]:
!transformers-cli convert --model_type bert --tf_checkpoint biobert_v1.1_pubmed/model.ckpt-1000000 --config biobert_v1.1_pubmed/bert_config.json --pytorch_dump_output biobert_v1.1_pubmed/pytorch_model.bin

2020-08-23 17:07:19.818170: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
Building PyTorch model from configuration: BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 28996
}

INFO:transformers.modeling_bert:Converting TensorFlow checkpoint from /content/biobert_v1.1_pubmed/model.ckpt-1000000
INFO:transformers.modeling_bert:Loading TF weight bert/embeddings/LayerNorm/beta with shape [768]
INFO:transformers.modeling_bert:Loading TF weight bert/embeddings/LayerNorm/gamma with shape [768]
INFO:transformers.modeling_bert:Loading TF weight bert/embedding

In [None]:
!ls biobert_v1.1_pubmed/
!mv biobert_v1.1_pubmed/bert_config.json biobert_v1.1_pubmed/config.json
!ls biobert_v1.1_pubmed/

bert_config.json			model.ckpt-1000000.meta
model.ckpt-1000000.data-00000-of-00001	pytorch_model.bin
model.ckpt-1000000.index		vocab.txt
config.json				model.ckpt-1000000.meta
model.ckpt-1000000.data-00000-of-00001	pytorch_model.bin
model.ckpt-1000000.index		vocab.txt


In [None]:
!ls 

biobert_v1.1_pubmed  biobert_weights  sample_data


# Data

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [None]:
MAX_LEN = 75
bs = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer(vocab_file='biobert_v1.1_pubmed/vocab.txt', do_lower_case=False)

In [None]:
data = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Data/bio_ner/tags_small.csv')
tag_values = data['tags'].values
vocab_len = len(tag_values)
print('Vocab length:',vocab_len)

Vocab length: 74


In [None]:
df_tags = pd.DataFrame({'tags':tag_values})
df_tags.to_csv('tags_small.csv',index=False)
df = pd.read_csv('tags_small.csv')
df.head()

Unnamed: 0,tags
0,I-Cellular_component
1,E-Gene_or_gene_product
2,I-Organism_subdivision
3,I-Organism_substance
4,B-Gene_or_gene_product


In [None]:
def sent_fetch(path):
    with open(path) as tsvfile:
        reader = csv.reader(tsvfile, delimiter='\t')
        sentences = []
        tags = []
        sent = []
        tag = []
        for row in reader:
            if len(row) == 0:
                if len(sent) != len(tag):
                    print('Error')
                    break
                sentences.append(sent)
                tags.append(tag)
                sent = []
                tag = []
            else:
                sent.append(row[0])
                tag.append(row[1])      
    return sentences, tags


def tokenize_and_label(sent, text_labels):
  tokenized_sent = []
  labels = []
  for word, label in zip(sent, text_labels):
    tokenized_word = tokenizer.tokenize(word)
    n_subwords = len(tokenized_word)

    tokenized_sent.extend(tokenized_word)
    labels.extend([label] * n_subwords)
  return tokenized_sent, labels

In [None]:
rootdir = '/content/gdrive/My Drive/Colab Notebooks/Data/BioNLP'
sentences = []
tags = []
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        if file == 'train.tsv':
            path_ = os.path.join(subdir, file)
            sent, tag =sent_fetch(path_)
            sentences.extend(sent)
            tags.extend(tag)
            
sentences = sentences[0:20000]
tags = tags[0:20000]

In [None]:
tokenized_texts_and_labels = [
    tokenize_and_label(sent, labs)
    for sent, labs in zip(sentences, tags)
]

In [None]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [None]:
len(tokenized_texts)

20000

In [None]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

In [None]:
for txt in tokenized_texts:
    print(txt)
    break

['The', 'C', '##d', '##c', '##6', 'protein', 'is', 'u', '##bi', '##qui', '##tina', '##ted', 'in', 'v', '##ivo', 'for', 'pro', '##te', '##oly', '##sis', 'in', 'Sa', '##cc', '##har', '##omy', '##ces', 'c', '##ere', '##vis', '##iae', '.']


In [None]:
len(tokenized_texts[9999])

64

In [None]:
import itertools

# print(set(itertools.chain.from_iterable(tags)))
tag_values = list(set(itertools.chain.from_iterable(tags)))
tag_values.append("PAD")

tag2idx = {t: i for i,t in enumerate(tag_values) }

In [None]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [None]:
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

In [None]:
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags,
                                                            random_state=2018, test_size=0.1)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [None]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

# Model

In [None]:
model = BertModel.from_pretrained('biobert_v1.1_pubmed')

In [None]:
for step,batch in enumerate(train_dataloader):
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    print(b_input_ids.size(), b_input_mask.size(), b_labels.size())
    break

torch.Size([32, 75]) torch.Size([32, 75]) torch.Size([32, 75])


In [None]:
config = BertConfig.from_json_file('biobert_v1.1_pubmed/config.json')
tmp_d = torch.load('biobert_v1.1_pubmed/pytorch_model.bin',map_location=device)
from collections import OrderedDict
state_dict = OrderedDict()

for i in list(tmp_d.keys())[:199]:
    x = i
    if i.find('bert') > -1:
        x = '.'.join(i.split('.')[1:])
    state_dict[x] = tmp_d[i]

In [None]:
class biobert_ner(nn.Module):
  def __init__(self, vocab_len, config, state_dict):
    super().__init__()
    self.bert = BertModel(config)
    self.bert.load_state_dict(state_dict)
    self.dropout = nn.Dropout(p=0.3)
    self.output = nn.Linear(self.bert.config.hidden_size, vocab_len)
    self.softmax = nn.Softmax(dim=1)

  def forward(self, input_ids, attention_mask):
    encoded_layer, _ = self.bert(input_ids=input_ids, attention_mask=attention_mask)
    encl = encoded_layer[-1]
    out = self.dropout(encl)
    out = self.output(out)
    return out, out.argmax(-1)

In [None]:
model = biobert_ner(vocab_len,config,state_dict)
model.to(device)

biobert_ner(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
       

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)
epochs = 3
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
def train_epoch(model,data_loader,loss_fn,optimizer,device,scheduler):
    model = model.train()
    losses = []
    correct_predictions = 0
    for step,batch in enumerate(data_loader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        outputs,y_hat = model(b_input_ids,b_input_mask)
        
        _,preds = torch.max(outputs,dim=2)
        outputs = outputs.view(-1,outputs.shape[-1])
        b_labels_shaped = b_labels.view(-1)
        loss = loss_fn(outputs,b_labels_shaped)
        correct_predictions += torch.sum(preds == b_labels)
        losses.append(loss.item())
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
    return correct_predictions.double()/len(data_loader) , np.mean(losses)

In [None]:
def model_eval(model,data_loader,loss_fn,device):
    model = model.eval()
    
    losses = []
    correct_predictions = 0
    
    with torch.no_grad():
        for step,batch in enumerate(data_loader):
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
        
            outputs,y_hat = model(b_input_ids,b_input_mask)
        
            _,preds = torch.max(outputs,dim=2)
            outputs = outputs.view(-1,outputs.shape[-1])
            b_labels_shaped = b_labels.view(-1)
            loss = loss_fn(outputs,b_labels_shaped)
            correct_predictions += torch.sum(preds == b_labels)
            losses.append(loss.item())
        
    
    return correct_predictions.double()/len(data_loader) , np.mean(losses)

In [None]:
%%time
from collections import defaultdict
history = defaultdict(list)
best_accuracy = 0
normalizer = bs*MAX_LEN

for epoch in range(epochs):
    print(f'Epoch {epoch+1}/{epochs}')
    print('-'*10)
    train_acc,train_loss = train_epoch(model,train_dataloader,loss_fn,optimizer,device,scheduler)
    train_acc = train_acc/normalizer
    print(f'Train loss {train_loss} accuracy {train_acc}')
          


    val_acc,val_loss = model_eval(model,valid_dataloader,loss_fn,device)
    val_acc = val_acc/normalizer
    print(f'val loss {val_loss} accuracy {val_acc}')
    print()
    
    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)

Epoch 1/3
----------
Train loss 0.09355648583856611 accuracy 0.9649089698046183
val loss 0.08961249308453666 accuracy 0.9589021164021163

Epoch 2/3
----------
Train loss 0.08261177434749756 accuracy 0.9658910597986976
val loss 0.08833834481617761 accuracy 0.9579034391534392

Epoch 3/3
----------
Train loss 0.07988401627429426 accuracy 0.966051657785672
val loss 0.08833834481617761 accuracy 0.9579034391534392

CPU times: user 16min 56s, sys: 10min 32s, total: 27min 28s
Wall time: 27min 33s


# Test

In [None]:
rootdir = '/content/gdrive/My Drive/Colab Notebooks/Data/BioNLP'
import os
import csv
sentences = []
tags = []
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        if file == 'train.tsv':
            path = os.path.join(subdir, file)
            sent, tag = sent_fetch(path)
            sentences.extend(sent)
            tags.extend(tag)
            
sentences = sentences[0:10000]
tags = tags[0:10000]

tag_list = []
for idx,tg in enumerate(tags):
    for t in tg:
        if t != 'O':
            tag_list.append(idx)
        break
        
sentences = sentences[0:10000]
tags = tags[0:10000]

In [None]:
import random

test_idx = random.choice(tag_list)

# test_idx = 9999
test_sentence = sentences[test_idx]
test_label = tags[test_idx]

for l,s in zip(test_sentence,test_label):
    print(l,'--',s,'\n')

Topo -- B-Gene_or_gene_product 

I -- E-Gene_or_gene_product 

expression -- O 

in -- O 

paired -- O 

primary -- O 

lymph -- O 

node -- O 

metastases -- O 

were -- O 

studied -- O 

for -- O 

concordance -- O 

. -- O 



In [None]:
tokenized_sentence = []
input_ids = []
for sent in test_sentence:
    tokenized_sentence.extend(tokenizer.tokenize(sent))

In [None]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(tokenized_sentence) ],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

In [None]:
import nltk

In [None]:
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

input_ids = torch.tensor(input_ids).cuda()
attention_masks = torch.tensor(attention_masks).cuda()

In [None]:
model.eval()
with torch.no_grad():
    _, y_hat = model(input_ids,attention_mask=attention_masks)
label_indices = y_hat.to('cpu').numpy()

In [None]:
tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])
new_tokens, new_labels = [], []
for token, label_idx in zip(tokens, label_indices[0]):
    if token.startswith("##"):
        new_tokens[-1] = new_tokens[-1] + token[2:]
    else:
        new_labels.append(tag_values[label_idx])
        new_tokens.append(token)

In [None]:
for token, label in zip(new_tokens, new_labels):
    print("{}\t{}".format(label, token))

O	Topo
O	I
O	expression
O	in
O	paired
O	primary
O	lymph
O	node
O	metastases
O	were
O	studied
O	for
O	concordance
O	.
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]
PAD	[PAD]


In [None]:
torch.save(model.state_dict(), '/content/gdrive/My Drive/Colab Notebooks/Data/bio_ner/BIOBERT_NER_small.pt')