In [None]:
!pip install transformers

In [13]:
import os
import re
import csv

import pandas as pd
from tqdm import tqdm, trange

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler
from pytorch_pretrained_bert import BertModel, BertTokenizer, BertConfig
from transformers import BertForTokenClassification, AdamW
from transformers import get_linear_schedule_with_warmup

import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [8]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [6]:
# Get GPU device name
device_name = tf.test.gpu_device_name()

if device_name == '/device:GPU:0':
  print('Found GPU at: {}'.format(device_name))
else:
  raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [7]:
if torch.cuda.is_available():

  # tell Pytorch to use the GPU
  device = torch.device('cuda')

  print('There are %d GPU(s) available.' % torch.cuda.device_count())
  print('We will use the GPU:', torch.cuda.get_device_name(0))

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


# Data

In [11]:
MAX_LEN = 75

tokenizer = BertTokenizer(vocab_file='/content/gdrive/My Drive/Colab Notebooks/Data/biobert_v1.1_pubmed/vocab.txt', do_lower_case=False)

In [22]:
data = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Data/bio_ner/tags_small.csv')
tag_values = data['tags'].values
vocab_len = len(tag_values)
print('Vocab length:',vocab_len)

Vocab length: 74


In [23]:
df_tags = pd.DataFrame({'tags':tag_values})
df_tags.to_csv('tags_small.csv',index=False)
df = pd.read_csv('tags_small.csv')
df.head()

Unnamed: 0,tags
0,I-Cellular_component
1,E-Gene_or_gene_product
2,I-Organism_subdivision
3,I-Organism_substance
4,B-Gene_or_gene_product


In [39]:
def sent_fetch(path):
    with open(path) as tsvfile:
        reader = csv.reader(tsvfile, delimiter='\t')
        sentences = []
        tags = []
        sent = []
        tag = []
        for row in reader:
            if len(row) == 0:
                if len(sent) != len(tag):
                    print('Error')
                    break
                sentences.append(sent)
                tags.append(tag)
                sent = []
                tag = []
            else:
                sent.append(row[0])
                tag.append(row[1])      
    return sentences, tags


def tokenize_and_label(sent, text_labels):
  tokenized_sent = []
  labels = []
  for word, label in zip(sent, text_labels):
    tokenized_word = tokenizer.tokenize(word)
    n_subwords = len(tokenized_word)

    tokenized_sent.extend(tokenized_word)
    labels.extend([label] * n_subwords)
  return tokenized_sent, labels

In [40]:
rootdir = '/content/gdrive/My Drive/Colab Notebooks/Data/BioNLP'
sentences = []
tags = []
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        if file == 'train.tsv':
            path_ = os.path.join(subdir, file)
            sent, tag =sent_fetch(path_)
            sentences.extend(sent)
            tags.extend(tag)
            
sentences = sentences[0:20000]
tags = tags[0:20000]

In [42]:
tokenized_texts_and_labels = [
    tokenize_and_label(sent, labs)
    for sent, labs in zip(sentences, tags)
]

In [43]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [44]:
len(tokenized_texts)

20000

In [45]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

In [48]:
for txt in tokenized_texts:
    print(txt)
    break

['The', 'C', '##d', '##c', '##6', 'protein', 'is', 'u', '##bi', '##qui', '##tina', '##ted', 'in', 'v', '##ivo', 'for', 'pro', '##te', '##oly', '##sis', 'in', 'Sa', '##cc', '##har', '##omy', '##ces', 'c', '##ere', '##vis', '##iae', '.']


# Model

In [4]:
class biobert_ner(nn.Module):
  def __init__(self, vocab_len, config, state_dict):
    super().__init__()
    self.bert = BertModel(config)
    if state_dict is not None: 
      # dict object that maps each layer to its parameter tensor
      self.bert.load_state_dict(state_dict)
      self.dropout = nn.Dropout(p=0.3)
      self.output = nn.Linear(self.bert.config.hidden_size, vocab_len)
      self.softmax = nn.Softmax(dim=1)

  def forward(self, input_ids, attention_mask):
    encoded_layer, _ = self.bert(input_ids=input_ids, attention_mask=attention_mask)
    encl = encoded_layer[-1]
    out = self.dropout(encl)
    out = self.output(out)
    return out, out.argmax(-1)

In [19]:
config = BertConfig.from_json_file('/content/gdrive/My Drive/Colab Notebooks/Data/biobert_v1.1_pubmed/bert_config.json')
model = biobert_ner(vocab_len,config,state_dict=None)

In [34]:
model.cuda()
model.eval()

biobert_ner(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
       