In [1]:
pip install gpustat transformers seqeval[gpu]

Collecting gpustat
  Downloading gpustat-0.6.0.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 4.1 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 18.8 MB/s 
[?25hCollecting seqeval[gpu]
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 2.4 MB/s 
Collecting blessings>=1.6
  Downloading blessings-1.7-py3-none-any.whl (18 kB)
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 45.1 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 45.0 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K  

In [2]:
import io
import re
import sys
import csv
import torch
import random
import numpy as np
import pandas as pd
import torch.nn as nn
import tensorflow as tf
import torch.optim as optim
import matplotlib.pyplot as plt

from tqdm import tqdm
from torch import cuda
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Dataset
from transformers import AutoTokenizer, BertConfig, BertForTokenClassification, get_scheduler, BertTokenizerFast

device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
train_url = ("https://raw.githubusercontent.com/Deepmori/projects-shivani_harrison_deep/main/ner_BC2GM/train.csv")
test_url = ("https://raw.githubusercontent.com/Deepmori/projects-shivani_harrison_deep/main/ner_BC2GM/test.csv")

train_data = pd.read_csv(train_url, encoding='unicode_escape', low_memory=False)
train_data = train_data[train_data['Tag'].notna()]
train_data = train_data.fillna(method='ffill')

train_data.head()

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 1,Immunohistochemical,O
1,Sentence: 1,staining,O
2,Sentence: 1,was,O
3,Sentence: 1,positive,O
4,Sentence: 1,for,O


In [4]:
print("# of tags: {}".format(len(train_data.Tag.unique())))

tag_count = train_data.Tag.value_counts()
tag_count

# of tags: 3


O           76517
B-B.GENE     9531
I-I.GENE     4239
Name: Tag, dtype: int64

In [5]:
tags = {}
for tag, count in zip(tag_count.index, tag_count):
    if tag != "O":
        if tag[2:5] not in tags.keys():
            tags[tag[2:5]] = count
        else:
            tags[tag[2:5]] += count
    continue

In [6]:
labels_to_ids = {k: v for v, k in enumerate(train_data.Tag.unique())}
ids_to_labels = {v: k for v, k in enumerate(train_data.Tag.unique())}

In [7]:
train_data['sentence'] = train_data[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Word'].transform(lambda x: ' '.join(x))
train_data['word_labels'] = train_data[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Tag'].transform(lambda x: ','.join(x))
train_data = train_data[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
train_data.head()

Unnamed: 0,sentence,word_labels
0,Immunohistochemical staining was positive for ...,"O,O,O,O,O,B-B.GENE,I-I.GENE,I-I.GENE,O,O,O,O,O..."
1,Chloramphenicol acetyltransferase assays exami...,"B-B.GENE,I-I.GENE,O,O,O,O,O,B-B.GENE,O,O,O,O,O..."
2,A new DNA repair gene from Schizosaccharomyces...,"O,O,B-B.GENE,I-I.GENE,I-I.GENE,O,O,O,O,O,O,B-B..."
3,Our study also demonstrated significant increa...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
4,Cloning and sequencing of the upstream region ...,"O,O,O,O,O,O,O,O,B-B.GENE,O,O,O,O,O,O,O,O,O,O,O..."


In [14]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [15]:
class dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):
        sentence = self.data.sentence[index]
        word_labels = self.data.word_labels[index].split(",") 
        encoding = self.tokenizer(sentence, return_offsets_mapping=True, padding='max_length', truncation=True, max_length=self.max_len)
        labels = []
        for label in  word_labels:
          if label == " " or label == "":
            pass
          else:
            labels.append(labels_to_ids[label])

        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100

        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
          if mapping[0] == 0 and mapping[1] != 0:
            encoded_labels[idx] = labels[i]
            i += 1

        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)
        
        return item

  def __len__(self):
        return self.len

In [16]:
train_size = 0.8
train_dataset = train_data.sample(frac=train_size,random_state=200)
test_dataset = train_data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

In [17]:
training_set[0]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 'input_ids': tensor([  101,  1057,  1012,  1055,  1012,  4584,  3571, 15332,  2343,  9395,
         16860,  1010,  2040,  2038, 12250,  4262,  2007,  2899,  1010,  2071,
          9167,  2235,  2608,  2000,  8443,  5750,  1010,  2164,  8443,  2967,
          1999,  8581,  7379,  1012,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     

In [18]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [19]:
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(labels_to_ids))
model.to(device)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [20]:
inputs = training_set[2]
input_ids = inputs["input_ids"].unsqueeze(0)
attention_mask = inputs["attention_mask"].unsqueeze(0)
labels = inputs["labels"].unsqueeze(0)

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)

outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
initial_loss = outputs[0]
initial_loss

tensor(1.4267, device='cuda:0', grad_fn=<NllLossBackward0>)

In [21]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [22]:
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    tr_examples, tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    model.train()
    
    for idx, batch in enumerate(training_loader):
        # print(idx)
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels, return_dict=False)
        tr_loss += loss.item()

        tr_steps += 1
        tr_examples += labels.size(0)
        
        if idx % 200==0:
            loss_step = tr_loss/tr_steps
            print("Training loss per 200 training steps: ",loss_step)
           
        flattened_targets = labels.view(-1) 
        active_logits = tr_logits.view(-1, model.num_labels) 
        flattened_predictions = torch.argmax(active_logits, axis=1) 
        

        active_accuracy = labels.view(-1) != -100 
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / tr_steps
    tr_accuracy = tr_accuracy / tr_steps
    print("Training loss epoch: ", epoch_loss)
    print("Training accuracy epoch: ", tr_accuracy)

In [23]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 200 training steps:  1.3391493558883667
Training loss per 200 training steps:  0.30963967732195535
Training loss per 200 training steps:  0.21533678710214477
Training loss per 200 training steps:  0.17719057807543884
Training loss per 200 training steps:  0.15683146748463908
Training loss epoch:  0.15696651100264056
Training accuracy epoch:  0.9513634513634514


In [24]:
def valid(model, testing_loader):
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    eval_examples, eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            
            loss, eval_logits = model(input_ids=ids, attention_mask=mask, labels=labels, return_dict=False)
            
            eval_loss += loss.item()

            eval_steps += 1
            eval_examples += labels.size(0)
        
            if idx % 200==0:
                loss_step = eval_loss/eval_steps
                print("Validation loss per 200 evaluation steps: ",loss_step)
              

            flattened_targets = labels.view(-1) 
            active_logits = eval_logits.view(-1, model.num_labels) 
            flattened_predictions = torch.argmax(active_logits, axis=1) 
            
            active_accuracy = labels.view(-1) != -100 
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]
    
    eval_loss = eval_loss / eval_steps
    eval_accuracy = eval_accuracy / eval_steps
    print("Validation Loss: ",eval_loss)
    print("Validation Accuracy: ",eval_accuracy)

    return labels, predictions

In [25]:
labels, predictions = valid(model, testing_loader)

Validation loss per 200 evaluation steps:  0.004031321965157986
Validation loss per 200 evaluation steps:  0.06446337282995865
Validation loss per 200 evaluation steps:  0.06266120459566223
Validation Loss:  0.062076066894933765
Validation Accuracy:  0.9804878048780488


In [26]:
from seqeval.metrics import classification_report
from sklearn.metrics import confusion_matrix
labels = [labels]
predictions = [predictions]
print(classification_report(labels, predictions))

              precision    recall  f1-score   support

      B.GENE       0.95      0.97      0.96       207

   micro avg       0.95      0.97      0.96       207
   macro avg       0.95      0.97      0.96       207
weighted avg       0.95      0.97      0.96       207

