In [48]:
!pip install transformers



In [49]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [50]:
import os
import torch
import torch.nn as nn
import transformers
from transformers import BertForTokenClassification,AdamW,BertTokenizer
from transformers import get_linear_schedule_with_warmup
from torchsummary import summary


from tqdm.auto import tqdm 
import time
import pandas as pd
import numpy as np
import joblib

from sklearn import metrics
from sklearn import model_selection, preprocessing

In [51]:
MAX_LEN=128
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 1
MODEL_PATH="/content/gdrive/My Drive/bert_ner_model.bin"
OUTPUT_LOG="/content/gdrive/My Drive/bert_ner_train.log"

#From kaggle dataset https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus
TRAINING_FILE="/content/gdrive/My Drive/ner_dataset.csv"

In [52]:
if torch.cuda.is_available() :
  device = torch.device("cuda")
  print('We will use the GPU:',torch.cuda.get_device_name(0))
else:
  print('No GPU available, using the CPU instead')
  device = torch.device("cpu")

We will use the GPU: Tesla P100-PCIE-16GB


In [53]:
# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


In [54]:
def log(value):
  if os.path.exists(OUTPUT_LOG) :
    f= open(OUTPUT_LOG,"a")
    f.write(value+"\n")
    f.close()
  else:
    f= open(OUTPUT_LOG,"w")
    f.write(value+ "\n")
    f.close()


In [55]:
#Read Data

def read_data(input_file):
    df = pd.read_csv(input_file, encoding="latin-1")
    df.loc[:, "Sentence #"] = df["Sentence #"].fillna(method="ffill")

    enc_pos = preprocessing.LabelEncoder()
    enc_tag = preprocessing.LabelEncoder()

    df.loc[:, "POS"] = enc_pos.fit_transform(df["POS"])
    df.loc[:, "Tag"] = enc_tag.fit_transform(df["Tag"])

    sentences = df.groupby("Sentence #")["Word"].apply(list).values
    pos = df.groupby("Sentence #")["POS"].apply(list).values
    tag = df.groupby("Sentence #")["Tag"].apply(list).values

    print(len(sentences))
    print(len(pos))
    print(len(tag))
    return sentences, pos, tag, enc_pos, enc_tag

In [56]:
class BERTDataset:
    def __init__(self, input_text, pos, tag):
        self.input_text= input_text
        self.pos = pos
        self.tag = tag
        self.tokenizer = tokenizer
        #self.max_len = MAX_LEN
    
    def __len__(self):
        return len(self.input_text)
    
    def __getitem__(self, item):
        max_len=MAX_LEN
        text = self.input_text[item]
        pos = self.pos[item]
        tag = self.tag[item]

        ids = []
        target_pos = []
        target_tag =[]

        for i, s in enumerate(text):
            inputs = tokenizer.encode(
                s,
                add_special_tokens=False
            )
          
            input_len = len(inputs)
            ids.extend(inputs)
            target_pos.extend([pos[i]] * input_len)
            target_tag.extend([tag[i]] * input_len)

        ids = ids[:max_len - 2]
        target_pos = target_pos[:max_len - 2]
        target_tag = target_tag[:max_len - 2]

        ids = [101] + ids + [102]
        target_pos = [0] + target_pos + [0]
        target_tag = [0] + target_tag + [0]

        mask = [1] * len(ids)
        token_type_ids = [0] * len(ids)

        padding_len = max_len - len(ids)

        ids = ids + ([0] * padding_len)
        mask = mask + ([0] * padding_len)
        token_type_ids = token_type_ids + ([0] * padding_len)
        target_pos = target_pos + ([0] * padding_len)
        target_tag = target_tag + ([0] * padding_len)

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "target_pos": torch.tensor(target_pos, dtype=torch.long),
            "target_tag": torch.tensor(target_tag, dtype=torch.long),
        }

In [57]:
def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()
    final_loss = 0
    for data in tqdm(data_loader, total=len(data_loader)):
        for k, v in data.items():
            data[k] = v.to(device)
        optimizer.zero_grad()
        _, _, loss = model(**data)
        loss.backward()
        optimizer.step()
        scheduler.step()
        final_loss += loss.item()
    return final_loss / len(data_loader)


def eval_fn(data_loader, model, device):
    model.eval()
    final_loss = 0
    for data in tqdm(data_loader, total=len(data_loader)):
        for k, v in data.items():
            data[k] = v.to(device)
        _, _, loss = model(**data)
        final_loss += loss.item()
    return final_loss / len(data_loader)

def loss_fn(output, target, mask, num_labels):
    lfn = nn.CrossEntropyLoss()
    active_loss = mask.view(-1) == 1
    active_logits = output[0].view(-1, num_labels)
    active_labels = torch.where(
        active_loss,
        target.view(-1),
        torch.tensor(lfn.ignore_index).type_as(target)
    )
    loss = lfn(active_logits, active_labels)
    return loss

In [58]:
class BERTNerModel(nn.Module):
    def __init__(self, num_tag, num_pos):
        super(BERTNerModel, self).__init__()
        self.num_tag = num_tag
        self.num_pos = num_pos
        self.bert_tag = transformers.BertForTokenClassification.from_pretrained("bert-base-uncased",num_labels=num_tag)
        self.bert_pos = transformers.BertForTokenClassification.from_pretrained("bert-base-uncased",num_labels=num_pos)
        

    def forward(
        self, 
        ids, 
        mask, 
        token_type_ids, 
        target_pos, 
        target_tag
    ):
        tag= self.bert_tag(
            ids, 
            attention_mask=mask, 
            token_type_ids=token_type_ids
        )

        pos= self.bert_pos(
            ids, 
            attention_mask=mask, 
            token_type_ids=token_type_ids
        )

        loss_tag = loss_fn(tag, target_tag, mask, self.num_tag)
        loss_pos = loss_fn(pos, target_pos, mask, self.num_pos)

        loss = (loss_tag + loss_pos) / 2

        return tag, pos, loss

In [59]:
def train():
  sentences, pos, tag, enc_pos, enc_tag = read_data(TRAINING_FILE)

  meta_data = {
    "enc_pos": enc_pos,
    "enc_tag": enc_tag
  }

  joblib.dump(meta_data, "meta.bin")

  num_pos = len(list(enc_pos.classes_))
  num_tag = len(list(enc_tag.classes_))

  (
    train_sentences,
    test_sentences,
    train_pos,
    test_pos,
    train_tag,
    test_tag
  ) = model_selection.train_test_split(
    sentences, 
    pos, 
    tag, 
    random_state=42, 
    test_size=0.1
  )


  train_dataset = BERTDataset(
    input_text=train_sentences, pos=train_pos, tag=train_tag
    )

  train_data_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=4
  )

  valid_dataset = BERTDataset(
    input_text=test_sentences, pos=test_pos, tag=test_tag
  )

  valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset, batch_size=VALID_BATCH_SIZE, num_workers=1
  )

  device = torch.device("cuda")
  model = BERTNerModel(num_tag=num_tag, num_pos=num_pos)
  model.to(device)

  param_optimizer = list(model.named_parameters())
  no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
  optimizer_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(
                nd in n for nd in no_decay
            )
        ],
        "weight_decay": 0.001,
    },
        {
        "params": [
            p for n, p in param_optimizer if any(
                nd in n for nd in no_decay
            )
        ],
        "weight_decay": 0.0,
    },
  ]

  num_train_steps = int(
    len(train_sentences) / TRAIN_BATCH_SIZE * EPOCHS
  )
  optimizer = AdamW(optimizer_parameters, lr=3e-5)
  scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0, 
    num_training_steps=num_train_steps
  )

  best_loss = np.inf
  for epoch in range(EPOCHS):
    train_loss = train_fn(
        train_data_loader, 
        model, 
        optimizer, 
        device, 
        scheduler
    )
    test_loss = eval_fn(
        valid_data_loader,
        model,
        device
    )
    print(f"Train Loss = {train_loss} Valid Loss = {test_loss}")
    if test_loss < best_loss:
        torch.save(model.state_dict(), MODEL_PATH)
        best_loss = test_loss

In [60]:
def predict():
  meta_data = joblib.load("meta.bin")
  enc_pos = meta_data["enc_pos"]
  enc_tag = meta_data["enc_tag"]

  num_pos = len(list(enc_pos.classes_))
  num_tag = len(list(enc_tag.classes_))

  sentence = "Ganga is a river in North India that flows from Himalayas to Bay of Bengal"
  tokenized_sentence = tokenizer.encode(sentence)

  sentence = sentence.split()
  print(sentence)

  print(tokenized_sentence)
  print(tokenizer.convert_ids_to_tokens(tokenized_sentence))

  test_dataset = BERTDataset(
    input_text=[sentence], 
    pos=[[0] * len(sentence)], 
    tag=[[0] * len(sentence)]
  )

  device = torch.device("cuda")
  model = BERTNerModel(num_tag=num_tag, num_pos=num_pos)
  model.load_state_dict(torch.load(MODEL_PATH))
  model.to(device)

  with torch.no_grad():
    data = test_dataset[0]
    for k, v in data.items():
      data[k] = v.to(device).unsqueeze(0)
    tag, pos, _ = model(**data)

    print(
        enc_tag.inverse_transform(
            tag[0].argmax(2).cpu().numpy().reshape(-1)
        )[:len(tokenized_sentence)]
    )
    print(
        enc_pos.inverse_transform(
            pos[0].argmax(2).cpu().numpy().reshape(-1)
        )[:len(tokenized_sentence)]
    )


In [61]:
train()
predict()

47959
47959
47959


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

HBox(children=(FloatProgress(value=0.0, max=5396.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=600.0), HTML(value='')))


Train Loss = 0.13304480957540457 Valid Loss = 0.08998058945406229
['Ganga', 'is', 'a', 'river', 'in', 'North', 'India', 'that', 'flows', 'from', 'Himalayas', 'to', 'Bay', 'of', 'Bengal']
[101, 28646, 2003, 1037, 2314, 1999, 2167, 2634, 2008, 6223, 2013, 26779, 2000, 3016, 1997, 8191, 102]
['[CLS]', 'ganga', 'is', 'a', 'river', 'in', 'north', 'india', 'that', 'flows', 'from', 'himalayas', 'to', 'bay', 'of', 'bengal', '[SEP]']


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

['B-art' 'B-geo' 'O' 'O' 'O' 'O' 'B-geo' 'I-geo' 'O' 'O' 'O' 'B-geo' 'O'
 'B-geo' 'I-geo' 'I-geo' 'B-art']
['$' 'NNP' 'VBZ' 'DT' 'NN' 'IN' 'NNP' 'NNP' 'WDT' 'VBZ' 'IN' 'NNP' 'TO'
 'NNP' 'IN' 'NNP' '$']
