# Named Entity Recognition Using BERT

In [39]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import torch
import random

## Configurations

In [40]:
from configs import PathConfig, GlobalConfig, DeviceConfig, ModelConfig, OptimizerConfig, TrainConfig, EvalConfig

### Path configuration

In [41]:
path_config_dict = {"data_dir": "./data/NER/", 
                    "output_dir": "./NER_output/"}
path_config = PathConfig(path_config_dict)

### Global configuration

In [42]:
global_config_dict = {"fp16": False}
global_config = GlobalConfig(global_config_dict)

### Device configuration

In [None]:
device_config_dict = {"no_cuda": False}
device_config = DeviceConfig(device_config_dict)
print("device name: {}".format(torch.cuda.get_device_name(0)))
print("number of gpus: {}".format(device_config.n_gpu))

### Model configuration

In [44]:
model_config_dict = {"bert_model": "bert-base-uncased",
                     "max_seq_length": 75,
                     "num_labels": 18,
                     "model_type": "token"}
model_config = ModelConfig(model_config_dict)

### Optimizer configuration

In [45]:
optimizer_config_dict = {"no_decay_params": ['bias', 'gamma', 'beta'],
                         "learning_rate": 3e-5}
optimizer_config = OptimizerConfig(optimizer_config_dict)

### Train configuration

In [76]:
train_config_dict = {"train_batch_size": 32,
                     "num_train_epochs": 5}
train_config = TrainConfig(train_config_dict)

### Evaluation configuration

In [79]:
eval_config = EvalConfig({"eval_batch_size":32})

### Set random seeds

In [48]:
random.seed(global_config.seed)
np.random.seed(global_config.seed)
torch.manual_seed(global_config.seed)

<torch._C.Generator at 0x7fa7827121b0>

## Preprocess Data

### Create training and validation examples
KaggleNERProcessor is a dataset specific class that generates training and evaluation examples in the format accepted by all utility functions. 

In [49]:
from bert_data_utils import KaggleNERProcessor

In [50]:
kaggle_ner_processor = KaggleNERProcessor(data_dir="./data/NER/ner_dataset.csv", dev_percentage = 0.1)

In [51]:
train_examples = kaggle_ner_processor.get_train_examples(data_dir="./data/NER/ner_dataset.csv")
dev_examples = kaggle_ner_processor.get_dev_examples(data_dir="./data/NER/ner_dataset.csv")

In [86]:
a = set(kaggle_ner_processor.train_sentence_nums)
b = set(kaggle_ner_processor.dev_sentence_nums)
a.intersection(b)

In [52]:
label_list = kaggle_ner_processor.get_labels()

In [105]:
print('Sample sentence: \n{}\n'.format(train_examples[0].text_a))
print('Sample sentence labels: \n{}\n'.format(train_examples[1].label))

Sample sentence: 
Iranian officials say they expect to get access to sealed sensitive parts of the plant Wednesday , after an IAEA surveillance system begins functioning .

Sample sentence labels: 
['O', 'O', 'B-tim', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'I-geo', 'O']



### Create dataloaders
These two utility functions convert training and evaluation examples to Pytorch dataloaders which can be used for model training and evaluation. The following steps are performed:
1. Tokenization
2. Convert token and labels to numerical values
3. Convert numpy arrays to Pytorch tensors
4. Create dataloader for sampling and serving data in batches

In [53]:
from bert_utils import create_train_dataloader, create_eval_dataloader

In [54]:
train_dataloader = create_train_dataloader(train_examples=train_examples,
                                           model_config=model_config,
                                           train_config=train_config,
                                           label_list=label_list,
                                           device_config=device_config)

In [80]:
valid_dataloader, _ = create_eval_dataloader(eval_examples=dev_examples, 
                                             model_config=model_config, 
                                             eval_config=eval_config, 
                                             label_list=label_list)

In [103]:
it = iter(train_dataloader)
first = next(it)
print("Sample token ids:\n{}\n".format(first[0][0]))
print("Sample attention mask:\n{}\n".format(first[1][0]))
print("Sample label ids:\n{}\n".format(first[3][0]))

Sample token ids:
tensor([ 9796,  2937,  8443,  2967,  2031,  2146,  4912,  2000, 16857,  1996,
         2406,  1005,  2015,  2343,  1010,  8909,  6935,  2015,  2139,  3762,
         1012,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0])

Sample attention mask:
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0])

Sample label ids:
tensor([12, 17,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, 17,  9,  9, 15, 17, 17,
         2,

## Load Model

In [57]:
from bert_utils import load_model

In [58]:
model = load_model(model_config=model_config, 
                   path_config=path_config, 
                   device_config=device_config,
                   global_config=global_config)

In [107]:
type(model)

pytorch_pretrained_bert.modeling.BertForTokenClassification

## Configure Optimizer

In [59]:
from bert_utils import configure_optimizer

In [60]:
optimizer, optimizer_config, _ = configure_optimizer(optimizer_config=optimizer_config,
                                                     global_config=global_config, 
                                                     train_config=train_config, 
                                                     device_config=device_config, 
                                                     model=model, 
                                                     num_train_examples=len(train_dataloader))

In [109]:
type(optimizer)

torch.optim.adam.Adam

## Train Model

In [61]:
from bert_utils import train_token_model

In [110]:
model, train_loss = train_token_model(model=model, 
                                      train_dataloader=train_dataloader, 
                                      optimizer=optimizer,
                                      train_config=train_config, 
                                      model_config=model_config, 
                                      optimizer_config=optimizer_config,
                                      device_config=device_config,
                                      global_config=global_config)

## Evaluate Model

In [63]:
from bert_utils import eval_token_model

In [64]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [82]:
preds, eval_loss, eval_accuracy = eval_token_model(model=model, 
                                                   eval_dataloader=valid_dataloader, 
                                                   model_config=model_config, 
                                                   device_config=device_config, 
                                                   label_list=label_list,
                                                   eval_func=flat_accuracy)

Validation loss: 0.2580980388323466
Validation Accuracy: 0.9766516460905346
Validation F1-Score: 0.7453125410892262
