# Named Entity Recognition Using BERT

### Required packages
* pytorch-pretrained-bert
* pandas
* seqeval
* unicode

In [3]:
import os
import sys
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import torch
import random

bert_utils_path = os.path.abspath('../../utils_nlp/bert')
if bert_utils_path not in sys.path:
    sys.path.insert(0, bert_utils_path)

## Configurations

In [4]:
from configs import PathConfig, GlobalConfig, DeviceConfig, ModelConfig, OptimizerConfig, TrainConfig, EvalConfig

### Path configuration

In [3]:
path_config_dict = {"data_dir": "./data/NER/", 
                    "output_dir": "./NER_output/"}
path_config = PathConfig(path_config_dict)

### Global configuration

In [4]:
global_config_dict = {"fp16": False}
global_config = GlobalConfig(global_config_dict)

### Device configuration

In [5]:
device_config_dict = {"no_cuda": False}
device_config = DeviceConfig(device_config_dict)
print("device name: {}".format(torch.cuda.get_device_name(0)))
print("number of gpus: {}".format(device_config.n_gpu))

device name: Tesla K80
number of gpus: 1


### Model configuration

In [6]:
model_config_dict = {"bert_model": "bert-base-uncased",
                     "max_seq_length": 75,
                     "num_labels": 18,
                     "model_type": "token"}
model_config = ModelConfig(model_config_dict)

### Optimizer configuration

In [7]:
optimizer_config_dict = {"no_decay_params": ['bias', 'gamma', 'beta'],
                         "learning_rate": 3e-5}
optimizer_config = OptimizerConfig(optimizer_config_dict)

### Train configuration

In [8]:
train_config_dict = {"train_batch_size": 32,
                     "num_train_epochs": 5}
train_config = TrainConfig(train_config_dict)

### Evaluation configuration

In [9]:
eval_config = EvalConfig({"eval_batch_size":32})

### Set random seeds

In [10]:
random.seed(global_config.seed)
np.random.seed(global_config.seed)
torch.manual_seed(global_config.seed)

<torch._C.Generator at 0x7fa16fad6150>

## Preprocess Data

### Create training and validation examples
KaggleNERProcessor is a dataset specific class that generates training and evaluation examples in the format accepted by all utility functions. 

In [11]:
from bert_data_utils import KaggleNERProcessor

In [12]:
kaggle_ner_processor = KaggleNERProcessor(data_dir="./data/NER/ner_dataset.csv", dev_percentage = 0.1)

In [13]:
train_examples = kaggle_ner_processor.get_train_examples(data_dir="./data/NER/ner_dataset.csv")
dev_examples = kaggle_ner_processor.get_dev_examples(data_dir="./data/NER/ner_dataset.csv")

In [14]:
a = set(kaggle_ner_processor.train_sentence_nums)
b = set(kaggle_ner_processor.dev_sentence_nums)
a.intersection(b)

set()

In [15]:
label_list = kaggle_ner_processor.get_labels()

In [16]:
print('Sample sentence: \n{}\n'.format(train_examples[0].text_a))
print('Sample sentence labels: \n{}\n'.format(train_examples[1].label))

Sample sentence: 
Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .

Sample sentence labels: 
['B-gpe', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-tim', 'O', 'O', 'O', 'B-org', 'O', 'O', 'O', 'O', 'O']



### Create dataloaders
These two utility functions convert training and evaluation examples to Pytorch dataloaders which can be used for model training and evaluation. The following steps are performed:
1. Tokenization
2. Convert tokens and labels to numerical values
3. Sequence padding or truncation
4. Convert numpy arrays to Pytorch tensors
5. Create dataloader for sampling and serving data in batches

In [17]:
from bert_utils import create_train_dataloader, create_eval_dataloader

In [18]:
train_dataloader = create_train_dataloader(train_examples=train_examples,
                                           model_config=model_config,
                                           train_config=train_config,
                                           label_list=label_list,
                                           device_config=device_config)

In [19]:
valid_dataloader, _ = create_eval_dataloader(eval_examples=dev_examples, 
                                             model_config=model_config, 
                                             eval_config=eval_config, 
                                             label_list=label_list)

In [20]:
it = iter(train_dataloader)
first = next(it)
print("Sample token ids:\n{}\n".format(first[0][0]))
print("Sample attention mask:\n{}\n".format(first[1][0]))
print("Sample label ids:\n{}\n".format(first[3][0]))

Sample token ids:
tensor([ 1057, 29625,  2015, 29625, 29624,  3709,  2749,  1999,  7041,  2360,
         2027,  2730,  2321, 17671,  2076,  2019, 11585,  3169,  1999,  1996,
         2264,  1010,  2096,  2334,  4584,  2758,  2216,  2730,  2020,  9272,
         1012,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0])

Sample attention mask:
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0])

Sample label ids:
tensor([ 1, 17, 17, 17, 17, 17,  1,  1, 13,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,

## Load Model

In [21]:
from bert_utils import load_model

In [22]:
model = load_model(model_config=model_config, 
                   path_config=path_config, 
                   device_config=device_config,
                   global_config=global_config)

In [23]:
type(model)

pytorch_pretrained_bert.modeling.BertForTokenClassification

## Configure Optimizer

In [24]:
from bert_utils import configure_optimizer

In [25]:
optimizer, optimizer_config, _ = configure_optimizer(optimizer_config=optimizer_config,
                                                     global_config=global_config, 
                                                     train_config=train_config, 
                                                     device_config=device_config, 
                                                     model=model, 
                                                     num_train_examples=len(train_dataloader))

In [26]:
type(optimizer)

torch.optim.adam.Adam

In [None]:
optimizer

In [27]:
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 3e-05
    weight_decay: 0.01

Parameter Group 1
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 3e-05
    weight_decay: 0.0
)

In [45]:
optimizer.param_groups[0]['weight_decay']

0.01

## Train Model

In [27]:
from bert_utils import train_token_model

In [28]:
model, train_loss = train_token_model(model=model, 
                                      train_dataloader=train_dataloader, 
                                      valid_dataloader=valid_dataloader,
                                      label_list=label_list,
                                      optimizer=optimizer,
                                      train_config=train_config, 
                                      model_config=model_config, 
                                      optimizer_config=optimizer_config,
                                      device_config=device_config,
                                      global_config=global_config)

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Train loss: 0.24524948639923594
Validation loss: 0.17619687835375467
Validation Accuracy: 0.6607947530864196


Epoch:  20%|██        | 1/5 [22:30<1:30:01, 1350.48s/it]

F1-Score: 0.5384509750068662
Train loss: 0.23885687313051557
Validation loss: 0.22382892300685248
Validation Accuracy: 0.9795953703703708


Epoch:  40%|████      | 2/5 [45:12<1:07:41, 1353.97s/it]

F1-Score: 0.773664250148345
Train loss: 0.2708684699437281
Validation loss: 0.26138194382190705
Validation Accuracy: 0.9756471193415636


Epoch:  60%|██████    | 3/5 [1:07:54<45:12, 1356.42s/it]

F1-Score: 0.7539385259446953
Train loss: 0.2847153783242202
Validation loss: 0.2587072236339251
Validation Accuracy: 0.9763917695473245


Epoch:  80%|████████  | 4/5 [1:30:36<22:38, 1358.03s/it]

F1-Score: 0.7518122751436396
Train loss: 0.2878860058476962
Validation loss: 0.28233432402213415
Validation Accuracy: 0.9749407407407401


Epoch: 100%|██████████| 5/5 [1:53:16<00:00, 1358.65s/it]

F1-Score: 0.7332092615906728





## Evaluate Model

In [63]:
from bert_utils import eval_token_model

In [64]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [82]:
preds, eval_loss, eval_accuracy = eval_token_model(model=model, 
                                                   eval_dataloader=valid_dataloader, 
                                                   model_config=model_config, 
                                                   device_config=device_config, 
                                                   label_list=label_list,
                                                   eval_func=flat_accuracy)

Validation loss: 0.2580980388323466
Validation Accuracy: 0.9766516460905346
Validation F1-Score: 0.7453125410892262
