In [1]:
# !wget https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.train
# !wget https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testa

In [2]:
def parse(file):
    with open(file) as fopen:
        texts = fopen.read().split('\n')
    left, right = [], []
    for text in texts:
        if '-DOCSTART-' in text or not len(text):
            continue
        splitted = text.split()
        left.append(splitted[0])
        right.append(splitted[-1])
    return left, right

In [3]:
left_train, right_train = parse('eng.train')
left_test, right_test = parse('eng.testa')

In [4]:
import re
import numpy as np
import tensorflow as tf
from tqdm import tqdm

In [5]:
seq_len = 50
def iter_seq(x):
    return np.array([x[i: i+seq_len] for i in range(0, len(x)-seq_len, 1)])

def to_train_seq(*args):
    return [iter_seq(x) for x in args]

In [6]:
left_train, right_train = to_train_seq(left_train, right_train)
left_test, right_test = to_train_seq(left_test, right_test)

In [7]:
tag2idx = {'PAD': 0}
for no, u in enumerate(np.unique(right_train)):
    tag2idx[u] = no + 1
tag2idx

{'PAD': 0,
 'B-LOC': 1,
 'B-MISC': 2,
 'B-ORG': 3,
 'I-LOC': 4,
 'I-MISC': 5,
 'I-ORG': 6,
 'I-PER': 7,
 'O': 8}

In [8]:
left_train

array([['EU', 'rejects', 'German', ..., 'the', 'European', 'Union'],
       ['rejects', 'German', 'call', ..., 'European', 'Union', "'s"],
       ['German', 'call', 'to', ..., 'Union', "'s", 'veterinary'],
       ...,
       ['Peter', 'Hedblom', '(', ..., 'Division', 'three', 'Swansea'],
       ['Hedblom', '(', 'Sweden', ..., 'three', 'Swansea', '1'],
       ['(', 'Sweden', ')', ..., 'Swansea', '1', 'Lincoln']], dtype='<U61')

In [9]:
BERT_VOCAB = 'uncased_L-12_H-768_A-12/vocab.txt'
BERT_INIT_CHKPNT = 'uncased_L-12_H-768_A-12/bert_model.ckpt'
BERT_CONFIG = 'uncased_L-12_H-768_A-12/bert_config.json'

In [10]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
from bert import modeling

tokenization.validate_case_matches_checkpoint(True,BERT_INIT_CHKPNT)
tokenizer = tokenization.FullTokenizer(
      vocab_file=BERT_VOCAB, do_lower_case=True)

In [11]:
def XY(left_train, right_train):
    X, Y = [], []
    for i in tqdm(range(len(left_train))):
        left = left_train[i]
        right = right_train[i]
        bert_tokens = ['[CLS]']
        y = ['PAD']
        for no, orig_token in enumerate(left):
            y.append(right[no])
            t = tokenizer.tokenize(orig_token)
            bert_tokens.extend(t)
            y.extend(['PAD'] * (len(t) - 1))
        bert_tokens.append("[SEP]")
        y.append('PAD')
        X.append(tokenizer.convert_tokens_to_ids(bert_tokens))
        Y.append([tag2idx[i] for i in y])
    return X, Y

In [12]:
train_X, train_Y = XY(left_train, right_train)

100%|██████████| 203571/203571 [03:32<00:00, 957.17it/s] 


In [13]:
import keras
train_X = keras.preprocessing.sequence.pad_sequences(train_X, padding='post')
train_Y = keras.preprocessing.sequence.pad_sequences(train_Y, padding='post')

Using TensorFlow backend.


In [14]:
test_X, test_Y = XY(left_test, right_test)
test_X = keras.preprocessing.sequence.pad_sequences(test_X, padding='post')
test_Y = keras.preprocessing.sequence.pad_sequences(test_Y, padding='post')

100%|██████████| 51312/51312 [00:53<00:00, 961.84it/s] 


In [15]:
epoch = 3
batch_size = 16
warmup_proportion = 0.1
num_train_steps = int(len(train_X) / batch_size * epoch)
num_warmup_steps = int(num_train_steps * warmup_proportion)
bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)

In [16]:
class Model:
    def __init__(
        self,
        dimension_output,
        learning_rate = 2e-5,
    ):
        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None, None])
        self.maxlen = tf.shape(self.X)[1]
        self.lengths = tf.count_nonzero(self.X, 1)
        
        model = modeling.BertModel(
            config=bert_config,
            is_training=True,
            input_ids=self.X,
            use_one_hot_embeddings=False)
        output_layer = model.get_sequence_output()
        logits = tf.layers.dense(output_layer, dimension_output)
        y_t = self.Y
        log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
            logits, y_t, self.lengths
        )
        self.cost = tf.reduce_mean(-log_likelihood)
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate = learning_rate
        ).minimize(self.cost)
        mask = tf.sequence_mask(self.lengths, maxlen = self.maxlen)
        self.tags_seq, tags_score = tf.contrib.crf.crf_decode(
            logits, transition_params, self.lengths
        )
        self.tags_seq = tf.identity(self.tags_seq, name = 'logits')

        y_t = tf.cast(y_t, tf.int32)
        self.prediction = tf.boolean_mask(self.tags_seq, mask)
        mask_label = tf.boolean_mask(y_t, mask)
        correct_pred = tf.equal(self.prediction, mask_label)
        correct_index = tf.cast(correct_pred, tf.float32)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [17]:
dimension_output = len(tag2idx)
learning_rate = 2e-5

tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(
    dimension_output,
    learning_rate
)

sess.run(tf.global_variables_initializer())
var_lists = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert')
saver = tf.train.Saver(var_list = var_lists)
saver.restore(sess, BERT_INIT_CHKPNT)

Instructions for updating:
Colocations handled automatically by placer.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from uncased_L-12_H-768_A-12/bert_model.ckpt


In [19]:
import time

for e in range(3):
    lasttime = time.time()
    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, train_X.shape[0])]
        batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.X: batch_x,
                model.Y: batch_y
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)
    
    pbar = tqdm(
        range(0, len(test_X), batch_size), desc = 'test minibatch loop'
    )
    for i in pbar:
        batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
        batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.X: batch_x,
                model.Y: batch_y
            },
        )
        assert not np.isnan(cost)
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)
    
    train_loss /= len(train_X) / batch_size
    train_acc /= len(train_X) / batch_size
    test_loss /= len(test_X) / batch_size
    test_acc /= len(test_X) / batch_size

    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (e, train_loss, train_acc, test_loss, test_acc)
    )

train minibatch loop: 100%|██████████| 12724/12724 [1:09:16<00:00,  3.74it/s, accuracy=1, cost=0.657]     
test minibatch loop: 100%|██████████| 3207/3207 [06:37<00:00,  8.22it/s, accuracy=0.971, cost=5.79] 
train minibatch loop:   0%|          | 0/12724 [00:00<?, ?it/s]

time taken: 4553.980525493622
epoch: 0, training loss: 3.842892, training acc: 0.983601, valid loss: 4.545971, valid acc: 0.981644



train minibatch loop: 100%|██████████| 12724/12724 [1:09:12<00:00,  3.74it/s, accuracy=1, cost=0.231]     
test minibatch loop: 100%|██████████| 3207/3207 [06:36<00:00,  8.28it/s, accuracy=0.943, cost=10.6]  
train minibatch loop:   0%|          | 0/12724 [00:00<?, ?it/s]

time taken: 4548.632176876068
epoch: 1, training loss: 1.317753, training acc: 0.994542, valid loss: 4.296402, valid acc: 0.986370



train minibatch loop: 100%|██████████| 12724/12724 [1:09:07<00:00,  3.72it/s, accuracy=1, cost=0.00496]   
test minibatch loop: 100%|██████████| 3207/3207 [06:35<00:00,  8.28it/s, accuracy=0.991, cost=2.43]  

time taken: 4543.170719861984
epoch: 2, training loss: 0.576555, training acc: 0.997577, valid loss: 5.002001, valid acc: 0.986560






In [20]:
idx2tag = {i: w for w, i in tag2idx.items()}

In [21]:
def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            out_i.append(idx2tag[p])
        out.append(out_i)
    return out

In [22]:
real_Y, predict_Y = [], []

pbar = tqdm(
    range(0, len(test_X), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
    batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
    predicted = pred2label(sess.run(model.tags_seq,
            feed_dict = {
                model.X: batch_x,
                model.Y: batch_y,
            },
    ))
    real = pred2label(batch_y)
    predict_Y.extend(predicted)
    real_Y.extend(real)

validation minibatch loop: 100%|██████████| 3207/3207 [06:12<00:00,  8.75it/s]


In [23]:
from sklearn.metrics import classification_report
print(classification_report(np.array(real_Y).ravel(), np.array(predict_Y).ravel()))

             precision    recall  f1-score   support

     B-MISC       0.00      0.00      0.00       200
      I-LOC       0.96      0.96      0.96    104662
     I-MISC       0.89      0.88      0.88     63129
      I-ORG       0.89      0.92      0.90    104387
      I-PER       0.98      0.98      0.98    157385
          O       0.99      0.99      0.99   2135837
        PAD       1.00      1.00      1.00   4104960

avg / total       0.99      0.99      0.99   6670560



  'precision', 'predicted', average, warn_for)
