### Resources

https://blog.codecentric.de/en/2020/11/take-control-of-named-entity-recognition-with-you-own-keras-model/

In [1]:
def split_text_label(filename):
    out = []
    labels = []
    words = []
    unique = set()
    with open(filename) as f:
        for line in f.readlines():
            line = line.lstrip().strip()
            if line.startswith("-DOCSTART-") or len(line) == 0:
                if labels and words:
                    out.append([" ".join(words), " ".join(labels)])
                labels = []
                words = []
            else:
                items = line.split(" ")
                words.append(items[0])
                labels.append(items[-1])
                unique.add(items[-1])
        return out, unique

def preprocess(samples, labels_to_index, embeddings_index):
    X = np.zeros((len(samples), MAX_LEN, EMB_DIM), dtype=np.float32)
    y = np.zeros((len(samples), MAX_LEN), dtype=np.uint8)
    default = np.random.rand(EMB_DIM).astype('float32')
    labels_out = []
    for i, sample in enumerate(samples):
        sentence = sample[0].split()
        labels = sample[1].split()
        labels_out.append(labels)
        for j, token in enumerate(sentence[:MAX_LEN]):
            X[i, j] = embeddings_index.get(token, default)
            y[i, j] = labels_to_index[labels[j]]
    return X, y, labels_out

def preprocessInference(samples, labels_to_index, embeddings_index):
    X = np.zeros((len(samples), MAX_LEN, EMB_DIM), dtype=np.float32)
    default = np.random.rand(EMB_DIM).astype('float32')
    num_tokens = []
    for i, sample in enumerate(samples):
        sentence = sample.split()
        num_tokens.append(len(sentence))
        for j, token in enumerate(sentence[:MAX_LEN]):
            X[i, j] = embeddings_index.get(token, default)
    return X, num_tokens

In [2]:
test, _= split_text_label("./eng.testa")
validation, _ = split_text_label("./eng.testb")
train, unique = split_text_label("./eng.train")

In [3]:
labels_to_index = {w:i for i, w in enumerate(sorted(list(unique)))}

In [27]:
labels_to_index

{'B-LOC': 0,
 'B-MISC': 1,
 'B-ORG': 2,
 'I-LOC': 3,
 'I-MISC': 4,
 'I-ORG': 5,
 'I-PER': 6,
 'O': 7}

In [4]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import pandas as pd

In [5]:
EMBEDDING_FILE = "wiki-news-300d-1M.vec"
MAX_LEN = 124
EMB_DIM = 300
NUM_LABELS = len(unique)

In [6]:
def getEmbeddingsIndex():
    out = {}
    with open(EMBEDDING_FILE) as f:
        for line in f.readlines():
            data = line.split(" ")
            out[data[0]] = np.array(data[1:], dtype='float32')
        return out

embeddings_index = getEmbeddingsIndex()

In [7]:
X_train, y_train, _ = preprocess(train, labels_to_index, embeddings_index)
X_test, y_test, test_labels = preprocess(test, labels_to_index, embeddings_index)

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, TimeDistributed, Dense
from sklearn.metrics import classification_report
 
def build_model(nr_filters=128):
    input_shape = (MAX_LEN, EMB_DIM)
    lstm = LSTM(nr_filters, return_sequences=True)
    bi_lstm = Bidirectional(lstm, input_shape=input_shape)
    tag_classifier = Dense(NUM_LABELS, activation='softmax')
    sequence_labeller = TimeDistributed(tag_classifier)
    return Sequential([bi_lstm, sequence_labeller])

model = build_model()

In [9]:
def train(model, epochs=10, batch_size=32):
    model.compile(optimizer='Adam',
                  loss='sparse_categorical_crossentropy',
                  metrics='accuracy')
    history = model.fit(X_train, y_train,
                        validation_split=0.4,
                        epochs=epochs,
                        batch_size=batch_size)
    return history.history

history = train(model)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [17]:
def predict(model, X_test):
    y_probs = model.predict(X_test)
    return np.argmax(y_probs, axis=-1)

predictions = predict(model, X_test)

In [18]:
def evaluate(predictions, y_test, test_labels, labels_to_index):
    assert len(predictions) == len(y_test) == len(test_labels)
    reverse_label_index = {v:k for k,v in labels_to_index.items()}
    n = len(predictions)
    y = []
    y_hat = []
    for i in range(n):
        y_hat += predictions[i][:len(test_labels[i])].tolist()
        y += y_test[i][:len(test_labels[i])].tolist()
        
    y_hat = list(map(lambda x: reverse_label_index[x], y_hat))
    y = list(map(lambda x: reverse_label_index[x], y))
    return classification_report(y, y_hat, output_dict=True)

In [19]:
report = evaluate(predictions, y_test, test_labels, labels_to_index)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
pd.DataFrame.from_dict(report).transpose()

Unnamed: 0,precision,recall,f1-score,support
B-LOC,0.0,0.0,0.0,0.0
B-MISC,0.0,0.0,0.0,4.0
I-LOC,0.941929,0.91404,0.927775,2094.0
I-MISC,0.798658,0.84731,0.822265,1264.0
I-ORG,0.88446,0.837954,0.860579,2092.0
I-PER,0.931936,0.97396,0.952484,3149.0
O,0.993515,0.992446,0.99298,42759.0
accuracy,0.978175,0.978175,0.978175,0.978175
macro avg,0.650071,0.652244,0.650869,51362.0
weighted avg,0.978322,0.978175,0.978168,51362.0


In [24]:
ex = ["Texas is hot", "Luis lives in Lalaland"]
X, num_tokens = preprocessInference(ex, labels_to_index, embeddings_index)
predictions = model.predict(X)

In [25]:
def postProcess(predictions, num_tokens, labels_to_index):
    reverse_label_index = {v:k for k,v in labels_to_index.items()}
    assert len(predictions) == len(num_tokens)
    n = len(predictions)
    preds = np.argmax(predictions, axis=-1)
    out = []
    for i in range(n):
        p = preds[i][:num_tokens[i]]
        out.append([reverse_label_index[idx] for idx in p])
    return out     
postProcess(predictions, num_tokens, labels_to_index)

[['I-LOC', 'O', 'O'], ['I-PER', 'O', 'O', 'B-LOC']]

In [26]:
tf.saved_model.save(model, 'ner_model/1/')



INFO:tensorflow:Assets written to: ner_model/1/assets


INFO:tensorflow:Assets written to: ner_model/1/assets


In [2]:
import keras
model = keras.models.load_model('ner_model/1/')