# A NER approach using a LSTM/CRF neural network approach


## Reading dataset

In [1]:
import os
import string
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [2]:
def load_wikiner(path, token_only=False):
    """Load WikiNER dataset.
    
    Params:
        path: path to txt file if WikiNER dataset;
        token_only: if True return only the list of token, if false return
                    also pos tag for each token.
    Return:
        sentences: list of sentences, each sentences is a list of token
        tags: list of list of token tags
        output_labels: set of all the labels in the dataset
    """
    raw_sents = []
    with open(path, 'r', encoding='utf-8') as f1:
        for line in f1.readlines():
            if line != '\n':
                raw_sents.append(line)
    
    # Split tokens
    for sent_idx in range(len(raw_sents)):
        raw_sents[sent_idx] = raw_sents[sent_idx].split()
    
    # Extract features and separate them from tags
    sentences = []
    tags = []
    output_labels = set()
    for raw_sent in raw_sents:
        sent = []
        tag = []
        for word in raw_sent:
            features = word.split('|')
            ent = features.pop()
            tag.append(ent)
            output_labels.add(ent)
            if token_only:
                sent.append(features.pop(0))
            else:
                sent.append(tuple(features))
        sentences.append(sent)
        tags.append(tag)
    print(f'Read {len(sentences)} sentences.')
    return sentences, tags, output_labels

In [3]:
file_path = os.path.join('../data', 'wikiner-en-wp3-raw.txt')
raw, ner, output_labels = load_wikiner(file_path, token_only=True)

Read 142153 sentences.


In [4]:
print(raw[0])
print(ner[0])

['The', 'Oxford', 'Companion', 'to', 'Philosophy', 'says', ',', '"', 'there', 'is', 'no', 'single', 'defining', 'position', 'that', 'all', 'anarchists', 'hold', ',', 'and', 'those', 'considered', 'anarchists', 'at', 'best', 'share', 'a', 'certain', 'family', 'resemblance', '.', '"']
['I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


---

# Data Preparation
Prepare character- and word-level input for the model.

## Sentence encoding and padding
We use a Keras `Tokenizer` to extract the vocabulary and encode words. We pad sentences to a fixed length because it is required from LSTM.

In [5]:
# integer encode sequences of words
token_tokenizer = Tokenizer()    # Automatically lowers tokens
token_tokenizer.fit_on_texts(raw)
sequences = token_tokenizer.texts_to_sequences(raw)

# Label encoding
tag2idx = { tag: idx for idx, tag in enumerate(output_labels) }
idx2tag = { idx: tag for tag, idx in tag2idx.items() }
ner_sequences = [[tag2idx[tag] for tag in sentence] for sentence in ner]

In [6]:
vocabulary_size = len(token_tokenizer.word_counts)
print(vocabulary_size)

108276


In [7]:
max_sentence_len = 50
X_sent = pad_sequences(sequences, maxlen=max_sentence_len, padding='post', truncating='post')
Y = pad_sequences(ner_sequences, maxlen=max_sentence_len, value=tag2idx['O'], padding='post', truncating='post')

X_sent = np.array(X_sent)
Y = np.array(Y)

In [8]:
token_tokenizer.index_word[0] = '_PAD_'
token_tokenizer.word_index['_PAD_'] = 0
print(X_sent[0])
print([token_tokenizer.index_word[word] for word in X_sent[0]])

[   1 2653 4672    7  934 1437    2   10   68   12   92  369 6229  456
   16   62 7102 1284    2    6  171  229 7102   25  305 1332    8  688
  271 9659    3   10    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]
['the', 'oxford', 'companion', 'to', 'philosophy', 'says', ',', '"', 'there', 'is', 'no', 'single', 'defining', 'position', 'that', 'all', 'anarchists', 'hold', ',', 'and', 'those', 'considered', 'anarchists', 'at', 'best', 'share', 'a', 'certain', 'family', 'resemblance', '.', '"', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_', '_PAD_']


---

## Character encoding and padding
In order to extract character-level informations, we have to:
* Encode characters with integers;
* Pad words to a fixed lengths;
* Use the 0 as padding integer both for sentence padding and for word padding.

We don't want to truncate words because prefix and suffix contains precious informations, so we take the longest words and we pad words to its length.

In [9]:
def to_char_list(data):
    '''Transform all the words of a dataset into lists of characters'''
    char_data = []
    for sentence in data:
        char_sent = []
        for word in sentence:
            char_sent.append(list(word))
        char_data.append(char_sent)
    return char_data

In [10]:
raw_char = to_char_list(raw)
print(raw_char[0])
print(len(raw_char))

[['T', 'h', 'e'], ['O', 'x', 'f', 'o', 'r', 'd'], ['C', 'o', 'm', 'p', 'a', 'n', 'i', 'o', 'n'], ['t', 'o'], ['P', 'h', 'i', 'l', 'o', 's', 'o', 'p', 'h', 'y'], ['s', 'a', 'y', 's'], [','], ['"'], ['t', 'h', 'e', 'r', 'e'], ['i', 's'], ['n', 'o'], ['s', 'i', 'n', 'g', 'l', 'e'], ['d', 'e', 'f', 'i', 'n', 'i', 'n', 'g'], ['p', 'o', 's', 'i', 't', 'i', 'o', 'n'], ['t', 'h', 'a', 't'], ['a', 'l', 'l'], ['a', 'n', 'a', 'r', 'c', 'h', 'i', 's', 't', 's'], ['h', 'o', 'l', 'd'], [','], ['a', 'n', 'd'], ['t', 'h', 'o', 's', 'e'], ['c', 'o', 'n', 's', 'i', 'd', 'e', 'r', 'e', 'd'], ['a', 'n', 'a', 'r', 'c', 'h', 'i', 's', 't', 's'], ['a', 't'], ['b', 'e', 's', 't'], ['s', 'h', 'a', 'r', 'e'], ['a'], ['c', 'e', 'r', 't', 'a', 'i', 'n'], ['f', 'a', 'm', 'i', 'l', 'y'], ['r', 'e', 's', 'e', 'm', 'b', 'l', 'a', 'n', 'c', 'e'], ['.'], ['"']]
142153


In [11]:
for sent_idx in range(len(raw)):
    if len(raw_char[sent_idx]) != len(sequences[sent_idx]):
        print('sequence len error')
        print(raw_char[sent_idx])
        print(sequences[sent_idx])
    for word_idx in range(len(raw[sent_idx])):
        if len(raw_char[sent_idx][word_idx]) != len(raw[sent_idx][word_idx]):
            print('word len error')

In [12]:
# NOTE: Tokenizer may take an argument char_level=True. We should try it in 
# order to get a cleaner code, but in this way we do not have a fixed length
# for words.
char_tokenizer = Tokenizer(lower=False, filters='')
# Build a list with all the characters
charset = string.ascii_letters + string.digits + string.punctuation
print(f'Charset dimension: {len(charset)}')
print(f'Charset: {charset}')
char_tokenizer.fit_on_texts(list(charset))

Charset dimension: 94
Charset: abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [13]:
# Add padding to the tokenizer with the 0 integer encoding
char_tokenizer.index_word[0] = '_PAD_'
char_tokenizer.word_index['_PAD_'] = 0

#### Pad sentences
Set the lengths to `max_sentence_len` (50) with padding and truncate.

In [14]:
for sent_idx in range(len(raw_char)):
    if len(raw_char[sent_idx]) > max_sentence_len:
        # Truncate long sentences
        raw_char[sent_idx] = raw_char[sent_idx][:max_sentence_len]
    while len(raw_char[sent_idx]) < max_sentence_len:
        # Pad sentences with '_PAD_' characters
        pad_word = []
        pad_word.append(char_tokenizer.index_word[0])
        raw_char[sent_idx].append(pad_word)

print(raw_char[10])

[['S', 'u', 'b', 's', 'e', 'q', 'u', 'e', 'n', 't', 'l', 'y'], [','], ['t', 'h', 'e'], ['I', 'n', 't', 'e', 'r', 'n', 'a', 't', 'i', 'o', 'n', 'a', 'l'], ['b', 'e', 'c', 'a', 'm', 'e'], ['p', 'o', 'l', 'a', 'r', 'i', 's', 'e', 'd'], ['i', 'n', 't', 'o'], ['t', 'w', 'o'], ['c', 'a', 'm', 'p', 's'], [','], ['w', 'i', 't', 'h'], ['M', 'a', 'r', 'x'], ['a', 'n', 'd'], ['B', 'a', 'k', 'u', 'n', 'i', 'n'], ['a', 's'], ['t', 'h', 'e', 'i', 'r'], ['r', 'e', 's', 'p', 'e', 'c', 't', 'i', 'v', 'e'], ['f', 'i', 'g', 'u', 'r', 'e', 'h', 'e', 'a', 'd', 's'], ['.'], ['_PAD_'], ['_PAD_'], ['_PAD_'], ['_PAD_'], ['_PAD_'], ['_PAD_'], ['_PAD_'], ['_PAD_'], ['_PAD_'], ['_PAD_'], ['_PAD_'], ['_PAD_'], ['_PAD_'], ['_PAD_'], ['_PAD_'], ['_PAD_'], ['_PAD_'], ['_PAD_'], ['_PAD_'], ['_PAD_'], ['_PAD_'], ['_PAD_'], ['_PAD_'], ['_PAD_'], ['_PAD_'], ['_PAD_'], ['_PAD_'], ['_PAD_'], ['_PAD_'], ['_PAD_'], ['_PAD_']]


In [15]:
len(char_tokenizer.word_index)

95

Encode characters with integers:

In [16]:
char_seq = []
for sentence in raw_char:
    char_seq.append(char_tokenizer.texts_to_sequences(sentence))

In [22]:
print(char_seq[0])
for word in char_seq[0]:
    w = [char_tokenizer.index_word[letter] for letter in word]
    print(w)

[[46, 8, 5], [41, 24, 6, 15, 18, 4], [29, 15, 13, 16, 1, 14, 9, 15, 14], [20, 15], [42, 8, 9, 12, 15, 19, 15, 16, 8, 25], [19, 1, 25, 19], [74], [64], [20, 8, 5, 18, 5], [9, 19], [14, 15], [19, 9, 14, 7, 12, 5], [4, 5, 6, 9, 14, 9, 14, 7], [16, 15, 19, 9, 20, 9, 15, 14], [20, 8, 1, 20], [1, 12, 12], [1, 14, 1, 18, 3, 8, 9, 19, 20, 19], [8, 15, 12, 4], [74], [1, 14, 4], [20, 8, 15, 19, 5], [3, 15, 14, 19, 9, 4, 5, 18, 5, 4], [1, 14, 1, 18, 3, 8, 9, 19, 20, 19], [1, 20], [2, 5, 19, 20], [19, 8, 1, 18, 5], [1], [3, 5, 18, 20, 1, 9, 14], [6, 1, 13, 9, 12, 25], [18, 5, 19, 5, 13, 2, 12, 1, 14, 3, 5], [76], [64], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0]]
['T', 'h', 'e']
['O', 'x', 'f', 'o', 'r', 'd']
['C', 'o', 'm', 'p', 'a', 'n', 'i', 'o', 'n']
['t', 'o']
['P', 'h', 'i', 'l', 'o', 's', 'o', 'p', 'h', 'y']
['s', 'a', 'y', 's']
[',']
['"']
['t', 'h', 'e', 'r', 'e']
['i', 's']
['n', 'o']
['s', 'i', 'n', 'g', 'l', 'e']
['d', 'e', 'f', 'i', 'n', 'i

#### Pad words 
Set all the words to max_word_len with padding and (possibly without) truncate:

In [17]:
def pad_words(sentence, maxlen, pad=0):
    padded_sentence = []
    for word in sentence:
        new_word = word.copy()
        if len(word) > maxlen:
            new_word = word[:maxlen]
        else:
            while maxlen - len(new_word) > 1:
                new_word.append(pad)
                new_word.insert(0, pad)
            if maxlen - len(new_word) == 1:
                new_word.insert(0, pad)
        padded_sentence.append(new_word)
    
    return padded_sentence

In [18]:
max_word_len = max([len(word) for word in token_tokenizer.word_index.keys()])
max_word_len

93

What? A word of 93 characters? Let's get deeper:

In [19]:
words = list(token_tokenizer.word_index.keys())
sorted(words, key=lambda w:len(w))[-5]

'news://alt.games.video.tiger.game-com'

Now it makes more sense: the datatset contains URLs and an URL is a single token!

In [20]:
X_char = np.array([pad_words(sentence, maxlen=max_word_len) for sentence in char_seq])

In [21]:
for sentence in X_char:
    if len(sentence) != max_sentence_len:
        print('sentence error')
    for word in sentence:
        if len(word) != max_word_len:
            print(f'word error: {len(word)}')

---

# Model implementation

In [26]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Conv1D, TimeDistributed, Dropout, Input, \
    MaxPooling1D, Flatten, concatenate, Bidirectional, LSTM, Dense
from tensorflow.keras.utils import plot_model
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

#### Hyperparameters of the model

In [27]:
USE_CHIU_CONFIG = False

In [28]:
if USE_CHIU_CONFIG:
    char_embedding_dim = 25
    cnn_window_size = 3
    cnn_filters_number = 53

    word_embedding_dim = 100
    hidden_cells = 275
    drop=0.68

    batch_size = 9
    epochs = 80
else:
    char_embedding_dim = 30
    cnn_window_size = 3
    cnn_filters_number = 30

    word_embedding_dim = 100
    hidden_cells = 200
    drop=0.5

    batch_size = 10
    epochs = 20

In [29]:
print(max_sentence_len)
print(max_word_len)

50
93


## CNN
We use a Convolutive Neural Network in order to extract pattern informations from the letters of the word. The CNN embedding is formed by:
* A `keras.layers.Embedding` layer, which is a lookup table that associate a vector to each character;
* A 1-dimensional convolution on the embedding vectors in order to capture patterns in letters;
* A MaxPool1d that transforms a series of vectors in a unique vectors which contains informations from the characters of the word. 

Thanks to the author of [this repo](https://github.com/kamalkraj/Named-Entity-Recognition-with-Bidirectional-LSTM-CNNs/blob/master/nn.py) that saved my work!

TODO: study where dropout is required, I missed some pieces.

In [30]:
cnn_input = Input(shape=(max_sentence_len, max_word_len,), name='char_encoding')
# We use TimeDistributed layer because we have two level of sequences:
# * The sentence is a sequence of words;
# * The word is a sequence of characters;
# We want to work on the lowest sequence. the sequence of characters, so the
# TimeDistributed layer allow us to apply this model to each word. 
cnn = TimeDistributed(Embedding(len(char_tokenizer.word_index), char_embedding_dim), name='cnn_Embedding')(cnn_input)
cnn = Dropout(drop)(cnn)
cnn = TimeDistributed(Conv1D(filters=cnn_filters_number, kernel_size=cnn_window_size, padding='same'), name='cnn_Convolution1d')(cnn)
cnn = TimeDistributed(MaxPooling1D(max_word_len), name='cnn_MaxPooling1d')(cnn)
# We finally obtain a 30-dimensional vector for each word which contains 
# char-level informations!
cnn_out = TimeDistributed(Flatten(), name='cnn_Flatten')(cnn)

## Glove
We load Glove embedding in order to embed tokens and capture word-level informations:

In [33]:
def load_glove_embedding_matrix(path, word_index, embed_dim):
    """Load Glove embeddings.
    
    More info here: 
    https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
    """
    embeddings_index = {}
    with open(path, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    print('Found %s word vectors.' % len(embeddings_index))
    embedding_matrix = np.zeros((len(word_index) + 1, embed_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    
    return embedding_matrix


In [34]:
glove_embedding_path = os.path.join('../embeddings', 'glove.6B.100d.txt')
embedding_dim = 100
embedding_matrix = load_glove_embedding_matrix(glove_embedding_path, token_tokenizer.word_index, embedding_dim)

Found 400001 word vectors.


In [None]:
word_input = Input(shape=(max_sentence_len,), name='word_encoding')
word_embed = Embedding(len(token_tokenizer.word_index)+1, word_embedding_dim, 
                       weights=[embedding_matrix], input_length=max_sentence_len,
                       trainable=True, mask_zero=True, 
                       name='Glove_Embedding')(word_input)

# BiLSTM + CRF
We concatenate character- and word-level informations and pass it to a bidirectional LSTM:

In [None]:
x = concatenate([word_embed, cnn_out], axis=-1)
x = Dropout(drop)(x)
x = Bidirectional(LSTM(hidden_cells, return_sequences=True, dropout=drop))(x)
x = Dense(len(output_labels), activation='relu', name='Dense_Layer')(x)
crf = CRF(len(output_labels), dtype='float32', name='CRF_Layer')
out = crf(x)

In [None]:
model = Model(
    inputs=[cnn_input, word_input],
    outputs=out
)

In [None]:
model.compile(
    loss=crf.loss, 
    optimizer='adam',
    metrics=[crf.accuracy]
)

model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
char_encoding (InputLayer)      [(None, 50, 93)]     0                                            
__________________________________________________________________________________________________
cnn_Embedding (TimeDistributed) (None, 50, 93, 30)   2850        char_encoding[0][0]              
__________________________________________________________________________________________________
dropout (Dropout)               (None, 50, 93, 30)   0           cnn_Embedding[0][0]              
__________________________________________________________________________________________________
cnn_Convolution1d (TimeDistribu (None, 50, 93, 30)   2730        dropout[0][0]                    
_______________________________________________________________________________________

In [None]:
best_model_file = os.path.join('models','cnn-blstm-winer-best-model.h5')
checkpoint = ModelCheckpoint(
    best_model_file,
    save_weights_only=True,
    save_best_only=True
)
early_stopping_callback = EarlyStopping(monitor="val_loss",
                                        patience=3, min_delta=0.001, verbose=1, 
                                        restore_best_weights=True)
# early_stopping_callback = EarlyStopping(monitor="val_accuracy",
#                                         patience=3, min_delta=0.005, verbose=1, 
#                                         restore_best_weights=True)

# Training

In [None]:
from sklearn.model_selection import train_test_split


X_sent_train, X_sent_test, Y_train, Y_test = train_test_split(X_sent, Y, test_size=0.2, random_state=3791)
X_char_train, X_char_test, _, _ = train_test_split(X_char, Y, test_size=0.2, random_state=3791)

In [None]:
history = model.fit([X_char_train, X_sent_train],
    Y_train, 
    batch_size=batch_size, 
    epochs=epochs,
    verbose=1,
    callbacks=[checkpoint, early_stopping_callback],
    validation_split=0.2
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 00005: early stopping


---

# Evaluation
We evaluate three aspects of the model:
* Memory consumption;
* Latency in predictions;
* F1 score on test set.

In [None]:
kerasutils.print_model_memory_usage(batch_size, model)

Model size: 61.087 MB


In [None]:
print(f'Model latency in predictions: {modelutils.compute_prediction_latency([X_char_test, X_sent_test], model, n_instances=len(X_sent_test)):.3} s')

Model latency in predictions: 0.00729 s


In [None]:
from seqeval.metrics import classification_report


datasets = [('Training Set', X_char_train, X_sent_train, Y_train), 
            ('Test Set', X_char_test, X_sent_test, Y_test)]

for title, X_char, X_sent, Y in datasets:
    Y_pred = model.predict({'char_encoding': X_char, 'word_encoding': X_sent}, batch_size=batch_size)
    Y, Y_pred = kerasutils.remove_seq_padding(X_sent, Y, Y_pred)
    Y, Y_pred = modelutils.from_encode_to_literal_labels(Y, Y_pred, idx2tag)
    print(title)
    print(classification_report(Y, Y_pred, digits=3))
    print('\n')

Training Set
           precision    recall  f1-score   support

      LOC      0.830     0.901     0.864     68020
     MISC      0.770     0.757     0.763     58442
      ORG      0.869     0.703     0.777     39297
      PER      0.923     0.950     0.936     76219

micro avg      0.851     0.850     0.850    241978
macro avg      0.851     0.850     0.848    241978



Test Set
           precision    recall  f1-score   support

     MISC      0.757     0.743     0.750     14427
      ORG      0.855     0.686     0.761      9760
      PER      0.913     0.945     0.929     19192
      LOC      0.815     0.889     0.850     17119

micro avg      0.839     0.839     0.839     60498
macro avg      0.839     0.839     0.837     60498





In [None]:
model.save(filepath=os.path.join('models', 'trained_end2end_crf_model_winer.h5'))