# Named Entity Recognition using LSTMs

### 1. Preparing the dataset

In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv("./data/ner_dataset.csv", encoding="latin1")
data = data.fillna(method="ffill")

In [2]:
print(data.head())

      Sentence           Word  POS Tag
0  Sentence: 1      Thousands  NNS   O
1  Sentence: 1             of   IN   O
2  Sentence: 1  demonstrators  NNS   O
3  Sentence: 1           have  VBP   O
4  Sentence: 1        marched  VBN   O


In [3]:
words = list(set(data["Word"].values))
words.append("ENDPAD")
n_words = len(words)
print(n_words)

35166


In [4]:
tags = list(set(data["Tag"].values))
n_tags = len(tags); n_tags
print(n_tags)

17


(This is a helper class I found online)

In [5]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [6]:
getter = SentenceGetter(data)

In [7]:
sent = getter.get_next()

In [8]:
sentences = getter.sentences

In [9]:
max_len = 35
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [10]:
word2idx["sup"]

24054

In [11]:
from keras.preprocessing.sequence import pad_sequences
x_data = [[word2idx[w[0]] for w in s] for s in sentences]
x_data = pad_sequences(maxlen=max_len, sequences=x_data, padding="post", value=n_words - 1)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [13]:
y_data = [[tag2idx[w[2]] for w in s] for s in sentences]
y_data = pad_sequences(maxlen=max_len, sequences=y_data, padding="post", value=tag2idx["O"])

In [14]:
print(x_data[1])
print(y_data[1])

[27936 12264 33153 26607 28090  7846 13728 30280  7846 10164 32537 23022
 17320 18092  2445 12888  4403  7785 22440 20765   668 21076 29922  1035
  8475 35165 35165 35165 35165 35165 35165 35165 35165 35165 35165]
[ 2  8  8  8  8  8  8  8  8  8  8  8  8  8  8 14  8  8  8 16  8  8  8  8
  8  8  8  8  8  8  8  8  8  8  8]


In [18]:
from keras.utils import to_categorical
y_data = [to_categorical(i, num_classes=n_tags) for i in y_data]

In [19]:
print(x_data[1])
print(y_data[1])

[27936 12264 33153 26607 28090  7846 13728 30280  7846 10164 32537 23022
 17320 18092  2445 12888  4403  7785 22440 20765   668 21076 29922  1035
  8475 35165 35165 35165 35165 35165 35165 35165 35165 35165 35165]
[[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0

So, for every sentence, we have an integer-indexed matrix (which will be the input for the embedding layer). For each word, we have a corresponding one-hot-encoded class.

In [21]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2)

### 2. Building the model

In [25]:
from keras.models import Model, Sequential
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional

model = Sequential()
model.add(Embedding(input_dim=n_words, output_dim=100, input_length=max_len))
model.add(Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)))
model.add(TimeDistributed(Dense(n_tags, activation="softmax")))

model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

In [27]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 35, 100)           3516600   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 35, 200)           160800    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 35, 17)            3417      
Total params: 3,680,817
Trainable params: 3,680,817
Non-trainable params: 0
_________________________________________________________________
None


### 3. Training 

In [29]:
model.fit(x_train, np.array(y_train), batch_size=30, epochs=5, validation_split=0.1, verbose=2)

Train on 34530 samples, validate on 3837 samples
Epoch 1/5
 - 194s - loss: 0.0689 - acc: 0.9799 - val_loss: 0.0697 - val_acc: 0.9792
Epoch 2/5
 - 194s - loss: 0.0583 - acc: 0.9828 - val_loss: 0.0654 - val_acc: 0.9803
Epoch 3/5
 - 194s - loss: 0.0522 - acc: 0.9846 - val_loss: 0.0635 - val_acc: 0.9809
Epoch 4/5
 - 194s - loss: 0.0478 - acc: 0.9858 - val_loss: 0.0642 - val_acc: 0.9803
Epoch 5/5
 - 193s - loss: 0.0441 - acc: 0.9869 - val_loss: 0.0643 - val_acc: 0.9808


<keras.callbacks.History at 0x159a7b09208>