In [22]:
import pandas as pd
import numpy as np
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF
import os

data = pd.read_csv("ner_dataset.csv", encoding="latin1")


In [23]:
import keras
print(keras.__version__)

2.2.4


In [24]:
data = data.fillna(method="ffill")

In [25]:
data.tail(10)

Unnamed: 0,Sentence #,Word,POS,Tag
1048565,Sentence: 47958,impact,NN,O
1048566,Sentence: 47958,.,.,O
1048567,Sentence: 47959,Indian,JJ,B-gpe
1048568,Sentence: 47959,forces,NNS,O
1048569,Sentence: 47959,said,VBD,O
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


In [26]:
words = list(set(data["Word"].values))
words.append("ENDPAD")
words.sort()
n_words = len(words); n_words

35179

In [27]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None
        
        
# class NERModel(object):
#     def __init__(self,data):
#         self.sentenceGetter = SentenceGetter(data)
#         self.model = modelSpec()
        
#     def modelSpec():
#         input = Input(shape=(max_len,))
#         model = Embedding(input_dim=n_words + 1, output_dim=20,input_length=max_len, mask_zero=True)(input)
#         model = Bidirectional(LSTM(units=50, return_sequences=True,recurrent_dropout=0.1))(model)
#         model = TimeDistributed(Dense(50, activation="relu"))(model)  
#         crf = CRF(n_tags)
#         out = crf(model)
#         model = Model(input, out)
#         model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])
#         return model
        
        

In [28]:
tags = list(set(data["Tag"].values))
tags.sort()


In [29]:
n_tags = len(tags); n_tags

17

In [30]:
getter = SentenceGetter(data)
sent = getter.get_next()
print(sent)


[('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops', 'NNS', 'O'), ('from', 'IN', 'O'), ('that', 'DT', 'O'), ('country', 'NN', 'O'), ('.', '.', 'O')]


In [31]:
sentences = getter.sentences

In [32]:
max_len = 75
word2idx = {w: i + 1 for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}


In [33]:
word2idx["Obama"]

11522

In [34]:
tag2idx["B-geo"]

2

In [35]:
from keras.preprocessing.sequence import pad_sequences
X = [[word2idx[w[0]] for w in s] for s in sentences]

In [36]:
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=n_words-1)

In [37]:
y = [[tag2idx[w[2]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

In [38]:
from keras.utils import to_categorical
y = [to_categorical(i, num_classes=n_tags) for i in y]

In [39]:
from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)

In [40]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers import CRF
os.environ["CUDA_VISIBLE_DEVICES"]="0"


In [46]:
input = Input(shape=(max_len,))
model = Embedding(input_dim=n_words + 1, output_dim=50,
                  input_length=max_len, mask_zero=True)(input)  # 20-dim embedding
model = Bidirectional(LSTM(units=50, return_sequences=True,
                           recurrent_dropout=0.1))(model)  # variational biLSTM
model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
crf = CRF(n_tags)  # CRF layer
out = crf(model)  # output

In [47]:
model = Model(input, out)

In [48]:
model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])



In [49]:
history = model.fit(X_tr, np.array(y_tr), batch_size=32, epochs=20, validation_split=0.1, verbose=1)

Train on 38846 samples, validate on 4317 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [50]:
model.save('BiLSTM_CRF_NER.h5')

In [51]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
test_pred = model.predict(X_te, verbose=1)

idx2tag = {i: w for w, i in tag2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out
    
pred_labels = pred2label(test_pred)
test_labels = pred2label(y_te)



In [52]:
print(classification_report(test_labels, pred_labels))

             precision    recall  f1-score   support

        org       0.62      0.68      0.65      2113
        per       0.76      0.72      0.74      1738
        geo       0.82      0.86      0.84      3928
        gpe       0.94      0.93      0.94      1561
        tim       0.84      0.85      0.85      1972
        art       0.06      0.17      0.09        35
        eve       0.22      0.32      0.26        25
        nat       0.43      0.45      0.44        20

avg / total       0.79      0.81      0.80     11392



In [97]:
for i in range(len(X_te)):
    p = model.predict(np.array([X_te[i]]))
    p = np.argmax(p, axis=-1)
    true = np.argmax(y_te[i], -1)
    fileName = "testFile"+str(i)
    file = open(fileName,'w')
    
    file.write("{:15}||{:5}||{}".format("Word", "True", "Pred"))
    file.write(30 * "=")
    for w, t, pred in zip(X_te[i], true, p[0]):
        if w != 0:
            file.write("{:15}: {:5} {}".format(words[w-1], tags[t], tags[pred]))
    file.close()