In [1]:
import pandas as pd
import numpy as np
data = pd.read_csv("ner_dataset.csv", encoding="latin1")


In [2]:
data = data.fillna(method="ffill")

In [3]:
data.tail(10)

Unnamed: 0,Sentence #,Word,POS,Tag
1048565,Sentence: 47958,impact,NN,O
1048566,Sentence: 47958,.,.,O
1048567,Sentence: 47959,Indian,JJ,B-gpe
1048568,Sentence: 47959,forces,NNS,O
1048569,Sentence: 47959,said,VBD,O
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


In [4]:
words = list(set(data["Word"].values))
n_words = len(words); n_words
tags = list(set(data["Tag"].values))
n_tags = len(tags); n_tags

17

In [5]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None
        
# class NERModel(object):
#     def __init__(self,data):
#         self.sentenceGetter = SentenceGetter(data)
#         self.model = modelSpec()
        
#     def modelSpec():
#         input = Input(shape=(max_len,))
#         model = Embedding(input_dim=n_words + 1, output_dim=20,input_length=max_len, mask_zero=True)(input)
#         model = Bidirectional(LSTM(units=50, return_sequences=True,recurrent_dropout=0.1))(model)
#         model = TimeDistributed(Dense(50, activation="relu"))(model)  
#         crf = CRF(n_tags)
#         out = crf(model)
#         model = Model(input, out)
#         model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])
#         return model

In [6]:
getter = SentenceGetter(data)
sentences = getter.sentences

max_len = 75
max_len_char = 10

word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1
word2idx["PAD"] = 0
idx2word = {i: w for w, i in word2idx.items()}
tag2idx = {t: i + 1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0
idx2tag = {i: w for w, i in tag2idx.items()}


In [7]:
from keras.preprocessing.sequence import pad_sequences
X_word = [[word2idx[w[0]] for w in s] for s in sentences]

X_word = pad_sequences(maxlen=max_len, sequences=X_word, value=word2idx["PAD"], padding='post', truncating='post')

Using TensorFlow backend.


In [8]:
chars = set([w_i for w in words for w_i in w])
n_chars = len(chars)
print(n_chars)

char2idx = {c: i + 2 for i, c in enumerate(chars)}
char2idx["UNK"] = 1
char2idx["PAD"] = 0

98


In [9]:
X_char = []
for sentence in sentences:
    sent_seq = []
    for i in range(max_len):
        word_seq = []
        for j in range(max_len_char):
            try:
                word_seq.append(char2idx.get(sentence[i][0][j]))
            except:
                word_seq.append(char2idx.get("PAD"))
        sent_seq.append(word_seq)
    X_char.append(np.array(sent_seq))

In [10]:
y = [[tag2idx[w[2]] for w in s] for s in sentences]

In [11]:
y = pad_sequences(maxlen=max_len, sequences=y, value=tag2idx["PAD"], padding='post', truncating='post')

In [12]:
from sklearn.model_selection import train_test_split
X_word_tr, X_word_te, y_tr, y_te = train_test_split(X_word, y, test_size=0.1, random_state=2018)
X_char_tr, X_char_te, _, _ = train_test_split(X_char, y, test_size=0.1, random_state=2018)

In [13]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Conv1D
from keras.layers import Bidirectional, concatenate, SpatialDropout1D, GlobalMaxPooling1D
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [14]:
# input and embedding for words
word_in = Input(shape=(max_len,))
emb_word = Embedding(input_dim=n_words + 2, output_dim=20,
                     input_length=max_len)(word_in)

# input and embeddings for characters
char_in = Input(shape=(max_len, max_len_char,))
emb_char = TimeDistributed(Embedding(input_dim=n_chars + 2, output_dim=10,
                           input_length=max_len_char))(char_in)
# character LSTM to get word encodings by characters
char_enc = TimeDistributed(LSTM(units=20, return_sequences=False,
                                recurrent_dropout=0.5))(emb_char)

# main LSTM
x = concatenate([emb_word, char_enc])
x = SpatialDropout1D(0.3)(x)
main_lstm = Bidirectional(LSTM(units=50, return_sequences=True,
                               recurrent_dropout=0.6))(x)
out = TimeDistributed(Dense(n_tags + 1, activation="softmax"))(main_lstm)

model = Model([word_in, char_in], out)

In [15]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["acc"])


In [16]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
callbacks = [EarlyStopping(monitor='val_loss', patience=5),
             ModelCheckpoint(filepath='best_charEmbedding_model.h5', monitor='val_loss', save_best_only=True)]

In [37]:
X_char_tr[0]

array([[ 4, 33,  0,  0,  0,  0,  0,  0,  0,  0],
       [78, 83, 91, 88, 33,  0,  0,  0,  0,  0],
       [74, 96, 21, 83,  0,  0,  0,  0,  0,  0],
       [47, 41, 78, 96, 19, 83, 29, 83, 43,  0],
       [17, 91, 21, 83,  0,  0,  0,  0,  0,  0],
       [92, 83, 83, 43,  0,  0,  0,  0,  0,  0],
       [79, 96, 78, 78, 83, 98,  0,  0,  0,  0],
       [96, 43,  0,  0,  0,  0,  0,  0,  0,  0],
       [91,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [29, 96, 78, 96, 33, 91, 43, 33,  0,  0],
       [91, 33, 33, 91, 19, 79,  0,  0,  0,  0],
       [41, 43,  0,  0,  0,  0,  0,  0,  0,  0],
       [91,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [88, 83, 19, 50,  3, 96, 33, 15,  0,  0],
       [19, 91, 29, 47,  0,  0,  0,  0,  0,  0],
       [96, 43,  0,  0,  0,  0,  0,  0,  0,  0],
       [42, 43, 98, 96, 91, 43, 73, 19, 41, 43],
       [52, 91, 88, 17, 29, 96,  3,  0,  0,  0],
       [71,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0, 

In [17]:
istory = model.fit([X_word_tr,np.array(X_char_tr).reshape((len(X_char_tr), max_len, max_len_char))],np.array(y_tr).reshape(len(y_tr), max_len, 1),batch_size=32, epochs=10, validation_split=0.1, callbacks=callbacks, verbose=1)


Train on 38846 samples, validate on 4317 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [19]:
y_pred = model.predict([X_word_te,
                        np.array(X_char_te).reshape((len(X_char_te),
                                                     max_len, max_len_char))])

In [30]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report, accuracy_score
idx2tag = {i: w for w, i in tag2idx.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PAD", "O"))
        out.append(out_i)
    return out
    
pred_labels = pred2label(y_pred)
test_labels = pred2label(y_te)

In [27]:
print(sentences[0])

print(pred_labels[0])

[('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops', 'NNS', 'O'), ('from', 'IN', 'O'), ('that', 'DT', 'O'), ('country', 'NN', 'O'), ('.', '.', 'O')]
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-org', 'O', 'B-per', 'I-per', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [36]:
accuracy_score(test_labels, pred_labels)

0.9546872393661384

In [97]:
for i in range(len(X_te)):
    p = model.predict(np.array([X_te[i]]))
    p = np.argmax(p, axis=-1)
    true = np.argmax(y_te[i], -1)
    fileName = "testFile"+str(i)
    file = open(fileName,'w')
    
    file.write("{:15}||{:5}||{}".format("Word", "True", "Pred"))
    file.write(30 * "=")
    for w, t, pred in zip(X_te[i], true, p[0]):
        if w != 0:
            file.write("{:15}: {:5} {}".format(words[w-1], tags[t], tags[pred]))
    file.close()