In [1]:
#import librsries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#load the dataset

df = pd.read_csv("ner_dataset.csv", encoding = 'latin1')
df = df.drop(['POS'], axis =1)

In [3]:
df = df.fillna(method = 'ffill')
words = list(df['Word'].values)
words = set(words)
words.add('padding')

In [6]:
len(words)
print(df.head(10))

    Sentence #           Word    Tag
0  Sentence: 1      Thousands      O
1  Sentence: 1             of      O
2  Sentence: 1  demonstrators      O
3  Sentence: 1           have      O
4  Sentence: 1        marched      O
5  Sentence: 1        through      O
6  Sentence: 1         London  B-geo
7  Sentence: 1             to      O
8  Sentence: 1        protest      O
9  Sentence: 1            the      O


In [7]:
entity = list(set(df['Tag'].values))

In [8]:
len(entity)

17

In [9]:
df.tail(20)

Unnamed: 0,Sentence #,Word,Tag
1048555,Sentence: 47957,.,O
1048556,Sentence: 47958,They,O
1048557,Sentence: 47958,say,O
1048558,Sentence: 47958,not,O
1048559,Sentence: 47958,all,O
1048560,Sentence: 47958,of,O
1048561,Sentence: 47958,the,O
1048562,Sentence: 47958,rockets,O
1048563,Sentence: 47958,exploded,O
1048564,Sentence: 47958,upon,O


In [10]:
#convert sentence into tuples with respective Tags

tuple_list = list(zip(df.Word, df.Tag))


In [11]:
print(len(tuple_list))


1048575


In [12]:
sentences = df["Sentence #"].unique()
print(sentences)

['Sentence: 1' 'Sentence: 2' 'Sentence: 3' ... 'Sentence: 47957'
 'Sentence: 47958' 'Sentence: 47959']


In [23]:
class SentenceGetter(object):
    
    def __init__(self, dataset):
        self.n_sent = 1
        self.dataset = dataset
        self.empty = False
        agg_func = lambda s: [(w, t) for w,t in zip(s["Word"].values.tolist(),s["Tag"].values.tolist())]
        self.grouped = self.dataset.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

In [24]:
getter = SentenceGetter(df)
sentences = getter.sentences

In [25]:
print(sentences[5])

[('Mr.', 'B-per'), ('Egeland', 'I-per'), ('said', 'O'), ('the', 'O'), ('latest', 'O'), ('figures', 'O'), ('show', 'O'), ('1.8', 'O'), ('million', 'O'), ('people', 'O'), ('are', 'O'), ('in', 'O'), ('need', 'O'), ('of', 'O'), ('food', 'O'), ('assistance', 'O'), ('-', 'O'), ('with', 'O'), ('the', 'O'), ('need', 'O'), ('greatest', 'O'), ('in', 'O'), ('Indonesia', 'B-tim'), (',', 'O'), ('Sri', 'B-per'), ('Lanka', 'B-gpe'), (',', 'O'), ('the', 'O'), ('Maldives', 'B-geo'), ('and', 'O'), ('India', 'B-geo'), ('.', 'O')]


In [28]:
words = list(set(df["Word"].values))
words.append("_PAD_")
n_words = len(words); n_words

35179

In [29]:
tags = list(set(df["Tag"].values))
n_tags = len(tags)
n_tags

17

In [30]:
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}

In [31]:
from keras.preprocessing.sequence import pad_sequences
X = [[word2idx[w[0]] for w in s] for s in sentences]

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [32]:
X = pad_sequences(maxlen=50, sequences=X, padding="post",value=n_words - 1)

In [33]:
y = [[tag2idx[w[1]] for w in s] for s in sentences]
y = pad_sequences(maxlen=50, sequences=y, padding="post", value=tag2idx["O"])

In [34]:
from keras.utils import to_categorical
y = [to_categorical(i, num_classes=n_tags) for i in y]

In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [36]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional

In [43]:
input = Input(shape=(50,))
model = Embedding(input_dim=n_words, output_dim=50, input_length=50)(input)
model = Dropout(0.2)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
model = Dropout(0.2)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
model = Dropout(0.5)(model)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)  # softmax output layer

In [44]:
model = Model(input, out)

In [45]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [46]:
model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 50)                0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 50, 50)            1758950   
_________________________________________________________________
dropout_4 (Dropout)          (None, 50, 50)            0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 50, 200)           120800    
_________________________________________________________________
dropout_5 (Dropout)          (None, 50, 200)           0         
_________________________________________________________________
bidirectional_4 (Bidirection (None, 50, 200)           240800    
_________________________________________________________________
dropout_6 (Dropout)          (None, 50, 200)           0   

In [47]:
training = model.fit(X_train, np.array(y_train), batch_size=32, epochs=3, validation_split=0.2, verbose=1)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 30693 samples, validate on 7674 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [103]:
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)

model.save_weights("model.h5")
print("Saved model to disk")

Saved model to disk


In [85]:
i = 0
p = model.predict(np.array([X_test[i]]))
p = np.argmax(p, axis=-1)
print("{:14} ({:5}): ({:4})".format("Word", "True", "Pred"))
for w,pred in zip(X_test[i],p[0]):
    print("{:14}: {}".format(words[w],tags[pred]))

Word           (True ): (Pred)
A             : O
spokesman     : O
for           : O
the           : O
government    : O
of            : O
the           : O
Silesia       : O
region        : O
,             : O
where         : O
Katowice      : B-org
is            : O
located       : O
,             : O
said          : O
another       : O
130           : O
were          : O
injured       : O
.             : O
_PAD_         : O
_PAD_         : O
_PAD_         : O
_PAD_         : O
_PAD_         : O
_PAD_         : O
_PAD_         : O
_PAD_         : O
_PAD_         : O
_PAD_         : O
_PAD_         : O
_PAD_         : O
_PAD_         : O
_PAD_         : O
_PAD_         : O
_PAD_         : O
_PAD_         : O
_PAD_         : O
_PAD_         : O
_PAD_         : O
_PAD_         : O
_PAD_         : O
_PAD_         : O
_PAD_         : O
_PAD_         : O
_PAD_         : O
_PAD_         : O
_PAD_         : O
_PAD_         : O


In [64]:
predictions = model.predict(np.array(X_test))
predictions = np.argmax(predictions, axis=-1)

In [86]:
predictions

array([[16, 16, 16, ..., 16, 16, 16],
       [16, 16, 16, ..., 16, 16, 16],
       [ 1,  4, 16, ..., 16, 16, 16],
       ...,
       [16, 16, 16, ..., 16, 16, 16],
       [ 1, 16,  3, ..., 16, 16, 16],
       [16, 16, 16, ..., 16, 16, 16]], dtype=int64)

In [69]:
predictions.shape

(9592, 50)

In [70]:
y_test_arr = np.array(y_test)
y_test_result=np.argmax(y_test_arr, axis=-1)
y_test_result.shape

(9592, 50)

In [67]:
y_test_result[0]

array([16, 16, 16, 16, 16, 16, 16,  5, 16, 16, 16,  5, 16, 16, 16, 16, 16,
       16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
       16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16],
      dtype=int64)

In [68]:
predictions[0]

array([16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  3, 16, 16, 16, 16, 16,
       16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
       16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16],
      dtype=int64)

In [87]:
# Data Visualization
from pandas_profiling import ProfileReport

In [98]:
profile = ProfileReport(df, title="Pandas Profiling Report")

In [99]:
profile.to_file("output.html")