<a href="https://colab.research.google.com/github/mriyank/NamedEntityRecognition_LSTM_CNN/blob/main/NER_Bidirectional_LSTM_ELMo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# __Document Annotator using Deep Recurring Neural Networks__

### Suyash Agarwal
### Mriyank Singh

## Importing the dataset

In [None]:
%tensorflow_version 1.15

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import drive

drive.mount('/content/gdrive')

`%tensorflow_version` only switches the major version: 1.x or 2.x.
You set: `1.15`. This will be interpreted as: `1.x`.


TensorFlow 1.x selected.
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
dataset = pd.read_csv("gdrive/MyDrive/NLPProj/ner_dataset.csv", encoding="latin1")
dataset = dataset.drop(['POS'], axis=1)
dataset = dataset.fillna(method="ffill")
dataset.head(15)

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 1,Thousands,O
1,Sentence: 1,of,O
2,Sentence: 1,demonstrators,O
3,Sentence: 1,have,O
4,Sentence: 1,marched,O
5,Sentence: 1,through,O
6,Sentence: 1,London,B-geo
7,Sentence: 1,to,O
8,Sentence: 1,protest,O
9,Sentence: 1,the,O


In [None]:
words = set(list(dataset['Word'].values))
n_words = len(words)
n_words

35178

In [None]:
tags = list(set(dataset["Tag"].values))
n_tags = len(tags)
n_tags

17

In [None]:
class SentenceExtractor(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [None]:
extractor = SentenceExtractor(dataset)
sen = extractor.get_next()
sen = extractor.get_next()
print(sen)

[('Families', 'O'), ('of', 'O'), ('soldiers', 'O'), ('killed', 'O'), ('in', 'O'), ('the', 'O'), ('conflict', 'O'), ('joined', 'O'), ('the', 'O'), ('protesters', 'O'), ('who', 'O'), ('carried', 'O'), ('banners', 'O'), ('with', 'O'), ('such', 'O'), ('slogans', 'O'), ('as', 'O'), ('"', 'O'), ('Bush', 'B-per'), ('Number', 'O'), ('One', 'O'), ('Terrorist', 'O'), ('"', 'O'), ('and', 'O'), ('"', 'O'), ('Stop', 'O'), ('the', 'O'), ('Bombings', 'O'), ('.', 'O'), ('"', 'O')]


In [None]:
total_sen = extractor.sentences
print(len(total_sen))

47959


In [None]:
words2index = {w:i for i,w in enumerate(words)}
tags2index = {t:i for i,t in enumerate(tags)}
print(words2index['Israeli'])
print(tags2index['B-gpe'])

33263
15


In [None]:
max_len = 50
X = [[w[0]for w in s] for s in total_sen]
new_X = []
for seq in X:
    new_seq = []
    for i in range(max_len):
        try:
            new_seq.append(seq[i])
        except:
            new_seq.append("PADword")
    new_X.append(new_seq)
new_X[15]

['Israeli',
 'officials',
 'say',
 'Prime',
 'Minister',
 'Ariel',
 'Sharon',
 'will',
 'undergo',
 'a',
 'medical',
 'procedure',
 'Thursday',
 'to',
 'close',
 'a',
 'tiny',
 'hole',
 'in',
 'his',
 'heart',
 'discovered',
 'during',
 'treatment',
 'for',
 'a',
 'minor',
 'stroke',
 'suffered',
 'last',
 'month',
 '.',
 'PADword',
 'PADword',
 'PADword',
 'PADword',
 'PADword',
 'PADword',
 'PADword',
 'PADword',
 'PADword',
 'PADword',
 'PADword',
 'PADword',
 'PADword',
 'PADword',
 'PADword',
 'PADword',
 'PADword',
 'PADword']

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
y = [[tags2index[w[1]] for w in s] for s in total_sen]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tags2index["O"])
y[15]

array([15,  7,  7,  4, 16, 16, 16,  7,  7,  7,  7,  7, 12,  7,  7,  7,  7,
        7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
        7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7],
      dtype=int32)

In [None]:
batch_size=32

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(new_X, y, test_size=0.15, random_state=10)

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import add
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Lambda
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import backend as K
sess = tf.Session()
K.set_session(sess)

In [None]:
elmo_model = hub.Module("https://tfhub.dev/google/elmo/2", trainable=False)
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())

In [None]:
def ElmoEmbedding(x):
    return elmo_model(inputs={
                            "tokens": tf.squeeze(tf.cast(x, tf.string)),
                            "sequence_len": tf.constant(batch_size*[max_len])
                      },
                      signature="tokens",
                      as_dict=True)["elmo"]

In [None]:
input_text = Input(shape=(max_len,), dtype=tf.string)
embedding = Lambda(ElmoEmbedding, output_shape=(max_len, 1024))(input_text)
x = Bidirectional(LSTM(units=512, return_sequences=True,
                       recurrent_dropout=0.2, dropout=0.2))(embedding)
x_rnn = Bidirectional(LSTM(units=512, return_sequences=True,
                           recurrent_dropout=0.2, dropout=0.2))(x)
x = add([x, x_rnn])
out = TimeDistributed(Dense(n_tags, activation="softmax"))(x)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [None]:
model = Model(input_text, out)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 50)]         0                                            
__________________________________________________________________________________________________
lambda (Lambda)                 (32, None, 1024)     0           input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (32, None, 1024)     6295552     lambda[0][0]                     
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (32, None, 1024)     6295552     bidirectional[0][0]              
______________________________________________________________________________________________

In [None]:
X_train, X_val = X_train[:1213*batch_size], X_train[-135*batch_size:]
y_train, y_val = y_train[:1213*batch_size], y_train[-135*batch_size:]
y_train = y_train.reshape(y_train.shape[0], y_train.shape[1], 1)
y_val = y_val.reshape(y_val.shape[0], y_val.shape[1], 1)

In [None]:
history = model.fit(np.array(X_train), y_train, validation_data=(np.array(X_val), y_val),
                    batch_size=batch_size, epochs=5)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 38816 samples, validate on 4320 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model.save("ELMO_trained_model.h5")

In [None]:
X_test = X_test[:149*batch_size]
test_pred = model.predict(np.array(X_test), verbose=1)



In [None]:
!pip install seqeval
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

idx2tag = {i: w for w, i in tags2index.items()}

def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PADword", "O"))
        out.append(out_i)
    return out

def test2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            out_i.append(idx2tag[p].replace("PADword", "O"))
        out.append(out_i)
    return out
    
pred_labels = pred2label(test_pred)
test_labels = test2label(y_test[:149*32])

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l[K     |███████▌                        | 10 kB 22.4 MB/s eta 0:00:01[K     |███████████████                 | 20 kB 21.7 MB/s eta 0:00:01[K     |██████████████████████▌         | 30 kB 11.1 MB/s eta 0:00:01[K     |██████████████████████████████  | 40 kB 9.1 MB/s eta 0:00:01[K     |████████████████████████████████| 43 kB 1.0 MB/s 
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16181 sha256=d134efec32544d0278e002899df362163114b064719bfa5a1bce3b4072a69e78
  Stored in directory: /root/.cache/pip/wheels/05/96/ee/7cac4e74f3b19e3158dce26a20a1c86b3533c43ec72a549fd7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
print(classification_report(test_labels, pred_labels))

              precision    recall  f1-score   support

         art       0.38      0.18      0.25        49
         eve       0.50      0.35      0.42        31
         geo       0.87      0.86      0.86      3761
         gpe       0.96      0.93      0.95      1534
         nat       0.38      0.35      0.36        23
         org       0.65      0.70      0.68      1929
         per       0.75      0.79      0.77      1716
         tim       0.84      0.86      0.85      2099

   micro avg       0.81      0.83      0.82     11142
   macro avg       0.67      0.63      0.64     11142
weighted avg       0.82      0.83      0.82     11142



In [None]:
i = 390
p = model.predict(np.array(X_test[i:i+batch_size]))[0]
p = np.argmax(p, axis=-1)
print("{:15} {:5}: ({})".format("Word", "Pred", "True"))
print("="*30)
for w, true, pred in zip(X_test[i], y_test[i], p):
    if w != "PADword":
        print("{:15}:{:5} ({})".format(w, tags[pred], tags[true]))

Word            Pred : (True)
Diplomats      :O     (O)
at             :O     (O)
the            :O     (O)
U.N.           :B-geo (B-geo)
nuclear        :O     (O)
agency         :O     (O)
say            :O     (O)
the            :O     (O)
United         :B-geo (B-geo)
States         :I-geo (I-geo)
and            :O     (O)
its            :O     (O)
European       :O     (O)
allies         :O     (O)
have           :O     (O)
agreed         :O     (O)
to             :O     (O)
suspend        :O     (O)
their          :O     (O)
push           :O     (O)
to             :O     (O)
refer          :O     (O)
Iran           :B-geo (B-geo)
to             :O     (O)
the            :O     (O)
Security       :B-org (B-org)
Council        :I-org (I-org)
for            :O     (O)
possible       :O     (O)
sanctions      :O     (O)
over           :O     (O)
its            :O     (O)
nuclear        :O     (O)
activities     :O     (O)
.              :O     (O)
