# Problem Statement: Recognize the Named Entity

In [34]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, TimeDistributed, Dense
import json

### Loading and pre-processing the data

In [35]:
# Getting all the File Path
data_dir = "../input/gmb-v220/gmb-2.2.0/data"
file_names = []
for root, dirs, files in os.walk(data_dir):
    for filename in files:
        if filename.endswith(".tags"):
            file_names.append(os.path.join(root, filename))


In [36]:
file_names[:2]

In [37]:
# Formatting the Named Entity Tokens in IOB format
def iob_formatter(ners):
    iob_tokens = []
    for idx, token in enumerate(ners):
        if token != 'O':
            if idx == 0:
                token = "B-" + token
            elif ners[idx-1] == token:
                token = "I-" + token
            else:
                token = "B-" + token
        iob_tokens.append(token)
    return iob_tokens

In [38]:
# Cleaning the NER tokens
def strip_ner(tag):
    return tag.split("-")[0]

In [39]:
all_data =[]
for idx, file in enumerate(file_names):
    with open(file, 'rb') as content:
        data = content.read().decode('utf-8').strip() # Reading from the files
        sentences = data.split("\n\n") # Splitting the sentences
        for sentence in sentences:
            toks = sentence.split('\n') 
            words, ner = [], []
            for tok in toks:
                t = tok.split("\t") # Splitting the sentence and seperating the "Named Entity" from "Words"
                words.append(t[0])
                ner.append(strip_ner(t[3]))
            all_data.append([" ".join(words),
                         " ".join(iob_formatter(ner))])

In [41]:
# Initializing Tokenizers
text_tokenizer = Tokenizer(filters='[\\]^\t\n', lower=False,
split=' ', oov_token='<OOV>')

ner_tokenizer = Tokenizer(filters='\t\n', lower=False,
split=' ', oov_token='<OOV>')

In [42]:
df = pd.DataFrame(all_data, columns=["words", "ner"]) #Converting the list to a dataframe

In [43]:
df.head()

In [44]:
text_tokenizer.fit_on_texts(df['words'])
ner_tokenizer.fit_on_texts(df['ner'])

In [45]:
text_tokenizer_json = text_tokenizer.to_json()
ner_tokenizer_json = ner_tokenizer.to_json()

In [46]:
with open('txt_tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(text_tokenizer_json, ensure_ascii=False))
    
with open('ner_tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(ner_tokenizer_json, ensure_ascii=False))

In [47]:
# Getting the config information from the tokenizer
ner_config = ner_tokenizer.get_config()
text_config = text_tokenizer.get_config()

In [48]:
text_vocab = eval(text_config['index_word'])
ner_vocab = eval(ner_config['index_word'])

In [49]:
# Converting to sequences
x =  text_tokenizer.texts_to_sequences(df['words'])
y =  ner_tokenizer.texts_to_sequences(df['ner'])

In [50]:
# Padding all the sentences to a fix length in order to feed the data to model
max_len = 50
x_pad = sequence.pad_sequences(x, padding='post',
maxlen=max_len)
y_pad = sequence.pad_sequences(y, padding='post',
maxlen=max_len)
print(x_pad.shape, y_pad.shape)

In [51]:
# Converting the labels to binary vectors
num_classes = len(ner_vocab) + 1
Y = tf.keras.utils.to_categorical(y_pad, num_classes=num_classes)
Y.shape

In [52]:
x_pad[0]

In [53]:
# Hyperparameters
vocab_size = len(text_vocab) + 1
embedding_dim = 64
rnn_units = 100
BATCH_SIZE=90
num_classes = len(ner_vocab)+1
dropout=0.2

In [54]:
X = x_pad
total_sentences = X.shape[0]
test_size = round(total_sentences / BATCH_SIZE * 0.2)
X_train = X[BATCH_SIZE*test_size:]
Y_train = Y[BATCH_SIZE*test_size:]
X_test = X[0:BATCH_SIZE*test_size]
Y_test = Y[0:BATCH_SIZE*test_size]


## Model Building

In [55]:
def build_model_bilstm(vocab_size, embedding_dim, rnn_units, batch_size, classes):
    model = tf.keras.Sequential([
        Embedding(vocab_size, embedding_dim, mask_zero=True,
                  batch_input_shape=[batch_size,None]),
        Bidirectional(LSTM(units=rnn_units,
                           return_sequences=True,
                           dropout=dropout)),
        TimeDistributed(Dense(rnn_units, activation='relu')),
        Dense(num_classes, activation="softmax")
    ])
    
    
    return model

In [56]:
model = build_model_bilstm(
    vocab_size = vocab_size,
    embedding_dim=embedding_dim, rnn_units=rnn_units, batch_size=BATCH_SIZE,
    classes=num_classes)
model.summary()
model.compile(optimizer="adam", loss="categorical_crossentropy",
              metrics=["accuracy"])

In [57]:
# Stopping early if model is not much improving
early_stopping_cb = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss", patience= 3)

In [58]:
history = model.fit(X_train, Y_train, batch_size=BATCH_SIZE, epochs=15, callbacks= [early_stopping_cb], validation_data= (X_test, Y_test))

In [59]:
model.save("NER.h5")

In [63]:
plt.plot(history.history["loss"], label="loss")
plt.plot(history.history["val_loss"], label="val_loss")
plt.ylabel("Loss")
plt.xlabel("Epochs")
plt.legend()
plt.savefig("Loss.pdf")

In [64]:
plt.plot(history.history["accuracy"], label="accuracy")
plt.plot(history.history["val_accuracy"], label="val_accuracy")
plt.ylabel("Acc")
plt.xlabel("Epochs")
plt.legend()
plt.savefig("Acc.pdf")