<a href="https://colab.research.google.com/github/mr-alamdari/NLP-Named-Entity-Recognition-Beginner/blob/main/NLP_Named_Entity_Recognition_Beginner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import os
import nltk
import numpy
import pandas as pd

In [2]:
!pip install trax
import trax
from trax import layers as tl
from trax.supervised import training
from trax.fastmath import numpy as fastnp

Collecting trax
  Downloading trax-1.4.1-py2.py3-none-any.whl (637 kB)
[K     |████████████████████████████████| 637 kB 7.6 MB/s 
Collecting tensorflow-text
  Downloading tensorflow_text-2.8.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 42.0 MB/s 
Collecting funcsigs
  Downloading funcsigs-1.0.2-py2.py3-none-any.whl (17 kB)
Collecting tf-estimator-nightly==2.8.0.dev2021122109
  Downloading tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)
[K     |████████████████████████████████| 462 kB 53.5 MB/s 
Installing collected packages: tf-estimator-nightly, tensorflow-text, funcsigs, trax
Successfully installed funcsigs-1.0.2 tensorflow-text-2.8.2 tf-estimator-nightly-2.8.0.dev2021122109 trax-1.4.1


In [5]:
def data_generator(batch_size, x, y, pad, shuffle=False, verbose=False):
    num_lines = len(x)
    lines_index = [*range(num_lines)]
    if shuffle:
        numpy.random.shuffle(lines_index)
    
    index = 0 
    while True:
        buffer_x = [0] * batch_size 
        buffer_y = [0] * batch_size 
                
        for i in range(batch_size):
            if index >= num_lines:
                index = 0
                if shuffle:
                    numpy.random.shuffle(lines_index)
            
            buffer_x[i] = x[lines_index[index]]
            buffer_y[i] = y[lines_index[index]]
            lenx = len(x[lines_index[index]])    
            if lenx > max_len:
                max_len = lenx                   
             
            index += 1
        X = numpy.full((batch_size, max_len), pad)
        Y = numpy.full((batch_size, max_len), pad)

        for i in range(batch_size):
            x_i = buffer_x[i]
            y_i = buffer_y[i]
            for j in range(len(x_i)):
                X[i, j] = x_i[j]
                Y[i, j] = y_i[j]

        if verbose: print("index=", index)
        yield((X,Y))

In [8]:
def NER(tags, vocab_size=35181, d_model=50):
    model = tl.Serial(
      tl.Embedding(vocab_size, d_model), 
      tl.LSTM(d_model), 
      tl.Dense(len(tags)), 
      tl.LogSoftmax())
    return model

In [9]:
from trax.supervised import training

batch_size = 64

train_generator = trax.supervised.inputs.add_loss_weights(
    data_generator(batch_size, t_sentences, t_labels, vocab['<PAD>'], True),
    id_to_mask=vocab['<PAD>'])

eval_generator = trax.supervised.inputs.add_loss_weights(
    data_generator(batch_size, v_sentences, v_labels, vocab['<PAD>'], True),
    id_to_mask=vocab['<PAD>'])

In [None]:
def train_model(NER, train_generator, eval_generator, train_steps=1, output_dir='model'):
    train_task = training.TrainTask(
      train_generator,
      loss_layer = tl.CrossEntropyLoss(),
      optimizer = trax.optimizers.Adam(0.01))

    eval_task = training.EvalTask(
      labeled_data = eval_generator,
      metrics = [tl.CrossEntropyLoss(), tl.Accuracy()],
      n_eval_batches = 10)

    training_loop = training.Loop(NER, train_task, eval_task = eval_task, output_dir = output_dir)

   
    training_loop.run(n_steps = train_steps)
    return training_loop

In [10]:
model = NER()
model.init(trax.shapes.ShapeDtype((1, 1), dtype=np.int32))

model.init_from_file('model.pkl.gz', weights_only=True)

In [None]:
x, y = next(data_generator(len(test_sentences), test_sentences, test_labels, vocab['<PAD>']))

In [11]:
def evaluate(pred, labels, pad):

    outputs = numpy.argmax(pred, axis=2)
    print("outputs shape:", outputs.shape)

    mask = labels != pad
    print("mask shape:", mask.shape, "mask[0][20:30]:", mask[0][20:30])
    accuracy = numpy.sum(outputs == labels) / float(numpy.sum(mask))
    return accuracy

In [None]:
accuracy = evaluate(model(x), y, vocab['<PAD>'])
print("accuracy: ", accuracy)

In [12]:
def predict(sentence, model, vocab, tag_map):
    s = [vocab[token] if token in vocab else vocab['UNK'] for token in sentence.split(' ')]
    batch_data = np.ones((1, len(s)))
    batch_data[0][:] = s
    sentence = np.array(batch_data).astype(int)
    output = model(sentence)
    outputs = np.argmax(output, axis=2)
    labels = list(tag_map.keys())
    pred = []
    for i in range(len(outputs[0])):
        idx = outputs[0][i] 
        pred_label = labels[idx]
        pred.append(pred_label)
    return pred

In [None]:
sentence = "Peter Navarro, the White House director of trade and manufacturing policy of U.S, said in an interview on Sunday morning that the White House was working to prepare for the possibility of a second wave of the coronavirus in the fall, though he said it wouldn’t necessarily come"
s = [vocab[token] if token in vocab else vocab['UNK'] for token in sentence.split(' ')]
predictions = predict(sentence, model, vocab, tag_map)
for x,y in zip(sentence.split(' '), predictions):
    if y != 'O':
        print(x,y)