**Named Entity Recognition (NER)** is a task in natural language processing (NLP) where the goal is to find and label specific pieces of information in text with their corresponding categories. These categories often include things like names of people, organizations, locations, dates, and sometimes more specialized categories depending on the application, like medical terms or product names.

Imagine you're reading a newspaper, and you highlight all the names of people, places, and dates. That's essentially what NER does, but automatically.

In [22]:
#sentences = [ "John Doe went to Paris","Jane was born in 1990","Microsoft is based in Redmond"]

# For each sentence, we specify whether each word is a Person (PER), Location (LOC), Organization (ORG), or Other (O)
#tags = [ ["B-PER", "I-PER", "O", "O", "B-LOC"],["B-PER", "O", "O", "O", "O", "B-TIME"],["B-ORG", "O", "O", "O", "B-LOC"]]

import random

# Entities and other vocabulary
names = ["Rahul Kodarapu", "Aamir Khan", "MS Dhoni", "Steve Jobs", "Carol Danvers"]
companies = ["Shoonya", "Satyamev Jayate", "Ritu Sports", "Apple", "Marvel Inc"]
locations = ["Hyderabad", "Mumbai", "Delhi", "Bangalore", "Chennai"]
years = ["1990", "2000", "2010", "2020", "1985"]
actions = ["moved to", "born in", "working at", "founded in", "visited"]

# Generate sentences
sentences = []
tags = []

for _ in range(100):
    name = random.choice(names)
    company = random.choice(companies)
    location = random.choice(locations)
    year = random.choice(years)
    action = random.choice(actions)
    
    if action == "moved to" or action == "visited":
        sentence = f"{name} {action} {location}"
        tag = ["B-PER", "I-PER", "O", "B-LOC"]
    elif action == "born in":
        sentence = f"{name} was {action} {year}"
        tag = ["B-PER", "I-PER", "O", "O", "O", "B-TIME"]
    elif action == "working at":
        sentence = f"{name} is {action} {company}"
        tag = ["B-PER", "I-PER", "O", "O", "B-ORG"]
    elif action == "founded in":
        sentence = f"{company} was {action} {location} in {year}"
        tag = ["B-ORG", "O", "O", "B-LOC", "O", "B-TIME"]
    
    sentences.append(sentence)
    tags.append(tag)

# Check some samples
for i in range(5):
    print(sentences[i])
    print(tags[i])


# Vocabulary
# Initialize vocab with 'PAD' token
vocab = {"PAD": 0}

# Update vocab with words from the new sentences
for sentence in sentences:
    for word in sentence.split():
        if word not in vocab:
            vocab[word] = len(vocab)

# The tag_index remains the same as your entity types haven't changed.
tag_index = {"O": 0, "B-PER": 1, "I-PER": 2, "B-LOC": 3, "B-TIME": 4, "B-ORG": 5, "PAD": 6}

# Just to check
print(f"Size of vocab: {len(vocab)}")
print(f"Sample vocab entries: {list(vocab.items())[:10]}")
print(f"Tag Index: {tag_index}")



Carol Danvers is working at Satyamev Jayate
['B-PER', 'I-PER', 'O', 'O', 'B-ORG']
MS Dhoni is working at Shoonya
['B-PER', 'I-PER', 'O', 'O', 'B-ORG']
MS Dhoni moved to Delhi
['B-PER', 'I-PER', 'O', 'B-LOC']
MS Dhoni moved to Mumbai
['B-PER', 'I-PER', 'O', 'B-LOC']
Aamir Khan visited Bangalore
['B-PER', 'I-PER', 'O', 'B-LOC']
Size of vocab: 39
Sample vocab entries: [('PAD', 0), ('Carol', 1), ('Danvers', 2), ('is', 3), ('working', 4), ('at', 5), ('Satyamev', 6), ('Jayate', 7), ('MS', 8), ('Dhoni', 9)]
Tag Index: {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-LOC': 3, 'B-TIME': 4, 'B-ORG': 5, 'PAD': 6}


In [23]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np

max_len = max([len(s.split()) for s in sentences])

X = [[vocab[w] for w in s.split()] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=vocab["PAD"])

y = [[tag_index[w] for w in s] for s in tags]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag_index["PAD"])
y = [to_categorical(i, num_classes=len(tag_index)) for i in y]


In [24]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Input, Bidirectional

input = Input(shape=(max_len,))
model = Embedding(input_dim=len(vocab), output_dim=50)(input)
model = Bidirectional(LSTM(units=64, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(len(tag_index), activation="softmax"))(model)

model = Model(input, out)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()


In [None]:
model.fit(X, np.array(y), batch_size=32, epochs=1000, validation_split=0.1, verbose=1)


Epoch 1/1000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 199ms/step - accuracy: 0.2952 - loss: 1.9354 - val_accuracy: 0.4500 - val_loss: 1.9069
Epoch 2/1000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.4830 - loss: 1.8968 - val_accuracy: 0.4750 - val_loss: 1.8629
Epoch 3/1000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.4939 - loss: 1.8480 - val_accuracy: 0.4625 - val_loss: 1.8028
Epoch 4/1000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.4970 - loss: 1.7785 - val_accuracy: 0.4500 - val_loss: 1.7201
Epoch 5/1000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.4799 - loss: 1.6815 - val_accuracy: 0.4500 - val_loss: 1.6108
Epoch 6/1000
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.4731 - loss: 1.5654 - val_accuracy: 0.4625 - val_loss: 1.4782
Epoch 7/1000
[1m3/3[0m [32m━━━

In [20]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

def preds_to_labels(predictions, index_tag):
    pred_tags = []
    for pred in predictions:
        pred_tags.append([index_tag[np.argmax(p)] for p in pred])
    return pred_tags

# Predict
preds = model.predict(X)

# Convert predictions and true labels to tags
pred_labels = preds_to_labels(preds, {i: t for t, i in tag_index.items()})
true_labels = preds_to_labels(y, {i: t for t, i in tag_index.items()})

print('Pred Labels', pred_labels)

print('True Labels', true_labels)

# Calculate and print metrics
print(classification_report(true_labels, pred_labels))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Pred Labels [['B-PER', 'I-PER', 'O', 'O', 'B-LOC'], ['O', 'O', 'O', 'O', 'B-TIME'], ['I-PER', 'O', 'O', 'O', 'B-TIME']]
True Labels [['B-PER', 'I-PER', 'O', 'O', 'B-LOC'], ['O', 'O', 'O', 'O', 'B-TIME'], ['B-ORG', 'O', 'O', 'O', 'B-LOC']]
              precision    recall  f1-score   support

         LOC       1.00      0.50      0.67         2
         ORG       0.00      0.00      0.00         1
         PER       0.50      1.00      0.67         1
        TIME       0.50      1.00      0.67         1

   micro avg       0.60      0.60      0.60         5
   macro avg       0.50      0.62      0.50         5
weighted avg       0.60      0.60      0.53         5

