<a href="https://colab.research.google.com/github/nivetharaja26/Google_Colab/blob/main/NER_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, TimeDistributed, Bidirectional

# -----------------------
# 1. Sample dataset (CoNLL-style sentences)
# -----------------------
sentences = [
    ["John", "lives", "in", "New", "York", "."],
    ["Mary", "works", "at", "Google", "."],
    ["Paris", "is", "in", "France", "."]
]

tags = [
    ["B-PER", "O", "O", "B-LOC", "I-LOC", "O"],
    ["B-PER", "O", "O", "B-ORG", "O"],
    ["B-LOC", "O", "O", "B-LOC", "O"]
]

# -----------------------
# 2. Build vocabularies
# -----------------------
words = list(set(w for s in sentences for w in s))
tags_flat = list(set(t for ts in tags for t in ts))

word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["PAD"] = 0
word2idx["UNK"] = 1

tag2idx = {t: i for i, t in enumerate(tags_flat)}

n_words = len(word2idx)
n_tags = len(tag2idx)

# -----------------------
# 3. Encode words and tags
# -----------------------
X = [[word2idx.get(w, 1) for w in s] for s in sentences]
y = [[tag2idx[t] for t in ts] for ts in tags]

MAX_LEN = 10
X = pad_sequences(maxlen=MAX_LEN, sequences=X, padding="post", value=word2idx["PAD"])
y = pad_sequences(maxlen=MAX_LEN, sequences=y, padding="post", value=tag2idx["O"])

y = [to_categorical(i, num_classes=n_tags) for i in y]

# -----------------------
# 4. Train-test split
# -----------------------
X_train, X_test, y_train, y_test = train_test_split(X, np.array(y), test_size=0.2)

# -----------------------
# 5. Build BiLSTM model
# -----------------------
model = Sequential()
model.add(Embedding(input_dim=n_words, output_dim=64, input_length=MAX_LEN, mask_zero=True))
model.add(Bidirectional(LSTM(units=64, return_sequences=True, recurrent_dropout=0.1)))
model.add(TimeDistributed(Dense(n_tags, activation="softmax")))

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()

# -----------------------
# 6. Train model
# -----------------------
history = model.fit(
    X_train, y_train,
    batch_size=16,
    epochs=10,
    validation_data=(X_test, y_test),
    verbose=1
)

# -----------------------
# 7. Prediction example
# -----------------------
idx2tag = {i: t for t, i in tag2idx.items()}
idx2word = {i: w for w, i in word2idx.items()}

i = 0  # pick first test sentence
p = model.predict(np.array([X_test[i]]))
p = np.argmax(p, axis=-1)
true = np.argmax(y_test[i], axis=-1)

print("\nWord  ----  Predicted / True")
for w, pred, true_tag in zip(X_test[i], p[0], true):
    if w != 0:  # ignore PAD
        print(f"{idx2word[w]:10}  -->  {idx2tag[pred]} / {idx2tag[true_tag]}")




Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9s/step - accuracy: 0.0000e+00 - loss: 1.6151 - val_accuracy: 0.6000 - val_loss: 1.6044
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 188ms/step - accuracy: 0.5000 - loss: 1.6070 - val_accuracy: 0.8000 - val_loss: 1.6013
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step - accuracy: 0.8000 - loss: 1.5982 - val_accuracy: 0.8000 - val_loss: 1.5981
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 112ms/step - accuracy: 0.7500 - loss: 1.5899 - val_accuracy: 0.8000 - val_loss: 1.5950
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 180ms/step - accuracy: 0.7500 - loss: 1.5812 - val_accuracy: 0.8000 - val_loss: 1.5917
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 162ms/step - accuracy: 0.7500 - loss: 1.5731 - val_accuracy: 0.8000 - val_loss: 1.5884
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━