# Home Exercise on Named Entity Recognition

Implement a **Recurrent Neural Network model** (**[Bidirectional LSTM-CRF Models for Sequence Tagging](https://arxiv.org/pdf/1508.01991)**) to extract named entities from text. Entity labels are encoded using the **BIO notation**, where each entity label is assigned a **B** (Beginning) or **I** (Inside) tag. The **B-** tag indicates the beginning of an entity, while the **I-** tag marks words inside the same entity.

These tags help identify multi-word entities. For example, in the phrase **"World War II"**, the labels would be: **(B-eve, I-eve, I-eve)**. Words that do not belong to any entity are labeled as **O** (Outside).

- **Data**: [Annotated GMB Corpus](https://www.kaggle.com/datasets/shoumikgoswami/annotated-gmb-corpus?select=GMB_dataset.txt) *(the last 10% of rows serve as the test set).*

**Note**: Submit only a **single Jupyter Notebook file** that can handle all tasks, including data downloading, preprocessing, model training, and model evaluation. *(Submissions that do not follow the guidelines will receive a score of 0.)*

## Grading Criteria

For valid submissions, scores will be assigned based on the **leaderboard ranking** (**strictly greater**):

- **Top 25%** → **10 points**
- **25% - 50%** → **9.0 points**
- **50% - 75%** → **8.0 points**
- **75% - 100%** → **7.0 points**


In [None]:
%pip install numpy pandas gdown tensorflow keras-crf matplotlib scikit-learn


In [None]:
import pandas as pd
import numpy as np
import gdown
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, TimeDistributed, Dropout, Input
from tensorflow.keras.models import Model
from keras_crf import CRF



In [None]:
# https://drive.google.com/drive/folders/1K1sEol_XTBWkcUwU7Yhi42--L0FyviMK?usp=sharing

# URL of the shared folder
folder_url = "https://drive.google.com/drive/folders/1K1sEol_XTBWkcUwU7Yhi42--L0FyviMK?usp=sharing"

# Output directory where the folder will be saved
output_dir = "./NLP_Data_GDrive"

# Download the folder
gdown.download_folder(folder_url, output=output_dir, quiet=False, use_cookies=False)

In [None]:
# Load dataset (Ensure file path is correct)
file_path = os.path.join(output_dir, "GMB_dataset.txt")  # Adjust file path if needed

# Check if files exist
if not os.path.exists(file_path):
    raise FileNotFoundError("File not found. Check the download process and file paths.")

df = pd.read_csv(file_path, delimiter="\t", names=["Sentence#", "Word", "POS", "Tag"], skiprows=1)

# Fill missing Sentence# values
df["Sentence#"] = df["Sentence#"].fillna(method="ffill")

# Display first few rows
print(df.head())


In [None]:
# Group words by sentences
sentences = df.groupby("Sentence#")["Word"].apply(list).values
tags = df.groupby("Sentence#")["Tag"].apply(list).values

# Create a vocabulary and tag index
words = list(set(df["Word"].values))
words.append("PAD")  # Add padding token
n_words = len(words)

tags_set = list(set(df["Tag"].values))
n_tags = len(tags_set)

# Word-to-index and index-to-word mappings
word2idx = {w: i for i, w in enumerate(words)}
idx2word = {i: w for w, i in word2idx.items()}

# Tag-to-index and index-to-tag mappings
tag2idx = {t: i for i, t in enumerate(tags_set)}
idx2tag = {i: t for t, i in tag2idx.items()}

print(f"Vocabulary size: {n_words}, Number of Tags: {n_tags}")


In [None]:
MAX_LEN = 50  # Adjust as needed

# Convert words to indices and pad sequences
X = [[word2idx[w] for w in s] for s in sentences]
X = pad_sequences(maxlen=MAX_LEN, sequences=X, padding="post", value=word2idx["PAD"])

# Convert tags to indices and pad sequences
y = [[tag2idx[t] for t in s] for s in tags]
y = pad_sequences(maxlen=MAX_LEN, sequences=y, padding="post", value=tag2idx["O"])

# Convert labels to categorical (one-hot encoding)
y = [to_categorical(i, num_classes=n_tags) for i in y]

# Train-test split (90% train, 10% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

print(f"Training Samples: {len(X_train)}, Testing Samples: {len(X_test)}")


In [None]:
EMBEDDING_DIM = 100  # Embedding size

# Define Model
input_layer = Input(shape=(MAX_LEN,))

# Embedding Layer
embedding = Embedding(input_dim=n_words, output_dim=EMBEDDING_DIM, input_length=MAX_LEN)(input_layer)

# BiLSTM Layer
bi_lstm = Bidirectional(LSTM(units=64, return_sequences=True, dropout=0.5, recurrent_dropout=0.25))(embedding)

# TimeDistributed Dense Layer
dense = TimeDistributed(Dense(n_tags, activation="relu"))(bi_lstm)

# CRF Layer
crf = CRF(n_tags)
output_layer = crf(dense)

# Compile Model
model = Model(input_layer, output_layer)
model.compile(optimizer="adam", loss=crf.loss_function, metrics=[crf.accuracy])

model.summary()


In [None]:
# Train Model
EPOCHS = 5
BATCH_SIZE = 32

history = model.fit(
    X_train, np.array(y_train),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.1,
    verbose=1
)


In [None]:
# Evaluate on Test Set
test_pred = model.predict(X_test)

# Convert predictions to tag labels
pred_tags = [[idx2tag[np.argmax(tag)] for tag in sentence] for sentence in test_pred]
true_tags = [[idx2tag[np.argmax(tag)] for tag in sentence] for sentence in y_test]

# Calculate accuracy
accuracy = np.mean([1 if pred == true else 0 for p, t in zip(pred_tags, true_tags) for pred, true in zip(p, t)])
print(f"Test Accuracy: {accuracy:.4f}")
