<a href="https://colab.research.google.com/github/mohammadreza-mohammadi94/Deep-Learning-Projects/blob/main/NER-WikiANN-Dataset/NER_WikiANN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download Dataset

In [10]:
!wget https://raw.githubusercontent.com/davidsbatista/NER-datasets/refs/heads/master/WikiANN/en/dev
!wget https://raw.githubusercontent.com/davidsbatista/NER-datasets/refs/heads/master/WikiANN/en/test
!wget https://raw.githubusercontent.com/davidsbatista/NER-datasets/refs/heads/master/WikiANN/en/train

--2026-02-18 22:04:24--  https://raw.githubusercontent.com/davidsbatista/NER-datasets/refs/heads/master/WikiANN/en/dev
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1002261 (979K) [text/plain]
Saving to: ‘dev.1’


2026-02-18 22:04:24 (31.1 MB/s) - ‘dev.1’ saved [1002261/1002261]

--2026-02-18 22:04:24--  https://raw.githubusercontent.com/davidsbatista/NER-datasets/refs/heads/master/WikiANN/en/test
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1000256 (977K) [text/plain]
Saving to: ‘test.1’


2026-02-18 22:04:24 (26.8 MB/s) - ‘

# Imports

In [28]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
import numpy as np

# Functions

In [21]:
def load_dataset(file_path):
    """Loads a dataset from a given file path into a list of lines.

    Args:
        file_path (str): The path to the dataset file.

    Returns:
        list: A list of strings, where each string is a line from the file.
    """
    with open(file_path) as f:
        file_set = f.readlines()
    return file_set

def prepare_split(file_set):
    """Processes a raw dataset (list of lines) into tokenized sentences and their corresponding labels.
    Each sentence is separated by a blank line in the input file.

    Args:
        file_set (list): A list of strings, each representing a line from the raw dataset file.

    Returns:
        tuple: A tuple containing two lists:
            - sentences_tokens (list[list[str]]): A list of sentences, where each sentence is a list of tokens.
            - sentences_labels (list[list[str]]): A list of sentences, where each sentence is a list of labels.
    """
    sentences_tokens = []
    sentences_labels = []
    current_sentence_tokens = []
    current_sentence_labels = []

    for line in file_set:
        line = line.strip()
        if not line:  # An empty line signals the end of a sentence
            if current_sentence_tokens:  # Add the collected sentence if it's not empty
                sentences_tokens.append(current_sentence_tokens)
                sentences_labels.append(current_sentence_labels)
            # Reset for the next sentence
            current_sentence_tokens = []
            current_sentence_labels = []
        else:
            # Ensure the line has the expected 'lang:token label' format before splitting
            parts = line.replace("\t", " ").split(" ", 1)
            if len(parts) == 2 and ':' in parts[0]:
                token_part = parts[0]
                label_part = parts[1]

                # Extract the token, removing the language prefix and any surrounding quotes
                token = token_part.split(":", 1)[1].strip("'\".")
                label = label_part

                current_sentence_tokens.append(token)
                current_sentence_labels.append(label)

    # Add the very last sentence if the file doesn't end with an empty line
    if current_sentence_tokens:
        sentences_tokens.append(current_sentence_tokens)
        sentences_labels.append(current_sentence_labels)

    return sentences_tokens, sentences_labels

def create_vocabulary(split):
    """Generates a sorted list of unique tokens (vocabulary) from a list of tokenized sentences.

    Args:
        split (list[list[str]]): A list of sentences, where each sentence is a list of tokens.

    Returns:
        list: A sorted list of unique tokens, representing the vocabulary.
    """
    # Flatten the list of lists into a single list of tokens for vocabulary creation
    all_tokens = [token for sentence in split for token in sentence]
    return sorted(list(set(all_tokens)))

def get_tag_list(train_tags):
    """Extracts and sorts a list of all unique named entity tags from a list of tag sequences.

    Args:
        train_tags (list[list[str]]): A list of tag sequences, where each sequence is a list of tags for a sentence.

    Returns:
        list: A sorted list of unique named entity tags.
    """
    # Flatten the list of lists into a single list of tags to find all unique tags
    all_tags = [tag for sentence_tags in train_tags for tag in sentence_tags]
    return sorted(list(set(all_tags)))

def create_word_indices(vocab):
    """Creates word-to-index and index-to-word mappings for the vocabulary.
    Special tokens [PAD] and [UNK] are added at indices 0 and 1, respectively.

    Args:
        vocab (list): A sorted list of unique words (the vocabulary).

    Returns:
        tuple: A tuple containing two dictionaries:
            - word2idx (dict): Maps words to their integer indices.
            - idx2word (dict): Maps integer indices back to words.
    """
    # Assign indices to words, starting from 2 to reserve 0 and 1 for special tokens
    word2idx = {word: idx + 2 for idx, word in enumerate(vocab)}
    word2idx["[PAD]"] = 0  # Padding token for shorter sequences
    word2idx["[UNK]"] = 1  # Unknown token for words not in the vocabulary
    idx2word = {idx: word for word, idx in word2idx.items()}
    return word2idx, idx2word

def create_tag_indices(tags):
    """Creates tag-to-index and index-to-tag mappings for the named entity tags.

    Args:
        tags (list): A sorted list of unique named entity tags.

    Returns:
        tuple: A tuple containing two dictionaries:
            - tag2idx (dict): Maps tags to their integer indices.
            - idx2tag (dict): Maps integer indices back to tags.
    """
    tag2idx = {tag: idx for idx, tag in enumerate(tags)}
    idx2tag = {idx: tag for tag, idx in tag2idx.items()}
    return tag2idx, idx2tag

def preprocess_and_padding(tokens_list_of_sentences, tags_list_of_sentences, word2idx, tag2idx, maxlen):
    """Converts token and tag sequences to numerical representations and applies padding/truncation.

    Args:
        tokens_list_of_sentences (list[list[str]]): A list of tokenized sentences.
        tags_list_of_sentences (list[list[str]]): A list of tag sequences for the sentences.
        word2idx (dict): Dictionary mapping words to integer indices.
        tag2idx (dict): Dictionary mapping tags to integer indices.
        maxlen (int): The maximum length for sequences. Shorter sequences are padded; longer ones are truncated.

    Returns:
        tuple: A tuple containing two numpy arrays:
            - X_padded (np.array): Padded and numerical token sequences.
            - y_padded (np.array): Padded and numerical tag sequences.
    """
    X_padded = []
    for sentence_tokens in tokens_list_of_sentences:
        # Convert tokens to their integer IDs, using [UNK] for unknown words
        encoded_tokens = [word2idx.get(token, word2idx["[UNK]"]) for token in sentence_tokens]

        if len(encoded_tokens) > maxlen:
            # Truncate if the sequence is longer than maxlen
            X_padded.append(encoded_tokens[:maxlen])
        else:
            # Pad with [PAD] token index (0) if the sequence is shorter
            padded_tokens = encoded_tokens + [word2idx["[PAD]"]] * (maxlen - len(encoded_tokens))
            X_padded.append(padded_tokens)

    y_padded = []
    # The 'O' (Outside) tag is commonly used for padding labels. Find its index.
    pad_tag_idx = tag2idx['O']  # Assuming 'O' is always present and appropriate for padding

    for sentence_tags in tags_list_of_sentences:
        # Convert tags to their integer IDs
        encoded_tags = [tag2idx[tag] for tag in sentence_tags]

        if len(encoded_tags) > maxlen:
            # Truncate if the sequence is longer than maxlen
            y_padded.append(encoded_tags[:maxlen])
        else:
            # Pad with 'O' tag index if the sequence is shorter
            padded_tags = encoded_tags + [pad_tag_idx] * (maxlen - len(encoded_tags))
            y_padded.append(padded_tags)

    return np.array(X_padded), np.array(y_padded)

def build_model(vocab_size, tag_output_dim, embedding_dim, lstm_units, maxlen):
    """Constructs a Bidirectional LSTM model for Named Entity Recognition.

    The model consists of:
    - An Embedding layer to convert integer-encoded words into dense vectors.
    - A Bidirectional LSTM layer to capture context from both directions of a sequence.
    - A Dense layer with softmax activation for multi-class classification (tag prediction).

    Args:
        vocab_size (int): The total number of unique words in the vocabulary (including [PAD] and [UNK]).
        tag_output_dim (int): The total number of unique named entity tags.
        embedding_dim (int): The dimensionality of the word embedding vectors.
        lstm_units (int): The number of units in the LSTM layer.
        maxlen (int): The maximum length of input sequences.

    Returns:
        tensorflow.keras.Model: The compiled Keras model ready for training.
    """
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim),
        Bidirectional(LSTM(lstm_units, return_sequences=True)),
        Dense(tag_output_dim, activation='softmax')
    ])
    # Explicitly build the model to determine output shapes and parameter counts
    model.build(input_shape=(None, maxlen))  # (batch_size, sequence_length)

    model.compile(
        optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

def train_model(model, X_train, y_train, X_dev, y_dev, callbacks, epochs=5, batch_size=32):
    """Trains the provided Keras model using the training and development datasets.

    Args:
        model (tensorflow.keras.Model): The compiled Keras model to be trained.
        X_train (np.array): Training data (numerical token sequences).
        y_train (np.array): Training labels (numerical tag sequences).
        X_dev (np.array): Development/validation data (numerical token sequences).
        y_dev (np.array): Development/validation labels (numerical tag sequences).
        callbacks (list): A list of Keras callbacks to apply during training (e.g., ModelCheckpoint, EarlyStopping).
        epochs (int, optional): The number of training epochs. Defaults to 5.
        batch_size (int, optional): The number of samples per gradient update. Defaults to 32.

    Returns:
        tensorflow.keras.callbacks.History: A History object containing training metrics.
    """
    history = model.fit(X_train, y_train,
              validation_data=(X_dev, y_dev),
              epochs=epochs,
              batch_size=batch_size,
              callbacks=callbacks,
              verbose=1)
    return model

# Execution

In [13]:
# Load Datasets
train = load_dataset("/content/train")
test = load_dataset("/content/test")
dev = load_dataset("/content/dev")

# Split datasets into sentences
train_sentences, train_tags = prepare_split(train)
test_sentences, test_tags = prepare_split(test)
dev_sentences, dev_tags = prepare_split(dev)

print(f"Train Sentences: {len(train_sentences)} \tTrain Tags: {len(train_tags)}")
print(f"Test Sentences: {len(test_sentences)} \tTest Tags: {len(test_tags)}")
print(f"Dev Sentences: {len(dev_sentences)} \tDev Tags: {len(dev_tags)}")

Train Sentences: 20000 	Train Tags: 20000
Test Sentences: 10000 	Test Tags: 10000
Dev Sentences: 10000 	Dev Tags: 10000


In [14]:
# Get tags list
tags = get_tag_list(train_tags)
print(f"Number of Tags: {len(tags)}")
# Create vocabulary
vocab = create_vocabulary(train_sentences)
print(f"Length of Vocabulary: {len(vocab)}")

Number of Tags: 7
Length of Vocabulary: 33393


In [15]:
# Generate word2idx and idx2word
word2idx, idx2word = create_word_indices(vocab)

# Generate tag2idx and idx2tag
tag2idx, idx2tag = create_tag_indices(tags)

print(f"[PAD] index: {word2idx.get('[PAD]', 'Not Found')}")
print(f"[UNK] index: {word2idx.get('[UNK]', 'Not Found')}")

# Creating X/ y sets for all splits
X_train, y_train = preprocess_and_padding(train_sentences, train_tags, word2idx, tag2idx, maxlen=100)
X_test, y_test = preprocess_and_padding(test_sentences, test_tags, word2idx, tag2idx, maxlen=100)
X_dev, y_dev = preprocess_and_padding(dev_sentences, dev_tags, word2idx, tag2idx, maxlen=100)

# Check shapes
print(f"X_Train shape: {X_train.shape}\ty_Train shape: {y_train.shape}")
print(f"X_test Shape: {X_test.shape}\ty_test shape: {y_test.shape}")
print(f"X_dev Shape: {X_dev.shape}\ty_dev Shape: {y_dev.shape}")

[PAD] index: 0
[UNK] index: 1
X_Train shape: (20000, 100)	y_Train shape: (20000, 100)
X_test Shape: (10000, 100)	y_test shape: (10000, 100)
X_dev Shape: (10000, 100)	y_dev Shape: (10000, 100)


In [30]:
# Define constants
vocab_size = len(vocab) + 2 # +2 for [PAD] and [UNK] tokens
tag_output_dim = len(tags)
maxlen = 100
embedding_dim = 100
lstm_units = 128

# Build the model using the function
model = build_model(vocab_size, tag_output_dim, embedding_dim, lstm_units, maxlen)

# Display the model summary
model.summary()

In [31]:
# Set up callbacks to control training behavior and save the best model
model_callbacks = [
    # Save the best model based on validation loss
    ModelCheckpoint(
        filepath='model.weights.keras',
        save_best_only=True,
        monitor='val_loss',
    ),
    # Stop training early if validation loss doesn't improve for 3 epochs
    EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True,
    )
]

# Train the model using our prepared data and callbacks
history = train_model(
    model, X_train, y_train,
    X_dev, y_dev, model_callbacks, epochs=12, batch_size=32)

Epoch 1/12
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 18ms/step - accuracy: 0.9541 - loss: 0.1541 - val_accuracy: 0.9825 - val_loss: 0.0532
Epoch 2/12
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - accuracy: 0.9880 - loss: 0.0387 - val_accuracy: 0.9866 - val_loss: 0.0402
Epoch 3/12
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - accuracy: 0.9953 - loss: 0.0165 - val_accuracy: 0.9878 - val_loss: 0.0382
Epoch 4/12
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 18ms/step - accuracy: 0.9976 - loss: 0.0084 - val_accuracy: 0.9875 - val_loss: 0.0438
Epoch 5/12
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 17ms/step - accuracy: 0.9987 - loss: 0.0050 - val_accuracy: 0.9870 - val_loss: 0.0486
Epoch 6/12
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - accuracy: 0.9991 - loss: 0.0034 - val_accuracy: 0.9870 - val_loss: 0.0523


# Check Classification Report

In [32]:
# Get model predictions on the test set
y_pred_probs = model.predict(X_test)

# Convert predicted probabilities to actual tag labels
y_pred = np.argmax(y_pred_probs, axis=-1)

# Get the index of the 'O' (Outside) tag, used for padding
O_tag_idx = tag2idx['O']

# Prepare true and predicted labels, ignoring padding tokens for evaluation
y_true_flat = []
y_pred_flat = []

for i in range(len(y_test)):
    for j in range(len(y_test[i])):
        # Only evaluate non-padding tokens
        if y_test[i][j] != O_tag_idx:
            y_true_flat.append(y_test[i][j])
            y_pred_flat.append(y_pred[i][j])

# Get tag names and their numerical indices for the classification report (excluding 'O')
true_tags = [idx2tag[i] for i in sorted(tag2idx.values()) if idx2tag[i] != 'O']
report_labels = [i for i in sorted(tag2idx.values()) if i != O_tag_idx]

# Print the detailed classification report
print("\nClassification Report on Test Set (excluding padding tokens):\n")
print(classification_report(y_true_flat, y_pred_flat, target_names=true_tags, labels=report_labels))

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step

Classification Report on Test Set (excluding padding tokens):

              precision    recall  f1-score   support

       B-LOC       0.78      0.74      0.76      4657
       B-ORG       0.73      0.65      0.69      4745
       B-PER       0.79      0.85      0.82      4521
       I-LOC       0.80      0.71      0.76      6447
       I-ORG       0.83      0.73      0.78     11607
       I-PER       0.84      0.79      0.82      7437

   micro avg       0.81      0.74      0.77     39414
   macro avg       0.80      0.75      0.77     39414
weighted avg       0.81      0.74      0.77     39414

