<a href="https://colab.research.google.com/github/mohammadreza-mohammadi94/Deep-Learning-Projects/blob/main/NER-CoNLL-Dataset/ner_conll_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installation

In [1]:
!pip install -q datasets==3.6.0 seqeval

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


# Libs

In [17]:
import tensorflow as tf
import numpy as np

from tensorflow.keras import (layers, models,
                              regularizers, callbacks,
                              losses)
from tensorflow.keras.preprocessing.sequence import pad_sequences
from datasets import load_dataset
from seqeval.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Define Pipeline

In [18]:
def load_and_prepare_dataset():
    """
    Loads the standard CoNLL-2003 dataset.
    Tags: 0:O, 1:B-PER, 2:I-PER, 3:B-ORG, 4:I-ORG, 5:B-LOC, 6:I-LOC, 7:B-MISC, 8:I-MISC
    """
    print(f"[INFO] - Loading CoNLL dataset from HF...")
    dataset = load_dataset("conll2003")

    # Load and split datasets
    train_data = (dataset["train"]['tokens'], dataset["train"]["ner_tags"])
    test_data = (dataset["test"]['tokens'], dataset["test"]["ner_tags"])
    val_data = (dataset["validation"]['tokens'], dataset["validation"]["ner_tags"])

    # Get tags
    tags = dataset["train"].features['ner_tags'].feature.names

    return train_data, test_data, val_data, tags


def create_vocab(token_list):
    """
    Creates a mapping from word tokens to unique integers.
    Index 0 is reserved for [PAD], Index 1 for [UNK].
    """
    print(f"[INFO] - Creating vocab and word2idx from train dataset...")
    vocab = set(word for sentence in token_list for word in sentence)
    # Build lookup dict
    word2idx = {word: id + 2 for id, word in enumerate(vocab)}
    word2idx["[PAD]"] = 0
    word2idx["[UNK]"] = 1
    return word2idx

class BiLstmNerModel(tf.keras.Model):
    """
    Bidirectional LSTM architecture for Sequence Tagging.
    Uses Masking to ignore padded time-steps.
    """
    def __init__(self, vocab_size, num_tags, embedding_dim=128, units=128):
        super(BiLstmNerModel, self).__init__()
        self.embedding = layers.Embedding(vocab_size, embedding_dim, mask_zero=True)
        self.bi_lstm = layers.Bidirectional(
            layers.LSTM(units, return_sequences=True)
        )
        self.dropout = layers.Dropout(0.3)
        self.classifier = layers.Dense(num_tags)

    def call(self, inputs, training=False):
        x = self.embedding(inputs)
        x = self.bi_lstm(x)

        if training:
            x = self.dropout(x, training=training)
        return self.classifier(x)


def get_stable_weighted_loss(class_weights):
    """
    Weighted Sparse Categorical Crossentropy normalized by batch weight sum.
    Prevents gradient explosion and metric collapse.
    """
    def loss_fn(y_true, y_pred):
        # Calculate raw cross entropy (per token)
        # from_logits=True is mandatory because we didn't add Softmax to the model
        cce = losses.SparseCategoricalCrossentropy(from_logits=True, reduction=None)
        raw_loss = cce(y_true, y_pred)

        # Assign weights to each ground-truth tag
        weights = tf.gather(class_weights, tf.cast(y_true, tf.int32))

        # Normalize: total_weighted_loss / sum_of_weights
        weighted_loss = tf.reduce_sum(raw_loss * weights)
        total_weight = tf.reduce_sum(weights)

        return weighted_loss / (total_weight + 1e-8)

    return loss_fn

In [19]:
def main():
    # Setup data
    (train_s, train_t), (val_s, val_t), (test_s, test_t), tag_names = load_and_prepare_dataset()
    word2idx = create_vocab(train_s)

    # Configuration
    MAX_LEN = 64
    VOCAB_SIZE = len(word2idx)
    NUM_TAGS = len(tag_names)
    BATCH_SIZE = 32

    print(f'[INFO] - Vocab Size: {VOCAB_SIZE} | Num Tags: {NUM_TAGS}')

    def vectorize_and_pad(sentences, tags_lists):
        X = [[word2idx.get(w, 1) for w in s] for s in sentences]
        X_p = pad_sequences(X, maxlen=MAX_LEN, padding="post")
        y_p = pad_sequences(tags_lists, maxlen=MAX_LEN, padding="post")
        return X_p, np.array(y_p)

    X_train, y_train = vectorize_and_pad(train_s, train_t)
    X_val, y_val = vectorize_and_pad(val_s, val_t)
    X_test, y_test = vectorize_and_pad(test_s, test_t)

    print(f'[INFO] X_train shape: {X_train.shape} | y_train shape: {y_train.shape}')

    # Handling Imabalce (Weigheting
    weights = np.ones(NUM_TAGS)
    weights[1:] = 5.0
    # class_weights = tf.constant(weights, dtype=tf.float32)

    # Instantiating the Model
    model = BiLstmNerModel(VOCAB_SIZE, NUM_TAGS)
    print("[INFO] - BiLSTM NER Model Instantiated...")
    weights = tf.constant([1.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0], dtype=tf.float32)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss=get_stable_weighted_loss(weights),
        metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy")]
    )
    print("[INFO] - Model Summary:")
    model.summary()

    # Training
    print("[INFO] - Training Started...")
    model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=12,
        batch_size=BATCH_SIZE,
        callbacks = [
            callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True),
            callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1)
        ]
    )


    # Evaluation
    print("[INFO] - Evaluating Model...")
    y_pred = model.predict(X_test)
    pred_indices = np.argmax(y_pred, axis=-1)


    # Convert numeric IDs back to strings for seqeval
    def idx_to_tag_seq(indices_batch, original_sentences):
        converted = []
        for i, sentence in enumerate(original_sentences):
            length = len(sentence)
            # Only keep tags for actual words (ignore padding)
            tags = [tag_names[idx] for idx in indices_batch[i][:length]]
            converted.append(tags)
        return converted

    y_true_str = idx_to_tag_seq(y_test, test_s)
    y_pred_str = idx_to_tag_seq(pred_indices, test_s)

    # Industry standard NER Classification Report
    print("\nFinal Classification Report (seqeval):")
    print(classification_report(y_true_str, y_pred_str))

# Execution
main()

[INFO] - Loading CoNLL dataset from HF...
[INFO] - Creating vocab and word2idx from train dataset...
[INFO] - Vocab Size: 23625 | Num Tags: 9
[INFO] X_train shape: (14041, 64) | y_train shape: (14041, 64)
[INFO] - BiLSTM NER Model Instantiated...
[INFO] - Model Summary:


[INFO] - Training Started...
Epoch 1/12
[1m439/439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 15ms/step - accuracy: 0.6830 - loss: 0.3106 - val_accuracy: 0.1916 - val_loss: 0.1070 - learning_rate: 0.0010
Epoch 2/12
[1m439/439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.2239 - loss: 0.0419 - val_accuracy: 0.1951 - val_loss: 0.0968 - learning_rate: 0.0010
Epoch 3/12
[1m439/439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - accuracy: 0.2262 - loss: 0.0139 - val_accuracy: 0.1955 - val_loss: 0.0984 - learning_rate: 0.0010
Epoch 4/12
[1m439/439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 12ms/step - accuracy: 0.2255 - loss: 0.0070 - val_accuracy: 0.1949 - val_loss: 0.1033 - learning_rate: 5.0000e-04
[INFO] - Evaluating Model...
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step

Final Classification Report (seqeval):
              precision    recall  f1-score   support

         LOC       

# Functional Implementation

In [21]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, losses, callbacks, initializers
from datasets import load_dataset
from tensorflow.keras.preprocessing.sequence import pad_sequences
from seqeval.metrics import classification_report
import requests
import zipfile

## Functions

In [27]:
def download_glove():
    """Download GloVe 100D vectors"""
    # Define the directory where GloVe files will be extracted
    glove_dir = "glove_data"
    glove_file_path = os.path.join(glove_dir, "glove.6B.100d.txt")

    if not os.path.exists(glove_file_path):
        print(">> Downloading GloVe embeddings (please wait)...")
        if not os.path.exists(glove_dir):
            os.makedirs(glove_dir) # Create the directory if it doesn't exist

        url = "https://nlp.stanford.edu/data/glove.6B.zip"
        zip_file_name = os.path.join(glove_dir, "glove.6B.zip")
        r = requests.get(url)
        with open(zip_file_name, "wb") as f:
            f.write(r.content)
        with zipfile.ZipFile(zip_file_name, "r") as zip_ref:
            # Extract to the created directory
            zip_ref.extractall(glove_dir)
        print(">> GloVe embeddings downloaded and extracted.")
    return glove_file_path


def load_embedding_matrix(word2idx, embedding_path, embedding_dim=100):
    """Load GloVe file and map it to out specific vocabulary."""
    print(">>> Processing GloVe file...")
    embeddings_index = {}
    with open(embedding_path, "r", encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype="float32")
            embeddings_index[word] = coefs

    vocab_size = len(word2idx)
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    hits, misses = 0, 0
    for word, i in word2idx.items():
        embedding_vector = embeddings_index.get(word.lower())
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            embedding_matrix[i] = np.random.normal(scale=0.1, size=(embedding_dim,))
            misses += 1

    print(f">>> Loaded {hits} words. {misses} words initialized randomly.")
    return embedding_matrix

def prepare_data():
    """Load CoNLL-2003 and perform tokenization/padding"""
    dataset = load_dataset("conll2003")
    tag_names = dataset["train"].features["ner_tags"].feature.names

    # Build Vocab
    train_tokens = dataset["train"]["tokens"]
    vocab = set(w for s in train_tokens for w in s)
    word2idx = {word: i + 2 for i, word in enumerate(sorted(list(vocab)))}
    word2idx["[PAD]"] = 0
    word2idx["[UNK]"] = 1

    MAX_LEN = 64

    def vectorize(split):
        X = [[word2idx.get(w, 1) for w in s] for s in dataset[split]["tokens"]]
        X_p = pad_sequences(X, maxlen=MAX_LEN, padding="post")
        y_p = pad_sequences(dataset[split]["ner_tags"], maxlen=MAX_LEN, padding="post", value=0)
        return X_p, np.asarray(y_p)

    X_train, y_train = vectorize('train')
    X_val, y_val     = vectorize('validation')
    X_test, y_test   = vectorize('test')

    return (X_train, y_train), (X_val, y_val), (X_test, y_test), word2idx, tag_names, dataset['test']['tokens']


def get_stable_weighted_loss(class_weights):
    """
    Weighted Sparse Categorical Crossentropy normalized by batch weight sum.
    Prevents gradient explosion and metric collapse.
    """
    def loss_fn(y_true, y_pred):
        # Calculate raw cross entropy (per token)
        # from_logits=True is mandatory because we didn't add Softmax to the model
        cce = losses.SparseCategoricalCrossentropy(from_logits=True, reduction=None)
        raw_loss = cce(y_true, y_pred)

        # Assign weights to each ground-truth tag
        weights = tf.gather(class_weights, tf.cast(y_true, tf.int32))

        # Normalize: total_weighted_loss / sum_of_weights
        weighted_loss = tf.reduce_sum(raw_loss * weights)
        total_weight = tf.reduce_sum(weights)

        return weighted_loss / (total_weight + 1e-8)

    return loss_fn

def build_functional_ner(vocab_size, num_tags, maxlen, emb_matrix):
    """Construct Bi-LSTM model using Keras Functional API."""
    input_layer = layers.Input(shape=(maxlen,), name="input_ids")

    # Pre-trained Embedding Layer
    # mask_zero=True tells the model to ignore index 0 during all calculations
    x = layers.Embedding(
        input_dim=vocab_size,
        output_dim=100,
        embeddings_initializer=initializers.Constant(emb_matrix),
        trainable=False, # Freeze GloVe weights initially
        mask_zero=True,
        name="glove_embeddings"
    )(input_layer)

    # SpatialDropout1D is superior for sequential data (drops whole features)
    x = layers.SpatialDropout1D(0.3)(x)

    # Bidirectional LSTM for context awareness
    x = layers.Bidirectional(
        layers.LSTM(128, return_sequences=True),
        name="bidirectional_lstm"
    )(x)

    x = layers.Dropout(0.4)(x)

    # Classification Head: Outputting Logits (Linear activation)
    # This is more stable when paired with from_logits=True loss
    outputs = layers.Dense(num_tags, activation=None, name="logits_output")(x)

    model = models.Model(inputs=input_layer, outputs=outputs, name="Functional_NER_Model")
    return model

def main():
    # Load Data
    (X_train, y_train), (X_val, y_val), (X_test, y_test), word2idx, tag_names, test_tokens = prepare_data()

    # Setup Embeddings
    glove_path = download_glove()
    # glove_path = "glove.6B.100d.txt" # This line is redundant and should be removed
    if os.path.exists(glove_path):
        emb_matrix = load_embedding_matrix(word2idx, glove_path)
    else:
        print("!! GloVe file missing, using random initialization for training demonstration.")
        emb_matrix = np.random.uniform(-0.05, 0.05, (len(word2idx), 100))

    # Define Class Weights to boost Recall (Tag 0 is 'O', 1-8 are entities)
    # Higher weights (3.0) for entities forces the model to focus on them
    weights = tf.constant([1.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0], dtype=tf.float32)

    # Build and Compile
    model = build_functional_ner(len(word2idx), len(tag_names), 64, emb_matrix)

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005), # Slower LR for stability
        loss=get_stable_weighted_loss(weights),
        metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy")]
    )

    model.summary()

    # Training with Callbacks
    print("\n>> Starting training...")
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        batch_size=32,
        epochs=12,
        callbacks=[
            callbacks.EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True),
            callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=1)
        ]
    )

    # ======================================================================================
    # 6. EVALUATION
    # ======================================================================================
    print("\n>> Running final evaluation on test set...")
    raw_preds = model.predict(X_test)
    pred_ids = np.argmax(raw_preds, axis=-1)

    # Convert numeric IDs back to string labels for the seqeval report
    def decode_tags(ids_batch, tokens_batch):
        decoded_results = []
        for i, sentence in enumerate(tokens_batch):
            # Only decode tags for actual tokens (length-aware decoding)
            length = len(sentence)
            decoded_results.append([tag_names[idx] for idx in ids_batch[i][:length]])
        return decoded_results

    y_true_str = decode_tags(y_test, test_tokens)
    y_pred_str = decode_tags(pred_ids, test_tokens)

    print("\nClassification Report (Entity-Level):")
    print(classification_report(y_true_str, y_pred_str))


main()

>> Downloading GloVe embeddings (please wait)...
>> GloVe embeddings downloaded and extracted.
>>> Processing GloVe file...
>>> Loaded 21009 words. 2616 words initialized randomly.



>> Starting training...
Epoch 1/12
[1m439/439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 18ms/step - accuracy: 0.8313 - loss: 0.2960 - val_accuracy: 0.9789 - val_loss: 0.1522 - learning_rate: 5.0000e-04
Epoch 2/12
[1m439/439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.9452 - loss: 0.1415 - val_accuracy: 0.9838 - val_loss: 0.1211 - learning_rate: 5.0000e-04
Epoch 3/12
[1m439/439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.9821 - loss: 0.1172 - val_accuracy: 0.9860 - val_loss: 0.1059 - learning_rate: 5.0000e-04
Epoch 4/12
[1m439/439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 11ms/step - accuracy: 0.9854 - loss: 0.0961 - val_accuracy: 0.9873 - val_loss: 0.0977 - learning_rate: 5.0000e-04
Epoch 5/12
[1m439/439[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.9870 - loss: 0.0860 - val_accuracy: 0.9880 - val_loss: 0.0912 - learning_rate: 5.0000e-04
Epoch 6/12
[1m439