<a href="https://colab.research.google.com/github/mohammadreza-mohammadi94/Deep-Learning-Projects/blob/main/NER%20Identification%20-%20Annotated%20GMB%20Corpus/ner_dentification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Download Dataset

In [2]:
import os
from google.colab import userdata

os.environ["KAGGLE_USERNAME"] = userdata.get("KAGGLE_USERNAME")
os.environ["KAGGLE_KEY"] = userdata.get("KAGGLE_KEY")

!mkdir ~/kaggle
!cp kaggle.json ~/kaggle
!chmod 600 ~/kaggle/kaggle.json
#!/bin/bash
!kaggle datasets download shoumikgoswami/annotated-gmb-corpus
!unzip annotated-gmb-corpus.zip
!rm annotated-gmb-corpus.zip

cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/kaggle/kaggle.json': No such file or directory
Dataset URL: https://www.kaggle.com/datasets/shoumikgoswami/annotated-gmb-corpus
License(s): DbCL-1.0
Downloading annotated-gmb-corpus.zip to /content
  0% 0.00/462k [00:00<?, ?B/s]
100% 462k/462k [00:00<00:00, 930MB/s]
Archive:  annotated-gmb-corpus.zip
  inflating: GMB_dataset.txt         


# Import Libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

# Load Data

In [4]:
df = pd.read_csv(
    "GMB_dataset.txt",
    sep='\t',
    header=0,
    encoding='latin1',
    names=['Index', 'Sentence #', 'Word', 'POS', 'Tag']
)

In [5]:
df.head()

Unnamed: 0,Index,Sentence #,Word,POS,Tag
0,0,1.0,Thousands,NNS,O
1,1,1.0,of,IN,O
2,2,1.0,demonstrators,NNS,O
3,3,1.0,have,VBP,O
4,4,1.0,marched,VBN,O


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66161 entries, 0 to 66160
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Index       66161 non-null  int64  
 1   Sentence #  66161 non-null  float64
 2   Word        66161 non-null  object 
 3   POS         66161 non-null  object 
 4   Tag         66161 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 2.5+ MB


# Preprocess Dataset

In [7]:
# Remove NaNs
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66161 entries, 0 to 66160
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Index       66161 non-null  int64  
 1   Sentence #  66161 non-null  float64
 2   Word        66161 non-null  object 
 3   POS         66161 non-null  object 
 4   Tag         66161 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 2.5+ MB


In [8]:
# Ensure only integers are in Sentence #
# Convert 'Sentence #' to string type before using .str accessor
df = df[df['Sentence #'].astype(str).str.contains('Sentence #', na=False) == False]

# Convert 'Sentence #' to integer, coercing errors and dropping resulting NaNs
df['Sentence #'] = pd.to_numeric(df['Sentence #'], errors='coerce')
df = df.dropna(subset=['Sentence #'])
df['Sentence #'] = df['Sentence #'].astype(int)

In [9]:
# Group the DataFrame by 'Sentence #' and aggregate 'Word' and 'Tag' into lists
grouped_data = df.groupby('Sentence #')[['Word', 'Tag']].agg(list)

# Create a list of tuples, where each tuple contains the list of words and list of tags for a sentence
all_sentences = [
    (row['Word'], row['Tag'])
    for index, row in grouped_data.iterrows()
]
print("Grouping complete.")

# Print the total number of sentences extracted
print(f"\nTotal number of sentences extracted: {len(all_sentences)}")

# If there are extracted sentences, print the first 3 and their lengths
if all_sentences:
    print("\nFirst 3 sentences:")
    print(all_sentences[:3])

    # Get the words and tags of the first sentence
    first_words, first_tags = all_sentences[0]
    # Print the length of words and tags in the first sentence
    print(f"\nLength of words in the first sentence: {len(first_words)}")
    print(f"Length of tags in the first sentence: {len(first_tags)}")

Grouping complete.

Total number of sentences extracted: 2999

First 3 sentences:
[(['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.'], ['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']), (['Families', 'of', 'soldiers', 'killed', 'in', 'the', 'conflict', 'joined', 'the', 'protesters', 'who', 'carried', 'banners', 'with', 'such', 'slogans', 'as', '"', 'Bush', 'Number', 'One', 'Terrorist', '"', 'and', '"', 'Stop', 'the', 'Bombings', '.', '"'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-per', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']), (['They', 'marched', 'from', 'the', 'Houses', 'of', 'Parliament', 'to', 'a', 'rally', 'in', 'Hyde', 'Park', '.'], ['O', 'O', 'O', 'O', 'O', 'O', '

# Vocab & Tokenization

In [10]:
# Extract all unique words and tags
all_words = set(word for sentence, tags in all_sentences for word in sentence)
all_tags = set(tag for sentence, tags in all_sentences for tag in tags)
print(f"Total unique words: {len(all_words)}")
print(f"Total unique tags: {len(all_tags)}")

Total unique words: 8766
Total unique tags: 17


In [11]:
# Mapping Dict
# Create a mapping from each unique word to an integer index.
# Start indexing from 2 to reserve 0 for padding and 1 for unknown words.
word_to_index = {word: i + 2 for i, word in enumerate(sorted(list(all_words)))}
word_to_index["<PAD>"] = 0 # Add a mapping for the padding token
word_to_index["<UNK>"] = 1 # Add a mapping for unknown words

# Create a mapping from each unique tag to an integer index.
# Start indexing from 1 to reserve 0 for padding.
tag_to_index = {tag: i + 1 for i, tag in enumerate(sorted(list(all_tags)))}
tag_to_index["<PAD>"] = 0 # Add a mapping for the padding tag

# Inveresed Dict for inference model
# Create an inverse mapping from integer index back to tag, useful for converting model predictions back to tags.
index_to_tag = {i: t for t, i in tag_to_index.items()}


# Split data to X/y
# Convert the sentences (list of words) into a list of lists of integer indices using the word_to_index mapping.
# If a word is not found in the mapping, use the index for the unknown token (1).
X = [[word_to_index.get(word, 1) for word in sentence] for sentence, tags in all_sentences]
# Convert the tags (list of tags) into a list of lists of integer indices using the tag_to_index mapping.
y = [[tag_to_index[tag] for tag in tags] for sentence, tags in all_sentences]

print("\nExample of Conversion")
print("Original sentence (words):", all_sentences[0][0])
print("Converted sentence (integers):", X[0])
print("-" * 30)
print("Original tags:", all_sentences[0][1])
print("Converted tags (integers):", y[0])


Example of Conversion
Original sentence (words): ['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']
Converted sentence (integers): [2676, 6377, 4341, 5329, 6015, 8237, 1745, 8257, 6934, 8193, 8594, 5530, 1471, 3187, 4325, 8193, 8702, 6377, 745, 8360, 5121, 8191, 4143, 14]
------------------------------
Original tags: ['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']
Converted tags (integers): [17, 17, 17, 17, 17, 17, 3, 17, 17, 17, 17, 17, 3, 17, 17, 17, 17, 17, 4, 17, 17, 17, 17, 17]


# Padding & Data Spliting

In [12]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


sentence_lengths = [len(s) for s in X]
MAX_LEN = int(np.percentile(sentence_lengths, 95))
print(f"Using MAX_LEN = {MAX_LEN} (Covers 95% of sentences)")

# Padding
X_padded = pad_sequences(
    sequences=X,
    maxlen=MAX_LEN,
    padding="post",
    value=word_to_index["<PAD>"]
)

y_padded = pad_sequences(
    sequences=y,
    maxlen=MAX_LEN,
    padding='post',
    value=tag_to_index['<PAD>']
)

# Split data to Train/Test
X_train, X_test, y_train, y_test = train_test_split(
    X_padded,
    y_padded,
    test_size=0.2,
    random_state=42
)

# Increase dimension since "SparseCategoricalCrossentropys" expects it.
y_train = np.expand_dims(y_train, axis=-1)
y_test = np.expand_dims(y_test, axis=-1)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

Using MAX_LEN = 36 (Covers 95% of sentences)
X_train shape: (2399, 36)
y_train shape: (2399, 36, 1)
X_test shape: (600, 36)
y_test shape: (600, 36, 1)


# Download and Create GloVe Matrix

In [14]:
VOCAB_SIZE = len(word_to_index)  # Size of the vocabulary (number of unique words + padding and unknown tokens)
NUM_TAGS = len(tag_to_index)      # Number of unique tags (including padding tag)
# EMBEDDING_DIM = 200              # Dimension of the word embedding vectors
# Define the path for the GloVe embeddings file
GLOVE_PATH = 'glove.6B.100d.txt' # Use the 100-dimensional vectors
GLOVE_ZIP_PATH = 'glove.6B.zip'

# Check if the GloVe file already exists
if not os.path.exists(GLOVE_PATH):
    print("GloVe embeddings not found, downloading...")
    # Download GloVe embeddings
    !wget http://nlp.stanford.edu/data/glove.6B.zip
    # Unzip the downloaded file
    !unzip glove.6B.zip
    # Clean up the zip file
    !rm glove.6B.zip
else:
    print("GloVe embeddings found, skipping download.")

# Load GloVe vectors
glove_vectors = {}
with open(GLOVE_PATH, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove_vectors[word] = vector
print(f"Loaded {len(glove_vectors)} word vectors.")

# Create Embedding Matrix
print("--- Creating Embedding Matrix ---")
# Correcting EMBEDDING_DIM to match the loaded GloVe vectors (100)
EMBEDDING_DIM = 100
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
for word, i in word_to_index.items():
    vector = glove_vectors.get(word)
    if vector is not None:
        # Words present in both our vocabulary and GloVe
        embedding_matrix[i] = vector
    # Words not in GloVe (like <UNK> and <PAD>) will have zero vectors.
print("Embedding Matrix created.")

GloVe embeddings found, skipping download.
Loaded 400000 word vectors.
--- Creating Embedding Matrix ---
Embedding Matrix created.


# Model Definition

In [15]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (Input, Embedding, Bidirectional,
                                     LSTM, TimeDistributed, Dense,
                                     Dropout)

# Define constants for model architecture
LSTM_UNITS = 256                 # Number of units in the LSTM layers

# Model Architecture
input_layer = Input(shape=(MAX_LEN,)) # Input layer that takes sequences of length MAX_LEN

# Embedding layer to convert input word indices into dense vectors
embedding_layer = Embedding(
    input_dim=VOCAB_SIZE,
    output_dim=EMBEDDING_DIM,
    weights=[embedding_matrix],
    trainable=False,
    mask_zero=True,  # Masking is enabled to ignore padding (value 0)
    name="embedding_layer"
)(input_layer)

# First Bidirectional LSTM layer
# Processes the sequence in both forward and backward directions
# return_sequences=True ensures that the output is a sequence
# recurrent_dropout applies dropout to the recurrent connections
bilstm_layer_1 = Bidirectional(
    LSTM(units=LSTM_UNITS, return_sequences=True, recurrent_dropout=0.2),
    name="bilstm_layer_1"
)(embedding_layer)

# Second Bidirectional LSTM layer
# Stacks another BiLSTM layer on top of the first one
bilstm_layer_2 = Bidirectional(
    LSTM(units=LSTM_UNITS, return_sequences=True, recurrent_dropout=0.2),
    name="bilstm_layer_2"
)(bilstm_layer_1)

# Dropout layer to prevent overfitting
dropout_layer = Dropout(0.5, name="dropout_layer")(bilstm_layer_2)

# TimeDistributed Dense layer for output
# Applies the Dense layer to each timestep of the sequence
# The Dense layer has NUM_TAGS units with softmax activation for multi-class classification at each timestep
output_layer = TimeDistributed(
    Dense(NUM_TAGS, activation='softmax'),
    name="output_layer"
)(dropout_layer)

# Define the model by specifying the input and output layers
model_improved = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
# optimizer: Adam optimizer
# loss: SparseCategoricalCrossentropy is used for integer targets (tags)
# metrics: Accuracy is used to evaluate the model's performance
model_improved.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Display the model summary
model_improved.summary()

# Training

In [16]:
from tensorflow.keras.callbacks import EarlyStopping

# Define Early Stopping callback to prevent overfitting
# monitor: quantity to be monitored (validation loss)
# patience: number of epochs with no improvement after which training will be stopped
# restore_best_weights: whether to restore model weights from the epoch with the best value of the monitored quantity
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

# Create sample weights to ignore padding in the loss calculation
# We assign a weight of 1.0 to non-padding tags (y_train > 0) and 0.0 to padding tags (y_train == 0)
sample_weights = np.where(y_train > 0, 1.0, 0.0)

# Train the model
# X_train, y_train: training data
# epochs: number of training epochs
# batch_size: number of samples per gradient update
# validation_data: data on which to evaluate the loss and any model metrics at the end of each epoch
# sample_weight: optional numpy array of weights for the training samples, used to weight the loss function
# callbacks: list of callbacks to apply during training
history = model_improved.fit(
    X_train, y_train,
    epochs=20,
    batch_size=32,
    validation_data=(X_test, y_test),
    sample_weight=sample_weights,
    callbacks=[early_stopping]
)

Epoch 1/20
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 566ms/step - accuracy: 0.5069 - loss: 0.9199 - val_accuracy: 0.5330 - val_loss: 0.4029
Epoch 2/20
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 510ms/step - accuracy: 0.5364 - loss: 0.3904 - val_accuracy: 0.5465 - val_loss: 0.3214
Epoch 3/20
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 524ms/step - accuracy: 0.5408 - loss: 0.3248 - val_accuracy: 0.5519 - val_loss: 0.2981
Epoch 4/20
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 520ms/step - accuracy: 0.5455 - loss: 0.2939 - val_accuracy: 0.5556 - val_loss: 0.2825
Epoch 5/20
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 517ms/step - accuracy: 0.5544 - loss: 0.2651 - val_accuracy: 0.5553 - val_loss: 0.2729
Epoch 6/20
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 519ms/step - accuracy: 0.5525 - loss: 0.2611 - val_accuracy: 0.5593 - val_loss: 0.2600
Epoch 7/20
[1m75/75[

# Evaluation

In [17]:
# Evaluate the model on the test data
loss, accuracy = model_improved.evaluate(X_test, y_test, verbose=0)

# Print the test loss and accuracy
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy*100:.2f}%")

Test Loss: 0.2471
Test Accuracy: 56.06%


In [32]:
from sklearn.metrics import classification_report

# Make predictions on the test data
predictions = model_improved.predict(X_test)

# Get the index of the tag with the highest probability for each word in each sentence
predicted_tags_indices = np.argmax(predictions, axis=-1)

# Remove the extra dimension from the true tags for easier comparison
true_tags_indices = np.squeeze(y_test)

# Flatten the true and predicted tags, excluding padding
y_true_flat = []
y_pred_flat = []

# Iterate through each sentence in the test set
for i in range(len(true_tags_indices)): # For each sentence
    # Iterate through each token (word) in the current sentence
    for j in range(len(true_tags_indices[i])): # For each token in the sentence
        # Check if the true tag is not a padding tag
        if true_tags_indices[i][j] != tag_to_index['<PAD>']:
            # Append the true tag (converted back to string) to the flattened list
            y_true_flat.append(index_to_tag[true_tags_indices[i][j]])
            # Append the predicted tag (converted back to string) to the flattened list
            y_pred_flat.append(index_to_tag[predicted_tags_indices[i][j]])

# Get the names of the tags, excluding the padding tag
tag_names = [tag for tag, index in tag_to_index.items() if index != 0]

# Print the detailed classification report
print("\n--- Detailed Classification Report ---")
print(classification_report(y_true_flat, y_pred_flat, labels=tag_names))

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 81ms/step

--- Detailed Classification Report ---
              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00        13
       B-eve       1.00      0.10      0.18        10
       B-geo       0.60      0.74      0.66       409
       B-gpe       0.51      0.47      0.49       257
       B-nat       0.00      0.00      0.00         8
       B-org       0.40      0.28      0.33       248
       B-per       0.58      0.62      0.60       215
       B-tim       0.69      0.58      0.63       219
       I-art       0.00      0.00      0.00         7
       I-eve       1.00      0.20      0.33        10
       I-geo       0.46      0.47      0.46        77
       I-gpe       0.00      0.00      0.00        10
       I-nat       0.00      0.00      0.00         5
       I-org       0.53      0.54      0.54       182
       I-per       0.70      0.79      0.74       252
       I-tim       0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
