In [2]:
from charset_normalizer import detect

with open('ner_dataset.csv', 'rb') as f:
    result = detect(f.read(50000))
    print(result)  # This will suggest the file's encoding

{'encoding': 'windows-1250', 'language': 'English', 'confidence': 0.9926}


In [3]:
# =============================================================================
# Import Libraries
# =============================================================================
import pandas as pd                              # For data loading and manipulation
from sklearn.model_selection import train_test_split  # To split the data into train/val/test sets
from collections import defaultdict, Counter     # For building a frequency-based baseline

import tensorflow as tf                          # TensorFlow and Keras for model building
from tensorflow.keras.preprocessing.sequence import pad_sequences  # For padding sequences
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.models import Model
import tensorflow_addons as tfa                   # For CRF layer implementation

from seqeval.metrics import classification_report  # For evaluation (entity-level)

# =============================================================================
# 1. Data Preprocessing
# =============================================================================

# Read the dataset
df = pd.read_csv("ner_dataset.csv", encoding="windows-1250")     # Charset codecs Windows-1252 is a legacy single-byte 
df.head()                                                        # ..character encoding that is used by default in MS-Windows

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [4]:
# Fill missing sentence numbers (if needed)
df['Sentence #'] = df['Sentence #'].fillna(method='ffill')
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [5]:
print(df.shape)
df.isnull().sum()

(1048575, 4)


Sentence #    0
Word          0
POS           0
Tag           0
dtype: int64

In [6]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1048565,1048566,1048567,1048568,1048569,1048570,1048571,1048572,1048573,1048574
Sentence #,Sentence: 1,Sentence: 1,Sentence: 1,Sentence: 1,Sentence: 1,Sentence: 1,Sentence: 1,Sentence: 1,Sentence: 1,Sentence: 1,...,Sentence: 47958,Sentence: 47958,Sentence: 47959,Sentence: 47959,Sentence: 47959,Sentence: 47959,Sentence: 47959,Sentence: 47959,Sentence: 47959,Sentence: 47959
Word,Thousands,of,demonstrators,have,marched,through,London,to,protest,the,...,impact,.,Indian,forces,said,they,responded,to,the,attack
POS,NNS,IN,NNS,VBP,VBN,IN,NNP,TO,VB,DT,...,NN,.,JJ,NNS,VBD,PRP,VBD,TO,DT,NN
Tag,O,O,O,O,O,O,B-geo,O,O,O,...,O,O,B-gpe,O,O,O,O,O,O,O


In [8]:
# Group words and their tags by sentence
sentences = df.groupby('Sentence #').apply(
    lambda x: list(zip(x['Word'].values.tolist(), x['Tag'].values.tolist()))
).tolist()

print(sentences[:3])                         # list of list of tuples (word,tag) in sentences

[[('Thousands', 'O'), ('of', 'O'), ('demonstrators', 'O'), ('have', 'O'), ('marched', 'O'), ('through', 'O'), ('London', 'B-geo'), ('to', 'O'), ('protest', 'O'), ('the', 'O'), ('war', 'O'), ('in', 'O'), ('Iraq', 'B-geo'), ('and', 'O'), ('demand', 'O'), ('the', 'O'), ('withdrawal', 'O'), ('of', 'O'), ('British', 'B-gpe'), ('troops', 'O'), ('from', 'O'), ('that', 'O'), ('country', 'O'), ('.', 'O')], [('Iranian', 'B-gpe'), ('officials', 'O'), ('say', 'O'), ('they', 'O'), ('expect', 'O'), ('to', 'O'), ('get', 'O'), ('access', 'O'), ('to', 'O'), ('sealed', 'O'), ('sensitive', 'O'), ('parts', 'O'), ('of', 'O'), ('the', 'O'), ('plant', 'O'), ('Wednesday', 'B-tim'), (',', 'O'), ('after', 'O'), ('an', 'O'), ('IAEA', 'B-org'), ('surveillance', 'O'), ('system', 'O'), ('begins', 'O'), ('functioning', 'O'), ('.', 'O')], [('Helicopter', 'O'), ('gunships', 'O'), ('Saturday', 'B-tim'), ('pounded', 'O'), ('militant', 'O'), ('hideouts', 'O'), ('in', 'O'), ('the', 'O'), ('Orakzai', 'B-geo'), ('tribal', '

In [9]:
df['Sentence #'].nunique(), df.Word.nunique(), df.POS.nunique(), df.Tag.nunique()

(47959, 35178, 42, 17)

In [12]:
df.Tag.value_counts()

O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: Tag, dtype: int64

In [13]:
# =============================================================================
# 2. Splitting the Dataset
# =============================================================================

# First, split into train+validation and test (test is 20% of the data).
train_val, test = train_test_split(sentences, test_size=0.2, random_state=25)

# Then, split train+validation into training and validation sets.
train, validation = train_test_split(train_val, test_size=0.1, random_state=25)

In [14]:
len(train), len(validation), len(test)

(34530, 3837, 9592)

In [16]:
# =============================================================================
# 3. Baseline Model: Frequency-based Lookup
# =============================================================================

# Build a dictionary mapping each word (from training data) to its most frequent NER tag.
word_tag_freq = defaultdict(Counter)
for sentence in train:
    for word, tag in sentence:
        word_tag_freq[word][tag] += 1
        
word_tag_freq

defaultdict(collections.Counter,
            {'He': Counter({'O': 1182}),
             'is': Counter({'O': 4906}),
             'said': Counter({'O': 3790}),
             'to': Counter({'O': 16657, 'B-tim': 9, 'I-tim': 70, 'I-org': 5}),
             'be': Counter({'O': 1807}),
             'living': Counter({'O': 102}),
             'in': Counter({'O': 18926, 'B-tim': 47, 'I-tim': 42, 'I-org': 1}),
             'exile': Counter({'O': 37, 'I-tim': 1}),
             'Israel': Counter({'B-geo': 639,
                      'B-org': 30,
                      'I-geo': 6,
                      'B-gpe': 12}),
             '.': Counter({'O': 34395}),
             'Taiwanese': Counter({'B-gpe': 10}),
             'Defense': Counter({'B-org': 58,
                      'O': 56,
                      'I-org': 51,
                      'B-per': 4}),
             'Ministry': Counter({'I-org': 214, 'B-org': 11}),
             'spokesman': Counter({'O': 516}),
             'Wang': Counter({'B-per': 3, '

In [17]:
baseline_mapping = {word: counter.most_common(1)[0][0] for word, counter in word_tag_freq.items()}
baseline_mapping

{'He': 'O',
 'is': 'O',
 'said': 'O',
 'to': 'O',
 'be': 'O',
 'living': 'O',
 'in': 'O',
 'exile': 'O',
 'Israel': 'B-geo',
 '.': 'O',
 'Taiwanese': 'B-gpe',
 'Defense': 'B-org',
 'Ministry': 'I-org',
 'spokesman': 'O',
 'Wang': 'B-per',
 'Shih-chien': 'I-per',
 'says': 'O',
 'the': 'O',
 'white': 'O',
 'paper': 'O',
 'shows': 'O',
 'that': 'O',
 'China': 'B-geo',
 'emphasizing': 'O',
 'buildup': 'O',
 'of': 'O',
 'its': 'O',
 'air': 'O',
 'and': 'O',
 'amphibious': 'O',
 'forces': 'O',
 ',': 'O',
 'proving': 'O',
 'Beijing': 'B-geo',
 'targeting': 'O',
 'island': 'O',
 'Nooyi': 'B-per',
 'was': 'O',
 'born': 'O',
 'Madras': 'B-geo',
 'earned': 'O',
 'her': 'O',
 'undergraduate': 'O',
 'master': 'O',
 "'s": 'O',
 'degrees': 'O',
 'India': 'B-geo',
 'A': 'O',
 'Russian': 'B-gpe',
 'spacecraft': 'O',
 'carrying': 'O',
 'an': 'O',
 'American': 'B-gpe',
 'space': 'O',
 'tourist': 'O',
 'two': 'O',
 'cosmonauts': 'O',
 'has': 'O',
 'docked': 'O',
 'with': 'O',
 'International': 'B-org',
 '

In [21]:
unique_tags = set(baseline_mapping.values())
print("Unique Tags:", unique_tags)
print("Count of Unique Tags:", len(unique_tags))

Unique Tags: {'I-art', 'I-gpe', 'I-tim', 'B-eve', 'B-nat', 'B-art', 'B-gpe', 'I-nat', 'I-org', 'B-org', 'I-eve', 'B-per', 'I-per', 'B-tim', 'I-geo', 'B-geo', 'O'}
Count of Unique Tags: 17


In [22]:
from collections import Counter

tag_counts = Counter(baseline_mapping.values())
print("Tag Counts:")
for tag, count in tag_counts.items():
    print(f"{tag}: {count}")

Tag Counts:
O: 20111
B-geo: 1950
B-gpe: 258
B-org: 1406
I-org: 1391
B-per: 1615
I-per: 2573
I-geo: 380
B-tim: 496
I-tim: 114
I-gpe: 7
B-eve: 29
B-nat: 12
B-art: 91
I-art: 59
I-eve: 17
I-nat: 5


In [23]:
# tag	count
# O	20111
# art	150
# eve	46
# geo	2330
# gpe	265
# nat	17
# org	2797
# per	4188
# tim	610

In [18]:
def baseline_predict(sentence_words):
    """
    Predicts NER tags for a list of words using the frequency-based baseline.
    If a word is unseen, it returns 'O' (outside).
    """
    return [baseline_mapping.get(word, 'O') for word in sentence_words]

In [19]:
# Evaluate the baseline on the test set.
true_tags_baseline = []
pred_tags_baseline = []
for sentence in test:
    words = [w for w, t in sentence]
    true = [t for w, t in sentence]
    pred = baseline_predict(words)
    true_tags_baseline.append(true)
    pred_tags_baseline.append(pred)

In [20]:
print("Baseline Model Evaluation:")
print(classification_report(true_tags_baseline, pred_tags_baseline))

Baseline Model Evaluation:
              precision    recall  f1-score   support

         art       0.28      0.10      0.14        82
         eve       0.29      0.28      0.29        67
         geo       0.74      0.83      0.79      7674
         gpe       0.94      0.94      0.94      3151
         nat       0.33      0.38      0.35        37
         org       0.44      0.49      0.46      3990
         per       0.54      0.60      0.57      3397
         tim       0.72      0.68      0.70      4057

   micro avg       0.67      0.72      0.70     22455
   macro avg       0.53      0.54      0.53     22455
weighted avg       0.68      0.72      0.70     22455



Shortcomings of the Baseline Model
Context Ignorance: The baseline only uses the word itself and ignores its surrounding context, which is crucial for disambiguating tags.
Handling Unseen Words: Defaulting to O for unknown words can lead to a high number of false negatives.
No Sequence Modeling: The model does not take into account dependencies between tags (e.g., a tag following B-PER is more likely to be I-PER).
Limited to Surface Features: Does not utilize any additional features (such as capitalization, word shape, suffixes/prefixes) that could help in better prediction.

Shortcomings of the Baseline Model
Lack of Context:
The baseline model does not account for context. For instance, the word “Washington” could be a person (e.g., “George Washington”) or a location (e.g., “Washington, D.C.”). Context is essential to disambiguate such cases.

Inability to Capture Sequential Dependencies:
Named entities often span multiple words (e.g., “New York City”). The baseline model treats each word independently and may miss the boundary information encoded in the IOB scheme.

Poor Generalization:
If a word was not seen in the training set (Out-of-Vocabulary), the baseline defaults to a generic tag (often O), which can hurt performance.

No Learning of Semantic or Syntactic Features:
The model does not learn features such as word embeddings, capitalization patterns, or neighboring words that are important for robust NER.

In [None]:
# =============================================================================
# 4. Preparing Data for the BiLSTM-CRF Model (TensorFlow/Keras)
# =============================================================================

# Build vocabulary mappings for words and tags using the training data.
word2idx = {}  # Maps word to unique index (reserve 0 for padding)
tag2idx = {}   # Maps tag to unique index

for sentence in train:
    for word, tag in sentence:
        if word not in word2idx:
            # Reserve index 0 for padding; start indexing from 1.
            word2idx[word] = len(word2idx) + 1
        if tag not in tag2idx:
            tag2idx[tag] = len(tag2idx)

# Create the inverse mapping for tags (to decode predictions later).
idx2tag = {idx: tag for tag, idx in tag2idx.items()}

# Helper function: Convert a sequence of items into indices.
def prepare_sequence(seq, mapping, unk_value=0):
    """
    Converts a sequence of tokens (words or tags) into a sequence of indices.
    If a token is not in the mapping, returns unk_value (default is 0).
    """
    return [mapping.get(token, unk_value) for token in seq]

# Prepare datasets (list of sentences) as lists of word indices and tag indices.
def create_dataset(data):
    word_sequences = []
    tag_sequences = []
    for sentence in data:
        words = [w for w, t in sentence]
        tags = [t for w, t in sentence]
        word_idx_seq = prepare_sequence(words, word2idx)
        tag_idx_seq = prepare_sequence(tags, tag2idx)
        word_sequences.append(word_idx_seq)
        tag_sequences.append(tag_idx_seq)
    return word_sequences, tag_sequences

train_words, train_tags = create_dataset(train)
val_words, val_tags = create_dataset(validation)

# Pad the sequences so that all sequences in a batch have the same length.
# For words, we use padding value 0 (which is reserved). For tags, we can also use 0.
max_len = max(max(len(s) for s in train_words), max(len(s) for s in val_words))

train_words_padded = pad_sequences(train_words, maxlen=max_len, padding='post', value=0)
train_tags_padded  = pad_sequences(train_tags,  maxlen=max_len, padding='post', value=0)

val_words_padded = pad_sequences(val_words, maxlen=max_len, padding='post', value=0)
val_tags_padded  = pad_sequences(val_tags,  maxlen=max_len, padding='post', value=0)

# =============================================================================
# 5. Building the BiLSTM-CRF Model using TensorFlow/Keras
# =============================================================================

# Define hyperparameters.
VOCAB_SIZE = len(word2idx) + 1  # +1 for the padding index (0)
TAGSET_SIZE = len(tag2idx)      # Number of distinct NER tags
EMBEDDING_DIM = 100
HIDDEN_DIM = 128

# Input layer: sequences of word indices.
input = Input(shape=(max_len,), dtype='int32', name='words_input')

# Embedding layer: maps word indices to dense vectors.
embedding = Embedding(input_dim=VOCAB_SIZE,
                      output_dim=EMBEDDING_DIM,
                      mask_zero=True,      # Important for variable sequence lengths
                      name='word_embedding')(input)

# BiLSTM layer: processes the embeddings in both forward and backward directions.
bilstm = Bidirectional(LSTM(units=HIDDEN_DIM // 2,
                            return_sequences=True))(embedding)

# Dense layer: projects the BiLSTM outputs to the tag space.
dense = Dense(TAGSET_SIZE)(bilstm)

In [None]:
# =============================================================================
# 4. Preparing Data for the BiLSTM-CRF Model (TensorFlow/Keras)
# =============================================================================

# Build vocabulary mappings for words and tags using the training data.
word2idx = {}  # Maps word to unique index (reserve 0 for padding)
tag2idx = {}   # Maps tag to unique index

for sentence in train:
    for word, tag in sentence:
        if word not in word2idx:
            # Reserve index 0 for padding; start indexing from 1.
            word2idx[word] = len(word2idx) + 1
        if tag not in tag2idx:
            tag2idx[tag] = len(tag2idx)

# Create the inverse mapping for tags (to decode predictions later).
idx2tag = {idx: tag for tag, idx in tag2idx.items()}

# Helper function: Convert a sequence of items into indices.
def prepare_sequence(seq, mapping, unk_value=0):
    """
    Converts a sequence of tokens (words or tags) into a sequence of indices.
    If a token is not in the mapping, returns unk_value (default is 0).
    """
    return [mapping.get(token, unk_value) for token in seq]

# Prepare datasets (list of sentences) as lists of word indices and tag indices.
def create_dataset(data):
    word_sequences = []
    tag_sequences = []
    for sentence in data:
        words = [w for w, t in sentence]
        tags = [t for w, t in sentence]
        word_idx_seq = prepare_sequence(words, word2idx)
        tag_idx_seq = prepare_sequence(tags, tag2idx)
        word_sequences.append(word_idx_seq)
        tag_sequences.append(tag_idx_seq)
    return word_sequences, tag_sequences

train_words, train_tags = create_dataset(train)
val_words, val_tags = create_dataset(validation)

# Pad the sequences so that all sequences in a batch have the same length.
# For words, we use padding value 0 (which is reserved). For tags, we can also use 0.
max_len = max(max(len(s) for s in train_words), max(len(s) for s in val_words))

train_words_padded = pad_sequences(train_words, maxlen=max_len, padding='post', value=0)
train_tags_padded  = pad_sequences(train_tags,  maxlen=max_len, padding='post', value=0)

val_words_padded = pad_sequences(val_words, maxlen=max_len, padding='post', value=0)
val_tags_padded  = pad_sequences(val_tags,  maxlen=max_len, padding='post', value=0)

# =============================================================================
# 5. Building the BiLSTM-CRF Model using TensorFlow/Keras
# =============================================================================

# Define hyperparameters.
VOCAB_SIZE = len(word2idx) + 1  # +1 for the padding index (0)
TAGSET_SIZE = len(tag2idx)      # Number of distinct NER tags
EMBEDDING_DIM = 100
HIDDEN_DIM = 128

# Input layer: sequences of word indices.
input = Input(shape=(max_len,), dtype='int32', name='words_input')

# Embedding layer: maps word indices to dense vectors.
embedding = Embedding(input_dim=VOCAB_SIZE,
                      output_dim=EMBEDDING_DIM,
                      mask_zero=True,      # Important for variable sequence lengths
                      name='word_embedding')(input)

# BiLSTM layer: processes the embeddings in both forward and backward directions.
bilstm = Bidirectional(LSTM(units=HIDDEN_DIM // 2,
                            return_sequences=True))(embedding)

In [29]:
# Dense layer: projects the BiLSTM outputs to the tag space.
dense = Dense(TAGSET_SIZE)(bilstm)

# -----------------------------------------------------------------------------
# CRF Layer: Instead of returning decoded (int32) sequences, we set it to return 
# the unnormalized potentials (float32), which the CRF loss function expects.
# -----------------------------------------------------------------------------
crf = tfa.layers.CRF(TAGSET_SIZE, return_decoded_sequence=False)
potentials = crf(dense)  # This output has type float32.

TypeError: ('Keyword argument not understood:', 'return_decoded_sequence')

In [28]:
# Build the model. We use decoded_sequence as the output.
model = Model(inputs=input, outputs=decoded_sequence)

ValueError: too many values to unpack (expected 2)

In [27]:
# Compile the model.
# Use the CRF layer’s loss function and accuracy metric.
# model.compile(optimizer='adam', loss=crf.loss, metrics=[crf.accuracy])
model.compile(optimizer='adam', loss=crf.loss_function, metrics=[crf.accuracy])

# Print a summary of the model.
model.summary()

# =============================================================================
# 6. Training the BiLSTM-CRF Model
# =============================================================================

# Train the model using model.fit.
# We provide the padded training sequences and their corresponding tags.
history = model.fit(
    train_words_padded,              # Input: padded word sequences
    train_tags_padded,               # Output: padded tag sequences
    batch_size=16,
    epochs=5,
    validation_data=(val_words_padded, val_tags_padded)
)

# =============================================================================
# 7. Evaluation of the BiLSTM-CRF Model on the Validation Set
# =============================================================================

def decode_predictions(predictions, idx2tag):
    """
    Converts a 2D array of predicted indices into a list of tag sequences.
    Ignores padded zeros.
    """
    pred_tags = []
    for seq in predictions:
        # Convert indices to tag strings and filter out the padded indices (0)
        tags = [idx2tag.get(idx) for idx in seq if idx != 0]
        pred_tags.append(tags)
    return pred_tags

# Get predictions on the validation set.
val_preds = model.predict(val_words_padded)

# The model output (val_preds) is a 2D numpy array of shape (num_sentences, max_len)
# Decode the predicted indices into tag names.
pred_tags_val = decode_predictions(val_preds, idx2tag)

# Prepare the true tags (decode padded sequences into tag strings, ignoring padding).
true_tags_val = []
for seq in val_tags_padded:
    tags = [idx2tag.get(idx) for idx in seq if idx != 0]
    true_tags_val.append(tags)

# Print the evaluation report using seqeval.
print("BiLSTM-CRF Model Evaluation on Validation Set:")
print(classification_report(true_tags_val, pred_tags_val))

# =============================================================================
# End of Script
# =============================================================================

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 words_input (InputLayer)    [(None, 104)]             0         
                                                                 
 word_embedding (Embedding)  (None, 104, 100)          3051500   
                                                                 
 bidirectional (Bidirection  (None, 104, 128)          84480     
 al)                                                             
                                                                 
 dense (Dense)               (None, 104, 17)           2193      
                                                                 
 crf (CRF)                   [(None, 104),             629       
                              (None, 104, 17),                   
                              (None,),                           
                              (17, 17)]                      

TypeError: in user code:

    File "C:\Users\Abhinav\anaconda3\Lib\site-packages\keras\src\engine\training.py", line 1377, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\Abhinav\anaconda3\Lib\site-packages\keras\src\engine\training.py", line 1360, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\Abhinav\anaconda3\Lib\site-packages\keras\src\engine\training.py", line 1349, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\Abhinav\anaconda3\Lib\site-packages\keras\src\engine\training.py", line 1127, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "C:\Users\Abhinav\anaconda3\Lib\site-packages\keras\src\engine\training.py", line 1185, in compute_loss
        return self.compiled_loss(
    File "C:\Users\Abhinav\anaconda3\Lib\site-packages\keras\src\engine\compile_utils.py", line 277, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "C:\Users\Abhinav\anaconda3\Lib\site-packages\keras\src\losses.py", line 143, in __call__
        losses = call_fn(y_true, y_pred)
    File "C:\Users\Abhinav\anaconda3\Lib\site-packages\keras\src\losses.py", line 270, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "C:\Users\Abhinav\anaconda3\Lib\site-packages\keras\src\losses.py", line 2454, in sparse_categorical_crossentropy
        return backend.sparse_categorical_crossentropy(
    File "C:\Users\Abhinav\anaconda3\Lib\site-packages\keras\src\backend.py", line 5777, in sparse_categorical_crossentropy
        res = tf.nn.sparse_softmax_cross_entropy_with_logits(

    TypeError: Value passed to parameter 'features' has DataType int32 not in list of allowed values: float16, bfloat16, float32, float64
