In [1]:
from charset_normalizer import detect

with open('ner_dataset.csv', 'rb') as f:
    result = detect(f.read(50000))
    print(result)                          # This will suggest the file's encoding

{'encoding': 'windows-1250', 'language': 'English', 'confidence': 0.9926}


In [2]:
# Import Libraries

import pickle
import pandas as pd                                                      # For data loading and manipulation
import numpy as np                                                       # For numerical operations
import sklearn_crfsuite
from sklearn_crfsuite import CRF                                         # scikit-leaen CRF package
from sklearn_crfsuite import metrics
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn.model_selection import train_test_split                     # To split the data into train/val/test sets
from collections import defaultdict, Counter                             # For building a frequency-based baseline
from sklearn.metrics import accuracy_score
from seqeval.metrics import classification_report

In [3]:
# Step 1: Load and Preprocess Data

df = pd.read_csv("ner_dataset.csv", encoding="windows-1250")     # Charset codecs Windows-1252 is a legacy single-byte 
df.head()                                                        # ..character encoding that is used by default in MS-Windows

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [4]:
# Fill missing sentence numbers with forward fill
df['Sentence #'] = df['Sentence #'].fillna(method='ffill')
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [5]:
print(df.shape)           # checking shape of dataframe
df.isnull().sum()         # checking for any null values

(1048575, 4)


Sentence #    0
Word          0
POS           0
Tag           0
dtype: int64

In [6]:
df.T                      # Transpose visualization of dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1048565,1048566,1048567,1048568,1048569,1048570,1048571,1048572,1048573,1048574
Sentence #,Sentence: 1,Sentence: 1,Sentence: 1,Sentence: 1,Sentence: 1,Sentence: 1,Sentence: 1,Sentence: 1,Sentence: 1,Sentence: 1,...,Sentence: 47958,Sentence: 47958,Sentence: 47959,Sentence: 47959,Sentence: 47959,Sentence: 47959,Sentence: 47959,Sentence: 47959,Sentence: 47959,Sentence: 47959
Word,Thousands,of,demonstrators,have,marched,through,London,to,protest,the,...,impact,.,Indian,forces,said,they,responded,to,the,attack
POS,NNS,IN,NNS,VBP,VBN,IN,NNP,TO,VB,DT,...,NN,.,JJ,NNS,VBD,PRP,VBD,TO,DT,NN
Tag,O,O,O,O,O,O,B-geo,O,O,O,...,O,O,B-gpe,O,O,O,O,O,O,O


In [7]:
# Group words and their tags by sentence
sentences = df.groupby('Sentence #').apply(
    lambda x: list(zip(x['Word'].values.tolist(), x['Tag'].values.tolist()))
).tolist()

print(sentences[:3])                         # list of list of tuples (word,tag) in sentences

[[('Thousands', 'O'), ('of', 'O'), ('demonstrators', 'O'), ('have', 'O'), ('marched', 'O'), ('through', 'O'), ('London', 'B-geo'), ('to', 'O'), ('protest', 'O'), ('the', 'O'), ('war', 'O'), ('in', 'O'), ('Iraq', 'B-geo'), ('and', 'O'), ('demand', 'O'), ('the', 'O'), ('withdrawal', 'O'), ('of', 'O'), ('British', 'B-gpe'), ('troops', 'O'), ('from', 'O'), ('that', 'O'), ('country', 'O'), ('.', 'O')], [('Iranian', 'B-gpe'), ('officials', 'O'), ('say', 'O'), ('they', 'O'), ('expect', 'O'), ('to', 'O'), ('get', 'O'), ('access', 'O'), ('to', 'O'), ('sealed', 'O'), ('sensitive', 'O'), ('parts', 'O'), ('of', 'O'), ('the', 'O'), ('plant', 'O'), ('Wednesday', 'B-tim'), (',', 'O'), ('after', 'O'), ('an', 'O'), ('IAEA', 'B-org'), ('surveillance', 'O'), ('system', 'O'), ('begins', 'O'), ('functioning', 'O'), ('.', 'O')], [('Helicopter', 'O'), ('gunships', 'O'), ('Saturday', 'B-tim'), ('pounded', 'O'), ('militant', 'O'), ('hideouts', 'O'), ('in', 'O'), ('the', 'O'), ('Orakzai', 'B-geo'), ('tribal', '

In [8]:
df['Sentence #'].nunique(), df.Word.nunique(), df.POS.nunique(), df.Tag.nunique()        # unique numbers

(47959, 35178, 42, 17)

In [9]:
df.Tag.value_counts()                 # count of unique Tag

O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: Tag, dtype: int64

In [10]:
# 2. Splitting the Dataset

# First, split into train+validation and test (test is 20% of the data).
train_val_set, test_set = train_test_split(sentences, test_size=0.2, random_state=25)

# Then, split train+validation into training and validation sets.
train_set, val_set = train_test_split(train_val_set, test_size=0.2, random_state=25)

In [11]:
len(train_val_set), len(train_set), len(val_set), len(test_set)           # length of split sets for train/test/validation

(38367, 30693, 7674, 9592)

In [12]:
# 3. Baseline Model: Frequency-based Lookup

# Build a dictionary mapping each word (from training data) to its most frequent NER tag.
word_tag_freq = defaultdict(Counter)
for sentence in train_val_set:
    for word, tag in sentence:
        word_tag_freq[word][tag] += 1
        
word_tag_freq

defaultdict(collections.Counter,
            {'Some': Counter({'O': 172}),
             '80': Counter({'O': 90}),
             'percent': Counter({'O': 525}),
             'of': Counter({'O': 20064,
                      'I-org': 396,
                      'B-org': 20,
                      'I-tim': 309,
                      'I-geo': 142,
                      'I-per': 13,
                      'B-tim': 78,
                      'B-geo': 24,
                      'I-eve': 2,
                      'I-gpe': 2,
                      'I-art': 4}),
             'the': Counter({'O': 41873,
                      'B-geo': 167,
                      'B-tim': 10,
                      'I-geo': 4,
                      'I-org': 6,
                      'I-tim': 3}),
             'world': Counter({'O': 605}),
             "'s": Counter({'O': 8422,
                      'I-org': 248,
                      'B-org': 25,
                      'I-eve': 1,
                      'B-tim': 18,
           

In [13]:
# Mapping words with their most frequent tags
baseline_mapping = {word: counter.most_common(1)[0][0] for word, counter in word_tag_freq.items()}
baseline_mapping

{'Some': 'O',
 '80': 'O',
 'percent': 'O',
 'of': 'O',
 'the': 'O',
 'world': 'O',
 "'s": 'O',
 'rough': 'O',
 'diamonds': 'O',
 'are': 'O',
 'bought': 'O',
 'and': 'O',
 'sold': 'O',
 'in': 'O',
 'Antwerp': 'B-org',
 '.': 'O',
 'At': 'O',
 'time': 'O',
 'assault': 'O',
 ',': 'O',
 'police': 'O',
 'said': 'O',
 'a': 'O',
 'gunman': 'O',
 'seized': 'O',
 'rifle': 'O',
 'from': 'O',
 'an': 'O',
 'Israeli': 'B-gpe',
 'security': 'O',
 'officer': 'O',
 'at': 'O',
 'work': 'O',
 'zone': 'O',
 'then': 'O',
 'shot': 'O',
 'killed': 'O',
 'two': 'O',
 'Palestinians': 'B-gpe',
 'he': 'O',
 'had': 'O',
 'driven': 'O',
 'to': 'O',
 'job': 'O',
 'site': 'O',
 'Police': 'O',
 'believe': 'O',
 'men': 'O',
 'rode': 'O',
 'train': 'O',
 'London': 'B-geo',
 '(': 'O',
 'King': 'B-per',
 'Cross': 'I-org',
 'station': 'O',
 ')': 'O',
 'split': 'O',
 'up': 'O',
 'carry': 'O',
 'out': 'O',
 'bombings': 'O',
 'on': 'O',
 'three': 'O',
 'subway': 'O',
 'trains': 'O',
 'one': 'O',
 'bus': 'O',
 'Mr.': 'B-per',

In [14]:
# Check tag count in the dataset
from collections import Counter

tag_counts = Counter(baseline_mapping.values())
print("Tag Counts:")
for tag, count in tag_counts.items():
    print(f"{tag}: {count}")

Tag Counts:
O: 20988
B-org: 1467
B-gpe: 264
B-geo: 2038
B-per: 1695
I-org: 1489
I-per: 2740
B-tim: 527
B-eve: 24
I-geo: 389
I-tim: 110
B-art: 102
B-nat: 13
I-gpe: 8
I-eve: 16
I-art: 62
I-nat: 6


In [15]:
def baseline_predict(sentence_words):
    """
    Predicts NER tags for a list of words using the frequency-based baseline.
    If a word is unseen, it returns 'O' (outside).
    """
    return [baseline_mapping.get(word, 'O') for word in sentence_words]

In [16]:
# 4. Evaluate the baseline on the test set.
true_tags_baseline = []
pred_tags_baseline = []
for sentence in test_set:
    words = [w for w, t in sentence]
    true = [t for w, t in sentence]
    pred = baseline_predict(words)
    true_tags_baseline.append(true)
    pred_tags_baseline.append(pred)

In [17]:
print("Baseline Model Evaluation:")
print(flat_classification_report(true_tags_baseline, pred_tags_baseline, zero_division=0))  # zero_division-> if no predicted sample for any label

Baseline Model Evaluation:
              precision    recall  f1-score   support

       B-art       0.38      0.13      0.20        82
       B-eve       0.46      0.31      0.37        67
       B-geo       0.79      0.85      0.82      7674
       B-gpe       0.94      0.94      0.94      3151
       B-nat       0.36      0.41      0.38        37
       B-org       0.67      0.52      0.59      3990
       B-per       0.75      0.68      0.71      3397
       B-tim       0.86      0.78      0.82      4057
       I-art       0.00      0.00      0.00        56
       I-eve       0.42      0.16      0.23        51
       I-geo       0.73      0.56      0.63      1559
       I-gpe       0.56      0.56      0.56        34
       I-nat       0.00      0.00      0.00         5
       I-org       0.68      0.55      0.61      3265
       I-per       0.73      0.67      0.70      3449
       I-tim       0.63      0.15      0.24      1310
           O       0.98      0.99      0.98    177230


In [18]:
# 5. Flatten the lists of labels and check accuracy
y_true_flat_bl = [item for sublist in true_tags_baseline for item in sublist]
y_pred_flat_bl = [item for sublist in pred_tags_baseline for item in sublist]

# Calculate the accuracy of baseline model
accuracy = accuracy_score(y_true_flat_bl, y_pred_flat_bl)
print("Accuracy: {:.2f}".format(accuracy))

Accuracy: 0.95


In [19]:
# Check accuracy and Classification Report without label 'O'
filtered_true = []
filtered_pred = []

for true_sentence, pred_sentence in zip(true_tags_baseline, pred_tags_baseline):
    filtered_true_sentence = []
    filtered_pred_sentence = []
    for t, p in zip(true_sentence, pred_sentence):
        if t != 'O':  # Remove 'O' labels
            filtered_true_sentence.append(t)
            filtered_pred_sentence.append(p)
    
    if filtered_true_sentence:  # Only keep non-empty sentences
        filtered_true.append(filtered_true_sentence)
        filtered_pred.append(filtered_pred_sentence)

# Generate the classification report
print(flat_classification_report(filtered_true, filtered_pred, zero_division=0))

              precision    recall  f1-score   support

       B-art       0.41      0.13      0.20        82
       B-eve       0.54      0.31      0.40        67
       B-geo       0.80      0.85      0.82      7674
       B-gpe       0.95      0.94      0.94      3151
       B-nat       0.56      0.41      0.47        37
       B-org       0.73      0.52      0.61      3990
       B-per       0.77      0.68      0.72      3397
       B-tim       0.88      0.78      0.83      4057
       I-art       0.00      0.00      0.00        56
       I-eve       0.42      0.16      0.23        51
       I-geo       0.74      0.56      0.64      1559
       I-gpe       0.56      0.56      0.56        34
       I-nat       0.00      0.00      0.00         5
       I-org       0.76      0.55      0.64      3265
       I-per       0.74      0.67      0.70      3449
       I-tim       0.80      0.15      0.25      1310
           O       0.00      0.00      0.00         0

    accuracy              

In [20]:
# 5. Flatten the lists of labels and check accuracy
y_true_flat_bl = [item for sublist in filtered_true for item in sublist]
y_pred_flat_bl = [item for sublist in filtered_pred for item in sublist]

# Calculate the accuracy of baseline model
accuracy = accuracy_score(y_true_flat_bl, y_pred_flat_bl)
print("Accuracy: {:.2f}".format(accuracy))

Accuracy: 0.69


In [30]:
# F1 Score
f1_score = metrics.flat_f1_score(filtered_true, filtered_pred, average='weighted')
print("F1 Score: {:.2f}".format(f1_score))

F1 Score: 0.73


# Advanced Model

In [21]:
# Feature extraction function
def word2features(sent, i):               # i'th word of the sentence
    word = sent[i][0]                     # 0'th position element of the tuple in the sentence list
    features = {                          # sent = [('Micheal', 'B-PER'), ('Jackson', 'I-PER'), ('ate', 'O'), ('banana', 'O')]
        'bias': 1.0,                      # bias or intercept with a constant value 1.0 for ML (y = mx + c)
        'word.lower()': word.lower(),
        'prefix[:3]': word[:3],           # First 3 letters
        'suffix[-3:]': word[-3:],         # Last 3 letters
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i - 1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent) - 1:
        word1 = sent[i + 1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True
    return features

In [22]:
# Convert sentences into features
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

X = [sent2features(s) for s in train_val_set]
y = [sent2labels(s) for s in train_val_set]

In [23]:
train_val_set[0]

[('Some', 'O'),
 ('80', 'O'),
 ('percent', 'O'),
 ('of', 'O'),
 ('the', 'O'),
 ('world', 'O'),
 ("'s", 'O'),
 ('rough', 'O'),
 ('diamonds', 'O'),
 ('are', 'O'),
 ('bought', 'O'),
 ('and', 'O'),
 ('sold', 'O'),
 ('in', 'O'),
 ('Antwerp', 'B-org'),
 ('.', 'O')]

In [24]:
X[0]

[{'bias': 1.0,
  'word.lower()': 'some',
  'prefix[:3]': 'Som',
  'suffix[-3:]': 'ome',
  'word.isupper()': False,
  'word.istitle()': True,
  'word.isdigit()': False,
  'BOS': True,
  '+1:word.lower()': '80',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False},
 {'bias': 1.0,
  'word.lower()': '80',
  'prefix[:3]': '80',
  'suffix[-3:]': '80',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': True,
  '-1:word.lower()': 'some',
  '-1:word.istitle()': True,
  '-1:word.isupper()': False,
  '+1:word.lower()': 'percent',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False},
 {'bias': 1.0,
  'word.lower()': 'percent',
  'prefix[:3]': 'per',
  'suffix[-3:]': 'ent',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  '-1:word.lower()': '80',
  '-1:word.istitle()': False,
  '-1:word.isupper()': False,
  '+1:word.lower()': 'of',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False},
 {'bias': 1.0,
  'word.lower()': 'of'

In [25]:
print(y[0])

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-org', 'O']


In [26]:
# Split train+validation set into training and validation (test) sets.

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=25)

In [27]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',                                          # Optimization algorithm
    c1=0.1,                                                     # Coefficient for L1 regularization
    c2=0.1,                                                     # Coefficient for L2 regularization
    max_iterations=100,                                         # Maximum number of iterations for the optimizer
    all_possible_transitions=False                              # Only consider transitions present in the training data
)

crf.fit(X_train, y_train)

In [47]:
labels = list(crf.classes_)
labels.remove('O')

y_pred = crf.predict(X_val)

# Classification Report
print(flat_classification_report(y_val, y_pred, labels=labels, zero_division=0))

# F1 Score
f1_score = metrics.flat_f1_score(y_val, y_pred, average='weighted', labels=labels)
print("F1 Score: {:.2f}".format(f1_score))

# Flatten the lists of true and predicted labels
y_val_true_flat = [item for sublist in y_val for item in sublist]
y_val_pred_flat = [item for sublist in y_pred for item in sublist]

# Filter out 'O' tags from both true and predicted labels
filtered_true = [true_label for true_label, pred_label in zip(y_val_true_flat, y_val_pred_flat) if true_label != 'O']
filtered_pred = [pred_label for true_label, pred_label in zip(y_val_true_flat, y_val_pred_flat) if true_label != 'O']

# Calculate the accuracy excluding 'O' labels
accuracy = accuracy_score(filtered_true, filtered_pred)
print("Accuracy: {:.2f}".format(accuracy))

              precision    recall  f1-score   support

       B-geo       0.86      0.91      0.88      6040
       B-per       0.84      0.83      0.83      2661
       I-per       0.85      0.89      0.87      2732
       B-tim       0.92      0.87      0.90      3234
       B-gpe       0.96      0.94      0.95      2491
       B-org       0.80      0.72      0.76      3232
       I-org       0.81      0.78      0.79      2760
       I-tim       0.85      0.73      0.79      1086
       B-eve       0.48      0.29      0.36        49
       I-eve       0.31      0.22      0.26        41
       I-geo       0.78      0.79      0.79      1142
       B-art       0.33      0.11      0.16        75
       I-art       0.15      0.03      0.05        60
       I-gpe       0.87      0.68      0.76        40
       B-nat       0.80      0.50      0.62        32
       I-nat       0.80      0.44      0.57         9

   micro avg       0.85      0.84      0.85     25684
   macro avg       0.71   

In [41]:
# Flatten the lists of labels
y_true_flat = [item for sublist in y_val for item in sublist]
y_pred_flat = [item for sublist in y_pred for item in sublist]

# Calculate the accuracy
accuracy = accuracy_score(y_true_flat, y_pred_flat)
print("Accuracy: {:.2f}".format(accuracy))

Accuracy: 0.97


In [33]:
import pickle

# Save model to disk
with open("ner_crf_model.pkl", "wb") as model_file:
    pickle.dump(crf, model_file)

# Load model for later use
with open("ner_crf_model.pkl", "rb") as model_file:
    loaded_crf = pickle.load(model_file)

print("\nModel successfully saved and loaded!")


Model successfully saved and loaded!


# Checking Model on Test Dataset

In [48]:
# Extract features and labels for Test set
X_test = [sent2features(s) for s in test_set]
y_test = [sent2labels(s) for s in test_set]

In [49]:
labels = list(loaded_crf.classes_)
labels.remove('O')

y_pred = loaded_crf.predict(X_test)

# Classification Report
print(flat_classification_report(y_test, y_pred, labels=labels, zero_division=0))

# F1 Score
f1_score = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)
print("F1 Score: {:.2f}".format(f1_score))

              precision    recall  f1-score   support

       B-geo       0.86      0.90      0.88      7674
       B-per       0.83      0.82      0.82      3397
       I-per       0.84      0.89      0.86      3449
       B-tim       0.93      0.88      0.90      4057
       B-gpe       0.96      0.95      0.95      3151
       B-org       0.79      0.72      0.75      3990
       I-org       0.80      0.79      0.79      3265
       I-tim       0.86      0.74      0.79      1310
       B-eve       0.62      0.52      0.57        67
       I-eve       0.51      0.41      0.46        51
       I-geo       0.84      0.78      0.81      1559
       B-art       0.38      0.12      0.19        82
       I-art       0.08      0.02      0.03        56
       I-gpe       0.73      0.65      0.69        34
       B-nat       0.71      0.46      0.56        37
       I-nat       0.80      0.80      0.80         5

   micro avg       0.85      0.84      0.85     32184
   macro avg       0.72   

In [50]:
# Flatten the lists of labels
y_true_flat = [item for sublist in y_test for item in sublist]
y_pred_flat = [item for sublist in y_pred for item in sublist]

# Calculate the accuracy
accuracy = accuracy_score(y_true_flat, y_pred_flat)
print("Accuracy: {:.2f}".format(accuracy))

Accuracy: 0.97


In [51]:
# Flatten the lists of true and predicted labels
y_val_true_flat = [item for sublist in y_test for item in sublist]
y_val_pred_flat = [item for sublist in y_pred for item in sublist]

# Filter out 'O' tags from both true and predicted labels
filtered_true = [true_label for true_label, pred_label in zip(y_val_true_flat, y_val_pred_flat) if true_label != 'O']
filtered_pred = [pred_label for true_label, pred_label in zip(y_val_true_flat, y_val_pred_flat) if true_label != 'O']

# Calculate the accuracy excluding 'O' labels
accuracy = accuracy_score(filtered_true, filtered_pred)
print("Accuracy: {:.2f}".format(accuracy))

Accuracy: 0.84
