In [1]:
import kagglehub
import os
import cupy as cp
import numpy as np 
import pandas as pd 
import ast
import spacy
from spacy.training.example import Example
from sklearn.model_selection import train_test_split
import random

In [None]:
#  spacy.require_gpu()


In [3]:
path = kagglehub.dataset_download("naseralqaydeh/named-entity-recognition-ner-corpus")
print("Files in dataset folder:")
print(os.listdir(path))


Files in dataset folder:
['ner.csv']


In [4]:
df = pd.read_csv(os.path.join(path, "ner.csv"))  # replace with actual filename
df.head()

Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"['PRP', 'VBD', 'IN', 'DT', 'NNS', 'IN', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","['NNS', 'VBD', 'DT', 'NN', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,Sentence: 5,The protest comes on the eve of the annual con...,"['DT', 'NN', 'VBZ', 'IN', 'DT', 'NN', 'IN', 'D...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [5]:
def convert_sentence_row_to_spacy_format(sentence, tags_raw):
    tags = ast.literal_eval(tags_raw)
    words = sentence.split()

    if len(words) != len(tags):
        return None  # or raise warning/log

    entities = []
    start = 0
    i = 0
    while i < len(words):
        word = words[i]
        tag = tags[i]
        word_start = sentence.find(word, start)
        word_end = word_start + len(word)

        if tag.startswith("B-"):
            ent_type = tag[2:]
            ent_start = word_start
            ent_end = word_end
            j = i + 1
            while j < len(tags) and tags[j] == f"I-{ent_type}":
                next_word = words[j]
                next_start = sentence.find(next_word, ent_end)
                ent_end = next_start + len(next_word)
                j += 1
            entities.append((ent_start, ent_end, ent_type))
            i = j
            start = ent_end
        else:
            i += 1
            start = word_end

    return (sentence, {"entities": entities})


In [6]:
TRAIN_DATA  = []
for _, row in df.iterrows():
    item = convert_sentence_row_to_spacy_format(row["Sentence"], row["Tag"])
    if item:
        TRAIN_DATA .append(item)

In [7]:
train_data, val_data = train_test_split(TRAIN_DATA, test_size=0.2, random_state=42)


In [8]:
nlp = spacy.load("en_core_web_sm")
ner = nlp.get_pipe("ner")


In [9]:
for _, annotations in TRAIN_DATA:
    for ent in annotations["entities"]:
        ner.add_label(ent[2])

In [10]:
optimizer = nlp.resume_training()


In [11]:
epochs = 10
batch_size = 8
for epoch in range(epochs):
    losses = {}
    random.shuffle(train_data)
    for i in range(0, len(train_data), batch_size):
        batch = train_data[i:i + batch_size]
        examples = [Example.from_dict(nlp.make_doc(text), annotations) for text, annotations in batch]
        nlp.update(examples, sgd=optimizer, losses=losses)
    print(f"Epoch {epoch+1}: Loss = {losses}")

Epoch 1: Loss = {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 36772.81610529137}
Epoch 2: Loss = {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 28590.879614144964}
Epoch 3: Loss = {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 25736.831421646577}
Epoch 4: Loss = {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 23576.559229979594}
Epoch 5: Loss = {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 22179.320753644923}
Epoch 6: Loss = {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 20820.874487552646}
Epoch 7: Loss = {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 19541.59834955089}
Epoch 8: Loss = {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 18526.303503396768}
Epoch 9: Loss = {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 17484.77621702419}
Epoch 10: Loss = {'tok2vec': 0.0, 'tagger': 0.0, 'parser': 0.0, 'ner': 16859.61960660386}


In [12]:
print("\nValidation example outputs:")
for text, annotations in random.sample(val_data, 3):
    doc = nlp(text)
    print(f"\nText: {text}")
    print("Predicted entities:", [(ent.text, ent.label_) for ent in doc.ents])
    print("True entities:", annotations["entities"])


Validation example outputs:

Text: U.S. Secretary of State Condoleezza Rice , who is on a tour of the Middle East , met with Egyptian President Hosni Mubarak Wednesday .
Predicted entities: [('U.S.', 'org'), ('State', 'org'), ('Condoleezza Rice', 'per'), ('Middle East', 'geo'), ('Egyptian', 'gpe'), ('President Hosni Mubarak', 'per'), ('Wednesday', 'tim')]
True entities: [(0, 4, 'org'), (24, 40, 'per'), (67, 78, 'geo'), (90, 98, 'gpe'), (99, 122, 'per'), (123, 132, 'tim')]

Text: President Maumoon Abdul GAYOOM dominated the islands ' political scene for 30 years , elected to six successive terms by single-party referendums .
Predicted entities: [('President Maumoon Abdul GAYOOM', 'per'), ('30', 'tim')]
True entities: [(0, 30, 'per'), (75, 77, 'tim')]

Text: Ms. Rice noted that President Bush 's Emergency Plan for AIDS Relief is the largest international initiative ever undertaken by a single nation to combat a disease .
Predicted entities: [('Ms. Rice', 'per'), ('President Bush', 'per'